Merge changes from github.

Change: 152200430
2017-04-04 16:10:08 -08:00 · 2017-04-04 16:10:08 -08:00 · ccbc8991db
commit ccbc8991db
parent 9477900946
147 changed files with 10934 additions and 3336 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,7 @@
 .ipynb_checkpoints
 node_modules
 /.bazelrc
+/.tf_configure.bazelrc
 /bazel-*
 /third_party/py/numpy/numpy_include
 /tools/bazel.rc
--- a/README.md
+++ b/README.md
@ -1,6 +1,7 @@
 <div align="center">
  <img src="https://www.tensorflow.org/images/tf_logo_transp.png"><br><br>
 </div>
+
 -----------------

 | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
@ -33,12 +34,12 @@ and discussion.**

 People who are a little more adventurous can also try our nightly binaries:

-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.0.1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.0.1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.1.0rc0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.1.0rc0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/))
 * Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
 ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))

@ -59,11 +60,11 @@ Hello, TensorFlow!
 >>>
 ```

-##For more information
+## For more information

 * [TensorFlow website](http://tensorflow.org)
 * [TensorFlow whitepaper](http://download.tensorflow.org/paper/whitepaper2015.pdf)
 * [TensorFlow Model Zoo](https://github.com/tensorflow/models)
 * [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)

-The TensorFlow community has created amazing things with TensorFlow, please see the [resources section of tensorflow.org](https://www.tensorflow.org/versions/master/resources#community) for an incomplete list.
+The TensorFlow community has created amazing things with TensorFlow, please see the [resources section of tensorflow.org](https://www.tensorflow.org/about/#community) for an incomplete list.
--- a/RELEASE.md
+++ b/RELEASE.md
@ -1,3 +1,110 @@
+# Release 1.1.0
+
+## Major Features and Improvements
+* Added Java API support for Windows.
+* Added `tf.spectral` module. Moved existing FFT ops to `tf.spectral` while
+  keeping an alias in the old location (`tf.*`).
+* Added 1D, 2D and 3D Fourier transform ops for real signals to `tf.spectral`.
+* Added a `tf.bincount` function.
+* Added Keras 2 API to contrib.
+* Added a new lightweight queue-like object - `RecordInput`.
+* Added `tf.contrib.image.compose_transforms` function.
+* Bring `tf.estimator.*` into the API. Non-deprecated functionality from `tf.contrib.learn.Estimator` is moved to `tf.estimator.Estimator` with cosmetic changes.
+* Docker images: TF images on gcr.io and Docker Hub are upgraded to ubuntu:16.04.
+* Added the following features to TensorFlow Debugger (tfdbg):
+  * Ability to inspect Python source file against TF ops and tensors (command `print_source` / `ps`)
+  * New navigation bar in Curses-based UI
+  * NodeStepper (command `invoke_stepper`) now uses intermediate tensor dumps. It also uses `TensorHandles` as direct feeds during successive `cont` calls for improved performance and reduced memory consumption.
+
+## Deprecations
+
+* TensorFlow 1.1.0 will be the last time we release a binary with Mac GPU support. Going forward, we will stop testing on Mac GPU systems. We continue to welcome patches that maintain Mac GPU support, and we will try to keep the Mac GPU build working.
+
+## Changes to contrib APIs
+* The behavior of RNNCells is now stricter due to the transition towards making RNNCells act more like Keras layers.
+  * If an RNNCell is used twice in two different variable scopes, an error is raised describing how to avoid this behavior.
+  * If an RNNCell is used in a variable scope with existing conflicting variables, an error is raised showing that the RNNCell must be constructed with argument `reuse=True`.
+* Deprecated contrib/distributions `pmf`, `pdf`, `log_pmf`, `log_pdf`.
+* Moved `bayesflow.special_math` to distributions.
+* `tf.contrib.tensor_forest.python.tensor_forest.RandomForestDeviceAssigner` removed.
+* Changed some MVN classes and parameters:
+  * `tf.contrib.distributions.MultivariateNormalFull` replaced by `tf.contrib.distributions.MultivariateNormalTriL`.
+  * `tf.contrib.distributions.MultivariateNormalCholesky` replaced by `tf.contrib.distributions.MultivariateNormalTriL`
+  * `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev` replaced
+    by `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusScale`
+  * `tf.contrib.distributions.MultivariateNormalDiag` arguments changed from `mu`, `diag_stddev` to `log`, `scale_diag`.
+  * `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT` removed.
+  * `tf.contrib.distributions.MultivariateNormalDiagPlusLowRank` added.
+
+## Bug Fixes and Other Changes
+* Java: Support for loading models exported using the SavedModel API (courtesy @EronWright).
+* Go: Added support for incremental graph execution.
+* Fix a bug in the WALS solver when single-threaded.
+* Added support for integer sparse feature values in `tf.contrib.layers.sparse_column_with_keys`.
+* Fixed `tf.set_random_seed(0)` to be deterministic for all ops.
+* Stability improvements for the GCS file system support.
+* Improved TensorForest performance.
+* Added support for multiple filename globs in `tf.matching_files`.
+* `LogMessage` now includes a timestamp as beginning of a message.
+* Added MultiBox person detector example standalone binary.
+* Android demo: Makefile build functionality added to build.gradle to fully support building TensorFlow demo in Android on Windows.
+* Android demo: read MultiBox priors from txt file rather than protobuf.
+* Added colocation constraints to `StagingArea`.
+* `sparse_matmul_op` reenabled for Android builds.
+* Restrict weights rank to be the same as the broadcast target, to avoid ambiguity on broadcast rules.
+* Upgraded libxsmm to 1.7.1 and applied other changes for performance and memory usage.
+* Fixed bfloat16 integration of LIBXSMM sparse mat-mul.
+* Improved performance and reduce memory usage by allowing ops to forward input buffers to output buffers and perform computations in-place.
+* Improved the performance of CPU assignment for strings.
+* Speed up matrix * vector multiplication and matrix * matrix with unknown shapes.
+* C API: Graph imports now support input remapping, control dependencies, and returning imported nodes (see `TF_GraphImportGraphDefWithReturnOutputs()`)
+* Multiple C++ API updates.
+* Multiple TensorBoard updates including:
+  * Users can now view image summaries at various sampled steps (instead of just the last step).
+  * Bugs involving switching runs as well as the image dashboard are fixed.
+  * Removed data download links from TensorBoard.
+  * TensorBoard uses a relative data directory, for easier embedding.
+  * TensorBoard automatically ignores outliers for domain calculation, and formats proportional values consistently.
+* Multiple tfdbg bug fixes:
+  * Fixed Windows compatibility issues.
+  * Command history now persists across runs.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+A. Besir Kurtulmus, Adal Chiriliuc, @akash, Alec-Desouza, Alex Rothberg, Alex
+Sergeev, Alexander Heinecke, Allen Guo, Andreas Madsen, Ankesh Anand, Anton
+Loss, @Aravind, @Arie, Ashutosh Das, AuréLien Geron, Bairen Yi, @bakunyo, Ben
+Visser, Brady Zhou, Calpa Liu, Changming Sun, Chi Zeng, Chih Cheng Liang,
+Christopher Berner, Clark Zinzow, @Conchylicultor, Courtial Florian, Dan Ellis,
+Dan J, Dan Jarvis, Daniel Ylitalo, Darren Garvey, David Norman, David Truong,
+@DavidNorman, Dimitar Pavlov, Dmitry Persiyanov, @Eddie, @elirex, Erfan
+Noury, Eron Wright, Evgeny Mazovetskiy, Fabrizio (Misto) Milo, @fanlu, Fisher
+Coder, Franck Dernoncourt, Gagan Goel, Gao, Xiang, @Gautam, Gefu Tang,
+@guilherme, @guschmue, Hannah Provenza, Hans Pabst, @hartb, Hsiao Yi, Huazuo
+Gao, Igor ChorążEwicz, Ivan Smirnov, Jakub Kolodziejczyk, Jason Gavris, Jason
+Morton, Jay Young, Jayaram Bobba, Jeremy Sawruk, Jiaming Liu, Jihun Choi,
+@jiqiu, Joan Thibault, John C F, Jojy G Varghese, Jon Malmaud, Julian Berman,
+Julian Niedermeier, Junpeng Lao, Kai Sasaki, @Kankroc, Karl Lessard, Kyle
+Bostelmann, @Lezcano, Li Yi, Luo Yun, @lurker, Mahmoud-Abuzaina, Mandeep Singh,
+Marek Kolodziej, Mark Szepieniec, Martial Hue, Medhat Omr, Memo Akten, Michael
+Gharbi, MichaëL Defferrard, Milan Straka, @MircoT, @mlucool, Muammar Ibn Faisal,
+Nayana Thorat, @nghiattran, Nicholas Connor, Nikolaas Steenbergen, Niraj Patel,
+Niranjan Hasabnis, @Panmari, Pavel Bulanov, Philip Pries Henningsen, Philipp
+Jund, @polonez, Prayag Verma, Rahul Kavi, Raphael Gontijo Lopes, @rasbt, Raven
+Iqqe, Reid Pryzant, Richard Shin, Rizwan Asif, Russell Kaplan, Ryo Asakura,
+RüDiger Busche, Saisai Shao, Sam Abrahams, @sanosay, Sean Papay, @seaotterman,
+@selay01, Shaurya Sharma, Sriram Narayanamoorthy, Stefano Probst, @taknevski,
+@tbonza, @teldridge11, Yuan (Terry) Tang, Tim Anglade, Tomas Reimers, Tomer Gafner,
+Valentin Iovene, Vamsi Sripathi, Viktor Malyi, Vit Stepanovs, Vivek Rane, Vlad
+Firoiu, @wangg12, @will, Xiaoyu Tao, Yaroslav Bulatov, Yuan (Terry) Tang,
+@Yufeng, Yuming Wang, Yuxin Wu, Zafar Takhirov, Ziming Dong
+
+We are also grateful to all who filed issues or helped resolve them, asked and
+answered questions, and were part of inspiring discussions.
+
+
 # Release 1.0.1

 ## Bug Fixes and Other Changes
@ -94,7 +201,7 @@ To help you upgrade your existing TensorFlow Python code to match the API change
 * In the C++ API (in tensorflow/cc), Input, Output, etc. have moved
  from the tensorflow::ops namespace to tensorflow.
 * Change arg order for `{softmax,sparse_softmax,sigmoid}_cross_entropy_with_logits` to be (labels, predictions), and force use of named args.
-* tf.nn.rnn_cell.* and most functions in tf.nn.rnn.* (with the exception of dynamic_rnn and raw_rnn) are temporarily in tf.contrib.rnn.  They will be moved back into core for TF 1.1.
+* tf.nn.rnn_cell.* and most functions in tf.nn.rnn.* (with the exception of dynamic_rnn and raw_rnn) are temporarily in tf.contrib.rnn.  They will be moved back into core for TF 1.2.
 * `tf.nn.sampled_softmax_loss` and `tf.nn.nce_loss` have both changed their API such that you need to switch the `inputs, labels` to `labels, inputs` parameters.
 * The shape keyword argument of the `SparseTensor` constructor changes its name to `dense_shape` between Tensorflow 0.12 and Tensorflow 1.0.

--- a/8
+++ b/8
@ -20,7 +20,9 @@ load("//tensorflow:workspace.bzl", "tf_workspace")
 #android_sdk_repository(
 #    name = "androidsdk",
 #    api_level = 23,
-#    build_tools_version = "25.0.1",
+#    # Ensure that you have the build_tools_version below installed in the 
+#    # SDK manager as it updates periodically.
+#    build_tools_version = "25.0.2",
 #    # Replace with path to Android SDK on your system
 #    path = "<PATH_TO_SDK>",
 #)
@ -29,7 +31,9 @@ load("//tensorflow:workspace.bzl", "tf_workspace")
 #android_ndk_repository(
 #    name="androidndk",
 #    path="<PATH_TO_NDK>",
-#    api_level=14) # This needs to be 14 or higher to compile TensorFlow.
+#    # This needs to be 14 or higher to compile TensorFlow. 
+#    # Note that the NDK version is not the API level.
+#    api_level=14)

 # Please add all new TensorFlow dependencies in workspace.bzl.
 tf_workspace()
--- a/90
+++ b/90
@ -8,9 +8,6 @@ pushd `dirname $0` > /dev/null
 SOURCE_BASE_DIR=`pwd -P`
 popd > /dev/null

-# This file contains customized config settings.
-touch .bazelrc
-
 PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"

 function is_linux() {
@ -38,14 +35,6 @@ function is_windows() {
  fi
 }

-function bazel_fetch() {
-  if [ -z "$TF_BAZEL_TARGETS" ]; then
-    bazel fetch "//tensorflow/... -//tensorflow/contrib/nccl/... -//tensorflow/examples/android/..."
-  else
-    bazel fetch $TF_BAZEL_TARGETS
-  fi
-}
-
 function sed_hyphen_i() {
  if is_macos; then
    sed -i '' "$@"
@ -54,6 +43,21 @@ function sed_hyphen_i() {
  fi
 }

+function write_to_bazelrc() {
+  echo "$1" >> .tf_configure.bazelrc
+}
+
+function write_action_env_to_bazelrc() {
+  write_to_bazelrc "build --action_env $1=\"$2\""
+}
+
+# This file contains customized config settings.
+rm -f .tf_configure.bazelrc
+touch .tf_configure.bazelrc
+touch .bazelrc
+sed_hyphen_i "/tf_configure/d" .bazelrc
+echo "import .tf_configure.bazelrc" >> .bazelrc
+
 # Delete any leftover BUILD files from the Makefile build, which would interfere
 # with Bazel parsing.
 MAKEFILE_DOWNLOAD_DIR=tensorflow/contrib/makefile/downloads
@ -164,6 +168,7 @@ if is_windows; then
  TF_NEED_HDFS=0
  TF_NEED_JEMALLOC=0
  TF_NEED_OPENCL=0
+  TF_CUDA_CLANG=0
 fi

 if is_linux; then
@ -181,9 +186,8 @@ else
  TF_NEED_JEMALLOC=0
 fi

-sed_hyphen_i -e "/with_jemalloc/d" .bazelrc
 if [[ "$TF_NEED_JEMALLOC" == "1" ]]; then
-  echo 'build --define with_jemalloc=true' >>.bazelrc
+  write_to_bazelrc 'build --define with_jemalloc=true'
 fi

 while [[ "$TF_NEED_GCP" == "" ]]; do
@ -200,9 +204,8 @@ while [[ "$TF_NEED_GCP" == "" ]]; do
  esac
 done

-sed_hyphen_i -e "/with_gcp_support/d" .bazelrc
 if [[ "$TF_NEED_GCP" == "1" ]]; then
-  echo 'build --define with_gcp_support=true' >>.bazelrc
+  write_to_bazelrc 'build --define with_gcp_support=true'
 fi

 while [[ "$TF_NEED_HDFS" == "" ]]; do
@ -219,9 +222,8 @@ while [[ "$TF_NEED_HDFS" == "" ]]; do
  esac
 done

-sed_hyphen_i -e "/with_hdfs_support/d" .bazelrc
 if [[ "$TF_NEED_HDFS" == "1" ]]; then
-  echo 'build --define with_hdfs_support=true' >>.bazelrc
+  write_to_bazelrc 'build --define with_hdfs_support=true'
 fi

 ## Enable XLA.
@ -235,9 +237,8 @@ while [[ "$TF_ENABLE_XLA" == "" ]]; do
  esac
 done

-sed_hyphen_i -e "/with_xla_support/d" .bazelrc
 if [[ "$TF_ENABLE_XLA" == "1" ]]; then
-  echo 'build --define with_xla_support=true' >>.bazelrc
+  write_to_bazelrc 'build --define with_xla_support=true'
 fi


@ -279,23 +280,11 @@ while [ "$TF_NEED_CUDA" == "" ]; do
  esac
 done

-sed_hyphen_i -e "/--action_env TF_NEED_CUDA/d" .bazelrc
-sed_hyphen_i -e "/--action_env CUD/d" .bazelrc
-sed_hyphen_i -e "/--action_env GCC_HOST/d" .bazelrc
-sed_hyphen_i -e "/--action_env TF_CUD/d" .bazelrc
-sed_hyphen_i -e "/--action_env CLANG_CUDA/d" .bazelrc
-
 export TF_NEED_CUDA
-echo "build --action_env TF_NEED_CUDA=$TF_NEED_CUDA" >>.bazelrc
+write_action_env_to_bazelrc "TF_NEED_CUDA" "$TF_NEED_CUDA"

 export TF_NEED_OPENCL

-if [[ "$TF_NEED_CUDA" == "0" ]] && [[ "$TF_NEED_OPENCL" == "0" ]]; then
-  echo "Configuration finished"
-  bazel_fetch
-  exit
-fi
-
 if [ "$TF_NEED_CUDA" == "1" ]; then
 while [[ "$TF_CUDA_CLANG" == "" ]]; do
  read -p "Do you want to use clang as CUDA compiler? [y/N] " INPUT
@ -308,7 +297,7 @@ while [[ "$TF_CUDA_CLANG" == "" ]]; do
 done

 export TF_CUDA_CLANG
-echo "build --action_env TF_CUDA_CLANG=$TF_CUDA_CLANG" >>.bazelrc
+write_action_env_to_bazelrc "TF_CUDA_CLANG" "$TF_CUDA_CLANG"

 # Set up which gcc nvcc should use as the host compiler
 # No need to set this on Windows
@ -324,7 +313,7 @@ while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do
  fi
  if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
    export GCC_HOST_COMPILER_PATH
-    echo "build --action_env GCC_HOST_COMPILER_PATH=\"$GCC_HOST_COMPILER_PATH\"" >>.bazelrc
+    write_action_env_to_bazelrc "GCC_HOST_COMPILER_PATH" "$GCC_HOST_COMPILER_PATH"
    break
  fi
  echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2
@ -348,7 +337,7 @@ while [[ "$TF_CUDA_CLANG" == "1" ]] && true; do
  fi
  if [ -e "$CLANG_CUDA_COMPILER_PATH" ]; then
    export CLANG_CUDA_COMPILER_PATH
-    echo "build --action_env CLANG_CUDA_COMPILER_PATH=\"$CLANG_CUDA_COMPILER_PATH\"" >>.bazelrc
+    write_action_env_to_bazelrc "CLANG_CUDA_COMPILER_PATH" "$CLANG_CUDA_COMPILER_PATH"
    break
  fi
  echo "Invalid clang path. ${CLANG_CUDA_COMPILER_PATH} cannot be found" 1>&2
@ -399,10 +388,9 @@ while true; do

  if [ -e "${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH}" ]; then
    export CUDA_TOOLKIT_PATH
-    echo "build --action_env CUDA_TOOLKIT_PATH=\"$CUDA_TOOLKIT_PATH\"" >>.bazelrc
-
+    write_action_env_to_bazelrc "CUDA_TOOLKIT_PATH" "$CUDA_TOOLKIT_PATH"
    export TF_CUDA_VERSION
-    echo "build --action_env TF_CUDA_VERSION=$TF_CUDA_VERSION" >>.bazelrc
+    write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION"
    break
  fi
  echo "Invalid path to CUDA $TF_CUDA_VERSION toolkit. ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH} cannot be found"
@ -417,9 +405,9 @@ done

 # Find out where the cuDNN library is installed
 while true; do
-  # Configure the Cudnn version to use.
+  # Configure the cuDNN version to use.
  if [ -z "$TF_CUDNN_VERSION" ]; then
-    read -p "Please specify the Cudnn version you want to use. [Leave empty to use system default]: " TF_CUDNN_VERSION
+    read -p "Please specify the cuDNN version you want to use. [Leave empty to use system default]: " TF_CUDNN_VERSION
  fi

  fromuser=""
@ -454,10 +442,9 @@ while true; do

  if [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_ALT_PATH}" -o -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_PATH}" ]; then
    export TF_CUDNN_VERSION
-    echo "build --action_env TF_CUDNN_VERSION=$TF_CUDNN_VERSION" >>.bazelrc
-
+    write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
    export CUDNN_INSTALL_PATH
-    echo "build --action_env CUDNN_INSTALL_PATH=\"$CUDNN_INSTALL_PATH\"" >>.bazelrc
+    write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH"
    break
  fi

@ -470,10 +457,9 @@ while true; do
    CUDNN_PATH_FROM_LDCONFIG="$($LDCONFIG_BIN -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')"
    if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then
      export TF_CUDNN_VERSION
-      echo "build --action_env TF_CUDNN_VERSION=$TF_CUDNN_VERSION" >>.bazelrc
-
+      write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
      export CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})"
-      echo "build --action_env CUDNN_INSTALL_PATH=\"$CUDNN_INSTALL_PATH\"" >>.bazelrc
+      write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH"
      break
    fi
  fi
@ -525,7 +511,7 @@ EOF
    fi
  else
    export TF_CUDA_COMPUTE_CAPABILITIES
-    echo "build --action_env TF_CUDA_COMPUTE_CAPABILITIES=$TF_CUDA_COMPUTE_CAPABILITIES" >>.bazelrc
+    write_action_env_to_bazelrc "TF_CUDA_COMPUTE_CAPABILITIES" "$TF_CUDA_COMPUTE_CAPABILITIES"
    break
  fi
  TF_CUDA_COMPUTE_CAPABILITIES=""
@ -536,9 +522,9 @@ if is_windows; then
  export CUDA_PATH="$CUDA_TOOLKIT_PATH"
  export CUDA_COMPUTE_CAPABILITIES="$TF_CUDA_COMPUTE_CAPABILITIES"
  export NO_WHOLE_ARCHIVE_OPTION=1
-
-  # Set GCC_HOST_COMPILER_PATH to keep cuda_configure.bzl happy
-  export GCC_HOST_COMPILER_PATH="/usr/bin/dummy_compiler"
+  write_action_env_to_bazelrc "CUDA_PATH" "$CUDA_PATH"
+  write_action_env_to_bazelrc "CUDA_COMPUTE_CAPABILITIES" "$CUDA_COMPUTE_CAPABILITIES"
+  write_action_env_to_bazelrc "NO_WHOLE_ARCHIVE_OPTION" "1"
 fi

 # end of if "$TF_NEED_CUDA" == "1"
@ -629,6 +615,6 @@ done
 # end of if "$TF_NEED_OPENCL" == "1"
 fi

-bazel_fetch
-
+# TODO(gunan): Remove once bazel correctly handles changes in remote repositories.
+bazel clean
 echo "Configuration finished"
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@ -72,7 +72,7 @@ def tfadd_with_ckpt_saver(out_dir):
    saver.save(sess, ckpt_file)
    # Without the SaverDef, the restore op won't be named correctly.
    saver_file = '%s/test_graph_tfadd_with_ckpt_saver.saver' % out_dir
-    with open(saver_file, 'w') as f:
+    with open(saver_file, 'wb') as f:
      f.write(saver.as_saver_def().SerializeToString())


@ -113,7 +113,7 @@ def write_graph(build_graph, out_dir):
  with g.as_default():
    build_graph(out_dir)
    filename = '%s/test_graph_%s.pb' % (out_dir, build_graph.__name__)
-    with open(filename, 'w') as f:
+    with open(filename, 'wb') as f:
      f.write(g.as_graph_def().SerializeToString())


--- a/tensorflow/compiler/tests/nary_ops_test.py
+++ b/tensorflow/compiler/tests/nary_ops_test.py
@ -116,12 +116,14 @@ class NAryOpsTest(XLATestCase):
                    np.array([1, 1], dtype=np.int32)],
                   expected=np.array([[], []], dtype=np.float32))

-    self._testNAry(lambda x: array_ops.strided_slice(*x),
-                   [np.array([[], [], []], dtype=np.float32),
-                    np.array([1, 0], dtype=np.int64),
-                    np.array([3, 0], dtype=np.int64),
-                    np.array([1, 1], dtype=np.int64)],
-                   expected=np.array([[], []], dtype=np.float32))
+    if np.int64 in self.int_types:
+      self._testNAry(
+          lambda x: array_ops.strided_slice(*x), [
+              np.array([[], [], []], dtype=np.float32), np.array(
+                  [1, 0], dtype=np.int64), np.array([3, 0], dtype=np.int64),
+              np.array([1, 1], dtype=np.int64)
+          ],
+          expected=np.array([[], []], dtype=np.float32))

    self._testNAry(lambda x: array_ops.strided_slice(*x),
                   [np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
--- a/tensorflow/compiler/tests/pooling_ops_3d_test.py
+++ b/tensorflow/compiler/tests/pooling_ops_3d_test.py
@ -33,7 +33,7 @@ from tensorflow.python.platform import test
 # MaxPoolGrad.
 def _AvgPoolGrad(inputs, outputs, output_gradients, ksize, strides, padding):
  del outputs  # Unused by average-pooling gradients.
-  return gen_nn_ops.avg_pool3d_grad(
+  return gen_nn_ops._avg_pool3d_grad(
      inputs.get_shape().as_list(),
      output_gradients,
      ksize=ksize,
@ -263,7 +263,7 @@ class Pooling3DTest(XLATestCase):
  def testMaxPoolGradValidPadding1_1_3d(self):
    self._VerifyGradient(
        nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
        input_sizes=[1, 3, 3, 3, 1],
        ksize=[1, 1, 1],
        strides=[1, 1, 1],
@ -272,7 +272,7 @@ class Pooling3DTest(XLATestCase):
  def testMaxPoolGradValidPadding2_1_6_3d(self):
    self._VerifyGradient(
        nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
        input_sizes=[2, 3, 3, 6, 3],
        ksize=[2, 2, 2],
        strides=[1, 1, 1],
@ -281,7 +281,7 @@ class Pooling3DTest(XLATestCase):
  def testMaxPoolGradValidPadding2_1_7_3d(self):
    self._VerifyGradient(
        nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
        input_sizes=[2, 3, 5, 7, 3],
        ksize=[2, 2, 2],
        strides=[1, 1, 1],
@ -290,7 +290,7 @@ class Pooling3DTest(XLATestCase):
  def testMaxPoolGradValidPadding2_2_3d(self):
    self._VerifyGradient(
        nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
        input_sizes=[2, 2, 2, 2, 3],
        ksize=[2, 2, 2],
        strides=[2, 2, 2],
@ -299,7 +299,7 @@ class Pooling3DTest(XLATestCase):
  def testMaxPoolGradSamePadding1_1_3d(self):
    self._VerifyGradient(
        nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
        input_sizes=[2, 3, 2, 4, 1],
        ksize=[1, 1, 1],
        strides=[1, 1, 1],
@ -308,7 +308,7 @@ class Pooling3DTest(XLATestCase):
  def testMaxPoolGradSamePadding2_1_3d(self):
    self._VerifyGradient(
        nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
        input_sizes=[2, 3, 2, 4, 1],
        ksize=[2, 2, 2],
        strides=[1, 1, 1],
@ -317,7 +317,7 @@ class Pooling3DTest(XLATestCase):
  def testMaxPoolGradSamePadding2_2_3d(self):
    self._VerifyGradient(
        nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
        input_sizes=[2, 5, 2, 4, 3],
        ksize=[2, 2, 2],
        strides=[2, 2, 2],
@ -326,7 +326,7 @@ class Pooling3DTest(XLATestCase):
  def testMaxPoolGradSamePadding3_1_3d(self):
    self._VerifyGradient(
        nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
        input_sizes=[1, 3, 3, 7, 1],
        ksize=[3, 3, 3],
        strides=[1, 1, 1],
--- a/tensorflow/contrib/android/cmake/build.gradle
+++ b/tensorflow/contrib/android/cmake/build.gradle
@ -5,7 +5,8 @@ def TF_SRC_DIR = projectDir.toString() + "/../../../.."

 android {
    compileSdkVersion 24
-    buildToolsVersion '25.0.1'
+    // Check local build_tools_version as this is liable to change within Android Studio.
+    buildToolsVersion '25.0.2'

    // for debugging native code purpose
    publishNonDefault true
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -22,6 +22,7 @@ option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" OFF)
 option(tensorflow_ENABLE_GRPC_SUPPORT "Enable gRPC support" ON)
 option(tensorflow_ENABLE_HDFS_SUPPORT "Enable HDFS support" OFF)
+option(tensorflow_ENABLE_JEMALLOC_SUPPORT "Enable jemalloc support" OFF)
 option(tensorflow_BUILD_CC_EXAMPLE "Build the C++ tutorial example" ON)
 option(tensorflow_BUILD_PYTHON_BINDINGS "Build the Python bindings" ON)
 option(tensorflow_BUILD_ALL_KERNELS "Build all OpKernels" ON)
@ -29,6 +30,7 @@ option(tensorflow_BUILD_CONTRIB_KERNELS "Build OpKernels from tensorflow/contrib
 option(tensorflow_BUILD_CC_TESTS "Build cc unit tests " OFF)
 option(tensorflow_BUILD_PYTHON_TESTS "Build python unit tests " OFF)
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
+option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")

 if (NOT WIN32)
  # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
@ -81,6 +83,22 @@ if (tensorflow_OPTIMIZE_FOR_NATIVE_ARCH)
  endif()
 endif()

+# MSVC SIMD instructions
+if (tensorflow_WIN_CPU_SIMD_OPTIONS)
+  if (WIN32)
+    CHECK_CXX_COMPILER_FLAG("${tensorflow_WIN_CPU_SIMD_OPTIONS}" COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
+    if(COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${tensorflow_WIN_CPU_SIMD_OPTIONS}")
+    else()
+      message(FATAL_ERROR "${tensorflow_WIN_CPU_SIMD_OPTIONS} not supported")
+    endif()
+  endif()
+endif()
+
+if (tensorflow_ENABLE_JEMALLOC_SUPPORT)
+  add_definitions(-DTENSORFLOW_USE_JEMALLOC -DJEMALLOC_EXPORT=)
+endif()
+
 # External dependencies
 include(zlib)
 include(gif)
@ -148,6 +166,12 @@ if(tensorflow_ENABLE_GRPC_SUPPORT)
  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES grpc)
  include_directories(${GRPC_INCLUDE_DIRS})
 endif()
+if(tensorflow_ENABLE_JEMALLOC_SUPPORT)
+  include(jemalloc)
+  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${jemalloc_STATIC_LIBRARIES})
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES jemalloc)
+  include_directories(${jemalloc_INCLUDE_DIRS})
+endif()
 if(WIN32)
  list(APPEND tensorflow_EXTERNAL_LIBRARIES wsock32 ws2_32 shlwapi)
 endif()
@ -202,7 +226,6 @@ endif()

 # Let's get to work!
 include(tf_core_framework.cmake)
-include(tf_tools.cmake)
 # NOTE: Disabled until issue #3996 is fixed.
 # include(tf_stream_executor.cmake)
 if (tensorflow_ENABLE_GPU)
@ -223,6 +246,7 @@ if(tensorflow_BUILD_CC_EXAMPLE)
  include(tf_tutorials.cmake)
  include(tf_label_image_example.cmake)
 endif()
+include(tf_tools.cmake)
 if(tensorflow_BUILD_PYTHON_BINDINGS)
  include(tensorboard)
  include(tf_python.cmake)
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@ -45,7 +45,7 @@ bindings.

 ### Pre-requisites

-* CMake version 3.5 up to 3.6
+* CMake version 3.5 or later.

 * [Git](http://git-scm.com)

@ -181,7 +181,11 @@ Step-by-step Windows build
   More? -Dtensorflow_ENABLE_GPU=ON ^
   More? -DCUDNN_HOME="D:\...\cudnn"
   ```
-    
+   To enable SIMD instructions with MSVC, as AVX and SSE, define it as follows:
+   ```
+   More? -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
+   ```     
+
   Note that the `-DCMAKE_BUILD_TYPE=Release` flag must match the build
   configuration that you choose when invoking `msbuild`. The known-good
   values are `Release` and `RelWithDebInfo`. The `Debug` build type is
--- a/tensorflow/contrib/cmake/external/jemalloc.cmake
+++ b/tensorflow/contrib/cmake/external/jemalloc.cmake
@ -0,0 +1,33 @@
+include (ExternalProject)
+
+set(jemalloc_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc/include)
+set(jemalloc_URL https://github.com/jemalloc/jemalloc-cmake/archive/jemalloc-cmake.4.3.1.tar.gz)
+set(jemalloc_HASH SHA256=f9be9a05fe906deb5c1c8ca818071a7d2e27d66fd87f5ba9a7bf3750bcedeaf0)
+set(jemalloc_BUILD ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc)
+
+if (WIN32)
+    set(jemalloc_INCLUDE_DIRS
+        ${jemalloc_INCLUDE_DIRS} 
+        ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc/include/msvc_compat
+    )
+    set(jemalloc_ADDITIONAL_CMAKE_OPTIONS -A x64)
+    set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.lib)
+else()
+    set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.a)
+endif()
+
+ExternalProject_Add(jemalloc
+    PREFIX jemalloc
+    URL ${jemalloc_URL}
+    URL_HASH ${jemalloc_HASH}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    BUILD_IN_SOURCE 1
+    CONFIGURE_COMMAND ${CMAKE_COMMAND}
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -Dwith-jemalloc-prefix:STRING=jemalloc_
+        -Dwithout-export:BOOL=ON
+        ${jemalloc_ADDITIONAL_CMAKE_OPTIONS}
+    BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target jemalloc
+    INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "Skipping install step."
+)
--- a/tensorflow/contrib/cmake/tf_tools.cmake
+++ b/tensorflow/contrib/cmake/tf_tools.cmake
@ -63,7 +63,6 @@ add_executable(${transform_graph}

 target_link_libraries(${transform_graph} PUBLIC
  tf_protos_cc
-  ${tf_core_gpu_kernels_lib}
  ${tensorflow_EXTERNAL_LIBRARIES}
 )

@ -83,7 +82,6 @@ add_executable(${summarize_graph}

 target_link_libraries(${summarize_graph} PUBLIC
  tf_protos_cc
-  ${tf_core_gpu_kernels_lib}
  ${tensorflow_EXTERNAL_LIBRARIES}
 )

@ -103,7 +101,6 @@ add_executable(${compare_graphs}

 target_link_libraries(${compare_graphs} PUBLIC
  tf_protos_cc
-  ${tf_core_gpu_kernels_lib}
  ${tensorflow_EXTERNAL_LIBRARIES}
 )

@ -118,6 +115,8 @@ add_executable(${benchmark_model}
    $<TARGET_OBJECTS:tf_core_ops>
    $<TARGET_OBJECTS:tf_core_direct_session>
    $<TARGET_OBJECTS:tf_core_kernels>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )

 target_link_libraries(${benchmark_model} PUBLIC
--- a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.h
+++ b/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.h
@ -29,7 +29,6 @@
  dispatch_queue_t videoDataOutputQueue;
  AVCaptureStillImageOutput *stillImageOutput;
  UIView *flashView;
-  UIImage *square;
  BOOL isUsingFrontFacingCamera;
  AVSpeechSynthesizer *synth;
  NSMutableDictionary *oldPredictionValues;
--- a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
+++ b/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
@ -369,13 +369,8 @@ didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
  isUsingFrontFacingCamera = !isUsingFrontFacingCamera;
 }

- (void)didReceiveMemoryWarning {
-  [super didReceiveMemoryWarning];
-}
-
 - (void)viewDidLoad {
  [super viewDidLoad];
-  square = [UIImage imageNamed:@"squarePNG"];
  synth = [[AVSpeechSynthesizer alloc] init];
  labelLayers = [[NSMutableArray alloc] init];
  oldPredictionValues = [[NSMutableDictionary alloc] init];
@ -399,26 +394,6 @@ didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
  [self setupAVCapture];
 }

- (void)viewDidUnload {
-  [super viewDidUnload];
-}
-
- (void)viewWillAppear:(BOOL)animated {
-  [super viewWillAppear:animated];
-}
-
- (void)viewDidAppear:(BOOL)animated {
-  [super viewDidAppear:animated];
-}
-
- (void)viewWillDisappear:(BOOL)animated {
-  [super viewWillDisappear:animated];
-}
-
- (void)viewDidDisappear:(BOOL)animated {
-  [super viewDidDisappear:animated];
-}
-
 - (BOOL)shouldAutorotateToInterfaceOrientation:
    (UIInterfaceOrientation)interfaceOrientation {
  return (interfaceOrientation == UIInterfaceOrientationPortrait);
--- a/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj
@ -13,7 +13,6 @@
 		591D3ECF1CFF7FCE0059011C /* ImageIO.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 591D3ECE1CFF7FCE0059011C /* ImageIO.framework */; };
 		591D3ED21CFF85C30059011C /* ios_image_load.mm in Sources */ = {isa = PBXBuildFile; fileRef = 591D3ED11CFF85C30059011C /* ios_image_load.mm */; };
 		591D3ED51CFF85FD0059011C /* tensorflow_utils.mm in Sources */ = {isa = PBXBuildFile; fileRef = 591D3ED31CFF85FD0059011C /* tensorflow_utils.mm */; };
-		591D3EDA1CFFA83A0059011C /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 591D3ED71CFFA83A0059011C /* grace_hopper.jpg */; };
 		591D3EDB1CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt in Resources */ = {isa = PBXBuildFile; fileRef = 591D3ED81CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt */; };
 		591D3EDC1CFFA83A0059011C /* tensorflow_inception_graph.pb in Resources */ = {isa = PBXBuildFile; fileRef = 591D3ED91CFFA83A0059011C /* tensorflow_inception_graph.pb */; };
 		591D3EDF1CFFAD230059011C /* libprotobuf-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 591D3EDD1CFFAD230059011C /* libprotobuf-lite.a */; };
@ -38,7 +37,6 @@
 		591D3ED11CFF85C30059011C /* ios_image_load.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_image_load.mm; sourceTree = SOURCE_ROOT; };
 		591D3ED31CFF85FD0059011C /* tensorflow_utils.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = tensorflow_utils.mm; sourceTree = SOURCE_ROOT; };
 		591D3ED41CFF85FD0059011C /* tensorflow_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tensorflow_utils.h; sourceTree = SOURCE_ROOT; };
-		591D3ED71CFFA83A0059011C /* grace_hopper.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = grace_hopper.jpg; sourceTree = "<group>"; };
 		591D3ED81CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_comp_graph_label_strings.txt; sourceTree = "<group>"; };
 		591D3ED91CFFA83A0059011C /* tensorflow_inception_graph.pb */ = {isa = PBXFileReference; lastKnownFileType = file; path = tensorflow_inception_graph.pb; sourceTree = "<group>"; };
 		591D3EDD1CFFAD230059011C /* libprotobuf-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libprotobuf-lite.a"; path = "../../makefile/gen/protobuf_ios/lib/libprotobuf-lite.a"; sourceTree = "<group>"; };
@ -79,7 +77,6 @@
 		591D3ED61CFFA83A0059011C /* data */ = {
 			isa = PBXGroup;
 			children = (
-				591D3ED71CFFA83A0059011C /* grace_hopper.jpg */,
 				591D3ED81CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt */,
 				591D3ED91CFFA83A0059011C /* tensorflow_inception_graph.pb */,
 			);
@ -199,7 +196,6 @@
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				591D3EDA1CFFA83A0059011C /* grace_hopper.jpg in Resources */,
 				591D3EDC1CFFA83A0059011C /* tensorflow_inception_graph.pb in Resources */,
 				592FF90D18EDD0DA00C164F8 /* MainStoryboard_iPhone.storyboard in Resources */,
 				591D3EDB1CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt in Resources */,
--- a/tensorflow/contrib/ios_examples/camera/data/grace_hopper.jpg
+++ b/tensorflow/contrib/ios_examples/camera/data/grace_hopper.jpg
--- a/tensorflow/contrib/ios_examples/camera/squarePNG.png
+++ b/tensorflow/contrib/ios_examples/camera/squarePNG.png
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@ -17,24 +17,31 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from six.moves import xrange  # pylint: disable=redefined-builtin
+
 from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.contrib.layers.python.ops import sparse_feature_cross_op

+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging

 __all__ = [
    "safe_embedding_lookup_sparse", "scattered_embedding_lookup",
-    "scattered_embedding_lookup_sparse", "embedding_lookup_unique"
+    "scattered_embedding_lookup_sparse", "embedding_lookup_unique",
+    "embedding_lookup_sparse_with_distributed_aggregation"
 ]


@ -548,3 +555,351 @@ def _sampled_scattered_embedding_lookup_sparse(params,
    return math_ops.unsorted_segment_sum(embeddings, segment_ids,
                                         num_segments=num_segments,
                                         name=name_scope)
+
+
+def embedding_lookup_sparse_with_distributed_aggregation(
+    params,
+    sp_ids,
+    sp_weights,
+    partition_strategy="mod",
+    name=None,
+    combiner=None,
+    max_norm=None):
+  """Computes embeddings for the given ids and weights.
+
+  Embeddings belonging to same param are aggregated on that device first. This
+  op is intended to decrease data transmission and improve parallelism. See
+  `tf.nn.embedding_lookup_sparse` for the functionality and example of this op.
+
+  Args:
+    params: A single tensor representing the complete embedding tensor,
+      or a list of P tensors all of same shape except for the first dimension,
+      representing sharded embedding tensors.  Alternatively, a
+      `PartitionedVariable`, created by partitioning along dimension 0. Each
+      element must be appropriately sized for the given `partition_strategy`.
+    sp_ids: N x M SparseTensor of int64 ids (typically from FeatureValueToId),
+      where N is typically batch size and M is arbitrary.
+    sp_weights: either a SparseTensor of float / double weights, or None to
+      indicate all weights should be taken to be 1. If specified, sp_weights
+      must have exactly the same shape and indices as sp_ids.
+    partition_strategy: A string specifying the partitioning strategy, relevant
+      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
+      is `"mod"`. See `tf.nn.embedding_lookup` for more details.
+    name: Optional name for the op.
+    combiner: A string specifying the reduction op. Currently "mean", "sqrtn"
+      and "sum" are supported.
+      "sum" computes the weighted sum of the embedding results for each row.
+      "mean" is the weighted sum divided by the total weight.
+      "sqrtn" is the weighted sum divided by the square root of the sum of the
+      squares of the weights.
+    max_norm: If not None, each embedding is normalized to have l2 norm equal
+      to max_norm before combining.
+
+  Returns:
+    A dense tensor representing the combined embeddings for the
+    sparse ids. For each row in the dense tensor represented by sp_ids, the op
+    looks up the embeddings for all ids in that row, multiplies them by the
+    corresponding weight, and combines these embeddings as specified.
+
+  Raises:
+    TypeError: If sp_ids is not a SparseTensor, or if sp_weights is neither
+      None nor SparseTensor.
+    ValueError: If combiner is not one of {"mean", "sqrtn", "sum"}.
+  """
+  if combiner is None:
+    logging.warn("The default value of combiner will change from \"mean\" "
+                 "to \"sqrtn\" after 2016/11/01.")
+    combiner = "mean"
+  if combiner not in ("mean", "sqrtn", "sum"):
+    raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'")
+  if isinstance(params, variables.PartitionedVariable):
+    params = list(params)  # Iterate to get the underlying Variables.
+  if not isinstance(params, list):
+    params = [params]
+  if not isinstance(sp_ids, sparse_tensor.SparseTensor):
+    raise TypeError("sp_ids must be SparseTensor")
+  ignore_weights = sp_weights is None
+  if not ignore_weights:
+    if not isinstance(sp_weights, sparse_tensor.SparseTensor):
+      raise TypeError("sp_weights must be either None or SparseTensor")
+    sp_ids.values.get_shape().assert_is_compatible_with(
+        sp_weights.values.get_shape())
+    sp_ids.indices.get_shape().assert_is_compatible_with(
+        sp_weights.indices.get_shape())
+    sp_ids.dense_shape.get_shape().assert_is_compatible_with(
+        sp_weights.dense_shape.get_shape())
+    # TODO(yleon): Add enhanced node assertions to verify that sp_ids and
+    # sp_weights have equal indices and shapes.
+
+  with ops.name_scope(name, "embedding_lookup_sparse",
+                      params + [sp_ids]) as name:
+    segment_ids = sp_ids.indices[:, 0]
+    if segment_ids.dtype != dtypes.int32:
+      segment_ids = math_ops.cast(segment_ids, dtypes.int32)
+
+    ids = sp_ids.values
+    if ignore_weights:
+      ids, idx = array_ops.unique(ids)
+    else:
+      idx = None
+
+    weights = None if ignore_weights else sp_weights.values
+    embeddings = _embedding_lookup_with_distributed_aggregation(
+        params,
+        ids,
+        partition_strategy=partition_strategy,
+        max_norm=max_norm,
+        weights=weights,
+        idx=idx,
+        segment_ids=segment_ids)
+    # Set weights to all one if ignore weights.
+    if ignore_weights:
+      weights = array_ops.fill([array_ops.shape(segment_ids)[0]], 1)
+    if weights.dtype != embeddings.dtype:
+      weights = math_ops.cast(weights, embeddings.dtype)
+    # Reshape weights.
+    ones = array_ops.fill(
+        array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1)
+    bcast_weights_shape = array_ops.concat([array_ops.shape(weights), ones], 0)
+    orig_weights_shape = weights.get_shape()
+    weights = array_ops.reshape(weights, bcast_weights_shape)
+    if embeddings.get_shape().ndims is not None:
+      weights.set_shape(
+          orig_weights_shape.concatenate(
+              [1 for _ in range(embeddings.get_shape().ndims - 1)]))
+
+    if combiner == "mean":
+      weight_sum = math_ops.segment_sum(weights, segment_ids)
+      embeddings = math_ops.div(embeddings, weight_sum)
+    elif combiner == "sqrtn":
+      weights_squared = math_ops.pow(weights, 2)
+      weight_sum = math_ops.segment_sum(weights_squared, segment_ids)
+      weight_sum_sqrt = math_ops.sqrt(weight_sum)
+      embeddings = math_ops.div(embeddings, weight_sum_sqrt)
+    elif combiner != "sum":
+      assert False, "Unrecognized combiner"
+    return embeddings
+
+
+def _do_gather(params, ids, validate_indices=True, name=None):
+  """Deals with doing gather differently for resource variables."""
+  if isinstance(params, resource_variable_ops.ResourceVariable):
+    return params.sparse_read(ids, name=name)
+  return array_ops.gather(
+      params, ids, name=name, validate_indices=validate_indices)
+
+
+def _embedding_lookup_with_distributed_aggregation(params,
+                                                   ids,
+                                                   partition_strategy="mod",
+                                                   name=None,
+                                                   validate_indices=True,
+                                                   max_norm=None,
+                                                   weights=None,
+                                                   idx=None,
+                                                   segment_ids=None):
+  """Lookup helper for embedding_lookup_sparse_with_distributed_aggregation."""
+  if params is None or params == []:  # pylint: disable=g-explicit-bool-comparison
+    raise ValueError("Need at least one param")
+  if isinstance(params, variables.PartitionedVariable):
+    params = list(params)  # Iterate to get the underlying Variables.
+  if not isinstance(params, list):
+    params = [params]
+
+  def maybe_normalize(x):
+    if max_norm is not None:
+      if x.get_shape().ndims is not None:
+        ndims = x.get_shape().ndims
+      else:
+        ndims = array_ops.size(array_ops.shape(x))
+      return clip_ops.clip_by_norm(x, max_norm, axes=list(range(1, ndims)))
+    return x
+
+  with ops.name_scope(name, "embedding_lookup_with_distributed_aggregation",
+                      params + [ids]) as name:
+    np = len(params)  # Number of partitions
+    # Preserve the resource variable status to avoid accidental dense reads.
+    if not any(
+        isinstance(p, resource_variable_ops.ResourceVariable) for p in params):
+      params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
+    if np == 1:
+      with ops.colocate_with(params[0]):
+        ret = maybe_normalize(
+            _do_gather(params[0], ids, validate_indices=validate_indices))
+        ignore_weights = weights is None
+        if not ignore_weights:
+          if weights.dtype != ret.dtype:
+            weights = math_ops.cast(weights, ret.dtype)
+          # Reshape to allow broadcast
+          ones = array_ops.fill(
+              array_ops.expand_dims(array_ops.rank(ret) - 1, 0), 1)
+          bcast_weights_shape = array_ops.concat(
+              [array_ops.shape(weights), ones], 0)
+          orig_weights_shape = weights.get_shape()
+          weights = array_ops.reshape(weights, bcast_weights_shape)
+          # Set weights shape after reshape
+          if ret.get_shape().ndims is not None:
+            weights.set_shape(
+                orig_weights_shape.concatenate(
+                    [1 for _ in range(ret.get_shape().ndims - 1)]))
+          ret *= weights
+          return math_ops.segment_sum(ret, segment_ids, name=name)
+        else:
+          return math_ops.sparse_segment_sum(ret, idx, segment_ids, name=name)
+    else:
+      ids = ops.convert_to_tensor(ids, name="ids")
+      flat_ids = array_ops.reshape(ids, [-1])
+      original_indices = math_ops.range(array_ops.size(flat_ids))
+
+      # Create p_assignments and set new_ids depending on the strategy.
+      if partition_strategy == "mod":
+        p_assignments = flat_ids % np
+        new_ids = flat_ids // np
+      elif partition_strategy == "div":
+        # Compute num_total_ids as the sum of dim-0 of params, then assign to
+        # partitions based on a constant number of ids per partition. Optimize
+        # if we already know the full shape statically.
+        dim_0_size = params[0].get_shape()[0]
+        for p in xrange(1, np):
+          dim_0_size += params[p].get_shape()[0]
+        if dim_0_size.value:
+          num_total_ids = constant_op.constant(dim_0_size.value, flat_ids.dtype)
+        else:
+          dim_0_sizes = []
+          for p in xrange(np):
+            if params[p].get_shape()[0].value is not None:
+              dim_0_sizes.append(params[p].get_shape()[0].value)
+            else:
+              with ops.colocate_with(params[p]):
+                dim_0_sizes.append(array_ops.shape(params[p])[0])
+          num_total_ids = math_ops.reduce_sum(
+              math_ops.cast(array_ops.stack(dim_0_sizes), flat_ids.dtype))
+        ids_per_partition = num_total_ids // np
+        extras = num_total_ids % np
+
+        p_assignments = math_ops.maximum(flat_ids // (ids_per_partition + 1), (
+            flat_ids - extras) // ids_per_partition)
+
+        # Emulate a conditional using a boolean indicator tensor
+        is_in_first_extras_partitions = math_ops.cast(p_assignments < extras,
+                                                      flat_ids.dtype)
+        new_ids = (is_in_first_extras_partitions * (flat_ids %
+                                                    (ids_per_partition + 1)) +
+                   (1 - is_in_first_extras_partitions) * (
+                       (flat_ids - extras) % ids_per_partition))
+      else:
+        raise ValueError("Unrecognized partition strategy: " +
+                         partition_strategy)
+
+      # Cast partition assignments to int32 for use in dynamic_partition.
+      # There really should not be more than 2^32 partitions.
+      p_assignments = math_ops.cast(p_assignments, dtypes.int32)
+      # Partition list of ids based on assignments into np separate lists
+      gather_ids = data_flow_ops.dynamic_partition(new_ids, p_assignments, np)
+      # Similarly, partition the original indices.
+      pindices = data_flow_ops.dynamic_partition(original_indices,
+                                                 p_assignments, np)
+      # Do np separate lookups, finding embeddings for plist[p] in params[p]
+      partitioned_result = []
+      for p in xrange(np):
+        with ops.colocate_with(params[p]):
+          partitioned_result.append(
+              _do_gather(
+                  params[p], gather_ids[p], validate_indices=validate_indices))
+
+      ignore_weights = weights is None
+      if not ignore_weights:
+        # Partition weights according to pindices.
+        partitioned_weight = []
+        for p in xrange(np):
+          partitioned_weight.append(array_ops.gather(weights, pindices[p]))
+      # Reshape each partition result.
+      element_shape = params[0].get_shape()[1:]
+      for p in params[1:]:
+        element_shape = element_shape.merge_with(p.get_shape()[1:])
+      if element_shape.is_fully_defined():
+        for p in xrange(np):
+          with ops.colocate_with(params[p]):
+            partitioned_result[p] = array_ops.reshape(
+                partitioned_result[p],
+                array_ops.concat([array_ops.shape(pindices[p]), element_shape],
+                                 0))
+      else:
+        with ops.colocate_with(params[0]):
+          params_shape = array_ops.shape(params[0])
+        for p in xrange(np):
+          with ops.colocate_with(params[p]):
+            partitioned_result[p] = array_ops.reshape(
+                partitioned_result[p],
+                array_ops.concat([
+                    array_ops.shape(pindices[p]), array_ops.slice(
+                        params_shape, [1], [-1])
+                ], 0))
+      # Normalize each partition result.
+      for p in xrange(np):
+        with ops.colocate_with(params[p]):
+          partitioned_result[p] = maybe_normalize(partitioned_result[p])
+      if not ignore_weights:
+        # Multiply each partition result with partition weights.
+        for p in xrange(np):
+          with ops.colocate_with(params[p]):
+            if partitioned_weight[p].dtype != partitioned_result[p].dtype:
+              partitioned_weight[p] = math_ops.cast(partitioned_weight[p],
+                                                    partitioned_result[p].dtype)
+            # Reshape partition weights.
+            ones = array_ops.fill(
+                array_ops.expand_dims(
+                    array_ops.rank(partitioned_result[p]) - 1, 0), 1)
+            bcast_weights_shape = array_ops.concat(
+                [array_ops.shape(partitioned_weight[p]), ones], 0)
+            orig_weights_shape = partitioned_weight[p].get_shape()
+            partitioned_weight[p] = array_ops.reshape(partitioned_weight[p],
+                                                      bcast_weights_shape)
+            if partitioned_result[p].get_shape().ndims is not None:
+              partitioned_weight[p].set_shape(
+                  orig_weights_shape.concatenate([
+                      1
+                      for _ in range(partitioned_result[p].get_shape().ndims -
+                                     1)
+                  ]))
+            partitioned_result[p] *= partitioned_weight[p]
+      partitioned_segment_ids = []
+      for p in xrange(np):
+        if not ignore_weights:
+          # Partition segment_ids according to pindices.
+          p_segment_ids = array_ops.gather(segment_ids, pindices[p])
+          # Number the p_segment_ids to meet segment_sum's requirements. Note
+          # that unique_p_segment_ids contains unique segment ids of this
+          # partiton and these ids' order is unchanged.
+          unique_p_segment_ids, unique_p_segment_idx = array_ops.unique(
+              p_segment_ids)
+          partitioned_segment_ids.append(unique_p_segment_ids)
+          # segment_sum this partition's result.
+          with ops.colocate_with(params[p]):
+            partitioned_result[p] = math_ops.segment_sum(
+                partitioned_result[p], unique_p_segment_idx)
+        else:
+          # When ignore weights, we need to get indexs of elements in idx and
+          # segment_ids.
+          _, exclude_idx = array_ops.setdiff1d(idx, pindices[p])
+          all_idx = math_ops.range(array_ops.shape(idx)[0])
+          _, include_idx = array_ops.setdiff1d(all_idx, exclude_idx)
+          # Gather segment_ids and idx according to indexs.
+          p_segment_ids = array_ops.gather(segment_ids, include_idx)
+          p_idx = array_ops.gather(idx, include_idx)
+          # Number the p_segment_ids, same as ignore_weights case above.
+          unique_p_segment_ids, unique_p_segment_idx = array_ops.unique(
+              p_segment_ids)
+          _, unique_p_idx_idx = array_ops.unique(p_idx)
+          partitioned_segment_ids.append(unique_p_segment_ids)
+          with ops.colocate_with(params[p]):
+            partitioned_result[p] = math_ops.sparse_segment_sum(
+                partitioned_result[p], unique_p_idx_idx, unique_p_segment_idx)
+      # Concat each partition's segment_ids and result for final segment_sum.
+      concat_segment_ids = array_ops.concat(partitioned_segment_ids, 0)
+      concat_partitioned_result = array_ops.concat(partitioned_result, 0)
+      return math_ops.unsorted_segment_sum(
+          concat_partitioned_result,
+          concat_segment_ids,
+          math_ops.reduce_max(concat_segment_ids) + 1,
+          name=name)
--- a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
@ -31,10 +31,13 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat


 class SafeEmbeddingLookupSparseTest(test.TestCase):
@ -143,8 +146,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
      self.assertAllClose(
          embedding_lookup_result,
          [(embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4,
-           [0] * 4, embedding_weights[0][2],
-           (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
+           [0] * 4, embedding_weights[0][2], (
+               embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])

  def test_safe_embedding_lookup_sparse_partitioned(self):
    with self.test_session():
@ -169,8 +172,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
      self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                        embedding_weights, sparse_ids)
      embedding_weights = [
-          constant_op.constant(
-              w, dtype=dtypes.float64) for w in embedding_weights
+          constant_op.constant(w, dtype=dtypes.float64)
+          for w in embedding_weights
      ]
      self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                        embedding_weights, sparse_ids, sparse_weights)
@ -183,11 +186,10 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
          embedding_weights, sparse_ids, sparse_weights).eval())

-      self.assertAllClose(
-          embedding_lookup_result,
-          [[(1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) /
-            3.0, [0] * 4, [0] * 4],
-           [embedding_weights[0][2], [0] * 4, [0] * 4]])
+      self.assertAllClose(embedding_lookup_result, [[
+          (1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) / 3.0,
+          [0] * 4, [0] * 4
+      ], [embedding_weights[0][2], [0] * 4, [0] * 4]])

  def test_safe_embedding_lookup_sparse_3d_return_special_vector(self):
    with self.test_session():
@ -213,14 +215,13 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
          embedding_weights, sparse_ids, None).eval())

-      self.assertAllClose(
-          embedding_lookup_result,
-          [[(embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4,
-            [0] * 4], [
-                embedding_weights[0][2],
-                (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0,
-                [0] * 4
-            ]])
+      self.assertAllClose(embedding_lookup_result, [[(
+          embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4, [
+              0
+          ] * 4], [
+              embedding_weights[0][2],
+              (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4
+          ]])

  def test_safe_embedding_lookup_sparse_3d_partitioned(self):
    with self.test_session():
@ -231,13 +232,12 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
          embedding_weights, sparse_ids, None).eval())

      embedding_weights = list(itertools.chain(*embedding_weights))
-      self.assertAllClose(embedding_lookup_result,
-                          [[(embedding_weights[0] + embedding_weights[1]) / 2.0,
-                            [0] * 4, [0] * 4], [
-                                embedding_weights[2],
-                                (embedding_weights[0] + embedding_weights[1]) /
-                                2.0, [0] * 4
-                            ]])
+      self.assertAllClose(embedding_lookup_result, [[
+          (embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4, [0] * 4
+      ], [
+          embedding_weights[2],
+          (embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4
+      ]])

  def test_safe_embedding_lookup_sparse_3d_partitioned_inconsistent_weights(
      self):
@ -249,8 +249,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
      self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                        embedding_weights, sparse_ids)
      embedding_weights = [
-          constant_op.constant(
-              w, dtype=dtypes.float64) for w in embedding_weights
+          constant_op.constant(w, dtype=dtypes.float64)
+          for w in embedding_weights
      ]
      self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                        embedding_weights, sparse_ids, sparse_weights)
@ -299,8 +299,8 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
      self.assertAllEqual(embedding_lookup_result[0],
                          embedding_lookup_result[1])
      # Different embedding expected for different value.
-      embedding_diff = np.min((embedding_lookup_result[2] -
-                               embedding_lookup_result[0])**2)
+      embedding_diff = np.min(
+          (embedding_lookup_result[2] - embedding_lookup_result[0])**2)
      self.assertGreater(embedding_diff, 0)

  def test_scattered_embedding_coverage(self):
@ -318,8 +318,8 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
  def test_scattered_embedding_multi_dimension(self):
    with self.test_session():
      embedding_weights = self._random_weights()
-      values = constant_op.constant(
-          [["foo", "bar", "bar"], ["bar", "bar", "foo"]])
+      values = constant_op.constant([["foo", "bar", "bar"],
+                                     ["bar", "bar", "foo"]])

      embedding_lookup_result = embedding_ops.scattered_embedding_lookup(
          embedding_weights, values, dimension=10).eval()
@ -338,8 +338,8 @@ class ScatteredEmbeddingLookupTest(test.TestCase):

      embedding_lookup_result = (
          embedding_ops.scattered_embedding_lookup_sparse(
-              embedding_weights, sparse_tensor, dimension=5, combiner="mean")
-          .eval())
+              embedding_weights, sparse_tensor, dimension=5,
+              combiner="mean").eval())

      self.assertAllEqual(embedding_lookup_result.shape, [5, 5])
      # Same non-zero embedding for the empty rows filled with a default value.
@ -431,8 +431,8 @@ class SampledScatteredEmbeddingLookupTest(test.TestCase):
  def test_hashed_embedding_multi_dimension(self):
    with self.test_session():
      embedding_weights = self._random_weights()
-      values = constant_op.constant(
-          [["foo", "bar", "bar"], ["bar", "bar", "foo"]])
+      values = constant_op.constant([["foo", "bar", "bar"],
+                                     ["bar", "bar", "foo"]])
      sampled_candidates = constant_op.constant(
          [[[1, 3, 4, 6], [1, 7, 8, 9], [1, 7, 8, 9]],
           [[1, 7, 8, 9], [1, 7, 8, 9], [1, 3, 4, 6]]])
@ -489,8 +489,8 @@ class SampledScatteredEmbeddingLookupSparseTest(test.TestCase):
      result = embedding_ops._sampled_scattered_embedding_lookup_sparse(
          params, sp_values, dimension=5, hash_key=self._hash_key)

-      self.assertAllClose(result.eval(), [[0., 0., 0., 0., 0.],
-                                          [.3, .2, .2, .3, .1],
+      self.assertAllClose(result.eval(), [[0., 0., 0., 0.,
+                                           0.], [.3, .2, .2, .3, .1],
                                          [0., 0., 0., 0., 0.]])

  def test_output_values_with_sampled_candidates(self):
@ -563,5 +563,224 @@ class SampledScatteredEmbeddingLookupSparseTest(test.TestCase):
      self.assertAllClose(result.eval(), result_abc.eval())


+def _PName(param_id):
+  return "p" + str(param_id)
+
+
+def _EmbeddingParams(num_shards,
+                     vocab_size,
+                     dtype=dtypes.float32,
+                     shape=None,
+                     use_shapeless_placeholder=False):
+  p = []
+  params = {}
+  feed_dict = {}
+  if not shape:
+    shape = [10]
+  for i in range(num_shards):
+    shard_shape = [vocab_size // num_shards] + shape
+    if i < vocab_size % num_shards:  # Excess goes evenly on the first shards
+      shard_shape[0] += 1
+
+    param_name = _PName(i)
+
+    if use_shapeless_placeholder:
+      param = array_ops.placeholder(dtype, shape=None, name=param_name)
+    else:
+      param = constant_op.constant(
+          1.0, shape=shard_shape, dtype=dtype, name=param_name)
+    p.append(param)
+    np_type = "f" if dtype == dtypes.float32 else "d"
+    val = (np.random.rand(*shard_shape).astype(np_type)) + 1
+    params[param_name + ":0"] = val
+    feed_dict[param.name] = val
+  return p, params, feed_dict
+
+
+def _EmbeddingResult(params,
+                     id_vals,
+                     num_shards,
+                     vocab_size,
+                     partition_strategy="mod",
+                     weight_vals=None):
+  if weight_vals is None:
+    weight_vals = np.copy(id_vals)
+    weight_vals.fill(1)
+  values = []
+  weights = []
+  weights_squared = []
+  for ids, wts in zip(id_vals, weight_vals):
+    value_aggregation = None
+    weight_aggregation = None
+    squared_weight_aggregation = None
+    if isinstance(ids, compat.integral_types):
+      ids = [ids]
+      wts = [wts]
+    for i, weight_value in zip(ids, wts):
+      if partition_strategy == "mod":
+        val = np.copy(params[_PName(i % num_shards) + ":0"][
+            i // num_shards, :]) * weight_value
+      elif partition_strategy == "div":
+        ids_per_partition, extras = divmod(vocab_size, num_shards)
+        threshold = extras * (ids_per_partition + 1)
+        if i < threshold:
+          partition = i // (ids_per_partition + 1)
+          offset = i % (ids_per_partition + 1)
+        else:
+          partition = extras + (i - threshold) // ids_per_partition
+          offset = (i - threshold) % ids_per_partition
+        val = np.copy(
+            params[_PName(partition) + ":0"][offset, :]) * weight_value
+      else:
+        assert False
+      if value_aggregation is None:
+        assert weight_aggregation is None
+        assert squared_weight_aggregation is None
+        value_aggregation = val
+        weight_aggregation = weight_value
+        squared_weight_aggregation = weight_value * weight_value
+      else:
+        assert weight_aggregation is not None
+        assert squared_weight_aggregation is not None
+        value_aggregation += val
+        weight_aggregation += weight_value
+        squared_weight_aggregation += weight_value * weight_value
+    values.append(value_aggregation)
+    weights.append(weight_aggregation)
+    weights_squared.append(squared_weight_aggregation)
+  values = np.array(values).astype(np.float32)
+  weights = np.array(weights).astype(np.float32)
+  weights_squared = np.array(weights_squared).astype(np.float32)
+  return values, weights, weights_squared
+
+
+class EmbeddingLookupSparseWithDistributedAggregationTest(test.TestCase):
+
+  def _RandomIdsAndWeights(self, batch_size, vocab_size):
+    max_val_per_entry = 6
+    vals_per_batch_entry = np.random.randint(
+        1, max_val_per_entry, size=batch_size)
+    num_vals = np.sum(vals_per_batch_entry)
+
+    ids = np.random.randint(vocab_size, size=num_vals)
+    weights = 1 + np.random.rand(num_vals)
+
+    indices = []
+    for batch_entry, num_val in enumerate(vals_per_batch_entry):
+      for val_index in range(num_val):
+        indices.append([batch_entry, val_index])
+
+    shape = [batch_size, max_val_per_entry]
+
+    sp_ids = sparse_tensor_lib.SparseTensor(
+        constant_op.constant(indices, dtypes.int64),
+        constant_op.constant(ids, dtypes.int32),
+        constant_op.constant(shape, dtypes.int64))
+    sp_weights = sparse_tensor_lib.SparseTensor(
+        constant_op.constant(indices, dtypes.int64),
+        constant_op.constant(weights, dtypes.float32),
+        constant_op.constant(shape, dtypes.int64))
+
+    return sp_ids, sp_weights, ids, weights, vals_per_batch_entry
+
+  def _GroupByBatchEntry(self, vals, vals_per_batch_entry):
+    grouped_vals = []
+    index = 0
+    for num_val in vals_per_batch_entry:
+      grouped_vals.append(list(vals[index:(index + num_val)]))
+      index += num_val
+    return grouped_vals
+
+  def testEmbeddingLookupSparse(self):
+    vocab_size = 13
+    batch_size = 10
+    param_shape = [2, 5]
+    expected_lookup_result_shape = [None] + param_shape
+
+    sp_ids, sp_weights, ids, weights, vals_per_batch_entry = (
+        self._RandomIdsAndWeights(batch_size, vocab_size))
+
+    grouped_ids = self._GroupByBatchEntry(ids, vals_per_batch_entry)
+    grouped_weights = self._GroupByBatchEntry(weights, vals_per_batch_entry)
+    grouped_ignored_weights = self._GroupByBatchEntry(
+        np.ones(np.sum(vals_per_batch_entry)), vals_per_batch_entry)
+
+    for num_shards, combiner, dtype, ignore_weights in itertools.product(
+        [1, 5], ["sum", "mean", "sqrtn"], [dtypes.float32,
+                                           dtypes.float64], [True, False]):
+
+      with self.test_session():
+        p, params, feed_dict = _EmbeddingParams(
+            num_shards, vocab_size, shape=param_shape, dtype=dtype)
+        embedding_sum = \
+            embedding_ops.embedding_lookup_sparse_with_distributed_aggregation(
+                p,
+                sp_ids,
+                None if ignore_weights else sp_weights,
+                combiner=combiner)
+
+        self.assertEqual(embedding_sum.get_shape().as_list(),
+                         expected_lookup_result_shape)
+
+        tf_embedding_sum = embedding_sum.eval(feed_dict=feed_dict)
+
+        np_embedding_sum, np_weight_sum, np_weight_sq_sum = _EmbeddingResult(
+            params,
+            grouped_ids,
+            num_shards,
+            vocab_size,
+            weight_vals=grouped_ignored_weights
+            if ignore_weights else grouped_weights)
+        if combiner == "mean":
+          np_embedding_sum /= np.reshape(np_weight_sum, (batch_size, 1, 1))
+        if combiner == "sqrtn":
+          np_embedding_sum /= np.reshape(
+              np.sqrt(np_weight_sq_sum), (batch_size, 1, 1))
+        self.assertAllClose(np_embedding_sum, tf_embedding_sum)
+
+  def testGradientsEmbeddingLookupSparse(self):
+    vocab_size = 12
+    batch_size = 4
+    param_shape = [2, 3]
+    sp_ids, sp_weights, _, _, _ = (self._RandomIdsAndWeights(
+        batch_size, vocab_size))
+
+    for num_shards, combiner, dtype, ignore_weights in itertools.product(
+        [1, 3], ["sum", "mean", "sqrtn"], [dtypes.float32,
+                                           dtypes.float64], [True, False]):
+      with self.test_session():
+        x, params, _ = _EmbeddingParams(
+            num_shards, vocab_size, shape=param_shape, dtype=dtype)
+
+        y = embedding_ops.embedding_lookup_sparse_with_distributed_aggregation(
+            x,
+            sp_ids,
+            None if ignore_weights else sp_weights,
+            combiner=combiner)
+        x_name = [_PName(i) for i in range(num_shards)]
+        x_init_value = [params[x_n + ":0"] for x_n in x_name]
+        x_shape = [i.shape for i in x_init_value]
+        y_shape = [batch_size] + list(params[_PName(0) + ":0"].shape[1:])
+        err = gradient_checker.compute_gradient_error(
+            x, x_shape, y, y_shape, x_init_value=x_init_value)
+      self.assertLess(err, 1e-5 if dtype == dtypes.float64 else 2e-3)
+
+  def testIncompatibleShapes(self):
+    with self.test_session():
+      x, _, _ = _EmbeddingParams(1, 10, dtype=dtypes.float32)
+      sp_ids = sparse_tensor_lib.SparseTensor(
+          constant_op.constant([[0, 0], [0, 1], [1, 0]], dtypes.int64),
+          constant_op.constant([0, 1, 2], dtypes.int32),
+          constant_op.constant([2, 2], dtypes.int64))
+      sp_weights = sparse_tensor_lib.SparseTensor(
+          constant_op.constant([[0, 0], [0, 1]], dtypes.int64),
+          constant_op.constant([12.0, 5.0], dtypes.float32),
+          constant_op.constant([1, 2], dtypes.int64))
+
+      with self.assertRaises(ValueError):
+        embedding_ops.embedding_lookup_sparse_with_distributed_aggregation(
+            x, sp_ids, sp_weights, combiner="mean")
+
+
 if __name__ == "__main__":
  test.main()
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@ -791,9 +791,11 @@ def weighted_sparse_column(sparse_id_column,
      weight or value of the corresponding sparse id feature.
    dtype: Type of weights, such as `tf.float32`. Only floating and integer
      weights are supported.
+
  Returns:
    A _WeightedSparseColumn composed of two sparse features: one represents id,
    the other represents weight (value) of the id feature in that example.
+
  Raises:
    ValueError: if dtype is not convertible to float.
  """
--- a/tensorflow/contrib/learn/python/learn/README.md
+++ b/tensorflow/contrib/learn/python/learn/README.md
@ -9,7 +9,7 @@ TF Learn is a simplified interface for TensorFlow, to get people started on pred

 ### Why *TensorFlow Learn*?

- To smooth the transition from the [scikit-learn](http://scikit-learn.org/stable/) world of one-liner machine learning into the more open world of building different shapes of ML models. You can start by using [fit](../../../../g3doc/api_docs/python/contrib.learn.md#Estimator.fit)/[predict](../../../../g3doc/api_docs/python/contrib.learn.md#Estimator.predict) and slide into TensorFlow APIs as you are getting comfortable.
+- To smooth the transition from the [scikit-learn](http://scikit-learn.org/stable/) world of one-liner machine learning into the more open world of building different shapes of ML models. You can start by using [fit](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator#fit)/[predict](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator#predict) and slide into TensorFlow APIs as you are getting comfortable.
 - To provide a set of reference models that will be easy to integrate with existing code.

 ## Installation
@ -43,17 +43,17 @@ Optionally you can install [scikit-learn](http://scikit-learn.org/stable/) and [
 ### Existing Estimator Implementations

 -   [`LinearClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/linear.py)
-    ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#LinearClassifier))
+    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/LinearClassifier))
 -   [`LinearRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/linear.py)
-    ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#LinearRegressor))
+    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/LinearRegressor))
 -   [`DNNClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn.py)
-    ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#DNNClassifier))
+    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNClassifier))
 -   [`DNNRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn.py)
-    ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#DNNRegressor))
+    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNRegressor))
 -   [`DNNLinearCombinedClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py)
-    ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#DNNLinearCombinedClassifier))
+    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNLinearCombinedClassifier))
 -   [`DNNLinearCombinedRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py)
-    ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#DNNLinearCombinedRegressor))
+    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNLinearCombinedRegressor))
 -   [`SVM`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/svm.py)
    ([docs](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/g3doc/svm.md))
 -   [`GMM`](https://www.tensorflow.org/code/tensorflow/contrib/factorization/python/ops/gmm.py)
@ -67,7 +67,7 @@ Below are a few simple examples of the API. For more examples, please see [examp

 General tips:

-  It's useful to rescale a dataset to 0 mean and unit standard deviation before passing it to an [`Estimator`](../../../../g3doc/api_docs/python/contrib.learn.md#estimators). [Stochastic Gradient Descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) doesn't always do the right thing when variable are at very different scales.
+-  It's useful to rescale a dataset to 0 mean and unit standard deviation before passing it to an [`Estimator`](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator). [Stochastic Gradient Descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) doesn't always do the right thing when variable are at very different scales.

 -  Categorical variables should be managed before passing input to the estimator.

@ -219,7 +219,7 @@ INFO:tensorflow:Loss for final step: 0.0162506.</pre>

 ## Summaries

-If you supply a `model_dir` argument to your `Estimator`s, TensorFlow will write summaries for ``loss`` and histograms for variables in this directory. (You can also add custom summaries in your custom model function by calling [Summary](../../../../g3doc/api_docs/python/train.md#summary-operations) operations.)
+If you supply a `model_dir` argument to your `Estimator`s, TensorFlow will write summaries for ``loss`` and histograms for variables in this directory. (You can also add custom summaries in your custom model function by calling [Summary](https://www.tensorflow.org/api_guides/python/summary) operations.)

 To view the summaries in TensorBoard, run the following command, where `logdir` is the `model_dir` for your `Estimator`:

--- a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
@ -22,6 +22,7 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.python.estimator.inputs.queues.feeding_functions import _ArrayFeedFn
 from tensorflow.python.estimator.inputs.queues.feeding_functions import _enqueue_data as enqueue_data
+from tensorflow.python.estimator.inputs.queues.feeding_functions import _GeneratorFeedFn
 from tensorflow.python.estimator.inputs.queues.feeding_functions import _OrderedDictNumpyFeedFn
 from tensorflow.python.estimator.inputs.queues.feeding_functions import _PandasFeedFn
 # pylint: enable=unused-import
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@ -200,6 +200,7 @@ class RunConfig(ClusterConfig):
  parameter servers), you probably want to use `learn_runner.EstimatorConfig`
  instead.
  """
+  _USE_DEFAULT = 0

  def __init__(self,
               master=None,
@ -208,7 +209,7 @@ class RunConfig(ClusterConfig):
               gpu_memory_fraction=1,
               tf_random_seed=None,
               save_summary_steps=100,
-               save_checkpoints_secs=600,
+               save_checkpoints_secs=_USE_DEFAULT,
               save_checkpoints_steps=None,
               keep_checkpoint_max=5,
               keep_checkpoint_every_n_hours=10000,
@ -260,6 +261,11 @@ class RunConfig(ClusterConfig):
    self._tf_random_seed = tf_random_seed
    self._save_summary_steps = save_summary_steps
    self._save_checkpoints_secs = save_checkpoints_secs
+    if save_checkpoints_secs == RunConfig._USE_DEFAULT:
+      if save_checkpoints_steps is None:
+        self._save_checkpoints_secs = 600
+      else:
+        self._save_checkpoints_secs = None
    self._save_checkpoints_steps = save_checkpoints_steps

    # TODO(weiho): Remove these after ModelFn refactoring, when users can
--- a/tensorflow/contrib/learn/python/learn/learn_io/init.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/init.py
@ -35,3 +35,4 @@ from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pan
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pandas_matrix
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import HAS_PANDAS
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import pandas_input_fn
+from tensorflow.contrib.learn.python.learn.learn_io.generator_io import generator_input_fn
--- a/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py
@ -0,0 +1,134 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Methods to allow generator of dict with numpy arrays."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from types import FunctionType, GeneratorType
+from collections import Container
+
+from tensorflow.contrib.learn.python.learn.dataframe.queues import feeding_functions
+
+
+def generator_input_fn(x,
+                       target_key=None,
+                       batch_size=128,
+                       num_epochs=1,
+                       shuffle=True,
+                       queue_capacity=1000,
+                       num_threads=1):
+  """Returns input function that would dicts of numpy arrays
+       yielded from a generator.
+  
+  It is assumed that every dict yielded from the dictionary represents
+  a single sample. The generator should consume a single epoch of the data.
+
+  This returns a function outputting `features` and `target` based on the dict
+  of numpy arrays. The dict `features` has the same keys as an element yielded
+  from x.
+
+  Example:
+    ```python
+    def generator():
+      for index in range(10):
+        yield {'height': np.random.randint(32,36),
+              'age': np.random.randint(18, 80),
+              'label': np.ones(1)}
+
+    with tf.Session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key="label", batch_size=2, shuffle=False,
+          num_epochs=1)
+    ```
+
+  Args:
+    x: Generator Function, returns a `Generator` that will yield the data
+      in `dict` of numpy arrays
+    target_key: String or Container of Strings, the key or Container of keys of
+      the numpy arrays in x dictionaries to use as target.
+    batch_size: Integer, size of batches to return.
+    num_epochs: Integer, number of epochs to iterate over data. If `None` will
+      run forever.
+    shuffle: Boolean, if True shuffles the queue. Avoid shuffle at prediction
+      time.
+    queue_capacity: Integer, size of queue to accumulate.
+    num_threads: Integer, number of threads used for reading and enqueueing.
+
+  Returns:
+    Function, that returns a feature `dict` with `Tensors` and an optional
+     label `dict` with `Tensors`, or if target_key is `str` label is a `Tensor`
+
+  Raises:
+    TypeError: `x` is not `FunctionType`.
+    TypeError: `x()` is not `GeneratorType`.
+    TypeError: `next(x())` is not `dict`.
+    TypeError: `target_key` is not `str` or `target_key` is not `Container`
+       of `str`.
+    KeyError:  `target_key` not a key or `target_key[index]` not in next(`x()`).
+    KeyError: `key` mismatch between dicts emitted from `x()`
+  """
+  if not isinstance(x, FunctionType):
+    raise TypeError(
+        'x must be generator function; got {}'.format(type(x).__name__))
+  generator = x()
+  if not isinstance(generator, GeneratorType):
+    raise TypeError(
+        'x() must be generator; got {}'.format(type(generator).__name__))
+  data = next(generator)
+  if not isinstance(data, dict):
+    raise TypeError('x() must yield dict; got {}'.format(type(data).__name__))
+  input_keys = sorted(next(x()).keys())
+  if target_key is not None:
+    if isinstance(target_key, str):
+      target_key = [target_key]
+    elif isinstance(target_key, Container):
+      for item in target_key:
+        if not isinstance(item, str):
+          raise TypeError('target_key must be str or Container of str; got {}'.
+                          format(type(item).__name__))
+        if item not in input_keys:
+          raise KeyError(
+              'target_key not in yielded dict. Expected {} keys; got {}'.format(
+                  input_keys, item))
+    else:
+      raise TypeError('target_key must be str or Container of str; got {}'.
+                      format(type(target_key).__name__))
+
+  def _generator_input_fn():
+    """generator input function."""
+    queue = feeding_functions.enqueue_data(
+        x,
+        queue_capacity,
+        shuffle=shuffle,
+        num_threads=num_threads,
+        enqueue_size=batch_size,
+        num_epochs=num_epochs)
+
+    features = (queue.dequeue_many(batch_size)
+                if num_epochs is None else queue.dequeue_up_to(batch_size))
+    if not isinstance(features, list):
+      features = [features]
+    features = dict(zip(input_keys, features))
+    if target_key is not None:
+      if len(target_key) > 1:
+        target = {key: features.pop(key) for key in target_key}
+      else:
+        target = features.pop(target_key[0])
+      return features, target
+    return features
+
+  return _generator_input_fn
--- a/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
@ -0,0 +1,348 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for numpy_io."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+# TODO: #6568 Remove this hack that makes dlopen() not crash.
+if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
+  import ctypes
+
+  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
+
+import numpy as np
+from tensorflow.contrib.learn.python.learn.learn_io import generator_io
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import queue_runner_impl
+
+
+class GeneratorIoTest(test.TestCase):
+
+  def testGeneratorInputFn(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator,
+          target_key='label',
+          batch_size=2,
+          shuffle=False,
+          num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1))
+      self.assertAllEqual(res[0]['b'], np.asarray([32, 33]).reshape(-1, 1))
+      self.assertAllEqual(res[1], np.asarray([-32, -31]).reshape(-1, 1))
+
+      session.run([features])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorSingleInputFn(self):
+
+    def generator():
+      for index in range(2):
+        yield {'a': np.ones(1) * index}
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+      features = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features])
+      self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1))
+
+      session.run([features])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnLabelDict(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32,
+            'label2': np.ones(1) * index - 64,
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator,
+          target_key=['label', 'label2'],
+          batch_size=2,
+          shuffle=False,
+          num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1))
+      self.assertAllEqual(res[0]['b'], np.asarray([32, 33]).reshape(-1, 1))
+      self.assertAllEqual(res[1]['label'], np.asarray([-32, -31]).reshape(
+          -1, 1))
+      self.assertAllEqual(res[1]['label2'],
+                          np.asarray([-64, -63]).reshape(-1, 1))
+
+      session.run([features])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnWithDifferentDimensionsOfFeatures(self):
+
+    def generator():
+      for index in range(100):
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator,
+          target_key='label',
+          batch_size=2,
+          shuffle=False,
+          num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'],
+                          np.vstack((np.zeros((10, 10)), np.ones(
+                              (10, 10)))).reshape(2, 10, 10))
+      self.assertAllEqual(res[0]['b'],
+                          np.vstack((np.zeros((5, 5)), np.ones(
+                              (5, 5)))).reshape(2, 5, 5) + 32)
+      self.assertAllEqual(res[1],
+                          np.vstack((np.zeros((3, 3)), np.ones(
+                              (3, 3)))).reshape(2, 3, 3) - 32)
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnWithXAsNonGeneratorFunction(self):
+    x = np.arange(32, 36)
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'x must be generator function'):
+        failing_input_fn = generator_io.generator_input_fn(
+            x, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFnWithXAsNonGenerator(self):
+
+    def generator():
+      return np.arange(32, 36)
+
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'x\(\) must be generator'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFnWithXAsNonGeneratorYieldingDicts(self):
+
+    def generator():
+      yield np.arange(32, 36)
+
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'x\(\) must yield dict'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFNWithTargetLabelNotString(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
+
+    y = np.arange(32, 36)
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'target_key must be str or'
+                                   ' Container of str'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFNWithTargetLabelListNotString(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
+
+    y = ['label', np.arange(10)]
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'target_key must be str or'
+                                   ' Container of str'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFNWithTargetLabelNotInDict(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
+
+    y = ['label', 'target']
+    with self.test_session():
+      with self.assertRaisesRegexp(KeyError, 'target_key not in yielded dict'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFnWithNoTargetKey(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+      features = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run(features)
+      self.assertAllEqual(res['a'], np.asarray([0, 1]).reshape(-1, 1))
+      self.assertAllEqual(res['b'], np.asarray([32, 33]).reshape(-1, 1))
+      self.assertAllEqual(res['label'], np.asarray([-32, -31]).reshape(-1, 1))
+
+      session.run([features])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnWithBatchLargerthanData(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key=None, batch_size=4, shuffle=False, num_epochs=1)
+      features = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run(features)
+      self.assertAllEqual(res['a'], np.asarray([0, 1, 0, 1]).reshape(-1, 1))
+      self.assertAllEqual(res['b'], np.asarray([32, 33, 32, 33]).reshape(-1, 1))
+      self.assertAllEqual(res['label'],
+                          np.asarray([-32, -31, -32, -31]).reshape(-1, 1))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnWithMismatchinGeneratorKeys(self):
+
+    def generator():
+      index = 0
+      yield {
+          'a': np.ones(1) * index,
+          'b': np.ones(1) * index + 32,
+          'label': np.ones(1) * index - 32
+      }
+      index = 1
+      yield {
+          'a': np.ones(1) * index,
+          'c': np.ones(1) * index + 32,
+          'label': np.ones(1) * index - 32
+      }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+      features = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features])
+
+      with self.assertRaisesRegex(KeyError, 'key mismatch between dicts emitted'
+                                  ' by GenFunExpected'):
+        coord.request_stop()
+        coord.join(threads)
+
+
+if __name__ == '__main__':
+  test.main()
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@ -370,6 +370,7 @@ ifeq ($(TARGET),IOS)
 	ifeq ($(IOS_ARCH),I386)
 		CXXFLAGS += -mios-simulator-version-min=$(MIN_SDK_VERSION) \
 		-arch i386 \
+		-mno-sse \
 		-fembed-bitcode \
 		-D__thread= \
 		-DUSE_GEMM_FOR_CONV \
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@ -75,7 +75,7 @@ To run the executable, use:

 ```bash
 tensorflow/contrib/makefile/gen/bin/benchmark \
- --graph=~/graphs/inception/tensorflow_inception_graph.pb
+ --graph=$HOME/graphs/inception/tensorflow_inception_graph.pb
 ```

 ## Android
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@ -99,8 +99,13 @@ class ExternalOptimizerInterface(object):
        slice(start, end) for start, end in zip(accumulated_dims[:-1],
                                                accumulated_dims[1:])]

-  def minimize(self, session=None, feed_dict=None, fetches=None,
-               step_callback=None, loss_callback=None):
+  def minimize(self,
+               session=None,
+               feed_dict=None,
+               fetches=None,
+               step_callback=None,
+               loss_callback=None,
+               **run_kwargs):
    """Minimize a scalar `Tensor`.

    Variables subject to optimization are updated in-place at the end of
@ -120,6 +125,7 @@ class ExternalOptimizerInterface(object):
        flattened into a single vector.
      loss_callback: A function to be called every time the loss and gradients
        are computed, with evaluated fetches supplied as positional arguments.
+      **run_kwargs: kwargs to pass to `session.run`.
    """
    session = session or ops.get_default_session()
    feed_dict = feed_dict or {}
@ -160,8 +166,10 @@ class ExternalOptimizerInterface(object):
                for packing_slice in self._packing_slices]

    # Set optimization variables to their new values.
-    session.run(self._var_updates,
-                feed_dict=dict(zip(self._update_placeholders, var_vals)))
+    session.run(
+        self._var_updates,
+        feed_dict=dict(zip(self._update_placeholders, var_vals)),
+        **run_kwargs)

  def _minimize(self, initial_val, loss_grad_func, equality_funcs,
                equality_grad_funcs, inequality_funcs, inequality_grad_funcs,
--- a/tensorflow/contrib/rnn/ops/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/ops/lstm_ops.cc
@ -78,7 +78,7 @@ ci = tanh(ci)
 cs = ci .* i + cs_prev .* f
 cs = clip(cs, cell_clip)

-o = sigmoid(cs * wco + f)
+o = sigmoid(cs * wco + o)
 co = tanh(cs)
 h = co .* o
 ```
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@ -322,9 +322,10 @@ class BahdanauAttention(_BaseAttentionMechanism):
    Args:
      query: Tensor of dtype matching `self.values` and shape
        `[batch_size, query_depth]`.
+
    Returns:
      score: Tensor of dtype matching `self.values` and shape
-        `[batch_size, self.num_units]`.
+        `[batch_size, max_time]` (`max_time` is memory's `max_time`).
    """
    with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
      processed_query = self.query_layer(query) if self.query_layer else query
@ -522,7 +523,8 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
    - Step 5: Calculate the context vector as the inner product between the
      alignments and the attention_mechanism's values (memory).
    - Step 6: Calculate the attention output by concatenating the cell output
-      and context through the attention layer.
+      and context through the attention layer (a linear layer with
+      `attention_size` outputs).

    Args:
      inputs: (Possibly nested tuple of) Tensor, the input at this time step.
@ -531,10 +533,10 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
      scope: Must be `None`.

    Returns:
-      A tuple `(attention, next_state)`, where:
+      A tuple `(attention_or_cell_output, next_state)`, where:

-      - `attention` is the attention passed to the layer above.
-      - `next_state` is an instance of `AttentionWrapperState`
+      - `attention_or_cell_output` depending on `output_attention`.
+      - `next_state` is an instance of `DynamicAttentionWrapperState`
         containing the state calculated at this time step.

    Raises:
--- a/tensorflow/contrib/seq2seq/python/ops/loss.py
+++ b/tensorflow/contrib/seq2seq/python/ops/loss.py
@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Seq2seq loss operations for use in sequence models.
 """

@ -28,22 +27,33 @@ from tensorflow.python.ops import nn_ops
 __all__ = ["sequence_loss"]


-def sequence_loss(logits, targets, weights,
-                  average_across_timesteps=True, average_across_batch=True,
-                  softmax_loss_function=None, name=None):
-  """Weighted cross-entropy loss for a sequence of logits (per example).
+def sequence_loss(logits,
+                  targets,
+                  weights,
+                  average_across_timesteps=True,
+                  average_across_batch=True,
+                  softmax_loss_function=None,
+                  name=None):
+  """Weighted cross-entropy loss for a sequence of logits.
+
+  Depending on the values of `average_across_timesteps` and
+  `average_across_batch`, the return Tensor will have rank 0, 1, or 2 as these
+  arguments reduce the cross-entropy at each target, which has shape
+  `[batch_size, sequence_length]`, over their respective dimensions. For
+  example, if `average_across_timesteps` is `True` and `average_across_batch`
+  is `False`, then the return Tensor will have shape `[batch_size]`.

  Args:
-    logits: A 3D Tensor of shape
-      [batch_size x sequence_length x num_decoder_symbols] and dtype float.
+    logits: A Tensor of shape
+      `[batch_size, sequence_length, num_decoder_symbols]` and dtype float.
      The logits correspond to the prediction across all classes at each
      timestep.
-    targets: A 2D Tensor of shape [batch_size x sequence_length] and dtype
+    targets: A Tensor of shape `[batch_size, sequence_length]` and dtype
      int. The target represents the true class at each timestep.
-    weights: A 2D Tensor of shape [batch_size x sequence_length] and dtype
-      float. Weights constitutes the weighting of each prediction in the
-      sequence. When using weights as masking set all valid timesteps to 1 and
-      all padded timesteps to 0.
+    weights: A Tensor of shape `[batch_size, sequence_length]` and dtype
+      float. `weights` constitutes the weighting of each prediction in the
+      sequence. When using `weights` as masking, set all valid timesteps to 1
+      and all padded timesteps to 0, e.g. a mask returned by `tf.sequence_mask`.
    average_across_timesteps: If set, sum the cost across the sequence
      dimension and divide the cost by the total label weight across timesteps.
    average_across_batch: If set, sum the cost across the batch dimension and
@ -55,7 +65,10 @@ def sequence_loss(logits, targets, weights,
    name: Optional name for this operation, defaults to "sequence_loss".

  Returns:
-    A scalar float Tensor: The average log-perplexity per symbol (weighted).
+    A float Tensor of rank 0, 1, or 2 depending on the
+    `average_across_timesteps` and `average_across_batch` arguments. By default,
+    it has rank 0 (scalar) and is the weighted average cross-entropy
+    (log-perplexity) per symbol.

  Raises:
    ValueError: logits does not have 3 dimensions or targets does not have 2
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -721,7 +721,8 @@ cc_library(
        "//tensorflow/core/kernels:quantized_ops",
    ]) + if_mkl([
        "//tensorflow/core/kernels:mkl_conv_op",
-        "//tensorflow/core/kernels:mkl_matmul_op",
+        "//tensorflow/core/kernels:mkl_pooling_ops",
+        "//tensorflow/core/kernels:mkl_relu_op",
        "//tensorflow/core/kernels:mkl_tfconv_op",
    ]),
 )
@ -2094,7 +2095,8 @@ tf_cc_test_mkl(
        "//tensorflow/cc:scope",
        "//tensorflow/cc:sendrecv_ops",
        "//tensorflow/core/kernels:mkl_conv_op",
-        "//tensorflow/core/kernels:mkl_matmul_op",
+        "//tensorflow/core/kernels:mkl_pooling_ops",
+        "//tensorflow/core/kernels:mkl_relu_op",
        "//tensorflow/core/kernels:mkl_tfconv_op",
        "//tensorflow/core/kernels:ops_util",
        "//third_party/eigen3",
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@ -18,7 +18,10 @@ limitations under the License.
 #include "tensorflow/core/graph/mkl_layout_pass.h"
 #include "tensorflow/core/util/mkl_util.h"

+#include <algorithm>
+#include <string>
 #include <vector>
+
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
@ -107,10 +110,345 @@ class MklLayoutPassTest : public ::testing::Test {
 };

 REGISTER_OP("Input").Output("o: float").SetIsStateful();
+REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
+REGISTER_OP("MklInput").Output("o: uint8").SetIsStateful();
+REGISTER_OP("MklInput2").Output("o: uint8").Output("o1: uint8").SetIsStateful();
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to node merge optiimization
+/////////////////////////////////////////////////////////////////////
+
+TEST_F(MklLayoutPassTest, Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Mul);D(Mul)|"
+            "A->C;A->D;B->C:1;B->D:1");
+}
+
+// Test set 1: Conv2D + AddBias
+
+// C=MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Sub(E,Y)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);E(MklConv2DWithBias);"
+            "M(MklInput);N(MklInput);Y(Input);Z(Sub)|A->E;B->E:2;D->E:4;"
+            "DMT/_0->E:5;E->Z;M->E:1;N->E:3;Y->Z:1");
+}
+
+// C=MklConv2D(A,M:1,B,N:1); E=BiasAdd(C,D); Z=Sub(E,Y)
+// Test for correct output slots selected
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput2'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput2'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M:1', 'B', 'N:1']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);E(MklConv2DWithBias);"
+            "M(MklInput2);N(MklInput2);Y(Input);Z(Sub)|A->E;B->E:2;D->E:4;"
+            "DMT/_0->E:5;E->Z;M:1->E:1;N:1->E:3;Y->Z:1");
+}
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
+// This is a case of node rewrite followed by node merge.
+// We will first rewrite Conv2D to MklConv2D, and then merge MklConv2D
+// with BiasAdd to produce MklConv2DWithBias.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(MklConv2DWithBias);Y(Input);Z(Sub)|"
+            "A->E;B->E:2;D->E:4;DMT/_0->E:1;DMT/_1->E:3;DMT/_2->E:5;"
+            "E->Z;Y->Z:1");
+}
+
+// Graph contains only MklConv2D, no AddBias.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MklConv2D);M(MklInput);N(MklInput)|"
+            "A->C;B->C:2;M->C:1;N->C:3");
+}
+
+// MklConv2D output does not go to BiasAdd.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D', 'E'] }");  // Output of MklConv2D does not go to BiasAdd.
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
+            "M(MklInput);N(MklInput)|A->C;B->C:2;D->F;E->F:1;M->C:1;N->C:3");
+}
+
+// MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
+// Merge should not be done in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D', 'E'] }"  // Conv2D has two outputs.
+                              // No merge should happen.
+      "node { name: 'G' op: 'Add'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
+            "G(Add);M(MklInput);N(MklInput)|A->C;B->C:2;C->G;D->F;"
+            "E->F:1;E->G:1;M->C:1;N->C:3");
+}
+
+// data_format attribute value mismatch. Merge should not be done
+// in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHCW' } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MklConv2D);D(Input);E(BiasAdd);M(MklInput);"
+            "N(MklInput)|A->C;B->C:2;C->E;D->E:1;M->C:1;N->C:3");
+}
+
+// No MklConv2D in context, but Conv2D in context.
+// Only Conv2D would be rewritten to MklConv2D, but no rewrite
+// for BiasAddGrad should happen.
+// C=MklConv2D(A,M,B,N); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MklConv2D);D(Sub);E(BiasAddGrad);"
+            "M(MklInput);N(MklInput)|A->C;A->D:1;B->C:2;C->D;D->E;"
+            "M->C:1;N->C:3");
+}
+
+// No Conv2D in the context for BiasAddGrad. No rewrite should happen.
+// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Add'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// No Conv2D in the context for BiasAddGrad, but MatMul in context.
+// Rewrite should happen, but name of BiasAddGrad does not change.
+// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D_MatMul) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'MatMul'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'transpose_a'      value { b: false } }"
+      " attr { key: 'transpose_b'      value { b: false } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests
+// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'MatMul'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'transpose_a'      value { b: false } }"
+      " attr { key: 'transpose_b'      value { b: false } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// No MatMul in the context for BiasAddGrad. No rewrite should happen.
+// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Negative_NoMatMul) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Add'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node to Mkl node
+/////////////////////////////////////////////////////////////////////

 // Single Conv2D Op; No Mkl layer on the input and on the output.
 // We will generate dummy Mkl tensor as 2nd input of Conv2D.
-TEST_F(MklLayoutPassTest, Conv2D_Basic) {
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
  InitGraph(
      "node { name: 'A' op: 'Input'}"
      "node { name: 'B' op: 'Input'}"
@ -130,7 +468,7 @@ TEST_F(MklLayoutPassTest, Conv2D_Basic) {

 // 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
 // have 2 outputs, both of which will be inputs to next Conv2D.
-TEST_F(MklLayoutPassTest, Conv2D_Positive1) {
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
  InitGraph(
      "node { name: 'A' op: 'Input'}"
      "node { name: 'B' op: 'Input'}"
@ -156,6 +494,104 @@ TEST_F(MklLayoutPassTest, Conv2D_Positive1) {
            "C:1->D:3;D->E:1;DMT/_0->C:1;DMT/_1->C:3;DMT/_2->D:1");
 }

+// Conv2D with INT32 which is not supported by Mkl
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
+  InitGraph(
+      "node { name: 'A' op: 'HalfInput'}"
+      "node { name: 'B' op: 'HalfInput'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_HALF } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_HALF } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(HalfInput);B(HalfInput);C(Conv2D);D(Mul)|"
+            "A->C;B->C:1;B->D;C->D:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node for workspace edges
+/////////////////////////////////////////////////////////////////////
+
+/* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['C', 'B', 'D'] }"
+      "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(MklMaxPoolGrad);F(Mul)|"
+            "A->B;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;C->E;C->F;D->E:4;"
+            "DMT/_0->B:1;DMT/_1->E:1;DMT/_2->E:5;E->F:1");
+}
+
+// Test MaxPool>MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPool node but workspace edges will not
+// be present.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MklMaxPool);C(Mul);DMT/_0(Const)|"
+            "A->B;A->C;B->C:1;DMT/_0->B:1");
+}
+
+// Test MaxPool->MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPoolGrad and for workspace tensor and
+// its Mkl part, we will generate dummy tensor.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(MklMaxPoolGrad);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Mul)|"
+            "A->D;A->E;B->D:2;C->D:4;D->E:1;DMT/_0->D:1;DMT/_1->D:3;"
+            "DMT/_2->D:5;DMT/_3->D:6;DMT/_4->D:7");
+}
+
+/////////////////////////////////////////////////////////////////////
+
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
  testing::StopTiming();
  string s;
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@ -81,9 +81,10 @@ class MklToTfConversionPass : public GraphOptimizationPass {
  // Is the input Op supported by Mkl-specific layout?
  //
  // @input op_name string of the op
+  // @input T Datatype to use for checking input op
  // @return true if op is Mkl supported; false, otherwise.
-  inline bool IsMklSupportedOp(const string& op_name) const {
-    return mkl_layer_registry::IsMklLayer(op_name);
+  inline bool IsMklSupportedOp(const string& op_name, DataType T) const {
+    return mkl_layer_registry::IsMklLayer(op_name, T);
  }

  // Insert layout conversion node on the edge pointed by 'e' from graph 'g'.
@ -188,6 +189,13 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
      continue;
    }

+    // We skip adding MklToTf on an edge between X->MklToTf or
+    // MklToTf->X, where X is any layer.
+    if (src->type_string().compare("MklToTf") == 0 ||
+        dst->type_string().compare("MklToTf") == 0) {
+      continue;
+    }
+
    VLOG(1) << "MklToTfConversionPass: InsertConversionNodes: "
            << src->type_string() << " and " << dst->type_string();

@ -202,8 +210,9 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
    GetNodeAttr(dst->def(), "T", &dst_datatype);

    // Check if src with is Mkl-compliant, while dst is not Mkl-compliant.
-    if (IsMklSupportedOp(src->type_string()) &&
-        !IsMklSupportedOp(dst->type_string())) {
+
+    if (IsMklSupportedOp(src->type_string(), src_datatype) &&
+        !IsMklSupportedOp(dst->type_string(), dst_datatype)) {
      VLOG(1) << "MklToTfConversionPass: Scheduled nodes " << src->name()
              << " and " << dst->name() << " for inserting conversion nodes";
      candidate_edges.push_back(const_cast<Edge*>(e));
--- a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
@ -17,7 +17,10 @@ limitations under the License.

 #include "tensorflow/core/graph/mkl_tfconversion_pass.h"

+#include <algorithm>
+#include <string>
 #include <vector>
+
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
@ -146,31 +149,34 @@ TEST_F(MklToTfConversionPass, Positive) {
            "C:1->Mkl2Tf/_0:1;D->E:1;M->C:1;Mkl2Tf/_0->E;N->C:3");
 }

-// MklConv2D followed by Non-Mkl layer, and MklConv2D uses half type
-// C=MklConv2D(A,M,B,N); E=Sub(C,D)
-// MklToTf node should be inserted.
-TEST_F(MklToTfConversionPass, Positive_Type) {
+// MklConv2D followed by MklToTf op followed by Non-Mkl layer.
+// C=MklConv2D(A,M,B,N); D=MklToTf(C:0, C:1) F=Sub(D,E)
+// MklToTf node should not be inserted again.
+TEST_F(MklToTfConversionPass, Negative_DoubleInsert) {
  InitGraph(
-      "node { name: 'A' op: 'HalfInput'}"
+      "node { name: 'A' op: 'Input'}"
      "node { name: 'M' op: 'MklInput'}"
-      "node { name: 'B' op: 'HalfInput'}"
+      "node { name: 'B' op: 'Input'}"
      "node { name: 'N' op: 'MklInput'}"
      "node { name: 'C' op: 'MklConv2D'"
-      " attr { key: 'T'                value { type: DT_HALF } }"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
      " attr { key: 'data_format'      value { s: 'NCHW' } }"
      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
      " attr { key: 'padding'          value { s: 'SAME' } }"
      " input: ['A', 'M', 'B', 'N']}"
-      "node { name: 'D' op: 'HalfInput'}"
-      "node { name: 'E' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_HALF } }"
-      " input: ['C', 'D']}");
+      "node { name: 'D' op: 'MklToTf'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C:0', 'C:1']}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'E']}");
  EXPECT_EQ(DoRunMklToTfConversionPass(),
-            "A(HalfInput);B(HalfInput);C(MklConv2D);D(HalfInput);"
-            "E(Sub);M(MklInput);Mkl2Tf/_0(MklToTf);N(MklInput)|"
-            "A->C;B->C:2;C->Mkl2Tf/_0;C:1->Mkl2Tf/_0:1;D->E:1;"
-            "M->C:1;Mkl2Tf/_0->E;N->C:3");
+            "A(Input);B(Input);C(MklConv2D);D(MklToTf);E(Input);"
+            "F(Sub);M(MklInput);N(MklInput)|"
+            "A->C;B->C:2;C->D;C:1->D:1;D->F;E->F:1;M->C:1;N->C:3");
 }

 // C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -27,6 +27,7 @@ load(
    "tf_copts",
    "tf_opts_nortti_if_android",
    "tf_kernel_library",
+    "tf_mkl_kernel_library",
    "cc_header_only_library",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
@ -2241,6 +2242,12 @@ tf_kernel_library(

 tf_kernel_library(
    name = "matmul_op",
+    srcs = [
+        "matmul_op.cc",
+    ] + if_mkl([
+        "mkl_matmul_op.cc",
+    ]),
+    hdrs = ["matmul_op.h"],
    defines = select({
        ":xsmm": [
            "TENSORFLOW_USE_LIBXSMM",
@ -2248,13 +2255,14 @@ tf_kernel_library(
        ],
        "//conditions:default": [],
    }),
-    prefix = "matmul_op",
    deps = MATH_DEPS + select({
        ":xsmm": [
            "@libxsmm_archive//:xsmm_avx",
        ],
        "//conditions:default": [],
-    }),
+    }) + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+    ]),
 )

 tf_kernel_library(
@ -2770,6 +2778,7 @@ tf_kernel_library(
        "cudnn_pooling_gpu.h",
        "fractional_pool_common.h",
        "maxpooling_op.h",
+        "pooling_ops_3d.h",
        "pooling_ops_common.h",
    ],
    gpu_srcs = [
@ -2780,6 +2789,8 @@ tf_kernel_library(
        "maxpooling_op_gpu.h",
        "pooling_ops_common.h",
        "pooling_ops_common_gpu.h",
+        "pooling_ops_3d_gpu.h",
+        "pooling_ops_3d_gpu.cu.cc",
    ],
    deps = [
        ":conv_2d",
@ -4468,49 +4479,69 @@ tf_cc_test(
    ],
 )

-if_mkl(
-    tf_kernel_library(
-        name = "mkl_matmul_op",
-        prefix = "mkl_matmul",
-        deps = [
-            ":math",
-            "//third_party/mkl:intel_binary_blob",
-        ],
-    ),
+tf_mkl_kernel_library(
+    name = "mkl_conv_op",
+    prefix = "mkl_conv",
+    deps = [
+        ":bounds_check",
+        ":conv_ops",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//third_party/mkl:intel_binary_blob",
+    ],
 )

-if_mkl(
-    tf_kernel_library(
-        name = "mkl_conv_op",
-        prefix = "mkl_conv",
-        deps = [
-            ":bounds_check",
-            ":ops_util",
-            "//tensorflow/core:core_cpu",
-            "//tensorflow/core:framework",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:lib_internal",
-            "//tensorflow/core:nn_ops_op_lib",
-            "//third_party/mkl:intel_binary_blob",
-        ],
-    ),
+tf_mkl_kernel_library(
+    name = "mkl_tfconv_op",
+    prefix = "mkl_tfconv",
+    deps = [
+        ":bounds_check",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//third_party/mkl:intel_binary_blob",
+    ],
 )

-if_mkl(
-    tf_kernel_library(
-        name = "mkl_tfconv_op",
-        prefix = "mkl_tfconv",
-        deps = [
-            ":bounds_check",
-            ":ops_util",
-            "//tensorflow/core:core_cpu",
-            "//tensorflow/core:framework",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:lib_internal",
-            "//tensorflow/core:nn_ops_op_lib",
-            "//third_party/mkl:intel_binary_blob",
-        ],
-    ),
+tf_mkl_kernel_library(
+    name = "mkl_pooling_ops",
+    srcs = [
+        "mkl_avgpooling_op.cc",
+        "mkl_maxpooling_op.cc",
+        "mkl_pooling_ops_common.cc",
+    ],
+    hdrs = ["mkl_pooling_ops_common.h"],
+    deps = [
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_relu_op",
+    prefix = "mkl_relu",
+    deps = [
+        ":bounds_check",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//third_party/mkl:intel_binary_blob",
+    ],
 )

 # -----------------------------------------------------------------------------
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@ -30,6 +30,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_2d.h"
+#ifdef TENSORFLOW_USE_LIBXSMM
+#include "tensorflow/core/kernels/xsmm_conv2d.h"
+#endif
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@ -88,6 +91,75 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;

+#ifdef TENSORFLOW_USE_LIBXSMM
+template <typename Device, class T>
+struct LaunchXsmmBackwardFilter {
+  bool operator()(OpKernelContext* context, const Device& d,
+                  typename TTypes<T, 4>::ConstTensor input_backward,
+                  typename TTypes<T, 4>::Tensor kernel,
+                  typename TTypes<T, 4>::ConstTensor output_backward,
+                  int input_rows, int input_cols, int row_stride,
+                  int col_stride, int pad_h, int pad_w,
+                  TensorFormat data_format) const {
+    return false;
+  }
+};
+
+template <>
+struct LaunchXsmmBackwardFilter<CPUDevice, float> {
+  bool operator()(OpKernelContext* context, const CPUDevice& d,
+                  typename TTypes<float, 4>::ConstTensor input,
+                  typename TTypes<float, 4>::Tensor filter,
+                  typename TTypes<float, 4>::ConstTensor output, int input_rows,
+                  int input_cols, int row_stride, int col_stride, int pad_h,
+                  int pad_w, TensorFormat data_format) const {
+    auto batch = input.dimension(0);
+    auto in_depth = input.dimension(3);
+    auto out_depth = output.dimension(3);
+    auto filter_rows = filter.dimension(0);
+    auto filter_cols = filter.dimension(1);
+
+    auto num_threads =
+        context->device()->tensorflow_cpu_worker_threads()->num_threads;
+    // See libxsmm_dnn.h for this struct definition.
+    libxsmm_dnn_conv_desc desc;
+    desc.N = batch;
+    desc.C = in_depth;
+    desc.H = input_rows;
+    desc.W = input_cols;
+    desc.K = out_depth;
+    desc.R = filter_rows;
+    desc.S = filter_cols;
+    desc.u = row_stride;
+    desc.v = col_stride;
+    desc.pad_h = pad_h;
+    desc.pad_w = pad_w;
+    desc.pad_h_in = 0;  // pad_rows;  // ignored by libxsmm for now.
+    desc.pad_w_in = 0;  // pad_cols;  // ignored by libxsmm for now.
+    desc.pad_h_out = 0;
+    desc.pad_w_out = 0;
+    desc.threads = num_threads;
+    desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
+    desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
+    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
+    desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
+
+    if (!CanUseXsmmConv2D(desc, data_format)) {
+      return false;
+    }
+
+    auto input_ptr = input.data();
+    auto filter_ptr = filter.data();
+    auto output_ptr = output.data();
+    bool success = functor::XsmmBkwFilterConv2D<CPUDevice, float>()(
+        context, desc, input_ptr, filter_ptr, output_ptr);
+    return success;
+  }
+};
+#endif
+
 template <typename Device, class T>
 class Conv2DFastBackpropFilterOp : public OpKernel {
 public:
@ -135,6 +207,36 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, filter_shape, &filter_backprop));

+#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+    if (pad_left == pad_right && pad_top == pad_bottom) {
+      if (LaunchXsmmBackwardFilter<Device, T>()(
+              context, context->eigen_device<Device>(), input.tensor<T, 4>(),
+              filter_backprop->tensor<T, 4>(), out_backprop.tensor<T, 4>(),
+              dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size,
+              (int)dims.spatial_dims[0].stride,
+              (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left,
+              data_format_)) {
+        return;
+      }
+    }
+#endif
+
    functor::SpatialConvolutionBackwardKernel<Device, T>()(
        context->eigen_device<Device>(), filter_backprop->tensor<T, 4>(),
        input.tensor<T, 4>(), out_backprop.tensor<T, 4>(),
@ -213,6 +315,19 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
            dims.spatial_dims[1].stride, padding_,
            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+    if (pad_left == pad_right && pad_top == pad_bottom) {
+      if (LaunchXsmmBackwardFilter<Device, T>()(
+              context, context->eigen_device<Device>(), input.tensor<T, 4>(),
+              filter_backprop->tensor<T, 4>(), out_backprop.tensor<T, 4>(),
+              dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size,
+              (int)dims.spatial_dims[0].stride,
+              (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left,
+              data_format_)) {
+        return;
+      }
+    }
+#endif

    // The total dimension size of each kernel.
    const int filter_total_size = dims.spatial_dims[0].filter_size *
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@ -131,7 +131,8 @@ struct LaunchXsmmBackwardInputConvolution {
                  typename TTypes<T, 4>::ConstTensor kernel,
                  typename TTypes<T, 4>::ConstTensor output_backward,
                  int input_rows, int input_cols, int row_stride,
-                  int col_stride, TensorFormat data_format) const {
+                  int col_stride, int pad_h, int pad_w,
+                  TensorFormat data_format) const {
    return false;
  }
 };
@ -143,7 +144,8 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
                  typename TTypes<float, 4>::ConstTensor kernel,
                  typename TTypes<float, 4>::ConstTensor output_backward,
                  int input_rows, int input_cols, int row_stride,
-                  int col_stride, TensorFormat data_format) const {
+                  int col_stride, int pad_h, int pad_w,
+                  TensorFormat data_format) const {
    auto batch = input_backward.dimension(0);
    auto in_depth = input_backward.dimension(3);
    auto out_depth = output_backward.dimension(3);
@ -162,10 +164,10 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
    desc.S = filter_cols;
    desc.u = row_stride;
    desc.v = col_stride;
-    desc.pad_h = 0;
-    desc.pad_w = 0;
-    desc.pad_h_in = 0;  // pad_rows;  // ignored by libxsmm for now.
-    desc.pad_w_in = 0;  // pad_cols;  // ignored by libxsmm for now.
+    desc.pad_h = pad_h;
+    desc.pad_w = pad_w;
+    desc.pad_h_in = 0;
+    desc.pad_w_in = 0;
    desc.pad_h_out = 0;
    desc.pad_w_out = 0;
    desc.threads = num_threads;
@ -174,7 +176,7 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
    desc.filter_format =
        LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;  // LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
    desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
-    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE;
    desc.datatype = LIBXSMM_DNN_DATATYPE_F32;

    auto input_ptr = input_backward.data();
@ -236,13 +238,31 @@ class Conv2DFastBackpropInputOp : public OpKernel {
                   context->allocate_output(0, input_shape, &in_backprop));

 #if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
-    if (LaunchXsmmBackwardInputConvolution<Device, T>()(
-            context, context->eigen_device<Device>(),
-            in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
-            out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
-            dims.spatial_dims[1].input_size, dims.spatial_dims[0].stride,
-            dims.spatial_dims[1].stride, data_format_)) {
-      return;
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+    if (pad_left == pad_right && pad_top == pad_bottom) {
+      if (LaunchXsmmBackwardInputConvolution<Device, T>()(
+              context, context->eigen_device<Device>(),
+              in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
+              out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
+              dims.spatial_dims[1].input_size, (int)dims.spatial_dims[0].stride,
+              (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left,
+              data_format_)) {
+        return;
+      }
    }
 #endif

@ -309,19 +329,9 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, input_shape, &in_backprop));

+// TODO(andydavis) Consider moving code shared with
+// Conv2DCustomBackpropFilterOp into a shared helper function.
 #if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
-    if (LaunchXsmmBackwardInputConvolution<Device, T>()(
-            context, context->eigen_device<Device>(),
-            in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
-            out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
-            dims.spatial_dims[1].input_size, dims.spatial_dims[0].stride,
-            dims.spatial_dims[1].stride, data_format_)) {
-      return;
-    }
-#endif
-
-    // TODO(andydavis) Consider moving code shared with
-    // Conv2DCustomBackpropFilterOp into a shared helper function.
    int64 pad_top, pad_bottom;
    int64 pad_left, pad_right;
    OP_REQUIRES_OK(
@ -337,6 +347,34 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
            dims.spatial_dims[1].stride, padding_,
            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));

+    if (pad_left == pad_right && pad_top == pad_bottom) {
+      if (LaunchXsmmBackwardInputConvolution<Device, T>()(
+              context, context->eigen_device<Device>(),
+              in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
+              out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
+              dims.spatial_dims[1].input_size, (int)dims.spatial_dims[0].stride,
+              (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left,
+              data_format_)) {
+        return;
+      }
+    }
+#else
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+#endif
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
    // The total dimension size of each kernel.
    const int filter_total_size = dims.spatial_dims[0].filter_size *
                                  dims.spatial_dims[1].filter_size *
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@ -213,8 +213,8 @@ class LaunchXsmmConvOp<CPUDevice, float> {
    desc.v = stride_cols;
    desc.pad_h = pad_rows;
    desc.pad_w = pad_cols;
-    desc.pad_h_in = pad_rows;  // libxsmm supports only physical padding for now
-    desc.pad_w_in = pad_cols;  // libxsmm supports only physical padding for now
+    desc.pad_h_in = 0;
+    desc.pad_w_in = 0;
    desc.pad_h_out = 0;
    desc.pad_w_out = 0;
    desc.threads = num_threads;
@ -222,13 +222,17 @@ class LaunchXsmmConvOp<CPUDevice, float> {
    desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
    desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
-    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE;
    desc.datatype = LIBXSMM_DNN_DATATYPE_F32;

    if (!CanUseXsmmConv2D(desc, data_format)) {
      return false;
    }

+    if (!CanUseXsmmConv2D(desc, data_format)) {
+      return false;
+    }
+
    auto input_ptr = input.template flat<float>().data();
    auto filter_ptr = filter.template flat<float>().data();
    auto output_ptr = output->template flat<float>().data();
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@ -548,9 +548,11 @@ template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 4>;
 template struct functor::ReverseTransformFilter<GPUDevice, float, 4>;
 template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 4>;

+template struct functor::NHWCToNCHW<GPUDevice, double, 4>;
 template struct functor::NHWCToNCHW<GPUDevice, float, 4>;
 template struct functor::NHWCToNCHW<GPUDevice, Eigen::half, 4>;

+template struct functor::NCHWToNHWC<GPUDevice, double, 4>;
 template struct functor::NCHWToNHWC<GPUDevice, float, 4>;
 template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 4>;

--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@ -18,6 +18,7 @@ limitations under the License.

 #include <array>

+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_3d.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
@ -242,8 +243,11 @@ void DnnPooling3dGradOp<T>::Compute(
  }
 }

-template class DnnPooling3dOp<float>;
-template class DnnPooling3dGradOp<float>;
+#define DEFINE_DNN_OPS(T)           \
+  template class DnnPooling3dOp<T>; \
+  template class DnnPooling3dGradOp<T>;
+TF_CALL_float(DEFINE_DNN_OPS) TF_CALL_half(DEFINE_DNN_OPS)
+#undef DEFINE_DNN_OPS

 #endif  // GOOGLE_CUDA

--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@ -46,6 +47,7 @@ limitations under the License.
 namespace tensorflow {

 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;

 const int kInvalidMaxPoolingIndex = -1;

@ -187,40 +189,6 @@ static void SpatialMaxPoolWithArgMaxHelper(
        params.tensor_in_batch, shard_cost, shard);
 }

-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    MaxPoolingOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
-    MaxPoolingOp<CPUDevice, Eigen::half>);
-
-#if GOOGLE_CUDA
-// Forward declarations for the functor specializations for GPU.
-namespace functor {
-#define DECLARE_GPU_SPEC(T)                                            \
-  template <>                                                          \
-  void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()(             \
-      const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \
-      typename TTypes<T, 4>::ConstTensor input, int window_rows,       \
-      int window_cols, int row_stride, int col_stride,                 \
-      const Eigen::PaddingType& padding);                              \
-  extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>;
-
-DECLARE_GPU_SPEC(float);
-#undef DECLARE_GPU_SPEC
-}  // namespace functor
-
-// Note(jiayq): Currently, the Caffe custom implementation is faster than the
-// default Eigen implementation so we are using the custom kernel as the
-// default. However, you can explicitly invoke the eigen version using
-// kernel_label_map.
-REGISTER_KERNEL_BUILDER(Name("MaxPool")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T")
-                            .Label("eigen_tensor"),
-                        MaxPoolingOp<Eigen::GpuDevice, float>);
-#endif  // GOOGLE_CUDA
-
 // The operation to compute MaxPool gradients.
 // It takes three inputs:
 //   - The original input tensor
@ -237,7 +205,7 @@ class MaxPoolingGradOp : public OpKernel {
                errors::InvalidArgument("Invalid data format"));
    OP_REQUIRES(
        context, data_format_ == FORMAT_NHWC,
-        errors::InvalidArgument("Default MaxPoolinGradOp only supports NHWC ",
+        errors::InvalidArgument("Default MaxPoolingGradOp only supports NHWC ",
                                "on device type ",
                                DeviceTypeString(context->device_type())));
    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
@ -305,13 +273,6 @@ class MaxPoolingGradOp : public OpKernel {
  TensorFormat data_format_;
 };

-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    MaxPoolingGradOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
-    MaxPoolingGradOp<CPUDevice, Eigen::half>);
-
 #ifdef GOOGLE_CUDA

 template <typename T>
@ -329,13 +290,13 @@ static void MaxPoolingBackwardCustomKernel(
    return;
  }

-  MaxPoolBackwardNoMask(
+  functor::MaxPoolBackwardNoMask<T>()(
      tensor_in->flat<T>().data(), params.tensor_in_batch,
      params.tensor_in_rows, params.tensor_in_cols, params.depth,
      params.out_height, params.out_width, params.window_rows,
      params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
-      params.pad_cols, out_backprop.flat<T>().data(),
-      output->flat<T>().data(), context->eigen_device<Eigen::GpuDevice>());
+      params.pad_cols, out_backprop.flat<T>().data(), output->flat<T>().data(),
+      context->eigen_device<Eigen::GpuDevice>());
 }

 template <class T>
@ -403,12 +364,252 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
  bool use_dnn_;
 };

-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    MaxPoolingGradOp<Eigen::GpuDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    MaxPoolingGradOp<Eigen::GpuDevice, Eigen::half>);
+#endif  // GOOGLE_CUDA
+
+// The operation to compute gradient of MaxPool gradients.
+// It takes three inputs:
+//   - The original input tensor
+//   - The original output tensor
+//   - Backprop tensor for output gradients
+// It produces one output: backprop tensor for output gradient.
+template <class Device, class T>
+class MaxPoolingGradGradOp : public OpKernel {
+ public:
+  explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES(
+        context, data_format_ == FORMAT_NHWC,
+        errors::InvalidArgument(
+            "Default MaxPoolingGradGradOp only supports NHWC ",
+            "on device type ", DeviceTypeString(context->device_type())));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+    OP_REQUIRES(
+        context, ksize_[3] == 1 && stride_[3] == 1,
+        errors::Unimplemented(
+            "MaxPoolingGradGrad is not yet supported on the depth dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& tensor_out = context->input(1);
+    const Tensor& out_grad_backprop = context->input(2);
+
+    // For maxpooling, tensor_in should have 4 dimensions.
+    OP_REQUIRES(context, tensor_in.dims() == 4,
+                errors::InvalidArgument("tensor_in must be 4-dimensional"));
+    OP_REQUIRES(context, tensor_out.dims() == 4,
+                errors::InvalidArgument("tensor_out must be 4-dimensional"));
+    // For maxpooling, out_grad_backprop should have 4 dimensions.
+    OP_REQUIRES(
+        context, out_grad_backprop.dims() == 4,
+        errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
+
+    PoolParameters params{context,  ksize_,      stride_,
+                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {2}, 0, tensor_out.shape(), &output));
+
+    SpatialMaxPoolGradGrad(context, output, tensor_in, tensor_out,
+                           out_grad_backprop, params, padding_);
+  }
+
+ private:
+  void SpatialMaxPoolGradGrad(OpKernelContext* context, Tensor* bottom_diff,
+                              const Tensor& tensor_in, const Tensor& tensor_out,
+                              const Tensor& top_diff,
+                              const PoolParameters& params,
+                              const Padding& padding) {
+    typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        ConstEigenMatrixMap;
+    typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        EigenMatrixMap;
+
+    ConstEigenMatrixMap in_mat(
+        tensor_in.flat<T>().data(), params.depth,
+        params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
+    ConstEigenMatrixMap out_mat(
+        tensor_out.flat<T>().data(), params.depth,
+        params.out_width * params.out_height * params.tensor_in_batch);
+    ConstEigenMatrixMap top_diff_mat(
+        top_diff.flat<T>().data(), params.depth,
+        params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
+    EigenMatrixMap bottom_diff_mat(
+        bottom_diff->flat<T>().data(), params.depth,
+        params.out_width * params.out_height * params.tensor_in_batch);
+
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *(context->device()->tensorflow_cpu_worker_threads());
+
+    // The following code basically does the following:
+    // 1. Flattens the input, output, top_diff and bottom_diff tensors into
+    //    two dimensional arrays.
+    //    tensor_in_as_matrix:
+    //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+    //    tensor_out_as_matrix:
+    //      depth by (out_width * out_height * tensor_in_batch)
+    //    top_diff_as_matrix:
+    //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+    //    bottom_diff_as_matrix:
+    //      depth by (out_width * out_height * tensor_in_batch)
+    //
+    // 2. Walks through the set of columns in the flattened
+    //    tensor_in_as_matrix, tensor_out_as_matrix, top_diff_as_matrix
+    //    and updates the column(s) corresponding to the maximum values in
+    //    tensor_out_as_matrix with the corresponding values in
+    //    top_diff_as_matrix.
+    auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
+                     int64 start, int64 limit) {
+      const int32 depth = params.depth;
+      const int32 in_rows = params.tensor_in_rows;
+      const int32 in_cols = params.tensor_in_cols;
+      const int32 pad_rows = params.pad_rows;
+      const int32 pad_cols = params.pad_cols;
+      const int32 window_rows = params.window_rows;
+      const int32 window_cols = params.window_cols;
+      const int32 row_stride = params.row_stride;
+      const int32 col_stride = params.col_stride;
+      const int32 out_height = params.out_height;
+      const int32 out_width = params.out_width;
+
+      {
+        // Initializes the output grad backprop tensor with 0.
+        const int32 output_image_size = out_height * out_width * params.depth;
+        EigenMatrixMap bottom_diff_shard(
+            bottom_diff_mat.data() + start * output_image_size, 1,
+            (limit - start) * output_image_size);
+        bottom_diff_shard.setZero();
+      }
+
+      for (int b = start; b < limit; ++b) {
+        for (int ph = 0; ph < out_height; ++ph) {
+          for (int pw = 0; pw < out_width; ++pw) {
+            // (h_start, h_end) * (w_start, w_end) is the range that the input
+            // vector projects to.
+            int h_start = ph * row_stride - pad_rows;
+            const int h_end = std::min(h_start + window_rows, in_rows);
+            int w_start = pw * col_stride - pad_cols;
+            const int w_end = std::min(w_start + window_cols, in_cols);
+            h_start = std::max(h_start, 0);
+            w_start = std::max(w_start, 0);
+            const int out_index = (b * out_height + ph) * out_width + pw;
+            // Find value corresponding to the input maximum in top_diff.
+            for (int d = 0; d < depth; ++d) {
+              const T& output_ref = out_mat.coeffRef(d, out_index);
+              bool should_stop = false;
+              for (int h = h_start; h < h_end && !should_stop; ++h) {
+                for (int w = w_start; w < w_end && !should_stop; ++w) {
+                  const int in_index = (b * in_rows + h) * in_cols + w;
+                  const T& input_ref = in_mat.coeffRef(d, in_index);
+                  if (output_ref == input_ref) {
+                    T& bottom_diff_ref = bottom_diff_mat.coeffRef(d, out_index);
+                    bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
+                    should_stop = true;
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    };
+
+    const int64 shard_cost = params.out_width * params.out_height *
+                             params.depth * params.window_rows *
+                             params.window_cols;
+    Shard(worker_threads.num_threads, worker_threads.workers,
+          params.tensor_in_batch, shard_cost, shard);
+  }
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+#ifdef GOOGLE_CUDA
+
+template <class T>
+class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
+ public:
+  typedef Eigen::GpuDevice Device;
+
+  explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
+    const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
+    OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& tensor_out = context->input(1);
+    const Tensor& out_grad_backprop = context->input(2);
+
+    // For maxpooling, tensor_in should have 4 dimensions.
+    OP_REQUIRES(context, tensor_in.dims() == 4,
+                errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
+    OP_REQUIRES(context, tensor_out.dims() == 4,
+                errors::InvalidArgument("tensor_out must be 4-dimensional"));
+    // For maxpooling, out_grad_backprop should have 4 dimensions.
+    OP_REQUIRES(
+        context, out_grad_backprop.dims() == 4,
+        errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {2}, 0, tensor_out.shape(), &output));
+
+    PoolParameters params{context,  ksize_,       stride_,
+                          padding_, data_format_, tensor_in.shape()};
+
+    functor::MaxPoolGradBackwardNoMask<T>()(
+        data_format_, tensor_in.flat<T>().data(), tensor_out.flat<T>().data(),
+        params.tensor_in_batch, params.out_height, params.out_width,
+        params.depth, params.tensor_in_rows, params.tensor_in_cols,
+        params.window_rows, params.window_cols, params.row_stride,
+        params.col_stride, params.pad_rows, params.pad_cols,
+        out_grad_backprop.flat<T>().data(), output->flat<T>().data(),
+        context->eigen_device<Eigen::GpuDevice>());
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+  bool use_dnn_;
+};

 #endif  // GOOGLE_CUDA

@ -565,6 +766,56 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
  Padding padding_;
 };

+template <typename Device, typename T>
+struct LaunchMaxPoolingGradGradWithArgmax;
+
+template <typename Device, typename T>
+class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
+ public:
+  explicit MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& grad_in = context->input(1);
+    const Tensor& argmax = context->input(2);
+
+    PoolParameters params{context,  ksize_,      stride_,
+                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    if (!context->status().ok()) {
+      return;
+    }
+
+    TensorShape out_shape({params.tensor_in_batch, params.out_height,
+                           params.out_width, params.depth});
+
+    Tensor* grad_out = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {1}, 0, out_shape, &grad_out));
+
+    LaunchMaxPoolingGradGradWithArgmax<Device, T>::launch(
+        context, params, grad_in, argmax, grad_out);
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+};
+
 #if GOOGLE_CUDA
 template <typename T>
 class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
@ -631,7 +882,7 @@ template <typename T>
 struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
  static void launch(OpKernelContext* context, const PoolParameters& params,
                     const Tensor& input, Tensor* output) {
-    bool status = MaxPoolForwardWithOptionalArgmax(
+    bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
        input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
        params.tensor_in_cols, params.depth, params.out_height,
        params.out_width, params.window_rows, params.window_cols,
@ -644,18 +895,11 @@ struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
  }
 };

-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    MaxPoolingNoMaskOp<Eigen::GpuDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    MaxPoolingNoMaskOp<Eigen::GpuDevice, Eigen::half>);
-
 template <typename T>
 struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
  static void launch(OpKernelContext* context, const PoolParameters& params,
                     const Tensor& input, Tensor* output, Tensor* argmax) {
-    bool status = MaxPoolForwardWithOptionalArgmax(
+    bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
        input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
        params.tensor_in_cols, params.depth, params.out_height,
        params.out_width, params.window_rows, params.window_cols,
@ -670,17 +914,6 @@ struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
  }
 };

-REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<int64>("Targmax")
-                            .TypeConstraint<float>("T"),
-                        MaxPoolingWithArgmaxOp<Eigen::GpuDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<int64>("Targmax")
-                            .TypeConstraint<Eigen::half>("T"),
-                        MaxPoolingWithArgmaxOp<Eigen::GpuDevice, Eigen::half>);
-
 template <typename T>
 struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
  static void launch(OpKernelContext* context, const PoolParameters& params,
@ -693,30 +926,118 @@ struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
    const int top_offset = params.out_height * params.out_width * params.depth;
    const int bottom_offset =
        params.tensor_in_rows * params.tensor_in_cols * params.depth;
-    bool status = MaxPoolBackwardWithArgmax(
+    bool status = functor::MaxPoolBackwardWithArgmax<T>()(
        output_size, input_size, grad_in.flat<T>().data(),
        reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
        bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
    if (!status) {
      context->SetStatus(
-          errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
+          errors::Internal("Failed launching MaxPoolBackwardWithArgmax"));
    }
  }
 };

-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGradWithArgmax")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<float>("T")
-        .TypeConstraint<int64>("Targmax"),
-    MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGradWithArgmax")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<Eigen::half>("T")
-        .TypeConstraint<int64>("Targmax"),
-    MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, Eigen::half>);
+template <typename T>
+struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& grad_in, const Tensor& argmax,
+                     Tensor* grad_out) {
+    const int input_size = params.tensor_in_batch * params.tensor_in_rows *
+                           params.tensor_in_cols * params.depth;
+    const int output_size = params.tensor_in_batch * params.out_height *
+                            params.out_width * params.depth;
+    const int top_offset =
+        params.tensor_in_rows * params.tensor_in_cols * params.depth;
+    const int bottom_offset =
+        params.out_width * params.out_height * params.depth;
+    bool status = functor::MaxPoolGradBackwardWithArgmax<T>()(
+        output_size, input_size, grad_in.flat<T>().data(),
+        reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
+        bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching MaxPoolGradBackwardWithArgmax"));
+    }
+  }
+};

 #endif  // GOOGLE_CUDA

+#define REGISTER_MAX_POOL_KERNELS(D, T)                                  \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("MaxPoolGrad").Device(DEVICE_##D).TypeConstraint<T>("T"),     \
+      MaxPoolingGradOp<D##Device, T>);                                   \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("MaxPoolGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      MaxPoolingGradGradOp<D##Device, T>);
+
+// Below kernels implemented only for CPU device.
+#define REGISTER_CPU_ONLY_POOL_KERNELS(T)                        \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      MaxPoolingOp<CPUDevice, T>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS);
+#undef REGISTER_CPU_ONLY_POOL_KERNELS
+
+#define REGISTER_CPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(CPU, T);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+
+// Forward declarations for the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                            \
+  template <>                                                          \
+  void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()(             \
+      const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \
+      typename TTypes<T, 4>::ConstTensor input, int window_rows,       \
+      int window_cols, int row_stride, int col_stride,                 \
+      const Eigen::PaddingType& padding);                              \
+  extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+#define REGISTER_GPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(GPU, T)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
+#undef REGISTER_GPU_MAX_POOL_KERNELS
+
+// Below kernels currently implemented only for GPU device.
+// Note(jiayq): Currently, the Caffe custom implementation is faster than the
+// default Eigen implementation so we are using the custom kernel as the
+// default. However, you can explicitly invoke the eigen version using
+// kernel_label_map.
+#define REGISTER_GPU_ONLY_POOL_KERNELS(T)                            \
+  REGISTER_KERNEL_BUILDER(Name("MaxPool")                            \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .Label("eigen_tensor"),                \
+                          MaxPoolingOp<GPUDevice, T>);               \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
+      MaxPoolingNoMaskOp<GPUDevice, T>);                             \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                  \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<int64>("Targmax")      \
+                              .TypeConstraint<T>("T"),               \
+                          MaxPoolingWithArgmaxOp<GPUDevice, T>);     \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")              \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<int64>("Targmax"),     \
+                          MaxPoolingGradWithArgmaxOp<GPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")          \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<int64>("Targmax"),     \
+                          MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_ONLY_POOL_KERNELS);
+#undef REGISTER_GPU_ONLY_POOL_KERNELS
+
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_MAX_POOL_KERNELS
+
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@ -199,15 +199,145 @@ __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
  }
 }

+// The parameters to the kernels in the gradient gradient function is as
+// follows:
+//     nthreads: the number of threads, which is equal to the output size. The
+//         gradient of the MaxPooling gradient w.r.t. the output data has a
+//         dimensions of N*C*Hout*Wout
+//     bottom_data: the bottom data of N*H*W*C (or N*C*H*W) items.
+//     output_data: the output data of N*Hout*Wout*C (or N*C*Hout*Wout) items.
+//     height, width, pooled_height, pooled_width: the input and output sizes.
+//     kernel_h, kernel_w: the kernel sizes.
+//     stride_h, stride_w: the strides.
+//     pad_t, pad_l: the padding values on the top and left side.
+//     top_diff: the gradient of the gradient of the output data w.r.t. the
+//         input data, of size N*H*W*C (or N*C*H*W).
+//     bottom_diff: the gradient of the gradient w.r.t. output.
+template <typename dtype>
+__global__ void MaxPoolGradBackwardNoMaskNCHW(
+    const int nthreads, const dtype* bottom_data, const dtype* output_data,
+    const int pooled_height, const int pooled_width, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    const dtype* top_diff, dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // First find out the index to the maximum, since we have no mask.
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    bool should_stop = false;
+    int maxidx = -1;
+    const dtype* bottom_data_n = bottom_data + n * channels * height * width;
+    // Propagate only first value from top_diff corresponding to the maximum.
+    for (int h = hstart; h < hend && !should_stop; ++h) {
+      for (int w = wstart; w < wend && !should_stop; ++w) {
+        int idx = c * height * width + h * width + w;
+        if (output_data[index] == bottom_data_n[idx]) {
+          maxidx = idx;
+          should_stop = true;
+        }
+      }
+    }
+    // Set the bottom diff (atomic is not necessary). The index could still be
+    // uninitialized, if all the bottom_data are NaN.
+    if (maxidx != -1) {
+      bottom_diff[index] = top_diff[n * channels * height * width + maxidx];
+    }
+  }
+}
+
+template <typename dtype>
+__global__ void MaxPoolGradBackwardNoMaskNHWC(
+    const int nthreads, const dtype* bottom_data, const dtype* output_data,
+    const int pooled_height, const int pooled_width, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    const dtype* top_diff, dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // First find out the index to the maximum, since we have no mask.
+    int n = index;
+    int c = n % channels;
+    n /= channels;
+    int wstart = (n % pooled_width) * stride_w - pad_l;
+    n /= pooled_width;
+    int hstart = (n % pooled_height) * stride_h - pad_t;
+    n /= pooled_height;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    bool should_stop = false;
+    int maxidx = -1;
+    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
+    // Propagate only first value from top_diff corresponding to the maximum.
+    for (int h = hstart; h < hend && !should_stop; ++h) {
+      for (int w = wstart; w < wend && !should_stop; ++w) {
+        int idx = (h * width + w) * channels + c;
+        if (output_data[index] == bottom_data_n[idx]) {
+          maxidx = idx;
+          should_stop = true;
+        }
+      }
+    }
+    // Set the bottom diff (atomic is not necessary). The index could still be
+    // uninitialized, if all the bottom_data are NaN.
+    if (maxidx != -1) {
+      bottom_diff[index] = top_diff[n * height * width * channels + maxidx];
+    }
+  }
+}
+
+// The parameters to the kernels in the gradient gradient function is as
+// follows:
+//     nthreads: the number of threads, which is equal to the output size. The
+//         gradient of the MaxPooling gradient w.r.t. the output data has a
+//         dimensions of N*C*Hout*Wout
+//     top_diff: the gradient of the gradient of the output data w.r.t. the
+//         input data, of size N*H*W*C (or N*C*H*W). As we have stored the
+//         flattened index of the input entries, the backward function is
+//         agnostic of the input storage order.
+//     mask: the output mask of the same size as top_data. It is stored in
+//         int form, keeping track of the flattened index of the input item that
+//         produces the max output.
+//     top_offset: the pre-computed per-image offset of the maxpool input
+//         gradient. This is equal to H*W*C. We choose to pre-compute this so we
+//         do not  need to compute it every time inside the kernel.
+//     bottom_offset: the pre-computed per-image offset of the maxpool output.
+//         This is equal to Hout*Wout*C.
+//     bottom_diff: the gradient of the gradient w.r.t. output.
+// This function relies on CudaAtomicAdd to avoid race conditions. Also, before
+// the kernel is run, you will need to make sure that bottom_diff is filled with
+// zero first.
+template <typename dtype>
+__global__ void MaxPoolGradBackward(const int nthreads, const dtype* top_diff,
+                                    const int64* mask, const int top_offset,
+                                    const int bottom_offset,
+                                    dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int image_id = (index / bottom_offset);
+    bottom_diff[index] = top_diff[image_id * top_offset + mask[index]];
+  }
+}
+
 #undef CUDA_1D_KERNEL_LOOP
 }  // namespace

-bool MaxPoolForwardWithOptionalArgmax(
-    const float* bottom_data, const int batch, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    float* top_data, int64* mask, const Eigen::GpuDevice& d) {
+namespace functor {
+
+template <typename T>
+bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
+    const T* bottom_data, const int batch, const int height, const int width,
+    const int channels, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_t, const int pad_l, T* top_data,
+    int64* mask, const Eigen::GpuDevice& d) {
  const int kThreadsPerBlock = 1024;
  const int output_size = batch * channels * pooled_height * pooled_width;

@ -219,31 +349,13 @@ bool MaxPoolForwardWithOptionalArgmax(
  return d.ok();
 }

-bool MaxPoolForwardWithOptionalArgmax(
-    const Eigen::half* bottom_data, const int batch, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    Eigen::half* top_data, int64* mask, const Eigen::GpuDevice& d) {
-  const int kThreadsPerBlock = 1024;
-  const int output_size = batch * channels * pooled_height * pooled_width;
-
-  MaxPoolForwardNHWC<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-                       kThreadsPerBlock, 0, d.stream()>>>(
-      output_size, bottom_data, height, width, channels, pooled_height,
-      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-      top_data, mask);
-  return d.ok();
-}
-
-bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
-                           const int height, const int width,
-                           const int channels, const int pooled_height,
-                           const int pooled_width, const int kernel_h,
-                           const int kernel_w, const int stride_h,
-                           const int stride_w, const int pad_t, const int pad_l,
-                           const float* top_diff, float* bottom_diff,
-                           const Eigen::GpuDevice& d) {
+template <typename T>
+bool MaxPoolBackwardNoMask<T>::operator()(
+    const T* bottom_data, const int batch, const int height, const int width,
+    const int channels, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_t, const int pad_l, const T* top_diff,
+    T* bottom_diff, const Eigen::GpuDevice& d) {
  const int kThreadsPerBlock = 1024;
  const int bottom_size = batch * channels * height * width;
  const int top_size = batch * channels * pooled_height * pooled_width;
@ -260,34 +372,11 @@ bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
  return d.ok();
 }

-bool MaxPoolBackwardNoMask(const Eigen::half* bottom_data, const int batch,
-                           const int height, const int width,
-                           const int channels, const int pooled_height,
-                           const int pooled_width, const int kernel_h,
-                           const int kernel_w, const int stride_h,
-                           const int stride_w, const int pad_t, const int pad_l,
-                           const Eigen::half* top_diff, Eigen::half* bottom_diff,
-                           const Eigen::GpuDevice& d) {
-  const int kThreadsPerBlock = 1024;
-  const int bottom_size = batch * channels * height * width;
-  const int top_size = batch * channels * pooled_height * pooled_width;
-
-  SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-            kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
-
-  MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) /
-                                  kThreadsPerBlock,
-                              kThreadsPerBlock, 0, d.stream()>>>(
-      top_size, bottom_data, height, width, channels, pooled_height,
-      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-      top_diff, bottom_diff);
-  return d.ok();
-}
-
-bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
-                               const float* top_diff, const int64* mask,
-                               const int top_offset, const int bottom_offset,
-                               float* bottom_diff, const Eigen::GpuDevice& d) {
+template <typename T>
+bool MaxPoolBackwardWithArgmax<T>::operator()(
+    const int output_size, const int input_size, const T* top_diff,
+    const int64* mask, const int top_offset, const int bottom_offset,
+    T* bottom_diff, const Eigen::GpuDevice& d) {
  const int kThreadsPerBlock = 1024;
  SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
            kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
@ -297,30 +386,61 @@ bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
  return d.ok();
 }

-bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
-                               const Eigen::half* top_diff, const int64* mask,
-                               const int top_offset, const int bottom_offset,
-                               Eigen::half* bottom_diff,
-                               const Eigen::GpuDevice& d) {
-  const int kThreadsPerBlock = 1024;
-  SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-            kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
-  MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-                    kThreadsPerBlock, 0, d.stream()>>>(
-      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+template <typename T>
+bool MaxPoolGradBackwardNoMask<T>::operator()(
+    TensorFormat data_format, const T* bottom_data, const T* output_data,
+    const int batch, const int pooled_height, const int pooled_width,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_t,
+    const int pad_l, const T* top_diff, T* bottom_diff,
+    const Eigen::GpuDevice& d) {
+  const int num_kernels = batch * channels * pooled_height * pooled_width;
+  CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
+
+  if (data_format == FORMAT_NHWC) {
+    MaxPoolGradBackwardNoMaskNHWC<<<config.block_count, config.thread_per_block,
+                                    0, d.stream()>>>(
+        num_kernels, bottom_data, output_data, pooled_height, pooled_width,
+        channels, height, width, kernel_h, kernel_w, stride_h, stride_w, pad_t,
+        pad_l, top_diff, bottom_diff);
+  } else {
+    MaxPoolGradBackwardNoMaskNCHW<<<config.block_count, config.thread_per_block,
+                                    0, d.stream()>>>(
+        num_kernels, bottom_data, output_data, pooled_height, pooled_width,
+        channels, height, width, kernel_h, kernel_w, stride_h, stride_w, pad_t,
+        pad_l, top_diff, bottom_diff);
+  }
+  return d.ok();
+}
+
+template <typename T>
+bool MaxPoolGradBackwardWithArgmax<T>::operator()(
+    const int output_size, const int input_size, const T* top_diff,
+    const int64* mask, const int top_offset, const int bottom_offset,
+    T* bottom_diff, const Eigen::GpuDevice& d) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
+  MaxPoolGradBackward<<<config.block_count, config.thread_per_block, 0,
+                        d.stream()>>>(output_size, top_diff, mask, top_offset,
+                                      bottom_offset, bottom_diff);
  return d.ok();
 }

 typedef Eigen::GpuDevice GPUDevice;

-#define DEFINE_GPU_KERNELS(T) \
-  template struct functor::SpatialMaxPooling<GPUDevice, T>;
+#define DEFINE_GPU_KERNELS(T)                          \
+  template struct SpatialMaxPooling<GPUDevice, T>;     \
+  template struct MaxPoolForwardWithOptionalArgmax<T>; \
+  template struct MaxPoolBackwardWithArgmax<T>;        \
+  template struct MaxPoolBackwardNoMask<T>;            \
+  template struct MaxPoolGradBackwardWithArgmax<T>;    \
+  template struct MaxPoolGradBackwardNoMask<T>;

-DEFINE_GPU_KERNELS(float)
-DEFINE_GPU_KERNELS(Eigen::half)
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);

 #undef DEFINE_GPU_KERNELS

+}  // namespace functor
+
 }  // end namespace tensorflow

 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@ -24,54 +24,62 @@ limitations under the License.

 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_format.h"

 namespace tensorflow {

+namespace functor {
 // Run the forward pass of max pooling, optionally writing the argmax indices to
 // the mask array, if it is not nullptr. If mask is passed in as nullptr, the
 // argmax indices are not written.
-bool MaxPoolForwardWithOptionalArgmax(
-    const float* bottom_data, const int batch, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    float* top_data, int64* mask, const Eigen::GpuDevice& d);
+template <typename T>
+struct MaxPoolForwardWithOptionalArgmax {
+  bool operator()(const T* bottom_data, const int batch, const int height,
+                  const int width, const int channels, const int pooled_height,
+                  const int pooled_width, const int kernel_h,
+                  const int kernel_w, const int stride_h, const int stride_w,
+                  const int pad_t, const int pad_l, T* top_data, int64* mask,
+                  const Eigen::GpuDevice& d);
+};

-bool MaxPoolForwardWithOptionalArgmax(
-    const Eigen::half* bottom_data, const int batch, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    Eigen::half* top_data, int64* mask, const Eigen::GpuDevice& d);
+template <typename T>
+struct MaxPoolBackwardWithArgmax {
+  bool operator()(const int output_size, const int input_size,
+                  const T* top_diff, const int64* mask, const int top_offset,
+                  const int bottom_offset, T* bottom_diff,
+                  const Eigen::GpuDevice& d);
+};

-bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
-                               const float* top_diff, const int64* mask,
-                               const int top_offset, const int bottom_offset,
-                               float* bottom_diff, const Eigen::GpuDevice& d);
+template <typename T>
+struct MaxPoolBackwardNoMask {
+  bool operator()(const T* bottom_data, const int batch, const int height,
+                  const int width, const int channels, const int pooled_height,
+                  const int pooled_width, const int kernel_h,
+                  const int kernel_w, const int stride_h, const int stride_w,
+                  const int pad_t, const int pad_l, const T* top_diff,
+                  T* bottom_diff, const Eigen::GpuDevice& d);
+};

-bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
-                               const Eigen::half* top_diff, const int64* mask,
-                               const int top_offset, const int bottom_offset,
-                               Eigen::half* bottom_diff,
-                               const Eigen::GpuDevice& d);
+template <typename T>
+struct MaxPoolGradBackwardWithArgmax {
+  bool operator()(const int output_size, const int input_size,
+                  const T* top_diff, const int64* mask, const int top_offset,
+                  const int bottom_offset, T* bottom_diff,
+                  const Eigen::GpuDevice& d);
+};

-bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
-                           const int height, const int width,
-                           const int channels, const int pooled_height,
-                           const int pooled_width, const int kernel_h,
-                           const int kernel_w, const int stride_h,
-                           const int stride_w, const int pad_t, const int pad_l,
-                           const float* top_diff, float* bottom_diff,
-                           const Eigen::GpuDevice& d);
+template <typename T>
+struct MaxPoolGradBackwardNoMask {
+  bool operator()(TensorFormat data_format, const T* bottom_data,
+                  const T* output_data, const int batch,
+                  const int pooled_height, const int pooled_width,
+                  const int channels, const int height, const int width,
+                  const int kernel_h, const int kernel_w, const int stride_h,
+                  const int stride_w, const int pad_t, const int pad_l,
+                  const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d);
+};

-bool MaxPoolBackwardNoMask(const Eigen::half* bottom_data, const int batch,
-                           const int height, const int width,
-                           const int channels, const int pooled_height,
-                           const int pooled_width, const int kernel_h,
-                           const int kernel_w, const int stride_h,
-                           const int stride_w, const int pad_t, const int pad_l,
-                           const Eigen::half* top_diff, Eigen::half* bottom_diff,
-                           const Eigen::GpuDevice& d);
+}  // namespace functor

 }  // namespace tensorflow

--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@ -0,0 +1,428 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   ==============================================================================*/
+
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class MklAvgPoolingOp : public UnaryOp<T> {
+ public:
+  explicit MklAvgPoolingOp(OpKernelConstruction* context)
+      : UnaryOp<T>(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklAvgPoolingOpContext mkl_context;
+    const Tensor& tensor_in = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    if (!input_in_mkl_format)
+      mkl_context.params.in_dim = tensor_in.dims();
+    else
+      mkl_context.params.in_dim = mkl_context.input_shape.GetDimension();
+
+    MklPoolParameters pool_params;
+    if (!input_in_mkl_format) {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       tensor_in.shape());
+    } else {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       &mkl_context.input_shape);
+    }
+
+    // Extract the parameters for the op from the pooling specs
+    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
+
+    Tensor mkl_tmp_input_buf_tensor_;
+    mkl_context.MklCreateLayoutsAndPrimitives(context,
+                                              &mkl_tmp_input_buf_tensor_);
+
+    Tensor workspace_tensor;
+    void* workspace_buf;
+    AllocTmpBuffer(context, &workspace_tensor, mkl_context.lt_workspace,
+                   &workspace_buf);
+
+    if (mkl_context.convert_input != nullptr) {
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(
+            dnnConversionExecute_F32(
+                mkl_context.convert_input,
+                static_cast<void*>(const_cast<T*>(tensor_in.flat<T>().data())),
+                mkl_context.input_buf),
+            E_SUCCESS);
+        CHECK_EQ(dnnDelete_F32(mkl_context.convert_input), E_SUCCESS);
+      } else {
+        mkl_context.input_shape.GetConvertedFlatData(
+            mkl_context.lt_prim_input,
+            static_cast<void*>(const_cast<T*>(tensor_in.flat<T>().data())),
+            mkl_context.input_buf);
+      }
+      mkl_context.pooling_res[dnnResourceSrc] = mkl_context.input_buf;
+    } else {
+      mkl_context.pooling_res[dnnResourceSrc] =
+          static_cast<void*>(const_cast<T*>(tensor_in.flat<T>().data()));
+    }
+
+    // Declare output tensor and allocate memory
+    Tensor* output = nullptr;
+    TensorShape tensor_out_shape;
+    MklShape mkl_out_shape;
+    mkl_out_shape.SetMklTensor(true);
+    mkl_out_shape.SetMklLayout(mkl_context.prim_pooling_fwd, dnnResourceDst);
+    mkl_out_shape.SetTfLayout(mkl_context.params.in_dim,
+                              mkl_context.params.out_sizes,
+                              mkl_context.params.out_strides);
+    mkl_out_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
+
+    tensor_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                                mkl_out_shape.GetMklLayout())) /
+                            sizeof(T));
+
+    AllocateOutputSetMklshape(context, 0, &output, tensor_out_shape,
+                              mkl_out_shape);
+    mkl_context.pooling_res[dnnResourceDst] =
+        static_cast<void*>(output->flat<T>().data());
+
+    mkl_context.pooling_res[dnnResourceWorkspace] = workspace_buf;
+
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_pooling_fwd, mkl_context.pooling_res),
+        E_SUCCESS);
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    MklPoolingOpParams params;
+    MklShape input_shape;
+    dnnPrimitive_t prim_pooling_fwd, convert_input;
+    dnnLayout_t lt_user_input, lt_prim_input, lt_workspace;
+    void* input_buf;
+    void* pooling_res[dnnResourceNumber];
+
+    void MklCreateLayoutsAndPrimitives(OpKernelContext* context,
+                                       Tensor* mkl_tmp_input_buf_tensor) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+
+      if (!input_in_mkl_format) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_user_input, params.in_dim,
+                                     params.in_sizes, params.in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_user_input = (dnnLayout_t)input_shape.GetCurLayout();
+      }
+
+      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingAvg;
+      dnnPrimitiveAttributes_t primAttr = nullptr;
+
+      // Create DNN primitives
+      CHECK_EQ(dnnPoolingCreateForward_F32(
+                   &prim_pooling_fwd, primAttr, algorithm, lt_user_input,
+                   params.kernel_size, params.kernel_stride, params.in_offset,
+                   dnnBorderZerosAsymm),
+               E_SUCCESS);
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &lt_prim_input, prim_pooling_fwd, dnnResourceSrc),
+               E_SUCCESS);
+      if (!dnnLayoutCompare_F32(lt_user_input, lt_prim_input)) {
+        CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_user_input,
+                                         lt_prim_input),
+                 E_SUCCESS);
+
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_prim_input,
+                       &input_buf);
+      }
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, prim_pooling_fwd,
+                                                dnnResourceWorkspace),
+               E_SUCCESS);
+    }
+
+    void MklCleanup() {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (!input_in_mkl_format) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_user_input), E_SUCCESS);
+      }
+
+      CHECK_EQ(dnnDelete_F32(prim_pooling_fwd), E_SUCCESS);
+      CHECK_EQ(dnnLayoutDelete_F32(lt_prim_input), E_SUCCESS);
+    }
+  } MklAvgPoolingOpContext;
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+//-----------------------------------------------------------------------------
+
+template <class Device, class T>
+class MklAvgPoolingGradOp : public OpKernel {
+ public:
+  explicit MklAvgPoolingGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklAvgPoolingGradOpContext mkl_context;
+    const Tensor& tensor_in_shape = MklGetInput(context, 0);
+    const Tensor& out_backprop = MklGetInput(context, 1);
+    GetMklShape(context, 1, &mkl_context.out_backprop_shape);
+    bool outbackprop_in_mkl_format =
+        mkl_context.out_backprop_shape.IsMklTensor();
+
+    TensorShape output_shape;
+    auto shape_vec = tensor_in_shape.vec<int32>();
+    for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
+      output_shape.AddDim(shape_vec(i));
+    }
+
+    MklPoolParameters pool_params;
+    pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                     output_shape);
+
+    // Extract the parameters for the op from the pooling specs
+    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
+
+    // Tensors needed to create temporary buffers
+    Tensor outbackprop_buf_tensor;
+    void* outbackprop_buf;
+    mkl_context.MklCreateLayoutsAndPrimitives(context);
+
+    // Check if outbackprop layout requires conversion.
+    if (!dnnLayoutCompare_F32(mkl_context.lt_user_outbackprop,
+                              mkl_context.lt_prim_outbackprop)) {
+      CHECK_EQ(dnnConversionCreate_F32(&mkl_context.convert_outbackprop,
+                                       mkl_context.lt_user_outbackprop,
+                                       mkl_context.lt_prim_outbackprop),
+               E_SUCCESS);
+
+      AllocTmpBuffer(context, &outbackprop_buf_tensor,
+                     mkl_context.lt_prim_outbackprop, &outbackprop_buf);
+
+      if (!outbackprop_in_mkl_format) {
+        CHECK_EQ(dnnConversionExecute_F32(mkl_context.convert_outbackprop,
+                                          static_cast<void*>(const_cast<T*>(
+                                              out_backprop.flat<T>().data())),
+                                          outbackprop_buf),
+                 E_SUCCESS);
+        CHECK_EQ(dnnDelete_F32(mkl_context.convert_outbackprop), E_SUCCESS);
+      } else {
+        mkl_context.out_backprop_shape.GetConvertedFlatData(
+            mkl_context.lt_prim_outbackprop,
+            static_cast<void*>(const_cast<T*>(out_backprop.flat<T>().data())),
+            outbackprop_buf);
+      }
+      mkl_context.pooling_res[dnnResourceDiffDst] = outbackprop_buf;
+    } else {
+      mkl_context.pooling_res[dnnResourceDiffDst] =
+          static_cast<void*>(const_cast<T*>(out_backprop.flat<T>().data()));
+    }
+
+    // Handle workspace requirements.
+    Tensor workspace_buf_tensor;
+    void* workspace_buf;
+    AllocTmpBuffer(context, &workspace_buf_tensor, mkl_context.lt_workspace,
+                   &workspace_buf);
+    mkl_context.pooling_res[dnnResourceWorkspace] = workspace_buf;
+
+    // Handle MKL output tensor setup.
+    Tensor* output = nullptr;
+    TensorShape tensor_out_shape;
+    MklShape mkl_out_shape;
+    mkl_out_shape.SetMklTensor(true);
+    mkl_out_shape.SetMklLayout(mkl_context.prim_pooling_bwd,
+                               dnnResourceDiffSrc);
+    mkl_out_shape.SetTfLayout(mkl_context.params.in_dim,
+                              mkl_context.params.in_sizes,
+                              mkl_context.params.in_strides);
+    mkl_out_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
+
+    tensor_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                                mkl_out_shape.GetMklLayout())) /
+                            sizeof(T));
+
+    AllocateOutputSetMklshape(context, 0, &output, tensor_out_shape,
+                              mkl_out_shape);
+
+    // Set output tensor.
+    mkl_context.pooling_res[dnnResourceDiffSrc] =
+        static_cast<void*>(output->flat<T>().data());
+
+    // Execute primitive.
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_pooling_bwd, mkl_context.pooling_res),
+        E_SUCCESS);
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    MklPoolingOpParams params;
+    MklShape out_backprop_shape;
+    dnnPrimitive_t prim_pooling_bwd, convert_outbackprop;
+    void* pooling_res[dnnResourceNumber];
+    dnnLayout_t lt_user_input, lt_user_outbackprop, lt_prim_outbackprop,
+        lt_workspace;
+
+    void MklCreateLayoutsAndPrimitives(OpKernelContext* context) {
+      const Tensor& tensor_in_shape = MklGetInput(context, 0);
+      const Tensor& out_backprop = MklGetInput(context, 1);
+      bool outbackprop_in_mkl_format = out_backprop_shape.IsMklTensor();
+
+      if (!outbackprop_in_mkl_format) {
+        // For avgpooling, tensor_in_shape should have 1 dimension, and 4
+        // elements.
+        OP_REQUIRES(
+            context,
+            tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
+            errors::InvalidArgument("original input shape must be "
+                                    "1-dimensional and 4 elements"));
+
+        // For avgpooling, out_backprop should have 4 dimensions.
+        OP_REQUIRES(context, out_backprop.dims() == 4,
+                    errors::InvalidArgument("out_backprop must be "
+                                            "4-dimensional"));
+      } else {
+        // Input in MKL format.
+        OP_REQUIRES(
+            context, out_backprop.dims() == 2,
+            errors::InvalidArgument("out_backprop in MKL format must be "
+                                    "2-dimensional"));
+
+        // For avgpooling, out_backprop should have 4 dimensions.
+        OP_REQUIRES(context, out_backprop_shape.GetDimension() == 4,
+                    errors::InvalidArgument("out_backprop must be "
+                                            "4-dimensional"));
+      }
+
+      // TODO(inteltf): Get outbackprop layout.
+      // Do we need to create layout in every invocation?
+      if (!outbackprop_in_mkl_format) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_user_outbackprop, params.in_dim,
+                                     params.out_sizes, params.out_strides),
+                 E_SUCCESS);
+      } else {
+        lt_user_outbackprop = (dnnLayout_t)out_backprop_shape.GetCurLayout();
+      }
+
+      // Create the backward primitive
+      // Create DNN user layout
+      CHECK_EQ(dnnLayoutCreate_F32(&lt_user_input, params.in_dim,
+                                   params.in_sizes, params.in_strides),
+               E_SUCCESS);
+
+      // Create PoolingBackward primitive
+      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingAvg;
+      dnnPrimitiveAttributes_t primAttr = nullptr;
+      CHECK_EQ(dnnPoolingCreateBackward_F32(
+                   &prim_pooling_bwd, primAttr, algorithm, lt_user_input,
+                   params.kernel_size, params.kernel_stride, params.in_offset,
+                   dnnBorderZerosAsymm),
+               E_SUCCESS);
+
+      // Create expected outbackprop layout from the primitive.
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &lt_prim_outbackprop, prim_pooling_bwd, dnnResourceDiffDst),
+               E_SUCCESS);
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, prim_pooling_bwd,
+                                                dnnResourceWorkspace),
+               E_SUCCESS);
+    }
+
+    void MklCleanup() {
+      bool outbackprop_in_mkl_format = out_backprop_shape.IsMklTensor();
+      CHECK_EQ(dnnDelete_F32(prim_pooling_bwd), E_SUCCESS);
+      CHECK_EQ(dnnLayoutDelete_F32(lt_user_input), E_SUCCESS);
+      if (!outbackprop_in_mkl_format) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_user_outbackprop), E_SUCCESS);
+      }
+      CHECK_EQ(dnnLayoutDelete_F32(lt_prim_outbackprop), E_SUCCESS);
+      CHECK_EQ(dnnLayoutDelete_F32(lt_workspace), E_SUCCESS);
+    }
+  } MklAvgPoolingGradOpContext;
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("MklAvgPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_layer_registry::kMklLayerLabel),
+                        MklAvgPoolingOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("MklAvgPoolGrad")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_layer_registry::kMklLayerLabel),
+                        MklAvgPoolingGradOp<CPUDevice, float>);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@ -0,0 +1,264 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.This opkernel uses MKL library, create MKL
+// layout and primitives, use MKL dnn primitives to compute convolution backward
+// bias.
+
+#ifdef INTEL_MKL
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, class T>
+class MklConv2DCustomBackpropBiasOp : public OpKernel {
+ public:
+  explicit MklConv2DCustomBackpropBiasOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
+  ~MklConv2DCustomBackpropBiasOp() {}
+
+  void Compute(OpKernelContext* context) override {
+    MklConvBackBiasOpContext mkl_context;
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    bool input_is_mkl = mkl_context.input_shape.IsMklTensor();
+
+    if (input_is_mkl) {
+      OP_REQUIRES(
+          context, mkl_context.input_shape.GetDimension() == 4,
+          errors::InvalidArgument("Input tensor must be 4-dimensional"));
+    } else {
+      OP_REQUIRES(context, input.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          input.shape().DebugString()));
+    }
+
+    if (input_is_mkl) {
+      mkl_context.c_size = mkl_context.input_shape.GetSizes()[MklDims::C];
+    } else if (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW) {
+      mkl_context.c_size = GetTensorDim(input, data_format_, 'C');
+    } else {
+      errors::InvalidArgument("Unknown format ",
+                              " Format must be either NCHW or NHWC. ");
+    }
+    TensorShape output_shape{mkl_context.c_size};
+
+    Tensor* bias_backprop = nullptr;
+    MklShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(false);
+    AllocateOutputSetMklshape(context, 0, &bias_backprop, output_shape,
+                              output_mkl_shape);
+
+    mkl_context.in_dims = 4;
+
+    if (input_is_mkl) {  // get the shape from the mkl shape
+      mkl_context.in_sizes[MklDims::W] =
+          mkl_context.input_shape.GetSizes()[MklDims::W];
+      mkl_context.in_sizes[MklDims::H] =
+          mkl_context.input_shape.GetSizes()[MklDims::H];
+      mkl_context.in_sizes[MklDims::C] =
+          mkl_context.input_shape.GetSizes()[MklDims::C];
+      mkl_context.in_sizes[MklDims::N] =
+          mkl_context.input_shape.GetSizes()[MklDims::N];
+    } else {
+      mkl_context.in_sizes[MklDims::W] = GetTensorDim(input, data_format_, 'W');
+      mkl_context.in_sizes[MklDims::H] = GetTensorDim(input, data_format_, 'H');
+      mkl_context.in_sizes[MklDims::C] = GetTensorDim(input, data_format_, 'C');
+      mkl_context.in_sizes[MklDims::N] = GetTensorDim(input, data_format_, 'N');
+      GetStridesFromSizes(data_format_, mkl_context.in_strides,
+                          mkl_context.in_sizes);
+    }
+
+    mkl_context.out_sizes[0] = mkl_context.c_size;
+    mkl_context.out_strides[0] = 1;
+
+    CHECK_EQ(
+        dnnConvolutionCreateBackwardBias_F32(
+            &mkl_context.prim_conv_bwdbias, NULL, dnnAlgorithmConvolutionDirect,
+            mkl_context.in_dims, mkl_context.in_sizes),
+        E_SUCCESS);
+
+    mkl_context.MklCreateInputLayouts(context);
+
+    Tensor mkl_tmp_input_buf, mkl_tmp_outbackprop_buf;
+    mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf);
+    mkl_context.MklPrepareConvolutionOutputs(context, &mkl_tmp_outbackprop_buf,
+                                             bias_backprop);
+
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_conv_bwdbias, mkl_context.conv_res),
+        E_SUCCESS);
+    if (mkl_context.should_convert_output) {
+      CHECK_EQ(dnnConversionExecute_F32(
+                   mkl_context.convert_outbackprop, mkl_context.outbackprop_buf,
+                   static_cast<void*>(bias_backprop->flat<T>().data())),
+               E_SUCCESS);
+    }
+    // deletes layouts
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    int c_size;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    size_t out_sizes[1];
+    size_t out_strides[1];
+    size_t filter_sizes[4];
+    size_t filter_strides[4];
+    int input_offset[2];
+    size_t conv_stride[2];
+    MklShape input_shape;
+    dnnPrimitive_t prim_conv_bwdbias;
+    void* conv_res[dnnResourceNumber];
+    dnnLayout_t lt_input, lt_outbackprop;
+    bool should_convert_output;
+    dnnPrimitive_t convert_outbackprop;
+    void* outbackprop_buf;
+
+    // Create MKL dnnLayout_t objects for tensors coming into the layer
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool input_is_mkl = input_shape.IsMklTensor();
+
+      CHECK_EQ(dnnLayoutCreate_F32(&lt_outbackprop, 1, out_sizes, out_strides),
+               E_SUCCESS);
+      if (input_is_mkl) {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      }
+    }
+
+    // Compare incoming output tensor layouts with MKL preferred layouts and
+    // convert data to the preferred layout if necessary
+    void MklPrepareConvolutionOutputs(OpKernelContext* context,
+                                      Tensor* mkl_tmp_outbackprop_buf,
+                                      Tensor* bias_backprop) {
+      dnnLayout_t mkl_prim_internal_outbackprop = nullptr;
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_prim_internal_outbackprop,
+                                                prim_conv_bwdbias,
+                                                dnnResourceDiffBias),
+               E_SUCCESS);
+      should_convert_output =
+          !dnnLayoutCompare_F32(lt_outbackprop, mkl_prim_internal_outbackprop);
+      if (should_convert_output) {
+        CHECK_EQ(dnnConversionCreate_F32(&convert_outbackprop,
+                                         mkl_prim_internal_outbackprop,
+                                         lt_outbackprop),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_outbackprop_buf,
+                       mkl_prim_internal_outbackprop, &outbackprop_buf);
+        conv_res[dnnResourceDiffBias] = outbackprop_buf;
+      } else {
+        conv_res[dnnResourceDiffBias] =
+            static_cast<void*>(const_cast<T*>(bias_backprop->flat<T>().data()));
+      }
+
+      dnnLayoutDelete_F32(mkl_prim_internal_outbackprop);
+    }
+
+    // Compare incoming input tensor layouts with MKL preferred layouts and
+    // convert data to the preferred layout if necessary
+    void MklPrepareConvolutionInputs(OpKernelContext* context,
+                                     Tensor* mkl_tmp_input_buf) {
+      dnnLayout_t mkl_prim_internal_input = nullptr;
+      dnnPrimitive_t mkl_convert_input = nullptr;
+      void* input_buf = nullptr;
+      const Tensor& input = MklGetInput(context, 0);
+
+      CHECK_EQ(
+          dnnLayoutCreateFromPrimitive_F32(
+              &mkl_prim_internal_input, prim_conv_bwdbias, dnnResourceDiffDst),
+          E_SUCCESS);
+
+      if (!dnnLayoutCompare_F32(lt_input, mkl_prim_internal_input)) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_convert_input, lt_input,
+                                         mkl_prim_internal_input),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf, mkl_prim_internal_input,
+                       &input_buf);
+        CHECK_EQ(dnnConversionExecute_F32(
+                     mkl_convert_input,
+                     static_cast<void*>(const_cast<T*>(input.flat<T>().data())),
+                     input_buf),
+                 E_SUCCESS);
+        conv_res[dnnResourceDiffDst] = input_buf;
+        dnnDelete_F32(mkl_convert_input);
+      } else {
+        conv_res[dnnResourceDiffDst] =
+            static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
+      }
+
+      dnnLayoutDelete_F32(mkl_prim_internal_input);
+    }
+
+    // Cleanup member layouts and primitives
+    void MklCleanup() {
+      bool input_is_mkl = input_shape.IsMklTensor();
+      if (!input_is_mkl) dnnLayoutDelete_F32(lt_input);
+      dnnLayoutDelete_F32(lt_outbackprop);
+
+      if (should_convert_output) dnnDelete_F32(convert_outbackprop);
+      dnnDelete_F32(prim_conv_bwdbias);
+    }
+  } MklConvBackBiasOpContext;
+
+  TensorFormat data_format_;
+  TF_DISALLOW_COPY_AND_ASSIGN(MklConv2DCustomBackpropBiasOp);
+};
+
+#define REGISTER_CPU_KERNELS(T)                                           \
+  REGISTER_KERNEL_BUILDER(Name("MklConv2DWithBiasBackpropBias")           \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<T>("T")                     \
+                              .Label(mkl_layer_registry::kMklLayerLabel), \
+                          MklConv2DCustomBackpropBiasOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+} /* namespace tensorflow */
+#endif /* INTEL_MKL */
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@ -0,0 +1,422 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+
+#ifdef INTEL_MKL
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, class T>
+class MklConv2DCustomBackpropFilterOp : public OpKernel {
+ public:
+  explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    int stride_n = GetTensorDim(strides_, data_format_, 'N');
+    int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, (stride_n == 1 && stride_c == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklConv2DGradFilterOpContext mkl_context;
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &(mkl_context.input_shape));
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    const Tensor& filter_sizes = MklGetInput(context, 1);
+
+    const Tensor& out_backprop = MklGetInput(context, 2);
+    GetMklShape(context, 2, &(mkl_context.out_backprop_shape));
+    bool out_backprop_in_mkl_format =
+        mkl_context.out_backprop_shape.IsMklTensor();
+
+    TensorShape input_shape, filter_shape, out_backprop_shape;
+
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(filter_sizes.shape()),
+        errors::InvalidArgument(
+            "Conv2DCustomBackpropFilter: filter_sizes input must be 1-dim, "
+            "not ",
+            filter_sizes.dims()));
+    OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
+                                filter_sizes.vec<int32>(), &filter_shape));
+
+    ConvBackpropDimensions backprop_dims;
+
+    // Generate shape for input if input is in MKL format.
+    if (input_in_mkl_format) {
+      OP_REQUIRES(context, mkl_context.input_shape.GetDimension() == 4,
+                  errors::InvalidArgument(
+                      "Conv2DCustomBackpropFilter: input size must be 4-dim"));
+
+      MklSizesToTFSizes(context, data_format_, mkl_context.input_shape,
+                        &input_shape);
+    } else {
+      input_shape = input.shape();
+    }
+
+    // Generate shape for outback prop if input is in MKL format.
+    if (out_backprop_in_mkl_format) {
+      OP_REQUIRES(
+          context, mkl_context.out_backprop_shape.GetDimension() == 4,
+          errors::InvalidArgument(
+              "Conv2DCustomBackpropFilter: outbackprop size must be 4-dim"));
+
+      MklSizesToTFSizes(context, data_format_, mkl_context.out_backprop_shape,
+                        &out_backprop_shape);
+    } else {
+      out_backprop_shape = out_backprop.shape();
+    }
+
+    OP_REQUIRES_OK(context,
+                   ConvBackpropComputeDimensions(
+                       "Conv2DCustomBackpropFilter", /*num_spatial_dims=*/2,
+                       input_shape, filter_shape, out_backprop_shape, strides_,
+                       padding_, data_format_, &backprop_dims));
+
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                backprop_dims.spatial_dims[0].input_size,
+                                backprop_dims.spatial_dims[0].filter_size,
+                                backprop_dims.spatial_dims[0].stride, padding_,
+                                &backprop_dims.spatial_dims[0].output_size,
+                                &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                backprop_dims.spatial_dims[1].input_size,
+                                backprop_dims.spatial_dims[1].filter_size,
+                                backprop_dims.spatial_dims[1].stride, padding_,
+                                &backprop_dims.spatial_dims[1].output_size,
+                                &pad_left, &pad_right));
+
+    // Create MKL primitives for convolution filter grad
+    mkl_context.in_dims = input_in_mkl_format
+                              ? mkl_context.input_shape.GetDimension()
+                              : input.dims();
+    mkl_context.out_dims = out_backprop_in_mkl_format
+                               ? mkl_context.out_backprop_shape.GetDimension()
+                               : out_backprop.dims();
+    mkl_context.in_sizes[0] =
+        static_cast<size_t>(backprop_dims.spatial_dims[1].input_size);
+    mkl_context.in_sizes[1] =
+        static_cast<size_t>(backprop_dims.spatial_dims[0].input_size);
+    mkl_context.in_sizes[2] = static_cast<size_t>(backprop_dims.in_depth);
+    mkl_context.in_sizes[3] = static_cast<size_t>(backprop_dims.batch_size);
+    mkl_context.out_sizes[0] =
+        static_cast<size_t>(backprop_dims.spatial_dims[1].output_size);
+    mkl_context.out_sizes[1] =
+        static_cast<size_t>(backprop_dims.spatial_dims[0].output_size);
+    mkl_context.out_sizes[2] = static_cast<size_t>(backprop_dims.out_depth);
+    mkl_context.out_sizes[3] = static_cast<size_t>(backprop_dims.batch_size);
+    mkl_context.input_offsets[0] = static_cast<int>(-pad_left);
+    mkl_context.input_offsets[1] = static_cast<int>(-pad_top);
+    mkl_context.conv_strides[0] =
+        static_cast<size_t>(backprop_dims.spatial_dims[1].stride);
+    mkl_context.conv_strides[1] =
+        static_cast<size_t>(backprop_dims.spatial_dims[0].stride);
+
+    GetStridesFromSizes(data_format_, mkl_context.in_strides,
+                        mkl_context.in_sizes);
+    GetStridesFromSizes(data_format_, mkl_context.out_strides,
+                        mkl_context.out_sizes);
+
+    // MKL understands dimensions in 0, 1, 2, and 3 indices denotes
+    // filter cols, rows, input channels, and output depth/channels.
+    mkl_context.filter_dims = 4;
+    mkl_context.filter_sizes[0] = backprop_dims.spatial_dims[1].filter_size;
+    mkl_context.filter_sizes[1] = backprop_dims.spatial_dims[0].filter_size;
+    mkl_context.filter_sizes[2] = backprop_dims.in_depth;
+    mkl_context.filter_sizes[3] = backprop_dims.out_depth;
+
+    // We want filter grad to be in TF format, so
+    // make the strides accordingly to reflect this fact.
+    // Note TF filter layout : (rows, cols, in_depth, out_depth),
+    // while row is the innermost dimension.
+    mkl_context.filter_strides[0] =
+        backprop_dims.out_depth * backprop_dims.in_depth;
+    mkl_context.filter_strides[1] = backprop_dims.out_depth *
+                                    backprop_dims.in_depth *
+                                    backprop_dims.spatial_dims[1].filter_size;
+    mkl_context.filter_strides[2] = backprop_dims.out_depth;
+    mkl_context.filter_strides[3] = 1;
+
+    mkl_context.conv_strides[0] = backprop_dims.spatial_dims[1].stride;
+    mkl_context.conv_strides[1] = backprop_dims.spatial_dims[0].stride;
+
+    // Create convolution-grad-filter primitive
+    CHECK_EQ(dnnConvolutionCreateBackwardFilter_F32(
+                 &mkl_context.prim_conv_bwdfilter, nullptr,
+                 dnnAlgorithmConvolutionDirect, mkl_context.in_dims,
+                 mkl_context.in_sizes, mkl_context.out_sizes,
+                 mkl_context.filter_sizes, mkl_context.conv_strides,
+                 mkl_context.input_offsets, dnnBorderZeros),
+             E_SUCCESS);
+
+    // Create the layouts for entities in received context.
+    mkl_context.MklCreateInputLayouts(context);
+
+    // Mkl needs the entities in its native format.
+    // So create temporary tensors along with buffers to
+    // convert the received entities.
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_out_backprop_buf_tensor;
+    // This preparation sets (1) dnnResourceSrc (2) dnnResourceDiffDst
+    mkl_context.MklPrepareInputs(context, &mkl_tmp_input_buf_tensor,
+                                 &mkl_tmp_out_backprop_buf_tensor);
+
+    // Final conv-grad-filter should be in TF layout.
+    Tensor* grad_filter;
+    mkl_context.grad_filter_shape.SetMklTensor(false);
+    mkl_context.grad_filter_shape.SetTfLayout(mkl_context.filter_dims,
+                                              mkl_context.filter_sizes,
+                                              mkl_context.filter_strides);
+    AllocateOutputSetMklshape(context, 0, &grad_filter, filter_shape,
+                              mkl_context.grad_filter_shape);
+
+    // Need to set member variable for TF layout
+    mkl_context.lt_grad_filter = mkl_context.grad_filter_shape.GetTfLayout();
+
+    // MKL conv-grad-filter might produce grad in its internal layout
+    Tensor mkl_tmp_grad_filter_buf_tensor;
+    // This preparation sets conversion primitive if required
+    // and allocates temporary tensor and its buffer without doing conversions.
+    // Also sets (3) dnnResourceDiffFilter accordingly
+    mkl_context.MklPrepareGradFilter(context, grad_filter,
+                                     &mkl_tmp_grad_filter_buf_tensor);
+
+    // After setting all the required dnnResources, ready for execution!
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_conv_bwdfilter, mkl_context.conv_res),
+        E_SUCCESS);
+
+    // Convert grad-filter to TF layout
+    if (mkl_context.convert_bwdfilter != nullptr) {
+      void* mkl_buf_convert_grad_filter =
+          const_cast<void*>(static_cast<const void*>(
+              mkl_tmp_grad_filter_buf_tensor.flat<T>().data()));
+      void* mkl_buf_grad_filter = const_cast<void*>(
+          static_cast<const void*>(grad_filter->flat<T>().data()));
+      CHECK_EQ(dnnConversionExecute_F32(mkl_context.convert_bwdfilter,
+                                        mkl_buf_convert_grad_filter,
+                                        mkl_buf_grad_filter),
+               E_SUCCESS);
+    }
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    int out_dims;
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    int filter_dims;
+    size_t filter_sizes[4];
+    size_t filter_strides[4];
+    int input_offsets[2];
+    size_t conv_strides[2];
+    MklShape input_shape, grad_filter_shape, out_backprop_shape;
+    dnnPrimitive_t prim_conv_bwdfilter, convert_bwdfilter;
+    dnnLayout_t lt_input, lt_grad_filter, lt_out_backprop;
+    void* conv_res[dnnResourceNumber];
+
+    void MklCleanup() {
+      // Cleanup member layouts and primitives except "lt_grad_filter_"
+      // which points to MklShape's TFLayout
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool out_backprop_in_mkl_format = out_backprop_shape.IsMklTensor();
+      if (!input_in_mkl_format) dnnLayoutDelete_F32(lt_input);
+      if (!out_backprop_in_mkl_format) dnnLayoutDelete_F32(lt_out_backprop);
+      if (convert_bwdfilter != nullptr) dnnDelete_F32(convert_bwdfilter);
+      dnnDelete_F32(prim_conv_bwdfilter);
+    }
+
+    // Create MKL dnnLayout_t objects for tensors coming into the layer
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (input_in_mkl_format) {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      }
+
+      bool out_backprop_in_mkl_format = out_backprop_shape.IsMklTensor();
+      if (out_backprop_in_mkl_format) {
+        lt_out_backprop =
+            static_cast<dnnLayout_t>(out_backprop_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_out_backprop, out_dims, out_sizes,
+                                     out_strides),
+                 E_SUCCESS);
+      }
+    }
+
+    // Compare incoming tensor layouts with MKL preferred layouts and convert
+    // data to the preferred layout if necessary
+    void MklPrepareInputs(OpKernelContext* context,
+                          Tensor* mkl_tmp_input_buf_tensor,
+                          Tensor* mkl_tmp_out_backprop_buf_tensor) {
+      bool mkl_convert_input, mkl_convert_out_backprop;
+      dnnPrimitive_t mkl_prim_convert_input, mkl_prim_convert_out_backprop;
+      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_out_backprop;
+      void *mkl_buf_convert_input, *mkl_buf_convert_out_backprop;
+
+      mkl_prim_convert_input = nullptr;
+      mkl_prim_convert_out_backprop = nullptr;
+      mkl_lt_internal_input = nullptr;
+      mkl_lt_internal_out_backprop = nullptr;
+      mkl_buf_convert_input = nullptr;
+      mkl_buf_convert_out_backprop = nullptr;
+
+      // Compare with internal layouts and convert if needed
+      const Tensor& input = MklGetInput(context, 0);
+      void* mkl_buf_input =
+          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &mkl_lt_internal_input, prim_conv_bwdfilter, dnnResourceSrc),
+               E_SUCCESS);
+      mkl_convert_input =
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
+      if (mkl_convert_input) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
+                       &mkl_buf_convert_input);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
+                                          mkl_buf_convert_input),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_input);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_input);
+
+      conv_res[dnnResourceSrc] =
+          (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
+
+      const Tensor& out_backprop = MklGetInput(context, 2);
+      void* mkl_buf_out_backprop = const_cast<void*>(
+          static_cast<const void*>(out_backprop.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
+                                                prim_conv_bwdfilter,
+                                                dnnResourceDiffDst),
+               E_SUCCESS);
+      mkl_convert_out_backprop =
+          !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop, lt_out_backprop);
+      if (mkl_convert_out_backprop) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
+                                         lt_out_backprop,
+                                         mkl_lt_internal_out_backprop),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_out_backprop_buf_tensor,
+                       lt_out_backprop, &mkl_buf_convert_out_backprop);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
+                                          mkl_buf_out_backprop,
+                                          mkl_buf_convert_out_backprop),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_out_backprop);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_out_backprop);
+
+      conv_res[dnnResourceDiffDst] = (mkl_convert_out_backprop)
+                                         ? mkl_buf_convert_out_backprop
+                                         : mkl_buf_out_backprop;
+    }
+
+    void MklPrepareGradFilter(OpKernelContext* context, Tensor* grad_filter,
+                              Tensor* mkl_tmp_grad_filter_buf_tensor) {
+      bool mkl_convert_grad_filter;
+      dnnLayout_t mkl_lt_internal_grad_filter = nullptr;
+      void* mkl_buf_convert_grad_filter = nullptr;
+      void* mkl_buf_grad_filter = const_cast<void*>(
+          static_cast<const void*>(grad_filter->flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_grad_filter,
+                                                prim_conv_bwdfilter,
+                                                dnnResourceDiffFilter),
+               E_SUCCESS);
+      mkl_convert_grad_filter =
+          !dnnLayoutCompare_F32(mkl_lt_internal_grad_filter, lt_grad_filter);
+      if (mkl_convert_grad_filter) {
+        CHECK_EQ(dnnConversionCreate_F32(&convert_bwdfilter,
+                                         mkl_lt_internal_grad_filter,
+                                         lt_grad_filter),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_grad_filter_buf_tensor,
+                       mkl_lt_internal_grad_filter,
+                       &mkl_buf_convert_grad_filter);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_grad_filter);
+
+      conv_res[dnnResourceDiffFilter] = (mkl_convert_grad_filter)
+                                            ? mkl_buf_convert_grad_filter
+                                            : mkl_buf_grad_filter;
+    }
+  } MklConv2DGradFilterOpContext;
+
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+#define REGISTER_MKL_FILTER_KERNELS(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("MklConv2DBackpropFilter")                 \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<T>("T")                     \
+                              .Label(mkl_layer_registry::kMklLayerLabel), \
+                          MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
+#undef REGISTER_MKL_FILTER_KERNELS
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@ -0,0 +1,355 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc. This opkernel uses MKL library, create MKL
+// layout and primitives, use MKL dnn primitives to compute convolution backward
+// input
+
+#ifdef INTEL_MKL
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+#include <algorithm>
+#include <vector>
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, class T>
+class MklConv2DCustomBackpropInputOp : public OpKernel {
+ public:
+  ~MklConv2DCustomBackpropInputOp() {}
+  explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string dataformat;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &dataformat));
+    OP_REQUIRES(context, FormatFromString(dataformat, &data_format),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides));
+    int stride_n = GetTensorDim(strides, data_format, 'N');
+    int stride_c = GetTensorDim(strides, data_format, 'C');
+    OP_REQUIRES(
+        context, (stride_n == 1 && stride_c == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklConvBackInputOpContext mkl_context;
+    const Tensor& input = MklGetInput(context, 0);
+    const Tensor& filter = MklGetInput(context, 1);
+
+    GetMklShape(context, 1, &(mkl_context.filter_shape));
+    bool filter_in_mkl_format = mkl_context.filter_shape.IsMklTensor();
+
+    const Tensor& out_backprop = MklGetInput(context, 2);
+    GetMklShape(context, 2, &(mkl_context.outback_shape));
+    bool outback_in_mkl_format = mkl_context.outback_shape.IsMklTensor();
+
+    TensorShape input_shape, filter_shape, outback_shape;
+
+    // Generate input shape.
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(input.shape()),
+        errors::InvalidArgument(
+            "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
+            input.dims()));
+    OP_REQUIRES_OK(
+        context, TensorShapeUtils::MakeShape(input.vec<int32>(), &input_shape));
+
+    // Generate shape for filter prop if input is in MKL format.
+    if (filter_in_mkl_format) {
+      OP_REQUIRES(context, mkl_context.filter_shape.GetDimension() == 4,
+                  errors::InvalidArgument(
+                      "Conv2DCustomBackpropInput: size must be 4-dim"));
+
+      MklSizesToTFSizes(context, data_format, mkl_context.filter_shape,
+                        &filter_shape);
+    } else {
+      filter_shape = filter.shape();
+    }
+
+    // Generate shape for outback prop if input is in MKL format.
+    if (outback_in_mkl_format) {
+      OP_REQUIRES(context, mkl_context.outback_shape.GetDimension() == 4,
+                  errors::InvalidArgument(
+                      "Conv2DCustomBackpropInput: size must be 4-dim"));
+
+      MklSizesToTFSizes(context, data_format, mkl_context.outback_shape,
+                        &outback_shape);
+    } else {
+      outback_shape = out_backprop.shape();
+    }
+
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(
+        context,
+        ConvBackpropComputeDimensions(
+            "Conv2DCustomBackpropInput", /*num_spatial_dims=*/2, input_shape,
+            filter_shape, outback_shape, strides, padding, data_format, &dims));
+
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+    mkl_context.in_dims = 4;
+
+    mkl_context.in_sizes[0] =
+        static_cast<size_t>(dims.spatial_dims[1].input_size);
+    mkl_context.in_sizes[1] =
+        static_cast<size_t>(dims.spatial_dims[0].input_size);
+    mkl_context.in_sizes[2] = static_cast<size_t>(dims.in_depth);
+    mkl_context.in_sizes[3] = static_cast<size_t>(dims.batch_size);
+
+    mkl_context.out_sizes[0] =
+        static_cast<size_t>(dims.spatial_dims[1].output_size);
+    mkl_context.out_sizes[1] =
+        static_cast<size_t>(dims.spatial_dims[0].output_size);
+    mkl_context.out_sizes[2] = static_cast<size_t>(dims.out_depth);
+    mkl_context.out_sizes[3] = static_cast<size_t>(dims.batch_size);
+
+    mkl_context.input_offset[0] = static_cast<int>(-pad_left);
+    mkl_context.input_offset[1] = static_cast<int>(-pad_top);
+
+    mkl_context.conv_strides[0] =
+        static_cast<size_t>(dims.spatial_dims[1].stride);
+    mkl_context.conv_strides[1] =
+        static_cast<size_t>(dims.spatial_dims[0].stride);
+
+    GetStridesFromSizes(data_format, mkl_context.out_strides,
+                        mkl_context.out_sizes);
+    GetStridesFromSizes(data_format, mkl_context.in_strides,
+                        mkl_context.in_sizes);
+
+    mkl_context.filter_size[0] = dims.spatial_dims[1].filter_size;
+    mkl_context.filter_size[1] = dims.spatial_dims[0].filter_size;
+    mkl_context.filter_size[2] = dims.in_depth;
+    mkl_context.filter_size[3] = dims.out_depth;
+
+    mkl_context.filter_stride[0] =
+        mkl_context.filter_size[2] * mkl_context.filter_size[3];
+    mkl_context.filter_stride[1] = mkl_context.filter_size[2] *
+                                   mkl_context.filter_size[0] *
+                                   mkl_context.filter_size[3];
+    mkl_context.filter_stride[2] = mkl_context.filter_size[3];
+    mkl_context.filter_stride[3] = 1;
+
+    CHECK_EQ(
+        dnnConvolutionCreateBackwardData_F32(
+            &mkl_context.prim_bwddata, NULL, dnnAlgorithmConvolutionDirect,
+            mkl_context.in_dims, mkl_context.in_sizes, mkl_context.out_sizes,
+            mkl_context.filter_size, mkl_context.conv_strides,
+            mkl_context.input_offset, dnnBorderZeros),
+        E_SUCCESS);
+
+    // Allocate output tensor and shape
+    TensorShape mkl_out_shape;
+    MklShape mklOutputShape;
+    mklOutputShape.SetMklTensor(true);
+    mklOutputShape.SetMklLayout(mkl_context.prim_bwddata, dnnResourceDiffSrc);
+    mklOutputShape.SetTfLayout(mkl_context.in_dims, mkl_context.in_sizes,
+                               mkl_context.in_strides);
+    // MKL might change the dimension ordering.
+    // Create mapping to recover the original TF dimension order
+    mklOutputShape.SetTfDimOrder(mkl_context.in_dims, data_format);
+
+    Tensor* in_backprop = nullptr;
+    mkl_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                             mklOutputShape.GetMklLayout())) /
+                         sizeof(T));
+    AllocateOutputSetMklshape(context, 0, &in_backprop, mkl_out_shape,
+                              mklOutputShape);
+
+    mkl_context.conv_res[dnnResourceDiffSrc] =
+        static_cast<void*>(const_cast<T*>(in_backprop->flat<T>().data()));
+
+    mkl_context.MklCreateInputLayouts(context);
+    Tensor mkl_tmp_outbackprop_buf_tensor, mkl_tmp_filter_buf_tensor;
+    mkl_context.MklPrepareConvolutionInputs(
+        context, &mkl_tmp_outbackprop_buf_tensor, &mkl_tmp_filter_buf_tensor);
+
+    CHECK_EQ(dnnExecute_F32(mkl_context.prim_bwddata, mkl_context.conv_res),
+             E_SUCCESS);
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    int input_offset[2];
+    size_t filter_size[4];
+    size_t filter_stride[4];
+    size_t conv_strides[2];
+    MklShape filter_shape, outback_shape;
+    dnnPrimitive_t prim_bwddata;
+    void* conv_res[dnnResourceNumber];
+    dnnLayout_t lt_filter, lt_outbackprop;
+
+    // Create MKL dnnLayout_t objects for tensors coming into the layer
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool filter_in_mkl_format = filter_shape.IsMklTensor();
+      bool outback_in_mkl_format = outback_shape.IsMklTensor();
+      if (filter_in_mkl_format) {
+        lt_filter = (dnnLayout_t)filter_shape.GetCurLayout();
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_filter, in_dims, filter_size,
+                                     filter_stride),
+                 E_SUCCESS);
+      }
+
+      if (outback_in_mkl_format) {
+        lt_outbackprop = (dnnLayout_t)outback_shape.GetCurLayout();
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_outbackprop, in_dims, out_sizes,
+                                     out_strides),
+                 E_SUCCESS);
+      }
+    }
+
+    // Compare incoming input tensor layouts with MKL preferred layouts and
+    // convert data to the preferred layout if necessary
+    void MklPrepareConvolutionInputs(OpKernelContext* context,
+                                     Tensor* mkl_tmp_outbackprop_buf_tensor,
+                                     Tensor* mkl_tmp_filter_buf_tensor) {
+      dnnPrimitive_t mkl_convert_filter = nullptr,
+                     mkl_convert_outbackprop = nullptr;
+      void *mkl_filter_buf = nullptr, *mkl_outbackprop_buf = nullptr;
+      dnnLayout_t mkl_lt_filter_internal = nullptr,
+                  mkl_lt_outbackprop_internal = nullptr;
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &mkl_lt_filter_internal, prim_bwddata, dnnResourceFilter),
+               E_SUCCESS);
+
+      const Tensor& filter = MklGetInput(context, 1);
+
+      CHECK_EQ(
+          dnnLayoutCreateFromPrimitive_F32(&mkl_lt_outbackprop_internal,
+                                           prim_bwddata, dnnResourceDiffDst),
+          E_SUCCESS);
+      if (!dnnLayoutCompare_F32(mkl_lt_filter_internal, lt_filter)) {
+        // Create conversion primitive
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_convert_filter, lt_filter,
+                                         mkl_lt_filter_internal),
+                 E_SUCCESS);
+
+        AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor,
+                       mkl_lt_filter_internal, &mkl_filter_buf);
+        CHECK_EQ(
+            dnnConversionExecute_F32(
+                mkl_convert_filter,
+                static_cast<void*>(const_cast<T*>(filter.flat<T>().data())),
+                mkl_filter_buf),
+            E_SUCCESS);
+
+        // Assign filter buf to resources[] for convolution.
+        conv_res[dnnResourceFilter] = mkl_filter_buf;
+        dnnDelete_F32(mkl_convert_filter);
+      } else {
+        // If we do not need any layout conversion for filter, then
+        // we direclty assign input filter to resources[].
+        conv_res[dnnResourceFilter] =
+            static_cast<void*>(const_cast<T*>(filter.flat<T>().data()));
+      }
+      dnnLayoutDelete_F32(mkl_lt_filter_internal);
+      const Tensor& out_backprop = MklGetInput(context, 2);
+      // --
+      // We do similar steps as above for outputbackprop.
+      if (!dnnLayoutCompare_F32(mkl_lt_outbackprop_internal, lt_outbackprop)) {
+        CHECK_EQ(
+            dnnConversionCreate_F32(&mkl_convert_outbackprop, lt_outbackprop,
+                                    mkl_lt_outbackprop_internal),
+            E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_outbackprop_buf_tensor,
+                       mkl_lt_outbackprop_internal, &mkl_outbackprop_buf);
+
+        CHECK_EQ(dnnConversionExecute_F32(mkl_convert_outbackprop,
+                                          static_cast<void*>(const_cast<T*>(
+                                              out_backprop.flat<T>().data())),
+                                          mkl_outbackprop_buf),
+                 E_SUCCESS);
+
+        conv_res[dnnResourceDiffDst] = mkl_outbackprop_buf;
+        dnnDelete_F32(mkl_convert_outbackprop);
+      } else {
+        conv_res[dnnResourceDiffDst] =
+            static_cast<void*>(const_cast<T*>(out_backprop.flat<T>().data()));
+      }
+      dnnLayoutDelete_F32(mkl_lt_outbackprop_internal);
+    }
+
+    // Cleanup member layouts and primitives
+    void MklCleanup() {
+      bool filter_in_mkl_format = filter_shape.IsMklTensor();
+      bool outback_in_mkl_format = outback_shape.IsMklTensor();
+      if (!filter_in_mkl_format) dnnLayoutDelete_F32(lt_filter);
+      if (!outback_in_mkl_format) dnnLayoutDelete_F32(lt_outbackprop);
+      dnnDelete_F32(prim_bwddata);
+    }
+  } MklConvBackInputOpContext;
+
+  std::vector<int32> strides;
+  Padding padding;
+  TensorFormat data_format;
+};
+
+#define REGISTER_MKL_CPU_KERNELS(T)                                       \
+  REGISTER_KERNEL_BUILDER(Name("MklConv2DBackpropInput")                  \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<T>("T")                     \
+                              .Label(mkl_layer_registry::kMklLayerLabel), \
+                          MklConv2DCustomBackpropInputOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
+#undef REGISTER_MKL_CPU_KERNELS
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@ -43,7 +43,6 @@ limitations under the License.
 namespace tensorflow {

 typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;

 template <typename Device, typename T, bool biasEnabled>
 class MklConv2DOp : public OpKernel {
@ -70,9 +69,10 @@ class MklConv2DOp : public OpKernel {
  }

  void Compute(OpKernelContext* context) override {
+    MklConv2DOpContext mkl_context;
    const Tensor& input = MklGetInput(context, 0);
-    GetMklShape(context, 0, &(mkl_params_.input_shape));
-    bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
+    GetMklShape(context, 0, &(mkl_context.input_shape));
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();

    const Tensor& filter = MklGetInput(context, 1);
    MklShape mkl_filter_shape;
@ -104,9 +104,9 @@ class MklConv2DOp : public OpKernel {
          errors::InvalidArgument("filter too large"));
    }

-    const int64 input_depth = input_in_mkl_format
-                                  ? mkl_params_.input_shape.GetSizes()[2]
-                                  : GetTensorDim(input, data_format_, 'C');
+    const int64 input_depth =
+        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C')
+                            : GetTensorDim(input, data_format_, 'C');
    OP_REQUIRES(context, input_depth == filter.dim_size(2),
                errors::InvalidArgument(
                    "input and filter must have the same depth: ", input_depth,
@ -116,9 +116,9 @@ class MklConv2DOp : public OpKernel {

    // The second dimension for input is rows/height.
    // The first dimension for filter is rows/height.
-    const int64 input_rows_raw = input_in_mkl_format
-                                     ? mkl_params_.input_shape.GetSizes()[1]
-                                     : GetTensorDim(input, data_format_, 'H');
+    const int64 input_rows_raw =
+        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H')
+                            : GetTensorDim(input, data_format_, 'H');
    OP_REQUIRES(
        context,
        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
@ -128,9 +128,9 @@ class MklConv2DOp : public OpKernel {

    // The third dimension for input is columns/width.
    // The second dimension for filter is columns/width.
-    const int64 input_cols_raw = input_in_mkl_format
-                                     ? mkl_params_.input_shape.GetSizes()[0]
-                                     : GetTensorDim(input, data_format_, 'W');
+    const int64 input_cols_raw =
+        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W')
+                            : GetTensorDim(input, data_format_, 'W');
    OP_REQUIRES(
        context,
        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
@ -139,9 +139,9 @@ class MklConv2DOp : public OpKernel {
    const int filter_cols = static_cast<int>(filter.dim_size(1));

    // The first dimension for input is batch.
-    const int64 input_batch_raw = input_in_mkl_format
-                                      ? mkl_params_.input_shape.GetSizes()[3]
-                                      : GetTensorDim(input, data_format_, 'N');
+    const int64 input_batch_raw =
+        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N')
+                            : GetTensorDim(input, data_format_, 'N');
    OP_REQUIRES(
        context,
        FastBoundsCheck(input_batch_raw, std::numeric_limits<int>::max()),
@ -184,98 +184,105 @@ class MklConv2DOp : public OpKernel {
    }

    // Create MKL convolution primitives
-    mkl_params_.in_dims = input_in_mkl_format
-                              ? mkl_params_.input_shape.GetDimension()
+    mkl_context.in_dims = input_in_mkl_format
+                              ? mkl_context.input_shape.GetDimension()
                              : input.dims();
-    mkl_params_.filter_dims = filter.dims();
-    mkl_params_.in_sizes[0] = static_cast<size_t>(input_cols);
-    mkl_params_.in_sizes[1] = static_cast<size_t>(input_rows);
-    mkl_params_.in_sizes[2] = static_cast<size_t>(input_depth);
-    mkl_params_.in_sizes[3] = static_cast<size_t>(batch);
-    mkl_params_.out_sizes[0] = static_cast<size_t>(out_cols);
-    mkl_params_.out_sizes[1] = static_cast<size_t>(out_rows);
-    mkl_params_.out_sizes[2] = static_cast<size_t>(out_depth);
-    mkl_params_.out_sizes[3] = static_cast<size_t>(batch);
-    mkl_params_.input_offset[0] = static_cast<int>(-pad_cols);
-    mkl_params_.input_offset[1] = static_cast<int>(-pad_rows);
-    mkl_params_.conv_stride[0] = static_cast<size_t>(stride_cols);
-    mkl_params_.conv_stride[1] = static_cast<size_t>(stride_rows);
+    mkl_context.filter_dims = filter.dims();

-    GetStridesFromSizes(data_format_, mkl_params_.out_strides,
-                        mkl_params_.out_sizes);
-    GetStridesFromSizes(data_format_, mkl_params_.in_strides,
-                        mkl_params_.in_sizes);
+    mkl_context.in_sizes[MklDims::W] = static_cast<size_t>(input_cols);
+    mkl_context.in_sizes[MklDims::H] = static_cast<size_t>(input_rows);
+    mkl_context.in_sizes[MklDims::C] = static_cast<size_t>(input_depth);
+    mkl_context.in_sizes[MklDims::N] = static_cast<size_t>(batch);
+
+    mkl_context.out_sizes[MklDims::W] = static_cast<size_t>(out_cols);
+    mkl_context.out_sizes[MklDims::H] = static_cast<size_t>(out_rows);
+    mkl_context.out_sizes[MklDims::C] = static_cast<size_t>(out_depth);
+    mkl_context.out_sizes[MklDims::N] = static_cast<size_t>(batch);
+
+    mkl_context.input_offset[0] = static_cast<int>(-pad_cols);
+    mkl_context.input_offset[1] = static_cast<int>(-pad_rows);
+
+    mkl_context.conv_stride[0] = static_cast<size_t>(stride_cols);
+    mkl_context.conv_stride[1] = static_cast<size_t>(stride_rows);
+
+    GetStridesFromSizes(data_format_, mkl_context.out_strides,
+                        mkl_context.out_sizes);
+    GetStridesFromSizes(data_format_, mkl_context.in_strides,
+                        mkl_context.in_sizes);

    // TF filter dimension order (out_depth, in_depth, cols, rows) ->
    // MKL filter dimension order (out_depth, in_depth, rows, cols)
-    mkl_params_.filter_sizes[0] = filter.dim_size(1);  // cols
-    mkl_params_.filter_sizes[1] = filter.dim_size(0);  // rows
-    mkl_params_.filter_sizes[2] = filter.dim_size(2);  // in_depth
-    mkl_params_.filter_sizes[3] = filter.dim_size(3);  // out_depth
+    mkl_context.filter_sizes[0] = filter.dim_size(1);  // cols
+    mkl_context.filter_sizes[1] = filter.dim_size(0);  // rows
+    mkl_context.filter_sizes[2] = filter.dim_size(2);  // in_depth
+    mkl_context.filter_sizes[3] = filter.dim_size(3);  // out_depth

    // TF filter layout - (rows, cols, in_depth, out_depth)
-    mkl_params_.filter_strides[0] =
+    mkl_context.filter_strides[0] =
        filter.dim_size(2) * filter.dim_size(3);  // cols
-    mkl_params_.filter_strides[1] =
+    mkl_context.filter_strides[1] =
        filter.dim_size(1) * filter.dim_size(2) * filter.dim_size(3);  // rows
-    mkl_params_.filter_strides[2] = filter.dim_size(3);  // in_depth
-    mkl_params_.filter_strides[3] = 1;                   // out_depth
+    mkl_context.filter_strides[2] = filter.dim_size(3);  // in_depth
+    mkl_context.filter_strides[3] = 1;                   // out_depth

    if (biasEnabled) {
      const Tensor& bias = MklGetInput(context, 2);
-      mkl_params_.bias_sizes[0] = {static_cast<size_t>(bias.dim_size(0))};
-      mkl_params_.bias_strides[0] = {1};
+      mkl_context.bias_sizes[0] = {static_cast<size_t>(bias.dim_size(0))};
+      mkl_context.bias_strides[0] = {1};
    }

    // Create Convolution Primitive
    if (biasEnabled) {
-      CHECK_EQ(dnnConvolutionCreateForwardBias_F32(
-                   &mkl_prim_convolution_fwd_, nullptr,
-                   dnnAlgorithmConvolutionDirect, mkl_params_.in_dims,
-                   mkl_params_.in_sizes, mkl_params_.out_sizes,
-                   mkl_params_.filter_sizes, mkl_params_.conv_stride,
-                   mkl_params_.input_offset, dnnBorderZeros),
-               E_SUCCESS);
+      CHECK_EQ(
+          dnnConvolutionCreateForwardBias_F32(
+              &mkl_context.prim_fwd, nullptr, dnnAlgorithmConvolutionDirect,
+              mkl_context.in_dims, mkl_context.in_sizes, mkl_context.out_sizes,
+              mkl_context.filter_sizes, mkl_context.conv_stride,
+              mkl_context.input_offset, dnnBorderZeros),
+          E_SUCCESS);
    } else {
-      CHECK_EQ(dnnConvolutionCreateForward_F32(
-                   &mkl_prim_convolution_fwd_, nullptr,
-                   dnnAlgorithmConvolutionDirect, mkl_params_.in_dims,
-                   mkl_params_.in_sizes, mkl_params_.out_sizes,
-                   mkl_params_.filter_sizes, mkl_params_.conv_stride,
-                   mkl_params_.input_offset, dnnBorderZeros),
-               E_SUCCESS);
+      CHECK_EQ(
+          dnnConvolutionCreateForward_F32(
+              &mkl_context.prim_fwd, nullptr, dnnAlgorithmConvolutionDirect,
+              mkl_context.in_dims, mkl_context.in_sizes, mkl_context.out_sizes,
+              mkl_context.filter_sizes, mkl_context.conv_stride,
+              mkl_context.input_offset, dnnBorderZeros),
+          E_SUCCESS);
    }

    TensorShape mkl_output_tf_shape;
    MklShape mkl_output_mkl_shape;
    mkl_output_mkl_shape.SetMklTensor(true);
-    mkl_output_mkl_shape.SetMklLayout(mkl_prim_convolution_fwd_,
-                                      dnnResourceDst);
-    mkl_output_mkl_shape.SetTfLayout(mkl_params_.in_dims, mkl_params_.out_sizes,
-                                     mkl_params_.out_strides);
+    mkl_output_mkl_shape.SetMklLayout(mkl_context.prim_fwd, dnnResourceDst);
+    mkl_output_mkl_shape.SetTfLayout(mkl_context.in_dims, mkl_context.out_sizes,
+                                     mkl_context.out_strides);
+    // MKL might change the dimension ordering
+    // Create mapping to recover the original TF dimension order
+    mkl_output_mkl_shape.SetTfDimOrder(mkl_context.in_dims, data_format_);
+
    mkl_output_tf_shape.AddDim(
        dnnLayoutGetMemorySize_F32(
            static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
        sizeof(T));
    AllocateOutputSetMklshape(context, 0, &output, mkl_output_tf_shape,
                              mkl_output_mkl_shape);
-    mkl_conv_res_[dnnResourceDst] =
+    mkl_context.conv_res[dnnResourceDst] =
        static_cast<void*>(output->flat<T>().data());

-    MklCreateInputLayouts(context);
+    mkl_context.MklCreateInputLayouts(context);

    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
        mkl_tmp_bias_buf_tensor;  // Temp tensor used to allocate tmp
                                  // buffers
-    MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
-                                &mkl_tmp_filter_buf_tensor,
-                                &mkl_tmp_bias_buf_tensor);
+    mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
+                                            &mkl_tmp_filter_buf_tensor,
+                                            &mkl_tmp_bias_buf_tensor);

    // Execute convolution
-    CHECK_EQ(dnnExecute_F32(mkl_prim_convolution_fwd_, mkl_conv_res_),
+    CHECK_EQ(dnnExecute_F32(mkl_context.prim_fwd, mkl_context.conv_res),
             E_SUCCESS);

-    MklCleanup();
+    mkl_context.MklCleanup();
  }

 private:
@ -293,151 +300,141 @@ class MklConv2DOp : public OpKernel {
    int input_offset[2];
    size_t conv_stride[2];
    MklShape input_shape;
-  } MklConv2DOpParams;
+    dnnPrimitive_t prim_fwd;
+    void* conv_res[dnnResourceNumber];
+    dnnLayout_t lt_filter, lt_bias, lt_input;

-  // Create MKL dnnLayout_t objects for tensors coming into the layer
-  void MklCreateInputLayouts(OpKernelContext* context) {
-    bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
-    if (input_in_mkl_format) {
-      mkl_lt_input_ =
-          static_cast<dnnLayout_t>(mkl_params_.input_shape.GetCurLayout());
-    } else {
-      CHECK_EQ(
-          dnnLayoutCreate_F32(&mkl_lt_input_, mkl_params_.in_dims,
-                              mkl_params_.in_sizes, mkl_params_.in_strides),
-          E_SUCCESS);
-    }
-
-    CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_filter_, mkl_params_.filter_dims,
-                                 mkl_params_.filter_sizes,
-                                 mkl_params_.filter_strides),
-             E_SUCCESS);
-
-    if (biasEnabled) {
-      CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_bias_, 1, mkl_params_.bias_sizes,
-                                   mkl_params_.bias_strides),
-               E_SUCCESS);
-    }
-  }
-
-  // Compare incoming tensor layouts with MKL preferred layouts and convert
-  // data to the preferred layout if necessary
-  void MklPrepareConvolutionInputs(OpKernelContext* context,
-                                   Tensor* mkl_tmp_input_buf_tensor,
-                                   Tensor* mkl_tmp_filter_buf_tensor,
-                                   Tensor* mkl_tmp_bias_buf_tensor) {
-    bool mkl_convert_input, mkl_convert_filter, mkl_convert_bias;
-    dnnPrimitive_t mkl_prim_convert_filter, mkl_prim_convert_bias,
-        mkl_prim_convert_input;
-    dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias,
-        mkl_lt_internal_input;
-    void *mkl_buf_convert_input, *mkl_buf_convert_filter, *mkl_buf_convert_bias;
-    mkl_prim_convert_filter = nullptr;
-    mkl_prim_convert_bias = nullptr;
-    mkl_prim_convert_input = nullptr;
-    mkl_lt_internal_filter = nullptr;
-    mkl_lt_internal_bias = nullptr;
-    mkl_lt_internal_input = nullptr;
-    mkl_buf_convert_input = nullptr;
-    mkl_buf_convert_filter = nullptr;
-    mkl_buf_convert_bias = nullptr;
-
-    // Compare with internal layouts and convert if needed
-    const Tensor& input = MklGetInput(context, 0);
-    void* mkl_buf_input =
-        const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-    CHECK_EQ(
-        dnnLayoutCreateFromPrimitive_F32(
-            &mkl_lt_internal_input, mkl_prim_convolution_fwd_, dnnResourceSrc),
-        E_SUCCESS);
-    mkl_convert_input =
-        !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input_);
-    if (mkl_convert_input) {
-      CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input_,
-                                       mkl_lt_internal_input),
-               E_SUCCESS);
-      AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
-                     &mkl_buf_convert_input);
-      CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
-                                        mkl_buf_convert_input),
-               E_SUCCESS);
-      dnnDelete_F32(mkl_prim_convert_input);
-    }
-    dnnLayoutDelete_F32(mkl_lt_internal_input);
-
-    mkl_conv_res_[dnnResourceSrc] =
-        (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
-
-    const Tensor& filter = MklGetInput(context, 1);
-    void* mkl_buf_filter =
-        const_cast<void*>(static_cast<const void*>(filter.flat<T>().data()));
-    CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_filter,
-                                              mkl_prim_convolution_fwd_,
-                                              dnnResourceFilter),
-             E_SUCCESS);
-    mkl_convert_filter =
-        !dnnLayoutCompare_F32(mkl_lt_internal_filter, mkl_lt_filter_);
-    if (mkl_convert_filter) {
-      CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_filter, mkl_lt_filter_,
-                                       mkl_lt_internal_filter),
-               E_SUCCESS);
-      AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor, mkl_lt_internal_filter,
-                     &mkl_buf_convert_filter);
-      CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_filter, mkl_buf_filter,
-                                        mkl_buf_convert_filter),
-               E_SUCCESS);
-      dnnDelete_F32(mkl_prim_convert_filter);
-    }
-    dnnLayoutDelete_F32(mkl_lt_internal_filter);
-
-    mkl_conv_res_[dnnResourceFilter] =
-        (mkl_convert_filter) ? mkl_buf_convert_filter : mkl_buf_filter;
-
-    if (biasEnabled) {
-      const Tensor& bias = MklGetInput(context, 2);
-      void* mkl_buf_bias =
-          const_cast<void*>(static_cast<const void*>(bias.flat<T>().data()));
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_bias,
-                                                mkl_prim_convolution_fwd_,
-                                                dnnResourceBias),
-               E_SUCCESS);
-      mkl_convert_bias =
-          !dnnLayoutCompare_F32(mkl_lt_internal_bias, mkl_lt_bias_);
-      if (mkl_convert_bias) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_bias, mkl_lt_bias_,
-                                         mkl_lt_internal_bias),
+    // Create MKL dnnLayout_t objects for tensors coming into the layer
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (input_in_mkl_format) {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_bias_buf_tensor, mkl_lt_internal_bias,
-                       &mkl_buf_convert_bias);
-        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_bias, mkl_buf_bias,
-                                          mkl_buf_convert_bias),
-                 E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_bias);
      }
-      dnnLayoutDelete_F32(mkl_lt_internal_bias);

-      mkl_conv_res_[dnnResourceBias] =
-          (mkl_convert_bias) ? mkl_buf_convert_bias : mkl_buf_bias;
+      CHECK_EQ(dnnLayoutCreate_F32(&lt_filter, filter_dims, filter_sizes,
+                                   filter_strides),
+               E_SUCCESS);
+
+      if (biasEnabled) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_bias, 1, bias_sizes, bias_strides),
+                 E_SUCCESS);
+      }
    }
-  }

-  void MklCleanup() {
-    bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
-    dnnDelete_F32(mkl_prim_convolution_fwd_);
-    if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input_);
-    dnnLayoutDelete_F32(mkl_lt_filter_);
-    if (biasEnabled) dnnLayoutDelete_F32(mkl_lt_bias_);
-  }
+    // Compare incoming tensor layouts with MKL preferred layouts and convert
+    // data to the preferred layout if necessary
+    void MklPrepareConvolutionInputs(OpKernelContext* context,
+                                     Tensor* mkl_tmp_input_buf_tensor,
+                                     Tensor* mkl_tmp_filter_buf_tensor,
+                                     Tensor* mkl_tmp_bias_buf_tensor) {
+      bool mkl_convert_input, mkl_convert_filter, mkl_convert_bias;
+      dnnPrimitive_t mkl_prim_convert_filter, mkl_prim_convert_bias,
+          mkl_prim_convert_input;
+      dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias,
+          mkl_lt_internal_input;
+      void *mkl_buf_convert_input, *mkl_buf_convert_filter,
+          *mkl_buf_convert_bias;
+      mkl_prim_convert_filter = nullptr;
+      mkl_prim_convert_bias = nullptr;
+      mkl_prim_convert_input = nullptr;
+      mkl_lt_internal_filter = nullptr;
+      mkl_lt_internal_bias = nullptr;
+      mkl_lt_internal_input = nullptr;
+      mkl_buf_convert_input = nullptr;
+      mkl_buf_convert_filter = nullptr;
+      mkl_buf_convert_bias = nullptr;
+
+      // Compare with internal layouts and convert if needed
+      const Tensor& input = MklGetInput(context, 0);
+      void* mkl_buf_input =
+          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_input,
+                                                prim_fwd, dnnResourceSrc),
+               E_SUCCESS);
+      mkl_convert_input =
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
+      if (mkl_convert_input) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
+                       &mkl_buf_convert_input);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
+                                          mkl_buf_convert_input),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_input);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_input);
+
+      conv_res[dnnResourceSrc] =
+          (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
+
+      const Tensor& filter = MklGetInput(context, 1);
+      void* mkl_buf_filter =
+          const_cast<void*>(static_cast<const void*>(filter.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_filter,
+                                                prim_fwd, dnnResourceFilter),
+               E_SUCCESS);
+      mkl_convert_filter =
+          !dnnLayoutCompare_F32(mkl_lt_internal_filter, lt_filter);
+      if (mkl_convert_filter) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_filter, lt_filter,
+                                         mkl_lt_internal_filter),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor,
+                       mkl_lt_internal_filter, &mkl_buf_convert_filter);
+        CHECK_EQ(
+            dnnConversionExecute_F32(mkl_prim_convert_filter, mkl_buf_filter,
+                                     mkl_buf_convert_filter),
+            E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_filter);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_filter);
+
+      conv_res[dnnResourceFilter] =
+          (mkl_convert_filter) ? mkl_buf_convert_filter : mkl_buf_filter;
+
+      if (biasEnabled) {
+        const Tensor& bias = MklGetInput(context, 2);
+        void* mkl_buf_bias =
+            const_cast<void*>(static_cast<const void*>(bias.flat<T>().data()));
+        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_bias,
+                                                  prim_fwd, dnnResourceBias),
+                 E_SUCCESS);
+        mkl_convert_bias = !dnnLayoutCompare_F32(mkl_lt_internal_bias, lt_bias);
+        if (mkl_convert_bias) {
+          CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_bias, lt_bias,
+                                           mkl_lt_internal_bias),
+                   E_SUCCESS);
+          AllocTmpBuffer(context, mkl_tmp_bias_buf_tensor, mkl_lt_internal_bias,
+                         &mkl_buf_convert_bias);
+          CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_bias, mkl_buf_bias,
+                                            mkl_buf_convert_bias),
+                   E_SUCCESS);
+          dnnDelete_F32(mkl_prim_convert_bias);
+        }
+        dnnLayoutDelete_F32(mkl_lt_internal_bias);
+
+        conv_res[dnnResourceBias] =
+            (mkl_convert_bias) ? mkl_buf_convert_bias : mkl_buf_bias;
+      }
+    }
+
+    void MklCleanup() {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      dnnDelete_F32(prim_fwd);
+      if (!input_in_mkl_format) dnnLayoutDelete_F32(lt_input);
+      dnnLayoutDelete_F32(lt_filter);
+      if (biasEnabled) dnnLayoutDelete_F32(lt_bias);
+    }
+  } MklConv2DOpContext;

  std::vector<int32> strides_;
  Padding padding_;
  TensorFormat data_format_;
-
-  MklConv2DOpParams mkl_params_;
-  dnnPrimitive_t mkl_prim_convolution_fwd_ = nullptr;
-  void* mkl_conv_res_[dnnResourceNumber];
-  dnnLayout_t mkl_lt_filter_ = nullptr, mkl_lt_bias_ = nullptr,
-              mkl_lt_input_ = nullptr;
 };

 #define REGISTER_MKL_CPU(T)                                               \
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@ -0,0 +1,506 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// An implementation of MaxPooling (forward).
+template <typename Device, typename T>
+class MklMaxPoolingOp : public OpKernel {
+ public:
+  explicit MklMaxPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
+    string data_format;
+
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
+
+    workspace_enabled_ = false;
+    // We may not get this attribute for this node if it does not go through
+    // graph rewrite pass. So we do not check for error while retrieving this
+    // attribute value.
+    context->GetAttr("workspace_enabled", &workspace_enabled_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklMaxPoolingOpContext mkl_context;
+    // Get the input tensor
+    const Tensor& tensor_in = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    mkl_context.params.in_dim = 4;
+    MklPoolParameters pool_params;
+    if (input_in_mkl_format == false) {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       tensor_in.shape());
+      OP_REQUIRES(
+          context, (pool_params.depth_window == 1),
+          errors::Unimplemented("Depthwise max pooling not supported by MKL"));
+
+    } else {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       &mkl_context.input_shape);
+    }
+
+    // Extract the parameters for the op from the pooling specs
+
+    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
+
+    mkl_context.MklCreateLayoutsAndPrimitives(context);
+
+    // Declare output tensor
+    TensorShape tensor_out_shape;
+    MklShape mkl_out_shape;
+    mkl_out_shape.SetMklTensor(true);
+    mkl_out_shape.SetMklLayout(mkl_context.prim_pooling_fwd, dnnResourceDst);
+    mkl_out_shape.SetTfLayout(mkl_context.params.in_dim,
+                              mkl_context.params.out_sizes,
+                              mkl_context.params.out_strides);
+    mkl_out_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
+
+    Tensor* output_tensor = nullptr;
+    tensor_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                                mkl_out_shape.GetMklLayout())) /
+                            sizeof(T));
+    AllocateOutputSetMklshape(context, 0, &output_tensor, tensor_out_shape,
+                              mkl_out_shape);
+
+    if (!workspace_enabled_) {
+      mkl_out_shape.SetMklTensor(false);
+    }
+
+    Tensor* workspace_tensor;
+    void* workspace_buf = nullptr;
+    if (workspace_enabled_) {
+      TensorShape workspace_shape;
+      workspace_shape.AddDim(
+          dnnLayoutGetMemorySize_F32(
+              static_cast<dnnLayout_t>(mkl_context.lt_workspace)) /
+          sizeof(T));
+      AllocateOutputSetMklshape(context, 1, &workspace_tensor, workspace_shape,
+                                mkl_out_shape);
+      mkl_context.pooling_res[dnnResourceWorkspace] = const_cast<void*>(
+          static_cast<const void*>(workspace_tensor->flat<T>().data()));
+    } else {
+      AllocTmpBuffer(context, workspace_tensor, mkl_context.lt_workspace,
+                     &workspace_buf);
+      mkl_context.pooling_res[dnnResourceWorkspace] = workspace_buf;
+    }
+
+    mkl_context.pooling_res[dnnResourceSrc] =
+        const_cast<void*>(static_cast<const void*>(tensor_in.flat<T>().data()));
+    mkl_context.pooling_res[dnnResourceDst] = const_cast<void*>(
+        static_cast<const void*>(output_tensor->flat<T>().data()));
+
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_pooling_fwd, mkl_context.pooling_res),
+        E_SUCCESS);
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    MklPoolingOpParams params;
+    MklShape input_shape;
+    void* pooling_res[dnnResourceNumber];
+    dnnPrimitive_t prim_pooling_fwd;
+    dnnLayout_t lt_user_input, lt_workspace;
+
+    void MklCreateLayoutsAndPrimitives(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      // Create or use existing DNN user layout
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_user_input, params.in_dim,
+                                     params.in_sizes, params.in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_user_input = (dnnLayout_t)input_shape.GetCurLayout();
+      }
+
+      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;
+      dnnPrimitiveAttributes_t primAttr = nullptr;
+
+      // Create DNN primitives
+      CHECK_EQ(dnnPoolingCreateForward_F32(
+                   &prim_pooling_fwd, primAttr, algorithm, lt_user_input,
+                   params.kernel_size, params.kernel_stride, params.in_offset,
+                   dnnBorderZerosAsymm),
+               E_SUCCESS);
+
+      // Creates layout for the workspace
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, prim_pooling_fwd,
+                                                dnnResourceWorkspace),
+               E_SUCCESS);
+    }
+
+    void MklCleanup() {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      CHECK_EQ(dnnDelete_F32(prim_pooling_fwd), E_SUCCESS);
+      if (!input_in_mkl_format) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_user_input), E_SUCCESS);
+      }
+      CHECK_EQ(dnnLayoutDelete_F32(lt_workspace), E_SUCCESS);
+    }
+  } MklMaxPoolingOpContext;
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+  bool workspace_enabled_;
+};
+
+// The operation to compute MaxPool gradients.
+// It takes three inputs:
+//   - The original input tensor
+//   - The original output tensor
+//   - Backprop tensor for output
+// It produces one output: backprop tensor for input.
+template <class Device, class T>
+class MklMaxPoolingGradOp : public OpKernel {
+ public:
+  explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+    workspace_enabled_ = false;
+    // We may not get this attribute for this node if it does not go through
+    // graph rewrite pass. So we do not check for error while retrieving this
+    // attribute value.
+    context->GetAttr("workspace_enabled", &workspace_enabled_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklMaxPoolingGradOpContext mkl_context;
+    // Input - The original input tensor
+    const Tensor& tensor_in = MklGetInput(context, 0);
+
+    // Output - Backprop tensor for input.
+    Tensor* output_tensor = nullptr;
+
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    GetMklShape(context, 2, &mkl_context.output_backprop_shape);
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    if (input_in_mkl_format == false)
+      mkl_context.params.in_dim = tensor_in.dims();
+    else
+      mkl_context.params.in_dim = mkl_context.input_shape.GetDimension();
+
+    MklPoolParameters pool_params;
+    if (input_in_mkl_format == false) {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       tensor_in.shape());
+      OP_REQUIRES(
+          context, (pool_params.depth_window == 1),
+          errors::Unimplemented("Depthwise max pooling not supported by MKL"));
+
+    } else {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       &mkl_context.input_shape);
+    }
+
+    // Extract the parameters for the op from the pooling specs
+    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
+
+    mkl_context.MklCreateLayouts(context);
+    mkl_context.MklCreatePrimitives(context, workspace_enabled_);
+    mkl_context.MklPrepareInputs(context, workspace_enabled_);
+
+    // Create shape for the input back prop output
+    TensorShape mkl_input_backprop;
+    MklShape mkl_output_shape;
+    mkl_output_shape.SetMklTensor(true);
+    mkl_output_shape.SetMklLayout(mkl_context.prim_pooling_bwd,
+                                  dnnResourceDiffSrc);
+    mkl_output_shape.SetTfLayout(mkl_context.params.in_dim,
+                                 mkl_context.params.in_sizes,
+                                 mkl_context.params.in_strides);
+    mkl_output_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
+
+    mkl_input_backprop.AddDim(
+        dnnLayoutGetMemorySize_F32(
+            static_cast<dnnLayout_t>(mkl_output_shape.GetMklLayout())) /
+        sizeof(T));
+    AllocateOutputSetMklshape(context, 0, &output_tensor, mkl_input_backprop,
+                              mkl_output_shape);
+    mkl_context.pooling_res[dnnResourceDiffSrc] = const_cast<void*>(
+        static_cast<const void*>(output_tensor->flat<T>().data()));
+
+    int64 output_size = output_tensor->NumElements();
+    for (int64 i = 0; i < output_size; ++i) {
+      (static_cast<float*>(mkl_context.pooling_res[dnnResourceDiffSrc]))[i] = 0;
+    }
+
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_pooling_bwd, mkl_context.pooling_res),
+        E_SUCCESS);
+
+    mkl_context.MklCleanup(workspace_enabled_);
+  }
+
+ private:
+  typedef struct {
+    MklPoolingOpParams params;
+    MklShape input_shape, output_backprop_shape;
+    void* pooling_resfwd[dnnResourceNumber];
+    void* pooling_res[dnnResourceNumber];
+    dnnPrimitive_t prim_pooling_fwd, prim_pooling_bwd, convert_input,
+        convert_outbackprop;
+    dnnLayout_t lt_outbackprop_user, lt_outbackprop_prim, lt_input_user,
+        lt_input_prim;
+    void* input_buf;
+    void* outbackprop_buf;
+
+    void MklCreateLayouts(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
+      // Create DNN user layout for input and outbackprop or get existing layout
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input_user, params.in_dim,
+                                     params.in_sizes, params.in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_input_user = (dnnLayout_t)input_shape.GetCurLayout();
+      }
+
+      // We dont care about the output layout for now as we can create it from
+      // primitives for the max pooling fwd prop
+      if (outbackprop_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_outbackprop_user, params.in_dim,
+                                     params.out_sizes, params.out_strides),
+                 E_SUCCESS);
+      } else {
+        lt_outbackprop_user = (dnnLayout_t)output_backprop_shape.GetCurLayout();
+      }
+    }
+
+    // Create DNN primitives
+    void MklCreatePrimitives(OpKernelContext* context, bool workspace_enabled) {
+      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;
+      dnnPrimitiveAttributes_t primAttr = nullptr;
+
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnPoolingCreateForward_F32(
+                     &prim_pooling_fwd, primAttr, algorithm, lt_input_user,
+                     params.kernel_size, params.kernel_stride, params.in_offset,
+                     dnnBorderZerosAsymm),
+                 E_SUCCESS);
+      }
+
+      CHECK_EQ(dnnPoolingCreateBackward_F32(
+                   &prim_pooling_bwd, primAttr, algorithm, lt_input_user,
+                   params.kernel_size, params.kernel_stride, params.in_offset,
+                   dnnBorderZerosAsymm),
+               E_SUCCESS);
+
+      // Creates conversions
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &lt_outbackprop_prim, prim_pooling_bwd, dnnResourceDiffDst),
+               E_SUCCESS);
+
+      // Tensors needed to create temporary buffers
+      Tensor input_buf_tensor, outbackprop_buf_tensor;
+
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                     &lt_input_prim, prim_pooling_fwd, dnnResourceSrc),
+                 E_SUCCESS);
+        if (!dnnLayoutCompare_F32(lt_input_user, lt_input_prim)) {
+          CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_input_user,
+                                           lt_input_prim),
+                   E_SUCCESS);
+          AllocTmpBuffer(context, &input_buf_tensor, lt_input_prim, &input_buf);
+        }
+      }
+
+      if (!dnnLayoutCompare_F32(lt_outbackprop_user, lt_outbackprop_prim)) {
+        CHECK_EQ(
+            dnnConversionCreate_F32(&convert_outbackprop, lt_outbackprop_user,
+                                    lt_outbackprop_prim),
+            E_SUCCESS);
+        AllocTmpBuffer(context, &outbackprop_buf_tensor, lt_outbackprop_prim,
+                       &outbackprop_buf);
+      }
+    }
+
+    // Compare incoming tensor layouts with MKL preferred layouts and convert
+    // data to the preferred layout if necessary
+    void MklPrepareInputs(OpKernelContext* context, bool workspace_enabled) {
+      const Tensor& tensor_in = MklGetInput(context, 0);
+      const Tensor& out_backprop = MklGetInput(context, 2);
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
+
+      void* tmp_output_buf;
+      Tensor tmp_output_buf_tensor;
+
+      void* workspace_buf;
+      Tensor workspace_buf_tensor;
+
+      if (workspace_enabled == false) {
+        if (convert_input != nullptr) {
+          if (input_in_mkl_format == false) {
+            CHECK_EQ(dnnConversionExecute_F32(
+                         convert_input,
+                         const_cast<void*>(static_cast<const void*>(
+                             tensor_in.flat<T>().data())),
+                         input_buf),
+                     E_SUCCESS);
+            CHECK_EQ(dnnDelete_F32(convert_input), E_SUCCESS);
+            convert_input = nullptr;
+          } else {
+            input_shape.GetConvertedFlatData(
+                lt_input_prim,
+                const_cast<void*>(
+                    static_cast<const void*>(tensor_in.flat<T>().data())),
+                input_buf);
+          }
+          pooling_resfwd[dnnResourceSrc] = input_buf;
+        } else {
+          pooling_resfwd[dnnResourceSrc] = const_cast<void*>(
+              static_cast<const void*>(tensor_in.flat<T>().data()));
+        }
+
+        dnnLayout_t lt_workspace;
+        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                     &lt_workspace, prim_pooling_fwd, dnnResourceWorkspace),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, &workspace_buf_tensor, lt_workspace,
+                       &workspace_buf);
+        pooling_resfwd[dnnResourceWorkspace] = workspace_buf;
+
+        dnnLayoutDelete_F32(lt_workspace);
+
+        // We create the layout for max pooling fwd prop tmp output here
+        AllocTmpBuffer(context, &tmp_output_buf_tensor, lt_outbackprop_prim,
+                       &tmp_output_buf);
+        pooling_resfwd[dnnResourceDst] = tmp_output_buf;
+
+        CHECK_EQ(dnnExecute_F32(prim_pooling_fwd, pooling_resfwd), E_SUCCESS);
+        pooling_res[dnnResourceWorkspace] =
+            pooling_resfwd[dnnResourceWorkspace];
+      } else {
+        const Tensor& workspace = MklGetInput(context, 3);
+        pooling_res[dnnResourceWorkspace] = const_cast<void*>(
+            static_cast<const void*>(workspace.flat<T>().data()));
+      }
+
+      // Out backprop conversions if needed
+      if (convert_outbackprop != nullptr) {
+        if (outbackprop_in_mkl_format == false) {
+          CHECK_EQ(dnnConversionExecute_F32(
+                       convert_outbackprop,
+                       const_cast<void*>(static_cast<const void*>(
+                           out_backprop.flat<T>().data())),
+                       outbackprop_buf),
+                   E_SUCCESS);
+          CHECK_EQ(dnnDelete_F32(convert_outbackprop), E_SUCCESS);
+        } else {
+          output_backprop_shape.GetConvertedFlatData(
+              lt_outbackprop_prim,
+              const_cast<void*>(
+                  static_cast<const void*>(out_backprop.flat<T>().data())),
+              outbackprop_buf);
+        }
+        pooling_res[dnnResourceDiffDst] = outbackprop_buf;
+      } else {
+        pooling_res[dnnResourceDiffDst] = const_cast<void*>(
+            static_cast<const void*>(out_backprop.flat<T>().data()));
+      }
+    }
+
+    void MklCleanup(bool workspace_enabled) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnDelete_F32(prim_pooling_fwd), E_SUCCESS);
+      }
+      CHECK_EQ(dnnDelete_F32(prim_pooling_bwd), E_SUCCESS);
+      if (outbackprop_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_outbackprop_user), E_SUCCESS);
+      }
+      CHECK_EQ(dnnLayoutDelete_F32(lt_outbackprop_prim), E_SUCCESS);
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_input_user), E_SUCCESS);
+      }
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_input_prim), E_SUCCESS);
+      }
+    }
+  } MklMaxPoolingGradOpContext;
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+
+  bool workspace_enabled_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("MklMaxPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_layer_registry::kMklLayerLabel),
+                        MklMaxPoolingOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("MklMaxPoolGrad")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_layer_registry::kMklLayerLabel),
+                        MklMaxPoolingGradOp<CPUDevice, float>);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@ -0,0 +1,150 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include <vector>
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+
+namespace tensorflow {
+
+// Initialization for TensorFlow format
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format,
+                             const TensorShape& tensor_in_shape) {
+  // For maxpooling, tensor_in should have 4 dimensions.
+  OP_REQUIRES(context, tensor_in_shape.dims() == 4,
+              errors::InvalidArgument("tensor_in must be 4-dimensional"));
+
+  depth = GetTensorDim(tensor_in_shape, data_format, 'C');
+  tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, 'W');
+  tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, 'H');
+  tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
+
+  Init(context, ksize, stride, padding, data_format);
+}
+
+// Initialization for MKL format
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format,
+                             const MklShape* mklInputShape) {
+  // Get the input sizes
+  depth = mklInputShape->GetSizes()[2];
+  tensor_in_cols = mklInputShape->GetSizes()[0];
+  tensor_in_rows = mklInputShape->GetSizes()[1];
+  tensor_in_batch = mklInputShape->GetSizes()[3];
+
+  Init(context, ksize, stride, padding, data_format);
+}
+
+// Common Initialization for TensorFlow and MKL formats
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format) {
+  // Get the data format
+  this->data_format = data_format;
+
+  // Get the output sizes
+  window_rows = GetTensorDim(ksize, data_format, 'H');
+  window_cols = GetTensorDim(ksize, data_format, 'W');
+  depth_window = GetTensorDim(ksize, data_format, 'C');
+
+  // Get the strides
+  row_stride = GetTensorDim(stride, data_format, 'H');
+  col_stride = GetTensorDim(stride, data_format, 'W');
+  depth_stride = GetTensorDim(stride, data_format, 'C');
+
+  // We only support 2D pooling across width/height and depthwise
+  // pooling, not a combination.
+  OP_REQUIRES(context,
+              (depth_window == 1 || (window_rows == 1 && window_cols == 1)),
+              errors::Unimplemented(
+                  "MaxPooling supports exactly one of pooling across depth "
+                  "or pooling across width/height."));
+
+  if (depth_window == 1) {
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                tensor_in_rows, window_rows, row_stride,
+                                padding, &out_height, &pad_top, &pad_bottom));
+
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                tensor_in_cols, window_cols, col_stride,
+                                padding, &out_width, &pad_left, &pad_right));
+  } else {
+    // Our current version of depthwise max pooling does not support
+    // any padding, and expects the depth_window to equal the depth
+    // stride (no overlapping).
+    OP_REQUIRES(context, depth % depth_window == 0,
+                errors::Unimplemented("Depthwise max pooling requires the"
+                                      " depth window to evenly divide the"
+                                      " input depth"));
+    OP_REQUIRES(context, depth_stride == depth_window,
+                errors::Unimplemented("Depthwise max pooling requires the"
+                                      " depth window to equal the depth"
+                                      " stride"));
+
+    // The current version of depthwise max is only implemented on CPU.
+    OP_REQUIRES(context,
+                (DeviceType(static_cast<Device*>(context->device())
+                                ->attributes()
+                                .device_type()) == DeviceType(DEVICE_CPU)),
+                errors::Unimplemented("Depthwise max pooling is currently "
+                                      "only implemented for CPU devices."));
+
+    pad_depth = 0;
+    out_depth = depth / depth_window;
+  }
+}
+
+// Transfers the right parameters for pooling to the op parameters
+// Updates context->status if there is an invalid input.
+void ExtractMklOpParams(OpKernelContext* context, TensorFormat data_format,
+                        const MklPoolParameters& params,
+                        MklPoolingOpParams* mkl_params) {
+  mkl_params->in_sizes[0] = params.tensor_in_cols;
+  mkl_params->in_sizes[1] = params.tensor_in_rows;
+  mkl_params->in_sizes[2] = params.depth;
+  mkl_params->in_sizes[3] = params.tensor_in_batch;
+
+  GetStridesFromSizes(data_format, mkl_params->in_strides,
+                      mkl_params->in_sizes);
+
+  mkl_params->out_sizes[0] = params.out_width;
+  mkl_params->out_sizes[1] = params.out_height;
+  mkl_params->out_sizes[2] = params.depth;
+  mkl_params->out_sizes[3] = params.tensor_in_batch;
+
+  GetStridesFromSizes(data_format, mkl_params->out_strides,
+                      mkl_params->out_sizes);
+
+  mkl_params->in_offset[0] = -params.pad_left;
+  mkl_params->in_offset[1] = -params.pad_top;
+  mkl_params->in_offset[2] = -params.pad_right;
+  mkl_params->in_offset[3] = -params.pad_bottom;
+
+  mkl_params->kernel_stride[0] = params.col_stride;
+  mkl_params->kernel_stride[1] = params.row_stride;
+
+  mkl_params->kernel_size[0] = params.window_cols;
+  mkl_params->kernel_size[1] = params.window_rows;
+}
+}  // namespace tensorflow
+#endif  // INTEL_MKL
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
+
+#ifdef INTEL_MKL
+#include <vector>
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+struct MklPoolParameters {
+  int depth;
+
+  int tensor_in_cols;
+  int tensor_in_rows;
+  int tensor_in_batch;
+
+  int window_rows;
+  int window_cols;
+  int depth_window;
+
+  int row_stride;
+  int col_stride;
+  int depth_stride;
+
+  int64 out_height;
+  int64 out_width;
+  int out_depth;
+
+  int64 pad_left;
+  int64 pad_right;
+  int64 pad_top;
+  int64 pad_bottom;
+  int pad_depth;
+
+  TensorFormat data_format;
+
+  // Updates context->status if there is an invalid input.
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format, const TensorShape& tensor_in_shape);
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format, const MklShape* mkl_in_shape);
+
+ private:
+  // Common initialization for TensorFlow and MKL formats
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format);
+};
+
+//-------------------------------------------------------------------
+// Utility functions
+
+typedef struct {
+  size_t in_dim;
+  size_t in_sizes[4];
+  size_t in_strides[4];
+  size_t out_sizes[4];
+  size_t out_strides[4];
+  int in_offset[4];
+  size_t kernel_stride[2];
+  size_t kernel_size[2];
+} MklPoolingOpParams;
+
+// Transfers the right parameters for pooling to the op parameters
+// Updates context->status if there is an invalid input.
+void ExtractMklOpParams(OpKernelContext* context, TensorFormat data_format,
+                        const MklPoolParameters& params,
+                        MklPoolingOpParams* mkl_params);
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@ -0,0 +1,397 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+#ifdef INTEL_MKL
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/platform/default/logging.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+struct MklReluHelpers {
+  static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g,
+                                     const Tensor& a) {
+    OP_REQUIRES(context, a.IsSameSize(g),
+                errors::InvalidArgument("g and a must be the same size"));
+  }
+  static bool ValidateSameSize(OpKernelContext* context, const Tensor& g,
+                               const Tensor& a) {
+    ValidateSameSizeHelper(context, g, a);
+    return context->status().ok();
+  }
+};
+
+template <typename Device, typename T>
+class MklReluOp : public OpKernel {
+ public:
+  ~MklReluOp() {}
+
+  explicit MklReluOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    MklReluOpContext mkl_context;
+
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    void* user_i = static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+    if (!input_in_mkl_format && !input.dims()) {  // handle the case of a scalar
+      const TensorShape& o_shape = input.shape();
+      Tensor* out_tensor = nullptr;
+      mkl_context.output_shape.SetMklTensor(false);
+      AllocateOutputSetMklshape(context, 0, &out_tensor, o_shape,
+                                mkl_context.output_shape);
+      void* out_o = static_cast<void*>(out_tensor->flat<T>().data());
+      (static_cast<T*>(out_o))[0] =
+          std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
+      return;
+    }
+
+    // Generate size, stride for input if input is in MKL format.
+    if (input_in_mkl_format) {
+      mkl_context.in_dims = mkl_context.input_shape.GetDimension();
+      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+      mkl_context.in_strides = new size_t[mkl_context.in_dims];
+      for (int i = 0; i < mkl_context.in_dims; i++) {
+        mkl_context.in_sizes[i] = mkl_context.input_shape.GetSizes()[i];
+        mkl_context.in_strides[i] = mkl_context.input_shape.GetStrides()[i];
+      }
+    } else {
+      mkl_context.in_dims = input.dims();
+      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+      mkl_context.in_strides = new size_t[mkl_context.in_dims];
+      for (int i = 0; i < mkl_context.in_dims; i++) {
+        mkl_context.in_sizes[i] = input.dim_size((mkl_context.in_dims - 1) - i);
+      }
+      mkl_context.in_strides[0] = 1;
+      for (int i = 1; i < mkl_context.in_dims; i++) {
+        mkl_context.in_strides[i] =
+            mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
+      }
+    }
+
+    float negative_slope = 0.0;
+    mkl_context.MklCreateInputLayouts(context);
+    CHECK_EQ(dnnReLUCreateForward_F32(&mkl_context.prim_relu_fwd, NULL,
+                                      mkl_context.lt_input, negative_slope),
+             E_SUCCESS);
+
+    Tensor* output = nullptr;
+
+    if (input_in_mkl_format) {
+      TensorShape tf_shape;
+      mkl_context.output_shape.SetMklTensor(true);
+      mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_fwd,
+                                            dnnResourceDst);
+      mkl_context.output_shape.SetTfLayout(
+          mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
+      mkl_context.output_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
+      tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                          mkl_context.output_shape.GetMklLayout())) /
+                      sizeof(T));
+      AllocateOutputSetMklshape(context, 0, &output, tf_shape,
+                                mkl_context.output_shape);
+    } else {
+      const TensorShape& o_shape = input.shape();
+      mkl_context.output_shape.SetMklTensor(false);
+      AllocateOutputSetMklshape(context, 0, &output, o_shape,
+                                mkl_context.output_shape);
+    }
+
+    void* user_o = static_cast<void*>(const_cast<T*>(output->flat<T>().data()));
+
+    mkl_context.relu_res[dnnResourceDst] = user_o;
+    mkl_context.relu_res[dnnResourceSrc] = user_i;
+    CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_fwd, mkl_context.relu_res),
+             E_SUCCESS);
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t* in_sizes;
+    size_t* in_strides;
+    MklShape input_shape, output_shape;
+    dnnPrimitive_t prim_relu_fwd = nullptr;
+    void* relu_res[dnnResourceNumber];
+    dnnLayout_t lt_input = nullptr;
+
+    void MklCleanup() {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (!input_in_mkl_format) {
+        dnnLayoutDelete_F32(lt_input);
+        free(in_sizes);
+        free(in_strides);
+      }
+      dnnDelete_F32(prim_relu_fwd);
+    }
+
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (!input_in_mkl_format) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      }
+    }
+  } MklReluOpContext;
+};
+
+template <typename Device, typename T>
+class MklReluGradOp : public OpKernel {
+ public:
+  ~MklReluGradOp() {}
+
+  explicit MklReluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override;
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t* in_sizes;
+    size_t* in_strides;
+    MklShape input_shape, grad_shape, output_shape;
+    void* relu_res[dnnResourceNumber];
+    dnnPrimitive_t prim_relu_bwd;
+    dnnLayout_t lt_input, lt_grad;
+
+    void MklPrepareReluGradInputs(OpKernelContext* context,
+                                  Tensor* mkl_tmp_grad_buf_tensor,
+                                  Tensor* mkl_tmp_input_buf_tensor) {
+      dnnPrimitive_t cv_user_to_reluB_input, cv_user_to_reluB_grad;
+      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_grad;
+
+      const Tensor& g = MklGetInput(context, 0);
+      const Tensor& a = MklGetInput(context, 1);
+
+      void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
+      void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &mkl_lt_internal_grad, prim_relu_bwd, dnnResourceDiffDst),
+               E_SUCCESS);
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_input,
+                                                prim_relu_bwd, dnnResourceSrc),
+               E_SUCCESS);
+
+      if (!dnnLayoutCompare_F32(mkl_lt_internal_grad, lt_grad)) {
+        AllocTmpBuffer(context, mkl_tmp_grad_buf_tensor, mkl_lt_internal_grad,
+                       &relu_res[dnnResourceDiffDst]);
+        CHECK_EQ(dnnConversionCreate_F32(&cv_user_to_reluB_grad, lt_grad,
+                                         mkl_lt_internal_grad),
+                 E_SUCCESS);
+        CHECK_EQ(dnnConversionExecute_F32(cv_user_to_reluB_grad, user_g,
+                                          relu_res[dnnResourceDiffDst]),
+                 E_SUCCESS);
+        dnnDelete_F32(cv_user_to_reluB_grad);
+      } else {
+        relu_res[dnnResourceDiffDst] = user_g;
+      }
+
+      if (!dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input)) {
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
+                       &relu_res[dnnResourceSrc]);
+        CHECK_EQ(dnnConversionCreate_F32(&cv_user_to_reluB_input, lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
+        CHECK_EQ(dnnConversionExecute_F32(cv_user_to_reluB_input, user_i,
+                                          relu_res[dnnResourceSrc]),
+                 E_SUCCESS);
+        dnnDelete_F32(cv_user_to_reluB_input);
+      } else {
+        relu_res[dnnResourceSrc] = user_i;
+      }
+
+      dnnLayoutDelete_F32(mkl_lt_internal_input);
+      dnnLayoutDelete_F32(mkl_lt_internal_grad);
+    }
+
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool grad_is_mkl = grad_shape.IsMklTensor();
+      bool input_is_mkl = input_shape.IsMklTensor();
+      if (!input_is_mkl) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      }
+
+      if (!grad_is_mkl) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_grad, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_grad = static_cast<dnnLayout_t>(grad_shape.GetCurLayout());
+      }
+    }
+
+    void MklCleanup() {
+      bool grad_is_mkl = grad_shape.IsMklTensor();
+      bool input_is_mkl = input_shape.IsMklTensor();
+      dnnDelete_F32(prim_relu_bwd);
+      if (!input_is_mkl) {
+        dnnLayoutDelete_F32(lt_input);
+        free(in_sizes);
+        free(in_strides);
+      }
+      if (!grad_is_mkl) {
+        dnnLayoutDelete_F32(lt_grad);
+      }
+    }
+  } MklReluGradOpContext;
+};
+
+template <typename Device, typename T>
+
+void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
+  MklReluGradOpContext mkl_context;
+  const Tensor& g = MklGetInput(context, 0);
+  const Tensor& a = MklGetInput(context, 1);
+
+  void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
+  void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
+
+  GetMklShape(context, 0, &mkl_context.grad_shape);
+  GetMklShape(context, 1, &mkl_context.input_shape);
+
+  bool grad_is_mkl = mkl_context.grad_shape.IsMklTensor();
+  bool input_is_mkl = mkl_context.input_shape.IsMklTensor();
+  if (!input_is_mkl && !grad_is_mkl &&
+      !MklReluHelpers::ValidateSameSize(context, g, a))
+    return;
+  Tensor* output = nullptr;
+  if (!input_is_mkl && !grad_is_mkl &&
+      !a.dims()) {  // handle the case of a scalar
+    // Allocate space for g and
+    const TensorShape& g_shape = g.shape();
+    mkl_context.output_shape.SetMklTensor(false);
+    AllocateOutputSetMklshape(context, 0, &output, g_shape,
+                              mkl_context.output_shape);
+    void* out_o = static_cast<void*>(output->flat<T>().data());
+    (static_cast<T*>(out_o))[0] =
+        (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
+    return;
+  }
+
+  // Generate size, stride for input if input/grad is in MKL format.
+  if (grad_is_mkl || input_is_mkl) {
+    const MklShape* tmp_mkl_shape =
+        (grad_is_mkl) ? &mkl_context.grad_shape : &mkl_context.input_shape;
+
+    mkl_context.in_dims = tmp_mkl_shape->GetDimension();
+    mkl_context.in_strides = new size_t[mkl_context.in_dims];
+    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+    for (int i = 0; i < mkl_context.in_dims; i++) {
+      mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
+      mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
+    }
+  } else {
+    mkl_context.in_dims = g.dims();
+    mkl_context.in_strides = new size_t[mkl_context.in_dims];
+    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+
+    for (int i = 0; i < mkl_context.in_dims; i++) {
+      mkl_context.in_sizes[i] = g.dim_size((mkl_context.in_dims - 1) - i);
+    }
+    mkl_context.in_strides[0] = 1;
+    for (int i = 1; i < mkl_context.in_dims; i++) {
+      mkl_context.in_strides[i] =
+          mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
+    }
+  }
+
+  mkl_context.MklCreateInputLayouts(context);
+  float negative_slope = 0.0;
+  CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
+                                     mkl_context.lt_grad, mkl_context.lt_input,
+                                     negative_slope),
+           E_SUCCESS);
+  Tensor mkl_tmp_grad_buf_tensor, mkl_tmp_input_buf_tensor;
+  mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_grad_buf_tensor,
+                                       &mkl_tmp_input_buf_tensor);
+
+  if (input_is_mkl ||
+      grad_is_mkl) { /*if  grad or input are MKL leave it in MKL*/
+    TensorShape tf_shape;
+    mkl_context.output_shape.SetMklTensor(true);
+    mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_bwd,
+                                          dnnResourceDiffSrc);
+    mkl_context.output_shape.SetTfLayout(
+        mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
+    // If input_is_mkl or grad_is_mkl, then we copy strides and sizes from Mkl
+    // shape of one that is in MKL layout.
+    if (grad_is_mkl == true) {
+      mkl_context.output_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.grad_shape.GetTfToMklDimMap());
+    } else {
+      mkl_context.output_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
+    }
+
+    tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                        mkl_context.output_shape.GetMklLayout())) /
+                    sizeof(T));
+    AllocateOutputSetMklshape(context, 0, &output, tf_shape,
+                              mkl_context.output_shape);
+
+  } else {
+    const TensorShape& o_shape = g.shape();
+    mkl_context.output_shape.SetMklTensor(false);
+    AllocateOutputSetMklshape(context, 0, &output, o_shape,
+                              mkl_context.output_shape);
+  }
+
+  mkl_context.relu_res[dnnResourceDiffSrc] =
+      static_cast<void*>(output->flat<T>().data());
+
+  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
+           E_SUCCESS);
+  mkl_context.MklCleanup();
+}
+
+/* Register DNN kernels for supported operations and supported types - right now
+ * it is only Relu and f32*/
+#define REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("MklRelu")                                 \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<type>("T")                  \
+                              .Label(mkl_layer_registry::kMklLayerLabel), \
+                          MklReluOp<CPUDevice, type>);                    \
+  REGISTER_KERNEL_BUILDER(Name("MklReluGrad")                             \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<type>("T")                  \
+                              .Label(mkl_layer_registry::kMklLayerLabel), \
+                          MklReluGradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
--- a/tensorflow/core/kernels/mkl_tfconv_op.cc
+++ b/tensorflow/core/kernels/mkl_tfconv_op.cc
@ -67,28 +67,10 @@ class MklToTfOp : public OpKernel {
    CHECK_EQ(op_data_type, input_data_type);
    CHECK_EQ(op_data_type, output_data_type);

-    // We need to recreate Tf tensor shape based on sizes and strides.
-    // Ideally, we should know what the data_format is, but that attribute
-    // to this op is not reliable. So below, we rely of sorting logic where
-    // we sort strides first and then sizes.
    TensorShape output_shape;
-    std::vector<std::pair<int, int>> shape_size;
    for (size_t i = 0; i < input_shape.GetDimension(); i++) {
-      VLOG(1) << "Size: " << input_shape.GetSizes()[i]
-              << ", Strides: " << input_shape.GetStrides()[i];
-      shape_size.push_back(std::make_pair(input_shape.GetSizes()[i],
-                                          input_shape.GetStrides()[i]));
-    }
-
-    std::sort(shape_size.begin(), shape_size.end(),
-              [](std::pair<int, int> a, std::pair<int, int> b) {
-                return (a.second > b.second) ||
-                       (a.second == b.second && a.first > b.first);
-              });
-
-    for (std::pair<int, int> s_s : shape_size) {
-      VLOG(1) << "Added dimension: " << s_s.first;
-      output_shape.AddDim(s_s.first);
+      // Outermost to innermost dimension
+      output_shape.AddDim(input_shape.GetSizes()[input_shape.tf_dim_idx(i)]);
    }

    // Allocate output tensor.
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@ -14,12 +14,15 @@ limitations under the License.
 ==============================================================================*/
 #define EIGEN_USE_THREADS

+#include "tensorflow/core/kernels/pooling_ops_3d.h"
+
 #include <array>

 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@ -28,15 +31,64 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/work_sharder.h"

 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
+#include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
 #endif
 namespace tensorflow {

 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;

+Pool3dParameters::Pool3dParameters(OpKernelContext* context,
+                                   const std::vector<int32>& ksize,
+                                   const std::vector<int32>& stride,
+                                   Padding padding, TensorFormat data_format,
+                                   const TensorShape& tensor_in_shape) {
+  // For maxpooling, tensor_in should have 4 dimensions.
+  OP_REQUIRES(context, tensor_in_shape.dims() == 5,
+              errors::InvalidArgument("tensor_in must be 4-dimensional"));
+
+  this->data_format = data_format;
+  depth = GetTensorDim(tensor_in_shape, data_format, 'C');
+  tensor_in_planes = GetTensorDim(tensor_in_shape, data_format, '0');
+  tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, '1');
+  tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, '2');
+  tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
+  window_planes = GetTensorDim(ksize, data_format, '0');
+  window_rows = GetTensorDim(ksize, data_format, '1');
+  window_cols = GetTensorDim(ksize, data_format, '2');
+  depth_window = GetTensorDim(ksize, data_format, 'C');
+  plane_stride = GetTensorDim(stride, data_format, '0');
+  row_stride = GetTensorDim(stride, data_format, '1');
+  col_stride = GetTensorDim(stride, data_format, '2');
+  depth_stride = GetTensorDim(stride, data_format, 'C');
+
+  // We only support 3D pooling across plane/width/height. Depthwise
+  // pooling is not supported.
+  OP_REQUIRES(
+      context, depth_window == 1 && depth_stride == 1,
+      errors::Unimplemented(
+          "Pooling3d only supports pooling across plane/width/height."));
+
+  OP_REQUIRES_OK(context, GetWindowedOutputSize(tensor_in_planes, window_planes,
+                                                plane_stride, padding,
+                                                &out_plane, &pad_planes));
+  OP_REQUIRES_OK(context,
+                 GetWindowedOutputSize(tensor_in_rows, window_rows, row_stride,
+                                       padding, &out_height, &pad_rows));
+  OP_REQUIRES_OK(context,
+                 GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
+                                       padding, &out_width, &pad_cols));
+}
+
+TensorShape Pool3dParameters::forward_output_shape() {
+  return ShapeFromFormat(data_format, tensor_in_batch,
+                         {{out_plane, out_height, out_width}}, depth);
+}
+
 enum PoolingType { MAX, AVG };

 template <typename Device, typename T, PoolingType Type>
@ -147,12 +199,6 @@ class Pooling3DOp : public UnaryOp<T> {
  Padding padding_;
  TensorFormat data_format_;
 };
-REGISTER_KERNEL_BUILDER(
-    Name("AvgPool3D").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Pooling3DOp<CPUDevice, float, AVG>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool3D").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Pooling3DOp<CPUDevice, float, MAX>);

 template <typename Device, typename T>
 struct LaunchMaxPooling3dGradOp;
@ -331,10 +377,6 @@ class MaxPooling3dGradOp : public OpKernel {
  TensorFormat data_format_;
 };

-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool3DGrad").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    MaxPooling3dGradOp<CPUDevice, float>);
-
 template <typename Device, typename T>
 struct LaunchAvgPooling3dGradOp;

@ -499,11 +541,208 @@ class AvgPooling3dGradOp : public OpKernel {
  TensorFormat data_format_;
 };

-REGISTER_KERNEL_BUILDER(Name("AvgPool3DGrad")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("orig_input_shape"),
-                        AvgPooling3dGradOp<CPUDevice, float>);
+template <typename Device, typename T>
+struct LaunchMaxPooling3dGradGradOp;
+
+template <typename T>
+struct LaunchMaxPooling3dGradGradOp<CPUDevice, T> {
+  static void launch(OpKernelContext* context, const Pool3dParameters& params,
+                     const Tensor& tensor_in, const Tensor& tensor_out,
+                     const Tensor& tensor_top_diff,
+                     Tensor* tensor_bottom_diff) {
+    OP_REQUIRES(
+        context, params.data_format == FORMAT_NHWC,
+        errors::InvalidArgument("Default MaxPooling3dGradGradOp only supports",
+                                "NDHWC on CPU device type"));
+
+    typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        ConstEigenMatrixMap;
+    typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        EigenMatrixMap;
+
+    ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
+                               params.tensor_in_planes * params.tensor_in_cols *
+                                   params.tensor_in_rows *
+                                   params.tensor_in_batch);
+    ConstEigenMatrixMap out_mat(tensor_out.flat<T>().data(), params.depth,
+                                params.out_plane * params.out_width *
+                                    params.out_height * params.tensor_in_batch);
+    ConstEigenMatrixMap top_diff_mat(
+        tensor_top_diff.flat<T>().data(), params.depth,
+        params.tensor_in_planes * params.tensor_in_cols *
+            params.tensor_in_rows * params.tensor_in_batch);
+    EigenMatrixMap bottom_diff_mat(
+        tensor_bottom_diff->flat<T>().data(), params.depth,
+        params.out_plane * params.out_width * params.out_height *
+            params.tensor_in_batch);
+
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *(context->device()->tensorflow_cpu_worker_threads());
+
+    auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
+                     int64 start, int64 limit) {
+      const int32 depth = params.depth;
+      const int32 in_planes = params.tensor_in_planes;
+      const int32 in_rows = params.tensor_in_rows;
+      const int32 in_cols = params.tensor_in_cols;
+      const int32 pad_planes = params.pad_planes;
+      const int32 pad_rows = params.pad_rows;
+      const int32 pad_cols = params.pad_cols;
+      const int32 window_planes = params.window_planes;
+      const int32 window_rows = params.window_rows;
+      const int32 window_cols = params.window_cols;
+      const int32 plane_stride = params.plane_stride;
+      const int32 row_stride = params.row_stride;
+      const int32 col_stride = params.col_stride;
+      const int32 out_plane = params.out_plane;
+      const int32 out_height = params.out_height;
+      const int32 out_width = params.out_width;
+
+      {
+        // Initializes the output grad backprop tensor with 0.
+        const int32 output_image_size =
+            out_plane * out_height * out_width * params.depth;
+        EigenMatrixMap bottom_diff_shard(
+            bottom_diff_mat.data() + start * output_image_size, 1,
+            (limit - start) * output_image_size);
+        bottom_diff_shard.setZero();
+      }
+
+      for (int b = start; b < limit; ++b) {
+        for (int pp = 0; pp < out_plane; ++pp) {
+          for (int ph = 0; ph < out_height; ++ph) {
+            for (int pw = 0; pw < out_width; ++pw) {
+              // (p_start, p_end) * (h_start, h_end) * (w_start, w_end) is the
+              // range that the input vector projects to.
+              int p_start = pp * plane_stride - pad_planes;
+              const int p_end = std::min(p_start + window_planes, in_planes);
+              int h_start = ph * row_stride - pad_rows;
+              const int h_end = std::min(h_start + window_rows, in_rows);
+              int w_start = pw * col_stride - pad_cols;
+              const int w_end = std::min(w_start + window_cols, in_cols);
+              p_start = std::max(p_start, 0);
+              h_start = std::max(h_start, 0);
+              w_start = std::max(w_start, 0);
+              const int out_index =
+                  ((b * out_plane + pp) * out_height + ph) * out_width + pw;
+              // Find value corresponding to the input maximum in top_diff.
+              for (int d = 0; d < depth; ++d) {
+                const T& output_ref = out_mat.coeffRef(d, out_index);
+                bool should_stop = false;
+                for (int p = p_start; p < p_end && !should_stop; ++p) {
+                  for (int h = h_start; h < h_end && !should_stop; ++h) {
+                    for (int w = w_start; w < w_end && !should_stop; ++w) {
+                      const int in_index =
+                          ((b * in_planes + p) * in_rows + h) * in_cols + w;
+                      const T& input_ref = in_mat.coeffRef(d, in_index);
+                      if (output_ref == input_ref) {
+                        T& bottom_diff_ref =
+                            bottom_diff_mat.coeffRef(d, out_index);
+                        bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
+                        should_stop = true;
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    };
+    const int64 shard_cost =
+        params.out_plane * params.out_height * params.out_width * params.depth *
+        params.window_planes * params.window_rows * params.window_cols;
+    Shard(worker_threads.num_threads, worker_threads.workers,
+          params.tensor_in_batch, shard_cost, shard);
+  }
+};
+
+template <class Device, class T>
+class MaxPooling3dGradGradOp : public OpKernel {
+ public:
+  explicit MaxPooling3dGradGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 5,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 5,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+    const int32 ksize_c = GetTensorDim(ksize_, data_format_, 'C');
+    const int32 stride_c = GetTensorDim(stride_, data_format_, 'C');
+    OP_REQUIRES(context, ksize_c == 1 && stride_c == 1,
+                errors::Unimplemented("MaxPooling3dGradGrad is not yet "
+                                      "supported on the depth dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& tensor_out = context->input(1);
+    const Tensor& out_grad_backprop = context->input(2);
+
+    // For maxpooling3d, tensor_in should have 5 dimensions.
+    OP_REQUIRES(context, tensor_in.dims() == 5,
+                errors::InvalidArgument("tensor_in must be 5-dimensional"));
+    OP_REQUIRES(context, tensor_out.dims() == 5,
+                errors::InvalidArgument("tensor_out must be 5-dimensional"));
+    // For maxpooling3d, out_grad_backprop should have 5 dimensions.
+    OP_REQUIRES(
+        context, out_grad_backprop.dims() == 5,
+        errors::InvalidArgument("out_grad_backprop must be 5-dimensional"));
+
+    Pool3dParameters params{context,  ksize_,       stride_,
+                            padding_, data_format_, tensor_in.shape()};
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {2}, 0, tensor_out.shape(), &output));
+
+    LaunchMaxPooling3dGradGradOp<Device, T>::launch(
+        context, params, tensor_in, tensor_out, out_grad_backprop, output);
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+#define REGISTER_KERNELS(D, T)                                             \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("MaxPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
+      Pooling3DOp<D##Device, T, MAX>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("MaxPool3DGrad")                            \
+                              .Device(DEVICE_##D)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<T>("TInput"),                \
+                          MaxPooling3dGradOp<D##Device, T>);               \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("MaxPool3DGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      MaxPooling3dGradGradOp<D##Device, T>);                               \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("AvgPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
+      Pooling3DOp<D##Device, T, AVG>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("AvgPool3DGrad")                            \
+                              .Device(DEVICE_##D)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .HostMemory("orig_input_shape"),             \
+                          AvgPooling3dGradOp<D##Device, T>);
+
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T)
+TF_CALL_float(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS

 #if GOOGLE_CUDA

@ -535,13 +774,6 @@ struct LaunchPoolingOp<GPUDevice, T, MAX> {
  }
 };

-REGISTER_KERNEL_BUILDER(
-    Name("AvgPool3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    Pooling3DOp<GPUDevice, float, AVG>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    Pooling3DOp<GPUDevice, float, MAX>);
-
 template <typename T>
 struct LaunchMaxPooling3dGradOp<GPUDevice, T> {
  static void launch(OpKernelContext* context, const Tensor& tensor_in,
@ -559,10 +791,6 @@ struct LaunchMaxPooling3dGradOp<GPUDevice, T> {
  }
 };

-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool3DGrad").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    MaxPooling3dGradOp<GPUDevice, float>);
-
 template <typename T>
 struct LaunchAvgPooling3dGradOp<GPUDevice, T> {
  static void launch(OpKernelContext* context,
@ -579,12 +807,36 @@ struct LaunchAvgPooling3dGradOp<GPUDevice, T> {
        nullptr, nullptr, output);
  }
 };
-REGISTER_KERNEL_BUILDER(Name("AvgPool3DGrad")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("orig_input_shape"),
-                        AvgPooling3dGradOp<GPUDevice, float>);
+
+template <typename T>
+struct LaunchMaxPooling3dGradGradOp<GPUDevice, T> {
+  static void launch(OpKernelContext* context, const Pool3dParameters& params,
+                     const Tensor& tensor_in, const Tensor& tensor_out,
+                     const Tensor& tensor_top_diff,
+                     Tensor* tensor_bottom_diff) {
+    bool status = functor::MaxPool3dGradBackward<T>()(
+        params.data_format, tensor_in.flat<T>().data(),
+        tensor_out.flat<T>().data(), params.tensor_in_batch, params.out_plane,
+        params.out_height, params.out_width, params.depth,
+        params.tensor_in_planes, params.tensor_in_rows, params.tensor_in_cols,
+        params.window_planes, params.window_rows, params.window_cols,
+        params.plane_stride, params.row_stride, params.col_stride,
+        params.pad_planes, params.pad_rows, params.pad_cols,
+        tensor_top_diff.flat<T>().data(), tensor_bottom_diff->flat<T>().data(),
+        context->eigen_gpu_device());
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching MaxPool3dGradBackward"));
+    }
+  }
+};
+
+#define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T)
+TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
+#undef REGISTER_GPU_KERNELS

 #endif  // GOOGLE_CUDA

+#undef REGISTER_KERNELS
+
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/pooling_ops_3d.h
+++ b/tensorflow/core/kernels/pooling_ops_3d.h
@ -0,0 +1,66 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_POOLING_OPS_3D_H_
+#define TENSORFLOW_KERNELS_POOLING_OPS_3D_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+// A helper class to manage sizes and shapes for 3d pooling operations.
+struct Pool3dParameters {
+  // Updates context->status if there is an invalid input.
+  Pool3dParameters(OpKernelContext* context, const std::vector<int32>& ksize,
+                   const std::vector<int32>& stride, Padding padding,
+                   TensorFormat data_format,
+                   const TensorShape& tensor_in_shape);
+
+  // Returns the shape of the output for "forward" pooling operations.
+  TensorShape forward_output_shape();
+
+  int depth;
+
+  int tensor_in_planes;
+  int tensor_in_cols;
+  int tensor_in_rows;
+  int tensor_in_batch;
+
+  int window_planes;
+  int window_cols;
+  int window_rows;
+  int depth_window;
+
+  int plane_stride;
+  int col_stride;
+  int row_stride;
+  int depth_stride;
+
+  int64 out_plane;
+  int64 out_height;
+  int64 out_width;
+
+  int64 pad_planes;
+  int64 pad_cols;
+  int64 pad_rows;
+
+  TensorFormat data_format;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_POOLING_OPS_3D_H_
--- a/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
@ -0,0 +1,172 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace {
+
+template <typename dtype>
+__global__ void MaxPoolGradBackwardNoMaskNCDHW(
+    const int nthreads, const dtype* bottom_data, const dtype* output_data,
+    const int pooled_plane, const int pooled_height, const int pooled_width,
+    const int channels, const int plane, const int height, const int width,
+    const int kernel_p, const int kernel_h, const int kernel_w,
+    const int stride_p, const int stride_h, const int stride_w, const int pad_p,
+    const int pad_t, const int pad_l, const dtype* top_diff,
+    dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // First find out the index to the maximum, since we have no mask.
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int pp = (index / pooled_width / pooled_height) % pooled_plane;
+    int c = (index / pooled_width / pooled_height / pooled_plane) % channels;
+    int n = (index / pooled_width / pooled_height / pooled_plane / channels);
+    int pstart = pp * stride_p - pad_p;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    const int pend = min(pstart + kernel_p, plane);
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    pstart = max(pstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    bool should_stop = false;
+    int maxidx = -1;
+    const dtype* bottom_data_n =
+        bottom_data + n * channels * plane * height * width;
+    // Propagate only first value from top_diff corresponding to the maximum.
+    for (int p = pstart; p < pend && !should_stop; ++p) {
+      for (int h = hstart; h < hend && !should_stop; ++h) {
+        for (int w = wstart; w < wend && !should_stop; ++w) {
+          int idx = c * plane * height * width + (p * height + h) * width + w;
+          if (output_data[index] == bottom_data_n[idx]) {
+            maxidx = idx;
+            should_stop = true;
+          }
+        }
+      }
+    }
+    // Set the bottom diff (atomic is not necessary). The index could still be
+    // uninitialized, if all the bottom_data are NaN.
+    if (maxidx != -1) {
+      bottom_diff[index] =
+          top_diff[n * channels * plane * height * width + maxidx];
+    }
+  }
+}
+
+template <typename dtype>
+__global__ void MaxPoolGradBackwardNoMaskNDHWC(
+    const int nthreads, const dtype* bottom_data, const dtype* output_data,
+    const int pooled_plane, const int pooled_height, const int pooled_width,
+    const int channels, const int plane, const int height, const int width,
+    const int kernel_p, const int kernel_h, const int kernel_w,
+    const int stride_p, const int stride_h, const int stride_w, const int pad_p,
+    const int pad_t, const int pad_l, const dtype* top_diff,
+    dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // First find out the index to the maximum, since we have no mask.
+    int n = index;
+    int c = n % channels;
+    n /= channels;
+    int wstart = (n % pooled_width) * stride_w - pad_l;
+    int wend = min(wstart + kernel_w, width);
+    wstart = max(wstart, 0);
+    n /= pooled_width;
+    int hstart = (n % pooled_height) * stride_h - pad_t;
+    int hend = min(hstart + kernel_h, height);
+    hstart = max(hstart, 0);
+    n /= pooled_height;
+    int pstart = (n % pooled_plane) * stride_p - pad_p;
+    int pend = min(pstart + kernel_p, plane);
+    pstart = max(pstart, 0);
+    n /= pooled_plane;
+    bool should_stop = false;
+    int maxidx = -1;
+    const dtype* bottom_data_n =
+        bottom_data + n * plane * height * width * channels;
+    // Propagate only first value from top_diff corresponding to the maximum.
+    for (int p = pstart; p < pend && !should_stop; ++p) {
+      for (int h = hstart; h < hend && !should_stop; ++h) {
+        for (int w = wstart; w < wend && !should_stop; ++w) {
+          int idx = ((p * height + h) * width + w) * channels + c;
+          if (output_data[index] == bottom_data_n[idx]) {
+            maxidx = idx;
+            should_stop = true;
+          }
+        }
+      }
+    }
+    // Set the bottom diff (atomic is not necessary). The index could still be
+    // uninitialized, if all the bottom_data are NaN.
+    if (maxidx != -1) {
+      bottom_diff[index] =
+          top_diff[n * plane * height * width * channels + maxidx];
+    }
+  }
+}
+
+}  // namespace
+
+namespace functor {
+
+template <typename T>
+bool MaxPool3dGradBackward<T>::operator()(
+    TensorFormat data_format, const T* bottom_data, const T* output_data,
+    const int batch, const int pooled_plane, const int pooled_height,
+    const int pooled_width, const int channels, const int plane,
+    const int height, const int width, const int kernel_p, const int kernel_h,
+    const int kernel_w, const int stride_p, const int stride_h,
+    const int stride_w, const int pad_p, const int pad_t, const int pad_l,
+    const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d) {
+  int num_kernels =
+      batch * channels * pooled_plane * pooled_height * pooled_width;
+  CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
+  if (data_format == FORMAT_NHWC) {
+    MaxPoolGradBackwardNoMaskNDHWC<<<config.block_count,
+                                     config.thread_per_block, 0, d.stream()>>>(
+        num_kernels, bottom_data, output_data, pooled_plane, pooled_height,
+        pooled_width, channels, plane, height, width, kernel_p, kernel_h,
+        kernel_w, stride_p, stride_h, stride_w, pad_p, pad_t, pad_l, top_diff,
+        bottom_diff);
+  } else {
+    MaxPoolGradBackwardNoMaskNCDHW<<<config.block_count,
+                                     config.thread_per_block, 0, d.stream()>>>(
+        num_kernels, bottom_data, output_data, pooled_plane, pooled_height,
+        pooled_width, channels, plane, height, width, kernel_p, kernel_h,
+        kernel_w, stride_p, stride_h, stride_w, pad_p, pad_t, pad_l, top_diff,
+        bottom_diff);
+  }
+  return d.ok();
+}
+
+}  // namespace functor
+
+#define DEFINE_GPU_SPECS(T) template struct functor::MaxPool3dGradBackward<T>;
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+#undef DEFINE_GPU_SPECS
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/pooling_ops_3d_gpu.h
+++ b/tensorflow/core/kernels/pooling_ops_3d_gpu.h
@ -0,0 +1,48 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !GOOGLE_CUDA
+#error This file must only be included when building with Cuda support
+#endif
+
+#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_GPU_H_
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace functor {
+template <typename T>
+struct MaxPool3dGradBackward {
+  bool operator()(TensorFormat data_format, const T* bottom_data,
+                  const T* output_data, const int batch, const int pooled_plane,
+                  const int pooled_height, const int pooled_width,
+                  const int channels, const int plane, const int height,
+                  const int width, const int kernel_p, const int kernel_h,
+                  const int kernel_w, const int stride_p, const int stride_h,
+                  const int stride_w, const int pad_p, const int pad_t,
+                  const int pad_l, const T* top_diff, T* bottom_diff,
+                  const Eigen::GpuDevice& d);
+};
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_H_
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@ -17,6 +17,7 @@ limitations under the License.

 #include <vector>
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"

 #if GOOGLE_CUDA
@ -127,8 +128,7 @@ namespace functor {
      typename TTypes<T, 4>::Tensor out);                           \
  extern template struct TransformDepth<GPUDevice, T, Eigen::DenseIndex>;

-DECLARE_GPU_SPEC(float);
-DECLARE_GPU_SPEC(Eigen::half);
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC)
 #undef DECLARE_GPU_SPEC
 }  // namespace functor

@ -373,10 +373,11 @@ void DnnPoolingGradOp<T>::Compute(
  }
 }

-template class DnnPoolingOp<Eigen::half>;
-template class DnnPoolingOp<float>;
-template class DnnPoolingGradOp<Eigen::half>;
-template class DnnPoolingGradOp<float>;
+#define DEFINE_DNN_OPS(T)         \
+  template class DnnPoolingOp<T>; \
+  template class DnnPoolingGradOp<T>;
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_DNN_OPS)
+#undef DEFINE_DNN_OPS

 #endif  // GOOGLE_CUDA

--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@ -303,10 +303,7 @@ class RandomGammaOp : public OpKernel {
                                                      &samples_shape));
    }
    const int64 num_samples = samples_shape.num_elements();
-    OP_REQUIRES(ctx, num_samples > 0,
-                errors::InvalidArgument(
-                    "Input shape should have non-zero element count, got: ",
-                    num_samples));
+    if (num_samples == 0) return;

    samples_shape.AppendShape(alpha_t.shape());
    // Allocate output samples.
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@ -837,15 +837,6 @@ class SparseMatMul {
 };

 #ifdef TENSORFLOW_USE_LIBXSMM
-#ifdef EXTRA_CACHE_LOGGING
-static tensorflow::mutex global_cache_stats_lock;
-static int total_num_entries_outstanding GUARDED_BY(global_cache_stats_lock) =
-    0;
-static int total_num_entries_in_cache GUARDED_BY(global_cache_stats_lock) = 0;
-#endif  // EXTRA_CACHE_LOGGING
-
-static const int max_entries_per_graph_node = 40;
-
 template <typename TL, typename TR>
 class LibxsmmSparseMatMul {
  typedef Eigen::Tensor<TL, 2, Eigen::RowMajor> MatrixL;
@ -861,7 +852,6 @@ class LibxsmmSparseMatMul {
      MatrixMapR;

 public:
-#if 1
  // This structure contains a set of libxsmm kernels for sizes that have been
  // encountered previously by this operator so that libxsmm does not need to
  // reallocate its scratchpad memory each time (which hurts performance
@ -880,181 +870,57 @@ class LibxsmmSparseMatMul {
      // useful (it is an empty struct right now)
      typename SparseMatMul<TL, TR>::TensorInfoCache
          non_libxsmm_cache;  // Currently not used
-      TF_DISALLOW_COPY_AND_ASSIGN(TensorInfoCacheEntry);
-      ~TensorInfoCacheEntry() {
-#ifdef EXTRA_CACHE_LOGGING
-        LOG(INFO) << "Deleting tensor cache entry at " << (void*)this;
-#endif  // EXTRA_CACHE_LOGGING
-        libxsmm_spmdm_destroy(&handle);
-      }
    };
-    // protects entries; invariant: entries is a valid std::list.
+    // protects entries; invariant: entries is a valid std::multimap
    tensorflow::mutex lock;
    // Because there could be multiple matrix multiplies with the same sizes
    // going on at the same time, we need to allow multiple cache entries for a
    // given set of parameters. Taking and returning entries is used to make
    // sure the same cache entry is not used from two threads at a time.
-    using entries_map_type = std::list<std::pair<
-        std::tuple<int, int, int, int>,
-        std::unique_ptr<TensorInfoCacheEntry>>>;  // multimap in LRU order
-    entries_map_type entries GUARDED_BY(
-        lock);  // MRU element at end so reverse search will find it first
-    int num_entries_outstanding GUARDED_BY(lock);
+    std::multimap<std::tuple<int, int, int, int>,
+                  std::unique_ptr<TensorInfoCacheEntry>>
+        entries GUARDED_BY(lock);

-    TensorInfoCache() : lock(), entries(), num_entries_outstanding(0) {}
+    TensorInfoCache() : lock(), entries() {}
    // Look up and remove first entry with these parameters, creating one if
    // there isn't one
    std::unique_ptr<TensorInfoCacheEntry> take_cache_entry(int M, int K, int N,
                                                           int max_threads)
-#ifdef EXTRA_CACHE_LOGGING
-        LOCKS_EXCLUDED(lock, global_cache_stats_lock)
-#else
-        LOCKS_EXCLUDED(lock)
-#endif
-    {
+        LOCKS_EXCLUDED(lock) {
      tensorflow::mutex_lock ml(lock);
-#ifdef EXTRA_CACHE_LOGGING
-      tensorflow::mutex_lock ml2(global_cache_stats_lock);
-#endif
      auto key = std::make_tuple(M, K, N, max_threads);
-      auto it_rev =
-          std::find_if(entries.rbegin(), entries.rend(),
-                       [&](const typename entries_map_type::value_type& e) {
-                         return e.first == key;
-                       });
-      auto it =
-          (it_rev == entries.rend() ? entries.end() : std::next(it_rev).base());
+      auto it = entries.find(key);
      if (it != entries.end()) {
        auto val = std::move(it->second);
        entries.erase(it);
-        ++num_entries_outstanding;
-#ifdef EXTRA_CACHE_LOGGING
-        ++total_num_entries_outstanding;
-        --total_num_entries_in_cache;
-        LOG(INFO) << "Used existing cache entry at " << (void*)val.get()
-                  << " for " << M << "x" << K << "x" << N << " max_threads "
-                  << max_threads
-                  << ", num_entries_outstanding = " << num_entries_outstanding
-                  << ", new cache size = " << entries.size()
-                  << ", total num_entries_outstanding = "
-                  << total_num_entries_outstanding
-                  << ", total cache size = " << total_num_entries_in_cache;
-#endif
        return val;
      } else {
-        while (!entries.empty() &&
-               entries.size() + num_entries_outstanding + 1 >
-                   max_entries_per_graph_node) {
-#ifdef EXTRA_CACHE_LOGGING
-          LOG(INFO) << "Removing old cache entry at "
-                    << (void*)entries.front().second.get();
-#endif
-          entries.pop_front();
-        }
        std::unique_ptr<TensorInfoCacheEntry> e{
            new TensorInfoCacheEntry{M, K, N, max_threads, {}, nullptr}};
        // setup scoped allocator, which uses cpu_allocator() for this scope
        const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
        libxsmm_spmdm_init(M, N, K, max_threads, &e->handle, &e->output_csr);
-        ++num_entries_outstanding;
-#ifdef EXTRA_CACHE_LOGGING
-        ++total_num_entries_outstanding;
-        LOG(INFO) << "Created cache entry at " << (void*)e.get() << " for " << M
-                  << "x" << K << "x" << N << " max_threads " << max_threads
-                  << ", num_entries_outstanding = " << num_entries_outstanding
-                  << ", new cache size = " << entries.size()
-                  << ", total num_entries_outstanding = "
-                  << total_num_entries_outstanding
-                  << ", total cache size = " << total_num_entries_in_cache;
-#endif
        return e;
      }
    }
    // Add a cache entry with certain parameters
    void return_cache_entry(std::unique_ptr<TensorInfoCacheEntry> e)
-#ifdef EXTRA_CACHE_LOGGING
-        LOCKS_EXCLUDED(lock, global_cache_stats_lock)
-#else
-        LOCKS_EXCLUDED(lock)
-#endif
-    {
+        LOCKS_EXCLUDED(lock) {
      tensorflow::mutex_lock ml(lock);
-#ifdef EXTRA_CACHE_LOGGING
-      tensorflow::mutex_lock ml2(global_cache_stats_lock);
-#endif
      auto key = std::make_tuple(e->M, e->K, e->N, e->max_threads);
-      --num_entries_outstanding;
-#ifdef EXTRA_CACHE_LOGGING
-      --total_num_entries_outstanding;
-      LOG(INFO) << "Returned cache entry at " << (void*)e.get() << " for "
-                << e->M << "x" << e->K << "x" << e->N << " max_threads "
-                << e->max_threads
-                << ", num_entries_outstanding = " << num_entries_outstanding
-                << ", prev cache size = " << entries.size()
-                << ", total num_entries_outstanding = "
-                << total_num_entries_outstanding
-                << ", total cache size = " << total_num_entries_in_cache;
-#endif
-      entries.push_back(std::make_pair(key, std::move(e)));
-#ifdef EXTRA_CACHE_LOGGING
-      ++total_num_entries_in_cache;
-#endif
+      entries.insert(std::make_pair(key, std::move(e)));
    }
    ~TensorInfoCache() {
      tensorflow::mutex_lock ml(lock);
-#ifdef EXTRA_CACHE_LOGGING
-      tensorflow::mutex_lock ml2(global_cache_stats_lock);
-      LOG(INFO) << "Deleting TensorInfoCache, cache size = " << entries.size()
-                << ", total num_entries_outstanding = "
-                << total_num_entries_outstanding
-                << ", total cache size = " << total_num_entries_in_cache;
-#endif
-      CHECK_EQ(num_entries_outstanding, 0);
+      for (auto& p : entries) {
+        libxsmm_spmdm_destroy(&p.second->handle);
+      }
      entries.clear();
    }

   private:
    TF_DISALLOW_COPY_AND_ASSIGN(TensorInfoCache);
  };
-#else
-  // This structure contains a set of libxsmm kernels for sizes that have been
-  // encountered previously by this operator so that libxsmm does not need to
-  // reallocate its scratchpad memory each time (which hurts performance
-  // substantially).
-  struct TensorInfoCache {
-    struct TensorInfoCacheEntry {
-      // Parameters for kernel
-      int M;
-      int K;
-      int N;
-      int max_threads;
-      // libxsmm handle and matrix data
-      libxsmm_spmdm_handle handle;
-      libxsmm_CSR_sparseslice* output_csr;
-      // Chain to non-libxsmm implementation's cache in case that ever becomes
-      // useful (it is an empty struct right now)
-      typename SparseMatMul<TL, TR>::TensorInfoCache
-          non_libxsmm_cache;  // Currently not used
-    };
-    TensorInfoCache() {}
-    // Look up and remove first entry with these parameters, creating one if
-    // there isn't one
-    std::unique_ptr<TensorInfoCacheEntry> take_cache_entry(int M, int K, int N,
-                                                           int max_threads) {
-      std::unique_ptr<TensorInfoCacheEntry> e{
-          new TensorInfoCacheEntry{M, K, N, max_threads, {}, nullptr}};
-      libxsmm_spmdm_init(M, N, K, max_threads, &e->handle, &e->output_csr);
-      return e;
-    }
-    // Add a cache entry with certain parameters
-    void return_cache_entry(std::unique_ptr<TensorInfoCacheEntry> e) {
-      libxsmm_spmdm_destroy(&e->handle);
-    }
-
-   private:
-    TF_DISALLOW_COPY_AND_ASSIGN(TensorInfoCache);
-  };
-#endif

  // Perform matrix multiplication of "left" and "right", and store the result
  // in *"output".
@ -1479,21 +1345,21 @@ inline void SparseMatMul<TL, TR>::ComputeBlockSizes(

 template <typename F>
 void do_on_all_threads(const DeviceBase::CpuWorkerThreads* thread_pool,
-                       ptrdiff_t max_thread_count, const F& f) {
+                       const F& f) {
  int num_threads = thread_pool->num_threads;
  if (num_threads == 0) {
    LOG(FATAL) << "Have 0 threads in thread pool";
  } else if (num_threads == 1) {
-    f(0, 1);
+    f(0);
  } else {
    BlockingCounter counter(num_threads - 1);
    for (int i = 1; i < num_threads; ++i) {
      thread_pool->workers->Schedule([&, i]() {
-        f(i, num_threads);
+        f(i);
        counter.DecrementCount();
      });
    }
-    f(0, num_threads);
+    f(0);
    counter.Wait();
  }
 }
@ -1522,24 +1388,21 @@ void wrapper_libxsmm_spmdm_createSparseSlice_generic_thread(

 void wrapper_libxsmm_spmdm_compute_generic_thread(
    empty_type_wrapper<bfloat16>, const libxsmm_spmdm_handle* handle,
-    char transA, char transB, libxsmm_CSR_sparseslice* A_sparse,
-    const bfloat16* B, char transC, float* C, int block_id, int tid,
-    int nthreads) {
-  const uint16 alpha = 1;
-  const uint16 beta = 0;
+    char transA, char transB, const bfloat16* alpha,
+    libxsmm_CSR_sparseslice* A_sparse, const bfloat16* B, char transC,
+    const bfloat16* beta, float* C, int block_id, int tid, int nthreads) {
  return libxsmm_spmdm_compute_bfloat16_thread(
-      handle, transA, transB, &alpha, A_sparse,
-      reinterpret_cast<const uint16*>(B), transC, &beta, C, block_id, tid,
-      nthreads);
+      handle, transA, transB, reinterpret_cast<const uint16*>(alpha), A_sparse,
+      reinterpret_cast<const uint16*>(B), transC,
+      reinterpret_cast<const uint16*>(beta), C, block_id, tid, nthreads);
 }
 void wrapper_libxsmm_spmdm_compute_generic_thread(
    empty_type_wrapper<float>, const libxsmm_spmdm_handle* handle, char transA,
-    char transB, libxsmm_CSR_sparseslice* A_sparse, const float* B, char transC,
-    float* C, int block_id, int tid, int nthreads) {
-  const float alpha = 1.f;
-  const float beta = 0.f;
-  return libxsmm_spmdm_compute_fp32_thread(handle, transA, transB, &alpha,
-                                           A_sparse, B, transC, &beta, C,
+    char transB, const float* alpha, libxsmm_CSR_sparseslice* A_sparse,
+    const float* B, char transC, const float* beta, float* C, int block_id,
+    int tid, int nthreads) {
+  return libxsmm_spmdm_compute_fp32_thread(handle, transA, transB, alpha,
+                                           A_sparse, B, transC, beta, C,
                                           block_id, tid, nthreads);
 }

@ -1590,13 +1453,11 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
  const int left_dim1 = transpose_left ? left.dimension(0) : left.dimension(1);
  const int right_dim0 = right.dimension(0);
  const int right_dim1 = right.dimension(1);
-  const int output_dim0 =
-      transpose_output ? output->dimension(1) : output->dimension(0);
-  const int output_dim1 =
-      transpose_output ? output->dimension(0) : output->dimension(1);
  CHECK_EQ(left_dim1, right_dim0);
-  CHECK_EQ(left_dim0, output_dim0);
-  CHECK_EQ(right_dim1, output_dim1);
+  CHECK_EQ(left_dim0,
+           (transpose_output ? output->dimension(1) : output->dimension(0)));
+  CHECK_EQ(right_dim1,
+           (transpose_output ? output->dimension(0) : output->dimension(1)));
  if (left_dim0 < 32 || left_dim1 < 32 || right_dim1 < 32) {
    // Causes problems in libxsmm
    SparseMatMul<TL, TR>::Compute(
@ -1614,50 +1475,42 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
  // Convert the left matrix to compressed sparse row (CSR) format
  ptrdiff_t total_num_creation_blocks =
      libxsmm_spmdm_get_num_createSparseSlice_blocks(&entry->handle);
-  ptrdiff_t total_num_mult_blocks =
-      libxsmm_spmdm_get_num_compute_blocks(&entry->handle);
-  bool use_libxsmm =
-      !(total_num_creation_blocks + total_num_mult_blocks < num_threads &&
-        !transpose_left && !transpose_output);
-  if (!use_libxsmm) {
-    // Avoid some performance issues in libxsmm (FIXME)
-    cache->return_cache_entry(std::move(entry));
-    SparseMatMul<TL, TR>::Compute(
-        nullptr /* Assumes no cached data for fallback */, left, right,
-        transpose_left, thread_pool, transpose_output, output);
-    return;
-  }
  std::atomic<int> cur_create_block_number;
  cur_create_block_number.store(0);
-  do_on_all_threads(thread_pool, total_num_creation_blocks,
-                    [&](int i, int actual_num_threads) {
-                      PinnedToCurrentCPU pin;
-                      while (true) {
-                        int work_item = cur_create_block_number.fetch_add(1);
-                        if (work_item >= total_num_creation_blocks) break;
-                        wrapper_libxsmm_spmdm_createSparseSlice_generic_thread(
-                            empty_type_wrapper<TL>{}, &entry->handle,
-                            (transpose_left ? 'T' : 'N'), left_data,
-                            entry->output_csr, work_item, i,
-                            actual_num_threads);
-                      }
-                    });
+  do_on_all_threads(thread_pool, [&](int i) {
+    PinnedToCurrentCPU pin;
+    while (true) {
+      int work_item = cur_create_block_number.fetch_add(1);
+      if (work_item >= total_num_creation_blocks) break;
+      wrapper_libxsmm_spmdm_createSparseSlice_generic_thread(
+          empty_type_wrapper<TL>{}, &entry->handle,
+          (transpose_left ? 'T' : 'N'), left_data, entry->output_csr, work_item,
+          i, num_threads);
+    }
+  });
  // Do matrix-matrix multiplication
+  // TODO(jewillco): libxsmm doesn't support beta != 1 yet -- remove when
+  // release
+  // includes beta handling
+  memset(output_data, 0, left_dim0 * right_dim1 * sizeof(TR));
+  ptrdiff_t total_num_mult_blocks =
+      libxsmm_spmdm_get_num_compute_blocks(&entry->handle);
  std::atomic<int> cur_mult_block_number;
  cur_mult_block_number.store(0);
-  do_on_all_threads(
-      thread_pool, total_num_mult_blocks, [&](int i, int actual_num_threads) {
-        PinnedToCurrentCPU pin;
-        while (true) {
-          int work_item = cur_mult_block_number.fetch_add(1);
-          if (work_item >= total_num_mult_blocks) break;
-          wrapper_libxsmm_spmdm_compute_generic_thread(
-              empty_type_wrapper<TL>{}, &entry->handle,
-              (transpose_left ? 'T' : 'N'), 'N', entry->output_csr, right_data,
-              (transpose_output ? 'T' : 'N'), output_data, work_item, i,
-              actual_num_threads);
-        }
-      });
+  do_on_all_threads(thread_pool, [&](int i) {
+    PinnedToCurrentCPU pin;
+    while (true) {
+      int work_item = cur_mult_block_number.fetch_add(1);
+      if (work_item >= total_num_mult_blocks) break;
+      const TL alpha(1.0);  // Stored in a variable so we can get a pointer
+      const TL beta(0.0);   // Stored in a variable so we can get a pointer
+      wrapper_libxsmm_spmdm_compute_generic_thread(
+          empty_type_wrapper<TL>{}, &entry->handle,
+          (transpose_left ? 'T' : 'N'), 'N', &alpha, entry->output_csr,
+          right_data, (transpose_output ? 'T' : 'N'), &beta, output_data,
+          work_item, i, num_threads);
+    }
+  });
  // Put handle + CSR storage back into cache
  cache->return_cache_entry(std::move(entry));
 }
@ -1803,17 +1656,15 @@ inline void SparseMatMul<TL, TR>::Compute(
                          SparseMatMulOp<TA, TB, LibxsmmSparseMatMul>);
 #endif

+REGISTER_SPARSE_MATMUL(bfloat16, bfloat16);
+
 REGISTER_SPARSE_MATMUL(float, bfloat16);

 REGISTER_SPARSE_MATMUL(bfloat16, float);

 #ifdef TENSORFLOW_USE_LIBXSMM
-REGISTER_SPARSE_MATMUL_LIBXSMM(bfloat16, bfloat16);
-
 REGISTER_SPARSE_MATMUL_LIBXSMM(float, float);
 #else
-REGISTER_SPARSE_MATMUL(bfloat16, bfloat16);
-
 REGISTER_SPARSE_MATMUL(float, float);
 #endif

--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@ -255,13 +255,12 @@ EIGEN_STRONG_INLINE Packet8d pbroadcast_second<Packet8d>(const Packet8d& a_in) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pbroadcast_third<Packet8d>(const Packet8d& a_in) {
-  Packet2d a = _mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1);
+  Packet2d a = _mm512_extractf32x4_ps(a_in, 1);
  return _mm512_broadcastsd_pd(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pbroadcast_fourth<Packet8d>(const Packet8d& a_in) {
-  Packet2d a =
-      _mm_permute_pd(_mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1), 3);
+  Packet2d a = _mm_permute_pd(_mm512_extractf32x4_ps(a_in, 1), 3);
  return _mm512_broadcastsd_pd(a);
 }
 template <>
@ -418,17 +417,14 @@ EIGEN_STRONG_INLINE Packet8f pbroadcast_fourth<Packet8f>(const Packet8f& a) {

 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_l(const Packet16f& from) {
-  return _mm512_castsi512_ps(_mm512_slli_epi32(
-      _mm512_cvtepu16_epi32(_mm512_castsi512_si256(_mm512_castps_si512(from))),
-      16));
+  return _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(from)),
+                           16);
 }

 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_u(const Packet16f& from) {
-  return _mm512_castsi512_ps(
-      _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_castpd_si256(
-                            _mm512_extractf64x4_pd(_mm512_castps_pd(from), 1))),
-                        16));
+  return _mm512_slli_epi32(
+      _mm512_cvtepu16_epi32(_mm512_extractf64x4_pd(from, 1)), 16);
 }

 #endif
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@ -26,14 +26,18 @@ void dummy_xsmm_conv2d_ensure_file_is_not_empty(void);
 #include "tensorflow/core/kernels/xsmm_conv2d.h"

 #include <stdlib.h>
+#include <cstring>
+#if 0
+#include <omp.h>
+#endif

 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/threadpool.h"

+#include "libxsmm_main.h"  // TODO(bsteiner): API to avoid incl. header from src/
 #include "include/libxsmm_cpuid.h"
-#include "libxsmm_dnn_handle.h"
-#include "libxsmm_malloc.h"
+#include "include/libxsmm_malloc.h"

 namespace tensorflow {

@ -59,10 +63,6 @@ bool CanUseXsmmConv2D(const libxsmm_dnn_conv_desc& desc,
    VLOG(1) << "Cannot use XSMM convolutions: unsupported format!";
    return false;
  }
-  if (desc.pad_h_in != 0 || desc.pad_w_in != 0) {
-    VLOG(1) << "Cannot use XSMM convolutions: unsupported padding!";
-    return false;
-  }
  if (desc.K % VECTOR_SIZE != 0) {
    VLOG(1) << "Cannot use XSMM convolutions: output features count not"
               " divisible by vector size!";
@ -72,7 +72,6 @@ bool CanUseXsmmConv2D(const libxsmm_dnn_conv_desc& desc,
  return true;
 }

-
 typedef Eigen::ThreadPoolDevice CPUDevice;

 namespace functor {
@ -83,25 +82,34 @@ static void chk_libxsmm_err(libxsmm_dnn_err_t status, string msg) {
  }
 }

-LIBXSMM_INLINE void copy_RSCK_to_custom(const float* rsck, float *kcrs, int R, int S, int C, int K,int blocksifm, int blocksofm, int ifmblock,int ofmblock, int start, int end)
-{
-  LIBXSMM_VLA_DECL(4, const      float, input, rsck, S, C,K);
-  LIBXSMM_VLA_DECL(6, float, output, kcrs, blocksifm,R,S,ifmblock, ofmblock);
-  int r, s, k,c, v1,v2;
-  
-  for (k = start; k < end ; k++ ) { 
-    for(c = 0; c < blocksifm;c++){
-      for ( r = 0; r < R; r++ ) {
-        for ( s = 0; s < S; s++ ){
-          for ( v1 = c*ifmblock; v1 < std::min(C,(c+1)*ifmblock) ; v1++ ) {
-            for ( v2 = k*ofmblock; v2 < std::min(K, (k+1)*ofmblock); v2++ )
-              LIBXSMM_VLA_ACCESS(6,  output, k,c, r, s,v1- c*ifmblock,v2-k*ofmblock, blocksifm, R, S,ifmblock,ofmblock) = LIBXSMM_VLA_ACCESS(4, input, r, s, v1, v2,  S, C, K);
-            for ( v2 = K; v2 < (k+1)*ofmblock ; v2++ )
-              LIBXSMM_VLA_ACCESS(6,  output, k,c, r, s,v1- c*ifmblock,v2-k*ofmblock, blocksifm, R, S,ifmblock,ofmblock) = 0.0f; 
-            }
-          for ( v1 = C; v1 < (c+1)*ifmblock ; v1++ ) {
-            for ( v2 = k*ofmblock; v2 < (k+1)*ofmblock; v2++ )
-              LIBXSMM_VLA_ACCESS(6,  output, k,c, r, s,v1- c*ifmblock,v2-k*ofmblock, blocksifm, R, S,ifmblock,ofmblock) = 0.0f;
+LIBXSMM_INLINE void copy_RSCK_to_custom(const float* rsck, float* kcrs, int R,
+                                        int S, int C, int K, int blocksifm,
+                                        int blocksofm, int ifmblock,
+                                        int ofmblock, int start, int end) {
+  LIBXSMM_VLA_DECL(4, const float, input, rsck, S, C, K);
+  LIBXSMM_VLA_DECL(6, float, output, kcrs, blocksifm, R, S, ifmblock, ofmblock);
+  int r, s, k, c, v1, v2;
+
+  for (k = start; k < end; k++) {
+    for (c = 0; c < blocksifm; c++) {
+      for (r = 0; r < R; r++) {
+        for (s = 0; s < S; s++) {
+          for (v1 = c * ifmblock; v1 < std::min(C, (c + 1) * ifmblock); v1++) {
+            for (v2 = k * ofmblock; v2 < std::min(K, (k + 1) * ofmblock); v2++)
+              LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
+                                 v2 - k * ofmblock, blocksifm, R, S, ifmblock,
+                                 ofmblock) =
+                  LIBXSMM_VLA_ACCESS(4, input, r, s, v1, v2, S, C, K);
+            for (v2 = K; v2 < (k + 1) * ofmblock; v2++)
+              LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
+                                 v2 - k * ofmblock, blocksifm, R, S, ifmblock,
+                                 ofmblock) = 0.0f;
+          }
+          for (v1 = C; v1 < (c + 1) * ifmblock; v1++) {
+            for (v2 = k * ofmblock; v2 < (k + 1) * ofmblock; v2++)
+              LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
+                                 v2 - k * ofmblock, blocksifm, R, S, ifmblock,
+                                 ofmblock) = 0.0f;
          }
        }
      }
@ -109,35 +117,28 @@ LIBXSMM_INLINE void copy_RSCK_to_custom(const float* rsck, float *kcrs, int R, i
  }
 }

- 
+class libxsmm_dnn_conv_desc_wrap {
+ public:
+  const libxsmm_dnn_conv_desc d;

-class libxsmm_dnn_conv_desc_wrap{
-  public:
-    const libxsmm_dnn_conv_desc d;
- 
-    libxsmm_dnn_conv_desc_wrap(const libxsmm_dnn_conv_desc &d_) : d(d_){
-    }
-    bool operator==(const libxsmm_dnn_conv_desc_wrap  &w) const{
-      return( d.N == w.d.N &&
-              d.C == w.d.C &&
-              d.H == w.d.H &&
-              d.W == w.d.W &&
-              d.K == w.d.K &&
-              d.R == w.d.R &&
-              d.S == w.d.S &&
-              d.u == w.d.u &&
-              d.v == w.d.v &&
-              d.pad_h_in == w.d.pad_h_in &&
-              d.pad_w_in == w.d.pad_w_in
-            );
-    }
+  libxsmm_dnn_conv_desc_wrap(const libxsmm_dnn_conv_desc& d_) : d(d_) {}
+  bool operator==(const libxsmm_dnn_conv_desc_wrap& w) const {
+    return (d.N == w.d.N && d.C == w.d.C && d.H == w.d.H && d.W == w.d.W &&
+            d.K == w.d.K && d.R == w.d.R && d.S == w.d.S && d.u == w.d.u &&
+            d.v == w.d.v && d.pad_h == w.d.pad_h && d.pad_w == w.d.pad_w);
+  }
 };
- 
- 
-struct HashFunction{
-  std::size_t operator()(const libxsmm_dnn_conv_desc_wrap & w) const{
+
+struct HashFunction {
+  std::size_t operator()(const libxsmm_dnn_conv_desc_wrap& w) const {
+    // unsigned char ptr[sizeof(&w.d)];
+
+    // memcpy(ptr, (unsigned char *)&w.d, sizeof(&w.d))
+
+    //
+    /*
    std::ostringstream N,C,H,W,K,R,S,u,v,padh,padw;
- 
+
    N << w.d.N; C << w.d.C;
    H << w.d.H; W << w.d.W;
    K << w.d.K; R << w.d.R;
@ -152,59 +153,71 @@ struct HashFunction{
                       + S.str() + u.str()\
                       + v.str() + padh.str()\
                       + padw.str();
-
-    return ( std::hash<std::string>()(out_));
+    //
+    //
+    */
+    return (std::hash<unsigned long long>()((unsigned long long)&(w.d)));
  }
 };

-class handles{
-  public:
-    libxsmm_dnn_layer* find( const libxsmm_dnn_conv_desc_wrap &w) {
-      std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*,
-                         HashFunction>::iterator i = libxsmm_handles.find(w);
-      if (i == libxsmm_handles.end()){
-        libxsmm_dnn_err_t status;
-        libxsmm_dnn_layer* libxsmm_handle =
-            libxsmm_dnn_create_conv_layer(w.d, &status);
-        chk_libxsmm_err(status, "Create handle");
-        libxsmm_handles.insert(std::make_pair(w, libxsmm_handle));
-        return libxsmm_handle;
-      }
-      else
-        return i->second;
-    }
-   ~handles(){
-     std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*,
-                        HashFunction>::iterator i;
-    for (i= libxsmm_handles.begin(); i != libxsmm_handles.end(); i++)
+class handles {
+ public:
+  libxsmm_dnn_layer* find(const libxsmm_dnn_conv_desc_wrap& w) {
+    std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
+                       HashFunction>::iterator i = libxsmm_handles.find(w);
+    if (i == libxsmm_handles.end()) {
+      libxsmm_dnn_err_t status;
+      libxsmm_dnn_layer* libxsmm_handle =
+          libxsmm_dnn_create_conv_layer(w.d, &status);
+      chk_libxsmm_err(status, "Create handle");
+      libxsmm_handles.insert(std::make_pair(w, libxsmm_handle));
+      return libxsmm_handle;
+    } else
+      return i->second;
+  }
+  ~handles() {
+    std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
+                       HashFunction>::iterator i;
+    for (i = libxsmm_handles.begin(); i != libxsmm_handles.end(); i++)
      chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(i->second),
-                    "Destroy handle");
-    }
-  private:
-    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction> libxsmm_handles;
+                      "Destroy handle");
+  }
+
+ private:
+  std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
+                     HashFunction>
+      libxsmm_handles;
 };

 static handles libxsmm_handles;

+//#define LIBXSMM_DETAILED_TIMING
+
 template <typename InputPtr, typename FilterPtr, typename OutputPtr>
 static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
                                   const libxsmm_dnn_conv_desc& desc,
-                                   libxsmm_dnn_compute_kind kind, InputPtr input,
-                                   FilterPtr filter, OutputPtr output) {
+                                   libxsmm_dnn_compute_kind kind,
+                                   InputPtr input, FilterPtr filter,
+                                   OutputPtr output) {
+#if defined(LIBXSMM_DETAILED_TIMING)
+  unsigned long long l_tick1, l_tick2, l_tick3, l_tick4, l_tick5, l_tick6,
+      l_tick7, l_tick8, l_tick9, l_tick10;
+  l_tick1 = libxsmm_timer_tick();
+#endif
  // setup scoped allocator, which adopts the allocator from the context
  const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator(*ctx);
  libxsmm_dnn_err_t status;
  libxsmm_dnn_layer* libxsmm_handle;
  libxsmm_dnn_conv_desc_wrap w(desc);
  void* scratch;
- 
-  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD)
-    libxsmm_handle = libxsmm_handles.find(w);
-  else {
-    libxsmm_handle = libxsmm_dnn_create_conv_layer(desc, &status);
-    chk_libxsmm_err(status, "Create handle");
-  }
-  
+
+  // if(kind == LIBXSMM_DNN_COMPUTE_KIND_FWD)
+  libxsmm_handle = libxsmm_handles.find(w);
+  // else{
+  //  libxsmm_handle = libxsmm_dnn_create_conv_layer(desc, &status);
+  //  chk_libxsmm_err(status, "Create handle");
+  //}
+
  status = libxsmm_dnn_get_codegen_success(libxsmm_handle, kind);
  if (status == LIBXSMM_DNN_WARN_FALLBACK) {
    chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
@ -217,100 +230,168 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
  libxsmm_dnn_buffer* libxsmm_output;
  libxsmm_dnn_filter* libxsmm_filter;

- /*
-  const DeviceBase::CpuWorkerThreads* worker_threads =
-      ctx->device()->tensorflow_cpu_worker_threads();
-
-  int num_threads = worker_threads->num_threads;
-*/
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick2 = libxsmm_timer_tick();
+#endif

  int ifmblock = (libxsmm_handle->ifmblock);
  int ofmblock = (libxsmm_handle->ofmblock);

-  int blocksifm = desc.C%ifmblock ==0 ? desc.C/ifmblock :desc.C/ifmblock + 1;
-  int blocksofm = desc.K%ofmblock ==0 ? desc.K/ofmblock :desc.K/ofmblock + 1;
-  float *native_filter = (float*)libxsmm_aligned_scratch(
-      blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float),
-      2097152);
+  int blocksifm =
+      desc.C % ifmblock == 0 ? desc.C / ifmblock : desc.C / ifmblock + 1;
+  int blocksofm =
+      desc.K % ofmblock == 0 ? desc.K / ofmblock : desc.K / ofmblock + 1;
+  float* native_filter =
+      (float*)libxsmm_aligned_scratch(blocksofm * blocksifm * desc.R * desc.S *
+                                          ifmblock * ofmblock * sizeof(float),
+                                      2097152);

  const DeviceBase::CpuWorkerThreads* worker_threads =
      ctx->device()->tensorflow_cpu_worker_threads();

  int num_threads = worker_threads->num_threads;

-
-  if(blocksofm > num_threads){
-    int work = blocksofm;
-    BlockingCounter count(num_threads);
-    for (int i = 0; i < num_threads; ++i) {
+#if 1
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD ||
+      kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    if (blocksofm > num_threads) {
+      int work = blocksofm;
+      BlockingCounter count(num_threads);
+      for (int i = 0; i < num_threads; ++i) {
        worker_threads->workers->Schedule([=, &count]() {
-        int start = work/num_threads*i;
-        int end =  (start + work/num_threads) > work ? work: start + work/num_threads;  
-        copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S,desc.C, desc.K,blocksifm,blocksofm,ifmblock,ofmblock,start, end);
-        count.DecrementCount();
+          int start = work / num_threads * i;
+          int end = (start + work / num_threads) > work
+                        ? work
+                        : start + work / num_threads;
+          copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S, desc.C,
+                              desc.K, blocksifm, blocksofm, ifmblock, ofmblock,
+                              start, end);
+          count.DecrementCount();
        });
-    }
-    count.Wait();
-  }
-  else{
+      }
+      count.Wait();
+    } else {
+      int work = blocksofm;
+      int num_threads = work;

-    int work = blocksofm;
-    int num_threads = work;
-    
-    BlockingCounter count(num_threads);
-    for (int i = 0; i < num_threads; ++i) {
+      BlockingCounter count(num_threads);
+      for (int i = 0; i < num_threads; ++i) {
        worker_threads->workers->Schedule([=, &count]() {
-        int start = i;
-        int end =  i+1;
-        copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S,desc.C, desc.K,blocksifm,blocksofm,ifmblock,ofmblock, start, end);
-        count.DecrementCount();
+          int start = i;
+          int end = i + 1;
+          copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S, desc.C,
+                              desc.K, blocksifm, blocksofm, ifmblock, ofmblock,
+                              start, end);
+          count.DecrementCount();
        });
+      }
+      count.Wait();
    }
-    count.Wait();
  }
+  // Added: for weight update
+  else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    libxsmm_filter =
+        libxsmm_dnn_link_filter(libxsmm_handle, LIBXSMM_DNN_FILTER, filter,
+                                LIBXSMM_DNN_TENSOR_FORMAT_RSCK_PTR, &status);
+    chk_libxsmm_err(status,
+                    "Link filter");  // weight update is in RSCK as
+                                     // filter should be returned in RSCK
+                                     // format
+  }
+#else
+  memset(native_filter, 0,
+         blocksofm * blocksifm * desc.R * desc.S * ifmblock * ofmblock *
+             sizeof(float));
+#endif

-  libxsmm_input = libxsmm_dnn_link_buffer(
-      libxsmm_handle, LIBXSMM_DNN_INPUT, input, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick3 = libxsmm_timer_tick();
+#endif
+
+  libxsmm_input =
+      libxsmm_dnn_link_buffer(libxsmm_handle, LIBXSMM_DNN_INPUT, input,
+                              LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
  chk_libxsmm_err(status, "Link input buffer");
-  libxsmm_output = libxsmm_dnn_link_buffer(
-      libxsmm_handle, LIBXSMM_DNN_OUTPUT, output, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
+  libxsmm_output =
+      libxsmm_dnn_link_buffer(libxsmm_handle, LIBXSMM_DNN_OUTPUT, output,
+                              LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
  chk_libxsmm_err(status, "Link output buffer");
-  libxsmm_filter = libxsmm_dnn_link_filter(
-      libxsmm_handle, LIBXSMM_DNN_FILTER, native_filter, LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
-  chk_libxsmm_err(status, "Link filter");
-
-  chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_output), "Zero output");
-
-
-  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
-    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT),
-                    "Bind input forward");
-    chk_libxsmm_err(
-        libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT),
-        "Bind output forward");
-    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER),
-                    "Bind filter forward");
-  } else {
-    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_GRADIENT_INPUT),
-                    "Bind input backward");
-    chk_libxsmm_err(
-        libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output, LIBXSMM_DNN_GRADIENT_OUTPUT),
-        "Bind output backward");
-    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER),
-                    "Bind filter backward");
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD ||
+      kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    libxsmm_filter = libxsmm_dnn_link_filter(
+        libxsmm_handle, LIBXSMM_DNN_FILTER, native_filter,
+        LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
+    chk_libxsmm_err(status, "Link filter");
  }
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
+    chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_output), "Zero output");
+
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
+                                            LIBXSMM_DNN_REGULAR_INPUT),
+                    "Bind input forward");
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
+                                            LIBXSMM_DNN_REGULAR_OUTPUT),
+                    "Bind output forward");
+    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
+                                            LIBXSMM_DNN_REGULAR_FILTER),
+                    "Bind filter forward");
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_input), "Zero input");
+
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
+                                            LIBXSMM_DNN_GRADIENT_INPUT),
+                    "Bind input backward");
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
+                                            LIBXSMM_DNN_GRADIENT_OUTPUT),
+                    "Bind output backward");
+    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
+                                            LIBXSMM_DNN_REGULAR_FILTER),
+                    "Bind filter backward");
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    chk_libxsmm_err(libxsmm_dnn_zero_filter(libxsmm_filter), "Zero filter");
+
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
+                                            LIBXSMM_DNN_REGULAR_INPUT),
+                    "Bind input weight udpate");
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
+                                            LIBXSMM_DNN_GRADIENT_OUTPUT),
+                    "Bind output weight update");
+    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
+                                            LIBXSMM_DNN_GRADIENT_FILTER),
+                    "Bind filter weight update");
+  } else {
+    /* shouldn't happen */
+  }
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick4 = libxsmm_timer_tick();
+#endif

  /* bind scratch */
-  scratch = (void*)libxsmm_aligned_scratch( libxsmm_dnn_get_scratch_size( libxsmm_handle, kind, &status ), 2097152);
-  chk_libxsmm_err( status, "scratch allocation" );
-  chk_libxsmm_err( libxsmm_dnn_bind_scratch( libxsmm_handle, kind, scratch ), "binding scratch" );
+  scratch = (void*)libxsmm_aligned_scratch(
+      libxsmm_dnn_get_scratch_size(libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL,
+                                   &status),
+      2097152);
+  chk_libxsmm_err(status, "scratch allocation");
+  chk_libxsmm_err(libxsmm_dnn_bind_scratch(
+                      libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch),
+                  "binding scratch");
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick5 = libxsmm_timer_tick();
+#endif

  if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
    libxsmm_dnn_transpose_filter(libxsmm_handle, LIBXSMM_DNN_FILTER);
  }

+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick6 = libxsmm_timer_tick();
+#endif
+
+#if 1
  BlockingCounter counter(num_threads);
-  
+
  for (int i = 0; i < num_threads; ++i) {
    worker_threads->workers->Schedule([=, &counter]() {
      chk_libxsmm_err(libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, i),
@ -319,28 +400,97 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
    });
  }
  counter.Wait();
+#else
+#pragma omp parallel
+  {
+    chk_libxsmm_err(
+        libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, omp_get_thread_num()),
+        "Worker");
+  }
+#endif
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick7 = libxsmm_timer_tick();
+#endif
+
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    libxsmm_dnn_reduce_wu_filters(libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER);
+  }
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick8 = libxsmm_timer_tick();
+#endif

  /* clean up */
-  chk_libxsmm_err( libxsmm_dnn_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ), "release scratch" );
+  chk_libxsmm_err(
+      libxsmm_dnn_release_scratch(libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL),
+      "release scratch");
  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ), "release input" );
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ), "release output" );
-    chk_libxsmm_err( libxsmm_dnn_release_filter( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ), "release filter" );
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT),
+        "release input");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT),
+        "release output");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER),
+        "release filter");
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT),
+        "release input");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT),
+        "release output");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER),
+        "release filter");
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT),
+        "release input");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT),
+        "release output");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER),
+        "release filter");
  } else {
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ), "release input" );
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ), "release output" );
-    chk_libxsmm_err( libxsmm_dnn_release_filter( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ), "release filter" );
+    /* shouldn't happen */
  }
  chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_input), "Destroy input");
  chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_output), "Destroy output");
  chk_libxsmm_err(libxsmm_dnn_destroy_filter(libxsmm_filter), "Destroy filter");
-  
-  if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
-    chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
-                  "Destroy handle");
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick9 = libxsmm_timer_tick();
+#endif
+
+  // if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
+  // chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
+  //               "Destroy handle");

  libxsmm_free(native_filter);
  libxsmm_free(scratch);
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick10 = libxsmm_timer_tick();
+  printf(
+      "time for convolution (%i, %i, %i, %i, %i): %f, %f, %f, %f, %f, %f, %f, "
+      "%f, %f, %f\n",
+      desc.N, desc.C, desc.K, desc.R, desc.S,
+      libxsmm_timer_duration(l_tick1, l_tick2),
+      libxsmm_timer_duration(l_tick2, l_tick3),
+      libxsmm_timer_duration(l_tick3, l_tick4),
+      libxsmm_timer_duration(l_tick4, l_tick5),
+      libxsmm_timer_duration(l_tick5, l_tick6),
+      libxsmm_timer_duration(l_tick6, l_tick7),
+      libxsmm_timer_duration(l_tick7, l_tick8),
+      libxsmm_timer_duration(l_tick8, l_tick9),
+      libxsmm_timer_duration(l_tick9, l_tick10),
+      libxsmm_timer_duration(l_tick1, l_tick10));
+#endif
+
  return true;  // Succeeded
 }

@ -348,8 +498,8 @@ template <typename T>
 struct XsmmFwdConv2D<CPUDevice, T> {
  bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                  const T* input, const T* filter, T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_FWD, input,
-                                  filter, output);
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_FWD,
+                                  input, filter, output);
  }
 };

@ -357,8 +507,8 @@ template <typename T>
 struct XsmmBkwInputConv2D<CPUDevice, T> {
  bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                  T* input, const T* filter, const T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_BWD, input,
-                                  filter, output);
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_BWD,
+                                  input, filter, output);
  }
 };

@ -366,8 +516,8 @@ template <typename T>
 struct XsmmBkwFilterConv2D<CPUDevice, T> {
  bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                  const T* input, T* filter, const T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_UPD, input,
-                                  filter, output);
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_UPD,
+                                  input, filter, output);
  }
 };

--- a/tensorflow/core/lib/io/inputbuffer.cc
+++ b/tensorflow/core/lib/io/inputbuffer.cc
@ -15,6 +15,7 @@ limitations under the License.

 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"

 namespace tensorflow {
 namespace io {
@ -43,25 +44,26 @@ Status InputBuffer::FillBuffer() {

 Status InputBuffer::ReadLine(string* result) {
  result->clear();
-  int i;
  Status s;
-  for (i = 0;; i++) {
-    if (pos_ == limit_) {
-      // Get more data into buffer
-      s = FillBuffer();
-      if (limit_ == buf_) {
-        break;
+  do {
+    size_t buf_remain = limit_ - pos_;
+    char* newline = static_cast<char*>(memchr(pos_, '\n', buf_remain));
+    if (newline != nullptr) {
+      size_t result_len = newline - pos_;
+      result->append(pos_, result_len);
+      pos_ = newline + 1;
+      if (!result->empty() && result->back() == '\r') {
+        result->resize(result->size() - 1);
      }
-    }
-    char c = *pos_++;
-    if (c == '\n') {
-      // We don't append the '\n' to *result
      return Status::OK();
    }
-    // We don't append '\r' to *result
-    if (c != '\r') {
-      *result += c;
-    }
+    if (buf_remain > 0) result->append(pos_, buf_remain);
+    // Get more data into buffer
+    s = FillBuffer();
+    DCHECK_EQ(pos_, buf_);
+  } while (limit_ != buf_);
+  if (!result->empty() && result->back() == '\r') {
+    result->resize(result->size() - 1);
  }
  if (errors::IsOutOfRange(s) && !result->empty()) {
    return Status::OK();
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@ -1323,6 +1323,11 @@ Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 If `indices` is a permutation and `len(indices) == params.shape[0]` then
 this operation will permute `params` accordingly.

+`validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+`indices` are always validated to be within range. If assigned to GPU,
+out-of-bound indices result in unspecified behavior (currently the result is
+`0`, but this may become an error in the future).
+
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="../../images/Gather.png" alt>
 </div>
--- a/tensorflow/core/ops/nn_grad.cc
+++ b/tensorflow/core/ops/nn_grad.cc
@ -181,4 +181,35 @@ Status MaxPoolGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("MaxPool", MaxPoolGrad);

+Status MaxPoolGradGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  *g = FDH::Define(
+    // Arg defs
+    {"input: T", "grad: T"},
+    // Ret val defs
+    {"output: T"},
+    // Attr defs
+    {"T: {float, half} = DT_FLOAT",
+     "ksize: list(int) >= 4",
+     "strides: list(int) >= 4",
+     GetPaddingAttrString()},
+    // Nodes
+    {
+      // Invoke MaxPool again to recompute the outputs (removed by CSE?).
+      {{"maxpool"}, "MaxPool", {"input"},
+       /*Attrs=*/{{"T", "$T"},
+                  {"ksize", "$ksize"},
+                  {"strides", "$strides"},
+                  {"padding", "$padding"}}},
+      {{"output"}, "MaxPoolGradGrad", {"input", "maxpool", "grad"},
+       /*Attrs=*/{{"T", "$T"},
+                  {"ksize", "$ksize"},
+                  {"strides", "$strides"},
+                  {"padding", "$padding"}}}
+    });
+  // clang-format on
+  return Status::OK();
+}
+REGISTER_OP_GRADIENT("MaxPoolGrad", MaxPoolGradGrad);
+
 }  // end namespace tensorflow
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@ -89,7 +89,7 @@ REGISTER_OP("AvgPool")
    .Attr("strides: list(int) >= 4")
    .Attr(GetPaddingAttrString())
    .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {float, half, double}")
+    .Attr("T: realnumbertype")
    .SetShapeFn(shape_inference::AvgPoolShape)
    .Doc(R"doc(
 Performs average pooling on the input.
@ -117,7 +117,7 @@ REGISTER_OP("AvgPoolGrad")
    .Attr("strides: list(int) >= 4")
    .Attr(GetPaddingAttrString())
    .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {float, half, double}")
+    .Attr("T: realnumbertype")
    .SetShapeFn([](InferenceContext* c) {
      // NOTE(mrry): We could in principle work out the shape from the
      // gradients and the attrs, but if we do not know orig_input_shape
@ -1186,15 +1186,16 @@ data_format: The data format of the input and output data. With the
 )doc");

 REGISTER_OP("MaxPool3DGrad")
-    .Input("orig_input: float")
-    .Input("orig_output: float")
+    .Input("orig_input: TInput")
+    .Input("orig_output: TInput")
    .Input("grad: T")
    .Output("output: T")
    .Attr("ksize: list(int) >= 5 ")
    .Attr("strides: list(int) >= 5")
    .Attr(GetPaddingAttrString())
    .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: numbertype")
+    .Attr("T: numbertype = DT_FLOAT")
+    .Attr("TInput: numbertype = DT_FLOAT")
    .SetShapeFn([](InferenceContext* c) {
      return UnchangedShapeWithRank(c, 5);
    })
@ -1216,6 +1217,44 @@ data_format: The data format of the input and output data. With the
        [batch, in_channels, in_depth, in_height, in_width].
 )doc");

+REGISTER_OP("MaxPool3DGradGrad")
+    .Input("orig_input: T")
+    .Input("orig_output: T")
+    .Input("grad: T")
+    .Output("output: T")
+    .Attr("ksize: list(int) >= 5 ")
+    .Attr("strides: list(int) >= 5")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("T: realnumbertype")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Pool3DShape(c));
+      ShapeHandle unused;
+      // Validate 'orig_input' is the same shape as 'grad'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(2), &unused));
+      // Validate 'orig_output' is same shape as 'output'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->output(0), &unused));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Computes second-order gradients of the maxpooling function.
+
+ksize: 1-D tensor of length 5. The size of the window for each dimension of
+  the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+strides: 1-D tensor of length 5. The stride of the sliding window for each
+  dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+padding: The type of padding algorithm to use.
+orig_input: The original input tensor.
+orig_output: The original output tensor.
+grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+output: Gradients of gradients w.r.t. the input to `max_pool`.
+data_format: The data format of the input and output data. With the
+    default format "NDHWC", the data is stored in the order of:
+        [batch, in_depth, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCDHW", the data storage order is:
+        [batch, in_channels, in_depth, in_height, in_width].
+)doc");
+
 // --------------------------------------------------------------------------

 REGISTER_OP("L2Loss")
@ -1303,7 +1342,7 @@ output: The gradients for LRN.
 // --------------------------------------------------------------------------

 REGISTER_OP("MaxPool")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: realnumbertype = DT_FLOAT")
    .Attr("ksize: list(int) >= 4")
    .Attr("strides: list(int) >= 4")
    .Attr(GetPaddingAttrString())
@ -1336,7 +1375,7 @@ REGISTER_OP("MaxPoolGrad")
    .Input("orig_output: T")
    .Input("grad: T")
    .Output("output: T")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: realnumbertype = DT_FLOAT")
    .SetShapeFn([](InferenceContext* c) {
      return UnchangedShapeWithRank(c, 4);
    })
@ -1358,6 +1397,43 @@ grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
 output: Gradients w.r.t. the input to `max_pool`.
 )doc");

+REGISTER_OP("MaxPoolGradGrad")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Input("orig_input: T")
+    .Input("orig_output: T")
+    .Input("grad: T")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::MaxPoolShape(c));
+      ShapeHandle unused;
+      // Validate 'orig_input' is the same shape as 'grad'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(2), &unused));
+      // Validate 'orig_output' is same shape as 'output'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->output(0), &unused));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Computes second-order gradients of the maxpooling function.
+
+ksize: The size of the window for each dimension of the input tensor.
+strides: The stride of the sliding window for each dimension of the
+  input tensor.
+padding: The type of padding algorithm to use.
+data_format: Specify the data format of the input and output data. With the
+    default format "NHWC", the data is stored in the order of:
+        [batch, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCHW", the data storage order of:
+        [batch, in_channels, in_height, in_width].
+orig_input: The original input tensor.
+orig_output: The original output tensor.
+grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+output: Gradients of gradients w.r.t. the input to `max_pool`.
+)doc");
+
 REGISTER_OP("MaxPoolWithArgmax")
    .Attr("ksize: list(int) >= 4")
    .Attr("strides: list(int) >= 4")
@ -1366,7 +1442,7 @@ REGISTER_OP("MaxPoolWithArgmax")
    .Input("input: T")
    .Output("output: T")
    .Output("argmax: Targmax")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: realnumbertype")
    .SetShapeFn([](InferenceContext* c) {
      TF_RETURN_IF_ERROR(shape_inference::MaxPoolShape(c));
      c->set_output(1, c->output(0));
@ -1397,7 +1473,7 @@ REGISTER_OP("MaxPoolGradWithArgmax")
    .Input("grad: T")
    .Input("argmax: Targmax")
    .Output("output: T")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: realnumbertype")
    .SetShapeFn([](InferenceContext* c) {
      return UnchangedShapeWithRank(c, 4);
    })
@ -1415,6 +1491,39 @@ argmax: The indices of the maximum values chosen for each output of `max_pool`.
 output: Gradients w.r.t. the input of `max_pool`.
 )doc");

+REGISTER_OP("MaxPoolGradGradWithArgmax")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr("Targmax: {int32, int64}")
+    .Input("input: T")
+    .Input("grad: T")
+    .Input("argmax: Targmax")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::MaxPoolShape(c));
+      ShapeHandle unused;
+      // Validate 'orig_input' is the same shape as 'grad'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(1), &unused));
+      // Validate 'argmax' is same shape as 'output'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(2), c->output(0), &unused));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Computes second-order gradients of the maxpooling function.
+
+ksize: The size of the window for each dimension of the input tensor.
+strides: The stride of the sliding window for each dimension of the
+  input tensor.
+padding: The type of padding algorithm to use.
+input: The original input.
+grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+  input of `max_pool`.
+argmax: The indices of the maximum values chosen for each output of `max_pool`.
+output: Gradients of gradients w.r.t. the input of `max_pool`.
+)doc");
+
 // --------------------------------------------------------------------------

 REGISTER_OP("Dilation2D")
@ -2517,7 +2626,10 @@ REGISTER_OP("MklConv2D")
    .Attr(GetConvnetDataFormatAttrString())
    .SetShapeFn(shape_inference::Conv2DShape)
    .Doc(R"doc(
-MKL version of Conv2D
+MKL version of Conv2D operator. Uses MKL DNN APIs to perform 2D convolution.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
 )doc");

 REGISTER_OP("MklConv2DWithBias")
@ -2533,14 +2645,216 @@ REGISTER_OP("MklConv2DWithBias")
    .Attr("strides: list(int)")
    .Attr("use_cudnn_on_gpu: bool = true")
    .Attr(GetPaddingAttrString())
-    .Attr(GetConvnetDataFormatAttrString());
+    .Attr(GetConvnetDataFormatAttrString())
+    .Doc(R"doc(
+MKL version of Conv2D and BiasAdd operator. Uses MKL DNN APIs to perform
+2D convolution and add Bias to the output of convolution.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklConv2DBackpropFilter")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Input("filter_sizes: int32")
+    .Input("mkl_filter_size: uint8")
+    .Input("out_backprop: T")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      return InputTensorShapeOrUnknown(c, 2 /* input_idx */, 4 /* ndims */);
+    })
+    .Doc(R"doc(
+MKL version of Conv2DBackpropFilter. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the filter.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklConv2DWithBiasBackpropBias")
+    .Input("out_backprop: T")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Doc(R"doc(
+MKL version of Conv2DBackpropBias. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the bias.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklConv2DBackpropInput")
+    .Input("input_sizes: int32")
+    .Input("mkl_input_sizes: uint8")
+    .Input("filter: T")
+    .Input("mkl_filter: uint8")
+    .Input("out_backprop: T")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      return InputTensorShapeOrUnknown(c, 0 /* input_idx */, 4 /* ndims */);
+    })
+    .Doc(R"doc(
+MKL version of Convolution2D backward input. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklRelu")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("activations: T")
+    .Output("mkl_activations: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+MKL version of Relu operator. Uses MKL DNN APIs to implement Relu operator.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklReluGrad")
+    .Input("gradients: T")
+    .Input("mkl_gradients: uint8")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("backprops: T")
+    .Output("mkl_backprops: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+MKL version of ReluGrad operator. Uses MKL DNN APIs to compute rectified
+linear gradients for Relu operation.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklMaxPool")
+    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("workspace_enabled: bool = false")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Output("workspace: T")
+    .Output("mkl_workspace: uint8")
+    .SetShapeFn(shape_inference::MaxPoolShape)
+    .Doc(R"doc(
+MKL version of MaxPool operator. Uses MKL DNN APIs to perform max pooling
+on the input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklMaxPoolGrad")
+    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr("workspace_enabled: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Input("orig_input: T")
+    .Input("mkl_orig_input: uint8")
+    .Input("orig_output: T")
+    .Input("mkl_orig_output: uint8")
+    .Input("grad: T")
+    .Input("mkl_grad: uint8")
+    .Input("workspace: T")
+    .Input("mkl_workspace: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .SetShapeFn([](InferenceContext* c) {
+      return UnchangedShapeWithRank(c, 4);
+    })
+    .Doc(R"doc(
+MKL version of MaxPoolGrad. Uses MKL DNN APIs to compute gradients of
+MaxPool operator.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklAvgPool")
+    .Input("value: T")
+    .Input("mkl_input: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("T: {float, half, double}")
+    .SetShapeFn(shape_inference::AvgPoolShape)
+    .Doc(R"doc(
+MKL version of AvgPool operator. Uses MKL DNN APIs to perform average pooling
+on the input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklAvgPoolGrad")
+    .Input("orig_input_shape: int32")
+    .Input("mkl_orig_input: uint8")
+    .Input("grad: T")
+    .Input("mkl_grad: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("T: {float, half, double}")
+    .SetShapeFn([](InferenceContext* c) {
+      return InputTensorShapeOrUnknown(c, 0 /* input_idx */, 4 /* ndims */);
+    })
+    .Doc(R"doc(
+MKL version of AvgPoolGrad operator. Uses MKL DNN APIs to compute gradients
+of AvgPool function.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");

 REGISTER_OP("MklToTf")
    .Input("input: T")
    .Input("mkl_input: uint8")
    .Output("output: T")
    .Attr("T: {half, float, double}")
-    .Attr(GetConvnetDataFormatAttrString());
+    .Attr(GetConvnetDataFormatAttrString())
+    .Doc(R"doc(
+MKL operator to convert a tensor from MKL layout to TensorFlow layout.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
 #endif  // INTEL_MKL

 }  // namespace tensorflow
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@ -25842,6 +25842,59 @@ op {
  summary: "Computes the sum along segments of a tensor."
  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
 }
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    description: "A tensor whose shape is a prefix of `data.shape`."
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "Has same shape as data, except for the first `segment_ids.rank`\ndimensions, which are replaced with a single dimension which has size\n`num_segments`."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Computes the max along segments of a tensor."
+  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\n  range of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
+}
 op {
  name: "Unstage"
  output_arg {
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@ -67,11 +67,8 @@ int GetXCR0EAX() {
 #endif

 // Structure for basic CPUID info
-struct CPUIDInfo {
-  string vendor_str;
-  int family;
-  int model_num;
-
+class CPUIDInfo {
+ public:
  CPUIDInfo()
      : have_adx_(0),
        have_aes_(0),
@ -120,9 +117,9 @@ struct CPUIDInfo {

    // Get vendor string (issue CPUID with eax = 0)
    GETCPUID(eax, ebx, ecx, edx, 0, 0);
-    cpuid->vendor_str.append(reinterpret_cast<char *>(&ebx), 4);
-    cpuid->vendor_str.append(reinterpret_cast<char *>(&edx), 4);
-    cpuid->vendor_str.append(reinterpret_cast<char *>(&ecx), 4);
+    cpuid->vendor_str_.append(reinterpret_cast<char *>(&ebx), 4);
+    cpuid->vendor_str_.append(reinterpret_cast<char *>(&edx), 4);
+    cpuid->vendor_str_.append(reinterpret_cast<char *>(&ecx), 4);

    // To get general information and extended features we send eax = 1 and
    // ecx = 0 to cpuid.  The response is returned in eax, ebx, ecx and edx.
@ -130,8 +127,8 @@ struct CPUIDInfo {
    // Volume 2A: Instruction Set Reference, A-M CPUID).
    GETCPUID(eax, ebx, ecx, edx, 1, 0);

-    cpuid->model_num = static_cast<int>((eax >> 4) & 0xf);
-    cpuid->family = static_cast<int>((eax >> 8) & 0xf);
+    cpuid->model_num_ = static_cast<int>((eax >> 4) & 0xf);
+    cpuid->family_ = static_cast<int>((eax >> 8) & 0xf);

    cpuid->have_aes_ = (ecx >> 25) & 0x1;
    cpuid->have_cmov_ = (edx >> 15) & 0x1;
@ -253,6 +250,10 @@ struct CPUIDInfo {
    return false;
  }

+  string vendor_str() const { return vendor_str_; }
+  int family() const { return family_; }
+  int model_num() { return model_num_; }
+
 private:
  int highest_eax_;
  int have_adx_ : 1;
@ -292,6 +293,9 @@ struct CPUIDInfo {
  int have_sse4_2_ : 1;
  int have_ssse3_ : 1;
  int have_hypervisor_ : 1;
+  string vendor_str_;
+  int family_;
+  int model_num_;
 };

 std::once_flag cpuid_once_flag;
@ -317,7 +321,7 @@ bool TestCPUFeature(CPUFeature feature) {
 std::string CPUVendorIDString() {
 #ifdef PLATFORM_IS_X86
  InitCPUIDInfo();
-  return cpuid->vendor_str;
+  return cpuid->vendor_str();
 #else
  return "";
 #endif
@ -326,7 +330,7 @@ std::string CPUVendorIDString() {
 int CPUFamily() {
 #ifdef PLATFORM_IS_X86
  InitCPUIDInfo();
-  return cpuid->family;
+  return cpuid->family();
 #else
  return 0;
 #endif
@ -335,7 +339,7 @@ int CPUFamily() {
 int CPUModelNum() {
 #ifdef PLATFORM_IS_X86
  InitCPUIDInfo();
-  return cpuid->model_num;
+  return cpuid->model_num();
 #else
  return 0;
 #endif
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

+#ifdef TENSORFLOW_USE_JEMALLOC
+#include "jemalloc/jemalloc.h"
+#endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@ -53,16 +57,55 @@ int NumSchedulableCPUs() {
 }

 void* AlignedMalloc(size_t size, int minimum_alignment) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  void* ptr = NULL;
+  // posix_memalign requires that the requested alignment be at least
+  // sizeof(void*). In this case, fall back on malloc which should return
+  // memory aligned to at least the size of a pointer.
+  const int required_alignment = sizeof(void*);
+  if (minimum_alignment < required_alignment) return Malloc(size);
+  int err = jemalloc_posix_memalign(&ptr, minimum_alignment, size);
+  if (err != 0) {
+    return NULL;
+  } else {
+    return ptr;
+  }
+#else
  return _aligned_malloc(size, minimum_alignment);
+#endif
 }

-void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
+void AlignedFree(void* aligned_memory) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  jemalloc_free(aligned_memory);
+#else
+  _aligned_free(aligned_memory);
+#endif
+}

-void* Malloc(size_t size) { return ::malloc(size); }
+void* Malloc(size_t size) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_malloc(size);
+#else
+  return malloc(size);
+#endif
+}

-void* Realloc(void* ptr, size_t size) { return ::realloc(ptr, size); }
+void* Realloc(void* ptr, size_t size) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_realloc(ptr, size);
+#else
+  return realloc(ptr, size);
+#endif
+}

-void Free(void* ptr) { ::free(ptr); }
+void Free(void* ptr) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_free(ptr);
+#else
+  return free(ptr);
+#endif
+}

 void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
  // No-op.
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #ifdef INTEL_MKL
+
+#include <string>
+#include <vector>
+
 #include "third_party/mkl/include/mkl_dnn.h"
 #include "third_party/mkl/include/mkl_dnn_types.h"
 #include "third_party/mkl/include/mkl_service.h"
@ -40,6 +44,8 @@ namespace tensorflow {
 // MKL operation, and did not go through a conversion to a standard
 // Tensorflow tensor.

+typedef enum { W = 0, H = 1, C = 2, N = 3 } MklDims;
+
 class MklShape {
 public:
  MklShape() {}
@ -50,12 +56,15 @@ class MklShape {
    if (strides_) delete[] strides_;
    if (mklLayout_) CHECK_EQ(dnnLayoutDelete_F32(mklLayout_), E_SUCCESS);
    if (tfLayout_) CHECK_EQ(dnnLayoutDelete_F32(tfLayout_), E_SUCCESS);
+    if (tf_to_mkl_dim_map_) delete[] tf_to_mkl_dim_map_;
  }

  const bool IsMklTensor() const { return isMklTensor_; }

  void SetMklTensor(const bool isMklTensor) { isMklTensor_ = isMklTensor; }

+  void SetDimensions(const size_t dimension) { dimension_ = dimension; }
+
  void SetMklLayout(const void* primitive, size_t resourceType) {
    CHECK_EQ(
        dnnLayoutCreateFromPrimitive_F32(&mklLayout_, (dnnPrimitive_t)primitive,
@ -66,7 +75,8 @@ class MklShape {
  void SetTfLayout(const size_t dimension, const size_t* sizes,
                   const size_t* strides) {
    dimension_ = dimension;
-    if (dimension > 0) {  // MKl doesn't support dimension 0
+
+    if (dimension > 0) {  // MKl doesn't support zero dimension tensors
      sizes_ = new size_t[dimension];
      strides_ = new size_t[dimension];

@ -79,6 +89,45 @@ class MklShape {
    }
  }

+  // Default case - MKL dim ordering is opposite of TF dim ordering
+  // MKL -> (DIMS-1)...0 where (DIMS-1) is outermost dim and 0 is innermost dim
+  // TF  -> 0...(DIMS-1) where 0 is outermost dim and (DIMS-1) is innermost dim
+  // For layers that rely on data_format semantics (conv, pooling etc.)
+  // or operate only on certain dimensions (relu, concat, split etc.),
+  // Mkl APIs might require us to reorder these dimensions. In such cases,
+  // kernels should explicitly set this map
+  void SetTfDimOrder(const size_t dimension) {
+    CHECK(dimension == dimension_);
+    if (tf_to_mkl_dim_map_ == nullptr) {
+      tf_to_mkl_dim_map_ = new size_t[dimension];
+    }
+    for (size_t ii = 0; ii < dimension; ii++) {
+      tf_to_mkl_dim_map_[ii] = dimension - (ii + 1);
+    }
+  }
+
+  void SetTfDimOrder(const size_t dimension, const size_t* tf_to_mkl_dim_map) {
+    CHECK(dimension == dimension_);
+    if (tf_to_mkl_dim_map_ == nullptr) {
+      tf_to_mkl_dim_map_ = new size_t[dimension];
+    }
+    for (size_t ii = 0; ii < dimension; ii++) {
+      tf_to_mkl_dim_map_[ii] = tf_to_mkl_dim_map[ii];
+    }
+  }
+
+  void SetTfDimOrder(const size_t dimension, TensorFormat data_format) {
+    CHECK_EQ(dimension, 4);
+    CHECK(dimension == dimension_);
+    if (tf_to_mkl_dim_map_ == nullptr) {
+      tf_to_mkl_dim_map_ = new size_t[dimension];
+    }
+    tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'W')] = MklDims::W;
+    tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'H')] = MklDims::H;
+    tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'C')] = MklDims::C;
+    tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'N')] = MklDims::N;
+  }
+
  const dnnLayout_t GetMklLayout() const { return mklLayout_; }
  const dnnLayout_t GetTfLayout() const { return tfLayout_; }
  const dnnLayout_t GetCurLayout() const {
@ -86,7 +135,10 @@ class MklShape {
  }
  size_t GetDimension() const { return dimension_; }
  const size_t* GetSizes() const { return sizes_; }
+  int64 dim_size(int index) const { return sizes_[index]; }
  const size_t* GetStrides() const { return strides_; }
+  const size_t* GetTfToMklDimMap() const { return tf_to_mkl_dim_map_; }
+  size_t tf_dim_idx(int index) const { return tf_to_mkl_dim_map_[index]; }

  void GetConvertedFlatData(dnnLayout_t targetLayout, void* input,
                            void* output) const {
@ -107,21 +159,23 @@ class MklShape {
 // The data is serialized in this order
 // isMklTensor_
 // dimension_
-// sizes
-// strides
+// sizes_
+// strides_
 // mklLayout_
 // tfLayout_
+// tf_to_mkl_dim_map_

 #define SIZE_OF_MKL_DNN_BUF \
  (dnnLayoutSerializationBufferSize_F32())  // Size of buffer needed to
                                            // serialize dnn_layout pointer

 // Size of buffer to hold the serialized object, the size is computed as follows
-// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes) + sizeof(strides)
+// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) + sizeof(strides_)
 // + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
+// + sizeof(tf_to_mkl_dim_map_)

 #define SIZE_OF_MKL_SERIAL_DATA(dims) \
-  (2 * sizeof(size_t) + 2 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF)
+  (2 * sizeof(size_t) + 3 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF)

 // First we need to define some macro for offsets into the serial buffer where
 // different elements of Mklshape is written/read from
@ -140,6 +194,9 @@ class MklShape {
  (STRIDES_OFFSET(dims) + dims * sizeof(size_t))  // Location of mklLayout_
 #define TF_LAYOUT_OFFSET(dims) \
  (MKL_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF)  // Location of tfLayout_
+// Location of tf_to_mkl_dim_map_
+#define TF_TO_MKL_DIM_MAP_OFFSET(dims) \
+  (TF_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF)

  // TODO(agramesh1) make sure to create a const to share with rewrite pass
  // for min size of MKL metadata tensor.
@ -156,11 +213,14 @@ class MklShape {
          << "Bufsize too small in DeSerialize";
      sizes_ = new size_t[dimension_];
      strides_ = new size_t[dimension_];
+      tf_to_mkl_dim_map_ = new size_t[dimension_];
      for (int i = 0; i < dimension_; i++) {
        sizes_[i] =
            reinterpret_cast<const size_t*>(buf + SIZES_OFFSET(dimension_))[i];
        strides_[i] = reinterpret_cast<const size_t*>(
            buf + STRIDES_OFFSET(dimension_))[i];
+        tf_to_mkl_dim_map_[i] = reinterpret_cast<const size_t*>(
+            buf + TF_TO_MKL_DIM_MAP_OFFSET(dimension_))[i];
      }
      CHECK_EQ(dnnLayoutDeserialize_F32(&mklLayout_,
                                        buf + MKL_LAYOUT_OFFSET(dimension_)),
@ -183,6 +243,9 @@ class MklShape {
            sizes_[i];
        reinterpret_cast<size_t*>(buf + STRIDES_OFFSET(dimension_))[i] =
            strides_[i];
+        reinterpret_cast<size_t*>(buf +
+                                  TF_TO_MKL_DIM_MAP_OFFSET(dimension_))[i] =
+            tf_to_mkl_dim_map_[i];
      }
      CHECK_EQ(dnnLayoutSerialize_F32(mklLayout_,
                                      buf + MKL_LAYOUT_OFFSET(dimension_)),
@ -202,6 +265,8 @@ class MklShape {
  size_t dimension_ = 0;
  size_t* sizes_ = nullptr;    // Required by MKL for conversions
  size_t* strides_ = nullptr;  // Required by MKL for conversions
+  // TF dimension corresponding to this MKL dimension
+  size_t* tf_to_mkl_dim_map_ = nullptr;
 };

 int inline GetTensorDataIndex(int n) {
@ -275,18 +340,78 @@ inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides,
  }
 }

+inline void MklSizesToTFSizes(OpKernelContext* context,
+                              TensorFormat data_format_,
+                              const MklShape& mklshape, TensorShape* tfshape) {
+  size_t tf_dim = mklshape.GetDimension();
+  const size_t* tf_sizes = mklshape.GetSizes();
+
+  // TODO(agramesh1): check if this constraint is applicable in other cases
+  // (besides BackpropInput, BackpropFilter).
+  OP_REQUIRES(context, tf_dim == 4,
+              errors::InvalidArgument("MKLSizesToTFSizes: size must be 4-dim"));
+  std::vector<int32> sizes;
+
+  sizes.push_back(tf_sizes[3]);
+
+  if (data_format_ == FORMAT_NHWC) {
+    sizes.push_back(tf_sizes[1]);
+    sizes.push_back(tf_sizes[0]);
+    sizes.push_back(tf_sizes[2]);
+  } else {
+    sizes.push_back(tf_sizes[2]);
+    sizes.push_back(tf_sizes[1]);
+    sizes.push_back(tf_sizes[0]);
+  }
+
+  OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(sizes, tfshape));
+}
+
+inline int32 GetMklTensorDimIndex(char dimension) {
+  switch (dimension) {
+    case 'N':
+      return MklDims::N;
+    case 'C':
+      return MklDims::C;
+    case 'H':
+      return MklDims::H;
+    case 'W':
+      return MklDims::W;
+    default:
+      LOG(FATAL) << "Invalid dimension: " << dimension;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+inline int64 GetMklTensorDim(const MklShape& mklshape, char dimension) {
+  int index = GetMklTensorDimIndex(dimension);
+  CHECK(index >= 0 && index < mklshape.GetDimension())
+      << "Invalid index from the dimension: " << index << ", " << dimension;
+  return mklshape.dim_size(index);
+}
+
 namespace mkl_layer_registry {

 static const char* kMklLayerLabel = "MklLayer";
-static const string kMklLayerLabelPattern = "label='MklLayer'";
+static const char* kMklLayerLabelPattern = "label='MklLayer'";

-// Check whether opname is registered as MKL-compliant in the registry.
+// Check whether opname with type T is registered as MKL-compliant.
 //
 // @input: name of the op
+// @input: T datatype to be used for checking op
 // @return: true if opname is registered as Mkl layer op
-static inline bool IsMklLayer(const std::string& op_name) {
+static inline bool IsMklLayer(const std::string& op_name, DataType T) {
  string kernel = KernelsRegisteredForOp(op_name);
-  return kernel.find(kMklLayerLabelPattern) != string::npos;
+  // Currently, MKL only supports float type for ops. So we check if
+  // the type is float. Actually, we should query kernel registration and
+  // find out if op is supported for type T. But there is no API to query
+  // kernel registration using name and type.
+  bool result =
+      (kernel.find(kMklLayerLabelPattern) != string::npos) && (T == DT_FLOAT);
+  if (result == true) {
+    VLOG(1) << "mkl_layer_registry::" << op_name << " is " << kMklLayerLabel;
+  }
+  return result;
 }

 }  // namespace mkl_layer_registry
--- a/tensorflow/docs_src/about/roadmap.md
+++ b/tensorflow/docs_src/about/roadmap.md
@ -12,9 +12,8 @@ we do not have timelines for these features.

 ### Improve non-Python language support

-* Improve C++ API for graph construction and gradients
-* Java language support
-* Go language support
+* Support for adding gradient computation for graphs constructed in other
+  languages (C++, Java, Go etc.)

 ### Making TensorFlow easier to use
 * High-level APIs
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@ -229,7 +229,7 @@ do the following to run it from Python :

 ```python
 import tensorflow as tf
-zero_out_module = tf.load_op_library('zero_out.so')
+zero_out_module = tf.load_op_library('./zero_out.so')
 with tf.Session(''):
  zero_out_module.zero_out([[1, 2], [3, 4]]).eval()

@ -243,14 +243,13 @@ named `ZeroOut` in the C++ files, the python function will be called `zero_out`.

 To make the op available as a regular function `import`-able from a Python
 module, it maybe useful to have the `load_op_library` call in a Python source
-file as follows (see [zero_out_op_1.py](https://www.tensorflow.org/code/tensorflow/examples/adding_an_op/zero_out_op_1.py))
-:
+file as follows:

 ```python
 import tensorflow as tf

-_zero_out_module = tf.load_op_library('zero_out_op_kernel_1.so')
-zero_out = _zero_out_module.zero_out
+zero_out_module = tf.load_op_library('./zero_out.so')
+zero_out = zero_out_module.zero_out
 ```

 ## Verify that the op works
@ -264,7 +263,7 @@ import tensorflow as tf

 class ZeroOutTest(tf.test.TestCase):
  def testZeroOut(self):
-    zero_out_module = tf.load_op_library('zero_out.so')
+    zero_out_module = tf.load_op_library('./zero_out.so')
    with self.test_session():
      result = zero_out_module.zero_out([5, 4, 3, 2, 1])
      self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
--- a/tensorflow/docs_src/get_started/get_started.md
+++ b/tensorflow/docs_src/get_started/get_started.md
@ -71,7 +71,7 @@ is a constant. Like all TensorFlow constants, it takes no inputs, and it outputs
 a value it stores internally. We can create two floating point Tensors `node1`
 and `node2` as follows:
 ```python
-node1 = tf.constant(3.0, tf.float32)
+node1 = tf.constant(3.0, dtype=tf.float32)
 node2 = tf.constant(4.0) # also tf.float32 implicitly
 print(node1, node2)
 ```
@ -110,7 +110,7 @@ print("sess.run(node3): ",sess.run(node3))
 ```
 The last two print statements produce
 ```
-node3:  Tensor("Add_2:0", shape=(), dtype=float32)
+node3:  Tensor("Add:0", shape=(), dtype=float32)
 sess.run(node3):  7.0
 ```

@ -173,8 +173,8 @@ initial value:


 ```python
-W = tf.Variable([.3], tf.float32)
-b = tf.Variable([-.3], tf.float32)
+W = tf.Variable([.3], dtype=tf.float32)
+b = tf.Variable([-.3], dtype=tf.float32)
 x = tf.placeholder(tf.float32)
 linear_model = W * x + b
 ```
@ -294,8 +294,8 @@ import numpy as np
 import tensorflow as tf

 # Model parameters
-W = tf.Variable([.3], tf.float32)
-b = tf.Variable([-.3], tf.float32)
+W = tf.Variable([.3], dtype=tf.float32)
+b = tf.Variable([-.3], dtype=tf.float32)
 # Model input and output
 x = tf.placeholder(tf.float32)
 linear_model = W * x + b
--- a/tensorflow/docs_src/tutorials/wide.md
+++ b/tensorflow/docs_src/tutorials/wide.md
@ -27,7 +27,7 @@ https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/learn/w
       # Mac OS X
       $ sudo easy_install pip
       $ sudo easy_install --upgrade six
-      ```
+       ```

    2. Use `pip` to install pandas:

--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@ -32,9 +32,7 @@ on API >= 14 devices.
        (https://arxiv.org/abs/1610.07629) to restyle the camera preview image
        to that of a number of different artists.

-<img src="sample_images/classify1.jpg" width="30%">
-<img src="sample_images/stylize1.jpg" width="30%">
-<img src="sample_images/detect1.jpg" width="30%">
+<img src="sample_images/classify1.jpg" width="30%"><img src="sample_images/stylize1.jpg" width="30%"><img src="sample_images/detect1.jpg" width="30%">

 ## Prebuilt APK:

@ -83,7 +81,7 @@ instead.
 Bazel is the primary build system for TensorFlow. To build with Bazel,
 it and the Android NDK and SDK must be installed on your system.

-1. Get the recommended Bazel version listed in [os_setup.html](https://www.tensorflow.org/versions/master/get_started/os_setup.html#source)
+1. Install the latest version of Bazel as per the instructions [on the Bazel website](https://bazel.build/versions/master/docs/install.html).
 2. The Android NDK is required to build the native (C/C++) TensorFlow code.
        The current recommended version is 12b, which may be found
        [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-12b-downloads).
@ -96,7 +94,7 @@ it and the Android NDK and SDK must be installed on your system.

 ##### Edit WORKSPACE

-The Android entries in [`<workspace_root>/WORKSPACE`](../../../WORKSPACE#L2-L13)
+The Android entries in [`<workspace_root>/WORKSPACE`](../../../WORKSPACE#L19-L32)
 must be uncommented with the paths filled in appropriately depending on where
 you installed the NDK and SDK. Otherwise an error such as:
 "The external label '//external:android/sdk' is not bound to anything" will
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@ -67,7 +67,7 @@ apply plugin: 'com.android.application'

 android {
    compileSdkVersion 23
-    buildToolsVersion "25.0.1"
+    buildToolsVersion "25.0.2"

    lintOptions {
        abortOnError false
--- a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
+++ b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
@ -278,7 +278,7 @@
    "            tensor = n.attr['value'].tensor\n",
    "            size = len(tensor.tensor_content)\n",
    "            if size > max_const_size:\n",
-    "                tensor.tensor_content = bytes(\"<stripped %d bytes>\"%size)\n",
+    "                tensor.tensor_content = tf.compat.as_bytes(\"<stripped %d bytes>\"%size)\n",
    "    return strip_def\n",
    "  \n",
    "def rename_nodes(graph_def, rename_func):\n",
--- a/tensorflow/examples/tutorials/monitors/iris_monitors.py
+++ b/tensorflow/examples/tutorials/monitors/iris_monitors.py
@ -21,7 +21,6 @@ import os

 import numpy as np
 import tensorflow as tf
-from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec

 tf.logging.set_verbosity(tf.logging.INFO)

@ -41,18 +40,15 @@ def main(unused_argv):
      "accuracy":
          tf.contrib.learn.MetricSpec(
              metric_fn=tf.contrib.metrics.streaming_accuracy,
-              prediction_key=
-              tf.contrib.learn.prediction_key.PredictionKey.CLASSES),
+              prediction_key="classes"),
      "precision":
          tf.contrib.learn.MetricSpec(
              metric_fn=tf.contrib.metrics.streaming_precision,
-              prediction_key=
-              tf.contrib.learn.prediction_key.PredictionKey.CLASSES),
+              prediction_key="classes"),
      "recall":
          tf.contrib.learn.MetricSpec(
              metric_fn=tf.contrib.metrics.streaming_recall,
-              prediction_key=
-              tf.contrib.learn.prediction_key.PredictionKey.CLASSES)
+              prediction_key="classes")
  }
  validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
      test_set.data,
@ -66,26 +62,6 @@ def main(unused_argv):
  # Specify that all features have real-value data
  feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]

-  validation_metrics = {
-      "accuracy": MetricSpec(
-                          metric_fn=tf.contrib.metrics.streaming_accuracy,
-                          prediction_key="classes"),
-      "recall": MetricSpec(
-                          metric_fn=tf.contrib.metrics.streaming_recall,
-                          prediction_key="classes"),
-      "precision": MetricSpec(
-                          metric_fn=tf.contrib.metrics.streaming_precision,
-                          prediction_key="classes")
-                        }
-  validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-      test_set.data,
-      test_set.target,
-      every_n_steps=50,
-      metrics=validation_metrics,
-      early_stopping_metric="loss",
-      early_stopping_metric_minimize=True,
-      early_stopping_rounds=200)
-
  # Build 3 layer DNN with 10, 20, 10 units respectively.
  classifier = tf.contrib.learn.DNNClassifier(
      feature_columns=feature_columns,
--- a/tensorflow/go/doc.go
+++ b/tensorflow/go/doc.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 // Package tensorflow is a Go binding to TensorFlow.
 //
--- a/tensorflow/go/example_inception_inference_test.go
+++ b/tensorflow/go/example_inception_inference_test.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 package tensorflow_test

@ -26,8 +28,8 @@ import (
 	"os"
 	"path/filepath"

-	tf "github.com/tensorflow/tensorflow/tensorflow/go"
 	"github.com/tensorflow/tensorflow/tensorflow/go/op"
+	tf "github.com/tensorflow/tensorflow/tensorflow/go"
 )

 func Example() {
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 // Package internal generates Go source code with functions for TensorFlow operations.
 //
@ -156,12 +158,12 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 `))

 	tmplOp = template.Must(template.New("op").Funcs(template.FuncMap{
-		"MakeComment": makeComment,
-		"GoType":      goType,
-		"CamelCase":   camelCase,
-		"Identifier":  identifier,
-		"IsListArg":   isListArg,
-		"IsListAttr":  isListAttr,
+		"MakeComment":       makeComment,
+		"GoType":            goType,
+		"CamelCase":         camelCase,
+		"Identifier":        identifier,
+		"IsListArg":         isListArg,
+		"IsListAttr":        isListAttr,
 		"StripLeadingColon": stripLeadingColon,
 	}).Parse(`
 {{if .OptionalAttrs -}}
--- a/tensorflow/go/genop/internal/genop_test.go
+++ b/tensorflow/go/genop/internal/genop_test.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 package internal

--- a/tensorflow/go/genop/internal/lib.go
+++ b/tensorflow/go/genop/internal/lib.go
@ -1,17 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.

+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 package internal

 // #cgo LDFLAGS: -ltensorflow
--- a/tensorflow/go/genop/main.go
+++ b/tensorflow/go/genop/main.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 //go:generate sh generate.sh

--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 package tensorflow

--- a/tensorflow/go/graph_test.go
+++ b/tensorflow/go/graph_test.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 package tensorflow

--- a/tensorflow/go/lib.go
+++ b/tensorflow/go/lib.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 package tensorflow

--- a/tensorflow/go/op/generate.go
+++ b/tensorflow/go/op/generate.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 //go:generate go generate ../genop
 //go:generate go run ../genop/main.go -outfile wrappers.go
--- a/tensorflow/go/op/op.go
+++ b/tensorflow/go/op/op.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 // Package op defines functions for adding TensorFlow operations to a Graph.
 //
--- a/tensorflow/go/op/op_test.go
+++ b/tensorflow/go/op/op_test.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 // Tests for the generated code of some operations.

--- a/tensorflow/go/op/scope.go
+++ b/tensorflow/go/op/scope.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 package op

--- a/tensorflow/go/op/scope_test.go
+++ b/tensorflow/go/op/scope_test.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 package op

--- a/tensorflow/go/operation.go
+++ b/tensorflow/go/operation.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 package tensorflow

--- a/tensorflow/go/operation_test.go
+++ b/tensorflow/go/operation_test.go
@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 package tensorflow

--- a/tensorflow/go/saved_model.go
+++ b/tensorflow/go/saved_model.go
@ -1,16 +1,18 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 package tensorflow

--- a/tensorflow/go/saved_model_test.go
+++ b/tensorflow/go/saved_model_test.go
@ -1,16 +1,18 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/

 package tensorflow

--- a/Show More
+++ b/Show More