From 71896cc7e5bd3d1b8b5bb615eac7bebf86fa998c Mon Sep 17 00:00:00 2001
From: Raghuraman Krishnamoorthi <raghuramank@google.com>
Date: Wed, 3 Jan 2018 19:06:54 -0800
Subject: [PATCH] Merge changes from github.

PiperOrigin-RevId: 180746153
---
 LICENSE                                       |   2 +-
 README.md                                     |   4 +-
 tensorflow/cc/gradients/math_grad.cc          |  18 -
 tensorflow/cc/gradients/math_grad_test.cc     |   8 -
 tensorflow/cc/ops/while_loop.cc               |   2 +-
 tensorflow/compiler/tf2xla/kernels/BUILD      |   3 +
 .../compiler/tf2xla/kernels/while_op.cc       |   2 +-
 .../xla/service/cpu/cpu_runtime_avx.cc        |   4 +-
 .../xla/service/cpu/cpu_runtime_avx.h         |  11 +-
 .../xla/service/cpu/cpu_runtime_neon.cc       |   4 +-
 .../xla/service/cpu/cpu_runtime_neon.h        |  12 +-
 .../xla/service/cpu/cpu_runtime_sse4_1.cc     |   4 +-
 .../xla/service/cpu/cpu_runtime_sse4_1.h      |  13 +-
 .../xla/service/cpu/llvm_ir_runtime.cc        |  14 +-
 .../xla/service/cpu/simple_orc_jit.cc         |  12 +-
 tensorflow/contrib/cmake/CMakeLists.txt       |   4 +-
 tensorflow/contrib/cmake/make.sh              |  26 +
 tensorflow/contrib/cmake/python_modules.txt   |  28 +-
 tensorflow/contrib/cmake/tf_python.cmake      |  33 +-
 .../estimator/python/gan_estimator_impl.py    |   2 +-
 .../contrib/layers/python/layers/layers.py    |  27 +-
 .../libsvm/kernels/decode_libsvm_op.cc        |  27 +-
 tensorflow/contrib/lite/model.cc              |   3 +-
 .../lite/models/smartreply/g3doc/README.md    |   4 +-
 tensorflow/contrib/predictor/BUILD            |   1 +
 .../predictor/core_estimator_predictor.py     |   4 +-
 .../contrib/predictor/predictor_factories.py  |   4 +-
 .../predictor/predictor_factories_test.py     |  24 +
 .../contrib/signal/python/ops/mfcc_ops.py     |   2 +-
 tensorflow/contrib/slim/README.md             |   2 +-
 .../contrib/slim/python/slim/learning.py      |   3 +-
 tensorflow/contrib/tensor_forest/BUILD        |   1 -
 .../tensor_forest/client/random_forest.py     |   3 +-
 tensorflow/contrib/training/BUILD             |   1 -
 tensorflow/core/BUILD                         |   3 +-
 .../core/common_runtime/gpu/gpu_device.cc     |   4 +
 tensorflow/core/kernels/BUILD                 |  11 +-
 tensorflow/core/kernels/constant_op.cc        |  34 -
 .../core/kernels/conv_grad_filter_ops.cc      |   8 +
 tensorflow/core/kernels/conv_ops_gpu_3.cu.cc  | 667 +++++++++++++-----
 tensorflow/core/kernels/data/dataset.h        |   2 +-
 tensorflow/core/kernels/fill_functor.cc       |  44 ++
 tensorflow/core/kernels/fill_functor.cu.cc    | 115 +++
 .../core/kernels/fused_batch_norm_op.cc       |  17 +
 .../core/kernels/fused_batch_norm_op.cu.cc    |   7 +
 tensorflow/core/kernels/fused_batch_norm_op.h |   6 +
 tensorflow/core/kernels/matrix_inverse_op.cc  |   2 +-
 tensorflow/core/kernels/pooling_ops_common.cc |   6 +
 tensorflow/core/kernels/record_input_op.cc    |   1 +
 tensorflow/core/kernels/where_op.cc           |   2 +-
 tensorflow/core/ops/bitwise_ops.cc            |   2 +-
 tensorflow/core/platform/default/logging.cc   |   4 +-
 tensorflow/core/platform/default/logging.h    |   4 +
 tensorflow/core/platform/s3/BUILD             |  19 +
 tensorflow/core/platform/s3/aws_logging.cc    | 121 ++++
 tensorflow/core/platform/s3/aws_logging.h     |  68 ++
 tensorflow/core/platform/s3/s3_file_system.cc |  10 +
 tensorflow/docs_src/get_started/estimator.md  |   4 +-
 .../performance/datasets_performance.md       |   2 +-
 tensorflow/docs_src/programmers_guide/faq.md  |   2 +-
 .../docs_src/programmers_guide/graphs.md      |   2 +-
 .../docs_src/tutorials/audio_recognition.md   |   2 +-
 .../examples/label_image/label_image.py       |   6 +-
 tensorflow/java/BUILD                         |  40 +-
 tensorflow/java/src/gen/cc/java_defs.h        | 273 +++++++
 tensorflow/java/src/gen/cc/op_gen_main.cc     |   8 +-
 tensorflow/java/src/gen/cc/op_generator.cc    |   2 +
 tensorflow/java/src/gen/cc/op_generator.h     |   2 +
 tensorflow/java/src/gen/gen_ops.bzl           |   2 +-
 tensorflow/python/data/ops/dataset_ops.py     |   2 +-
 .../python/debug/examples/examples_test.sh    |   3 +
 tensorflow/python/estimator/estimator.py      |   3 +-
 tensorflow/python/estimator/estimator_test.py |  16 +-
 tensorflow/python/estimator/training.py       |  10 +-
 .../python/kernel_tests/conv_ops_test.py      |  14 +
 .../python/kernel_tests/pooling_ops_test.py   |  22 +
 tensorflow/python/ops/bitwise_ops_test.py     |  31 +
 tensorflow/python/ops/gradients_impl.py       |  47 +-
 tensorflow/python/ops/gradients_test.py       |  39 +
 tensorflow/python/ops/image_ops_impl.py       | 553 ++++++++-------
 tensorflow/python/ops/image_ops_test.py       |  35 +
 .../python/ops/nn_fused_batchnorm_test.py     |  53 ++
 .../python/training/learning_rate_decay.py    | 111 ++-
 .../training/learning_rate_decay_test.py      |  76 +-
 tensorflow/python/training/training.py        |   1 +
 .../tools/api/golden/tensorflow.train.pbtxt   |   6 +-
 tensorflow/tools/ci_build/Dockerfile.android  |   2 +-
 tensorflow/tools/ci_build/Dockerfile.cpu      |   2 +-
 tensorflow/tools/ci_build/Dockerfile.cpu.mpi  |   2 +-
 tensorflow/tools/ci_build/Dockerfile.hadoop   |   2 +-
 tensorflow/tools/ci_build/Dockerfile.pi       |   2 +-
 .../tools/ci_build/Dockerfile.pi-python3      |   2 +-
 tensorflow/tools/ci_build/ci_sanity.sh        |   4 +-
 .../tools/ci_build/install/install_bazel.sh   |   2 +-
 .../install/install_python3.6_pip_packages.sh |  19 +
 tensorflow/tools/compatibility/BUILD          |  11 +-
 tensorflow/tools/compatibility/tf_upgrade.py  | 483 ++++++++++++-
 .../tools/compatibility/tf_upgrade_test.py    |   5 +-
 tensorflow/tools/docker/Dockerfile.devel      |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu  |   2 +-
 tensorflow/tools/pip_package/setup.py         |  13 +-
 tensorflow/workspace.bzl                      |   8 +-
 third_party/gpus/cuda/BUILD.tpl               |   1 +
 103 files changed, 2654 insertions(+), 744 deletions(-)
 create mode 100755 tensorflow/contrib/cmake/make.sh
 create mode 100644 tensorflow/core/kernels/fill_functor.cu.cc
 create mode 100644 tensorflow/core/platform/s3/aws_logging.cc
 create mode 100644 tensorflow/core/platform/s3/aws_logging.h
 create mode 100644 tensorflow/java/src/gen/cc/java_defs.h

diff --git a/LICENSE b/LICENSE
index 15ae4214045..4862420c023 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright 2017 The TensorFlow Authors.  All rights reserved.
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
 
                                  Apache License
                            Version 2.0, January 2004
diff --git a/README.md b/README.md
index 2c4afb0b554..de7a94e581e 100644
--- a/README.md
+++ b/README.md
@@ -46,8 +46,8 @@ packages on Linux, Mac, and Windows.
 
 
 **Individual whl files**
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/42/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/)) / [Python 3.6](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp36-cp36m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=cpu-slave/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/42/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/)) / [Python 3.6](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/))
 * Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
 * Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/))
 * Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/))
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index afd92fbf488..52c177212a8 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -763,24 +763,6 @@ Status LgammaGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Lgamma", LgammaGrad);
 
-Status SelectGrad(const Scope& scope, const Operation& op,
-                  const std::vector<Output>& grad_inputs,
-                  std::vector<Output>* grad_outputs) {
-  auto comparator = op.input(0);
-  auto x = op.input(1);
-  auto zeros = ZerosLike(scope, x);
-  auto grad = grad_inputs[0];
-
-  auto gx_1 = Where3(scope, comparator, grad, zeros);
-  auto gx_2 = Where3(scope, comparator, zeros, grad);
-
-  grad_outputs->push_back(NoGradient());
-  grad_outputs->push_back(gx_1);
-  grad_outputs->push_back(gx_2);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("Select", SelectGrad);
-
 Status MinOrMaxGrad(const Scope& scope, const Operation& op,
                     const std::vector<Output>& grad_inputs,
                     std::vector<Output>* grad_outputs) {
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index 7f076960b57..1b4c7c26880 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -904,13 +904,5 @@ TEST_F(NaryGradTest, Prod) {
   RunTest({x}, {x_shape}, {y}, {y_shape});
 }
 
-TEST_F(NaryGradTest, Select) {
-  TensorShape shape({3, 4});
-  auto x1 = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
-  auto x2 = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
-  auto y = Where3(scope_, Greater(scope_, x1, x2), x1, x2);
-  RunTest({x1, x2}, {shape, shape}, {y}, {shape});
-}
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/ops/while_loop.cc b/tensorflow/cc/ops/while_loop.cc
index e0251efb2a4..d1c918d464b 100644
--- a/tensorflow/cc/ops/while_loop.cc
+++ b/tensorflow/cc/ops/while_loop.cc
@@ -116,7 +116,7 @@ Status CreateCond(const Scope& scope, const CondGraphBuilderFn& cond,
   return Status::OK();
 }
 
-// Create the bdoy subgraph defined by `body`. `outputs` must be non-null and
+// Create the body subgraph defined by `body`. `outputs` must be non-null and
 // empty.
 Status CreateBody(const Scope& scope, const BodyGraphBuilderFn& body,
                   const std::vector<Output>& inputs,
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 0dd95de0ce3..5e1b01878b7 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -4,6 +4,7 @@ package(
     default_visibility = ["//tensorflow/compiler/tf2xla:internal"],
 )
 
+load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
 tf_kernel_library(
@@ -167,6 +168,7 @@ tf_kernel_library(
 cc_library(
     name = "index_ops_kernel_argmax_float_1d",
     srcs = ["index_ops_kernel_argmax_float_1d.cc"],
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
@@ -179,6 +181,7 @@ cc_library(
 cc_library(
     name = "index_ops_kernel_argmax_float_2d",
     srcs = ["index_ops_kernel_argmax_float_2d.cc"],
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 07a565e34b8..ee466520dd1 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -39,7 +39,7 @@ Status MakeXlaCompilerArgumentsFromInputs(
   *has_uninitialized_vars = false;
   *has_tensor_arrays = false;
   for (int i = 0; i < ctx->num_inputs(); ++i) {
-    VLOG(2) << "  Input " << i
+    VLOG(2) << " Input " << i
             << " type: " << DataTypeString(ctx->input_type(i))
             << " shape: " << ctx->InputShape(i).DebugString();
     XlaCompiler::Argument& arg = (*args)[i];
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
index 181deedde71..b1c1142e8d9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 
-#ifdef __AVX__
+#ifdef TF_XLA_HAS_AVX
 xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_ExpV8F32AVX(
     xla::cpu::runtime::V8F32AVX x) {
   return Eigen::internal::pexp(x);
@@ -29,7 +29,7 @@ xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_LogV8F32AVX(
     xla::cpu::runtime::V8F32AVX x) {
   return Eigen::internal::plog(x);
 }
-#endif  // __AVX__
+#endif  // TF_XLA_HAS_AVX
 
 namespace xla {
 namespace cpu {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
index 74ae6d00c91..e5c782f93f5 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
@@ -24,6 +24,11 @@ limitations under the License.
 
 #include "tensorflow/core/platform/macros.h"
 
+#if defined(__AVX__)
+#include <immintrin.h>
+#define TF_XLA_HAS_AVX
+#endif
+
 namespace xla {
 namespace cpu {
 namespace runtime {
@@ -31,14 +36,16 @@ namespace runtime {
 extern const char *const kExpV8F32AVXSymbolName;
 extern const char *const kLogV8F32AVXSymbolName;
 
-typedef float V8F32AVX __attribute__((__vector_size__(32)));
+#ifdef TF_XLA_HAS_AVX
+typedef __m256 V8F32AVX;
+#endif
 }  // namespace runtime
 }  // namespace cpu
 }  // namespace xla
 
 extern "C" {
 
-#ifdef __AVX__
+#ifdef TF_XLA_HAS_AVX
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.cc
index abe792b2787..8099b722f10 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 
-#ifdef __ARM_NEON__
+#ifdef TF_XLA_HAS_NEON
 
 xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_ExpV4F32NEON(
     xla::cpu::runtime::V4F32NEON x) {
@@ -32,7 +32,7 @@ xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_LogV4F32NEON(
   return Eigen::internal::plog(p);
 }
 
-#endif  // __ARM_NEON__
+#endif  // TF_XLA_HAS_NEON
 
 namespace xla {
 namespace cpu {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
index 645a43858fb..2f5d1a872aa 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
@@ -27,6 +27,7 @@ limitations under the License.
 // __attribute__((__vector_size__(*))).  Unfortunately, the typedef for the ARM
 // NEON SIMD types is not portable, so the type has to come from <arm_neon.h>
 #include <arm_neon.h>
+#define TF_XLA_HAS_NEON
 #endif  // __ARM_NEON__
 
 namespace xla {
@@ -36,12 +37,9 @@ namespace runtime {
 extern const char *const kExpV4F32NEONSymbolName;
 extern const char *const kLogV4F32NEONSymbolName;
 
-#ifdef __ARM_NEON__
+#ifdef TF_XLA_HAS_NEON
 typedef float32x4_t V4F32NEON;
-#else
-// On non-ARM platforms ensure the declaration is present
-struct V4F32NEON;
-#endif  // __ARM_NEON__
+#endif  // TF_XLA_HAS_NEON
 
 }  // namespace runtime
 }  // namespace cpu
@@ -49,7 +47,7 @@ struct V4F32NEON;
 
 extern "C" {
 
-#ifdef __ARM_NEON__
+#ifdef TF_XLA_HAS_NEON
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
@@ -58,7 +56,7 @@ xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_ExpV4F32NEON(
 
 xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_LogV4F32NEON(
     xla::cpu::runtime::V4F32NEON x);
-#endif  // __ARM_NEON__
+#endif  // TF_XLA_HAS_NEON
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_NEON_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
index a9a45db5a42..d8ecf231cc8 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 
-#ifdef __SSE4_1__
+#ifdef TF_XLA_HAS_SSE4_1
 
 xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_ExpV4F32SSE(
     xla::cpu::runtime::V4F32SSE x) {
@@ -33,7 +33,7 @@ xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_LogV4F32SSE(
   return Eigen::internal::plog(p);
 }
 
-#endif  // __SSE4_1__
+#endif  // TF_XLA_HAS_SSE4_1
 
 namespace xla {
 namespace cpu {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
index 80ca4243a2c..aeb1eda23f7 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
@@ -24,6 +24,13 @@ limitations under the License.
 
 #include "tensorflow/core/platform/macros.h"
 
+// MSVC does not have __SSE4_1__ macro. Eigen enables EIGEN_VECTORIZE_SSE4_1
+// when __AVX__ is defined, we should do the same.
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && defined(__AVX__))
+#include <smmintrin.h>
+#define TF_XLA_HAS_SSE4_1
+#endif
+
 namespace xla {
 namespace cpu {
 namespace runtime {
@@ -31,7 +38,9 @@ namespace runtime {
 extern const char *const kExpV4F32SSESymbolName;
 extern const char *const kLogV4F32SSESymbolName;
 
-typedef float V4F32SSE __attribute__((__vector_size__(16)));
+#ifdef TF_XLA_HAS_SSE4_1
+typedef __m128 V4F32SSE;
+#endif
 
 }  // namespace runtime
 }  // namespace cpu
@@ -39,7 +48,7 @@ typedef float V4F32SSE __attribute__((__vector_size__(16)));
 
 extern "C" {
 
-#ifdef __SSE4_1__
+#ifdef TF_XLA_HAS_SSE4_1
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index 0f71258ff07..0336fa61312 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -64,14 +64,14 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
                             &ir_builder),
       llvm::ConstantFP::get(vector_type, 9.0), &ir_builder);
 
-  std::array<float, 7> numerator_coeffs(
-      {-2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
-       5.12229709037114e-08f, 1.48572235717979e-05f, 6.37261928875436e-04f,
-       4.89352455891786e-03f});
+  std::array<float, 7> numerator_coeffs{
+      -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
+      5.12229709037114e-08f,  1.48572235717979e-05f, 6.37261928875436e-04f,
+      4.89352455891786e-03f};
 
-  std::array<float, 4> denominator_coeffs(
-      {1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
-       4.89352518554385e-03f});
+  std::array<float, 4> denominator_coeffs{
+      1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
+      4.89352518554385e-03f};
 
   llvm::Value* input_squared =
       ir_builder.CreateFMul(input_clamped, input_clamped);
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 43ab2ec524e..5403bf48b74 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -103,17 +103,17 @@ llvm::StringRef GetHostCpuName() {
 
 CompilerFunctor::VectorIntrinsics GetAvailableIntrinsics() {
   CompilerFunctor::VectorIntrinsics intrinsics;
-#ifdef __SSE4_1__
+#ifdef TF_XLA_HAS_SSE4_1
   intrinsics.sse_intrinsics = true;
 #else
   intrinsics.sse_intrinsics = false;
 #endif
-#ifdef __AVX__
+#ifdef TF_XLA_HAS_AVX
   intrinsics.avx_intrinsics = true;
 #else
   intrinsics.avx_intrinsics = false;
 #endif
-#ifdef __ARM_NEON__
+#ifdef TF_XLA_HAS_NEON
   intrinsics.neon_intrinsics = true;
 #else
   intrinsics.neon_intrinsics = false;
@@ -215,15 +215,15 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
-#ifdef __ARM_NEON__
+#ifdef TF_XLA_HAS_NEON
   REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32NEON);
   REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32NEON);
 #endif
-#ifdef __SSE4_1__
+#ifdef TF_XLA_HAS_SSE4_1
   REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32SSE);
   REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32SSE);
 #endif
-#ifdef __AVX__
+#ifdef TF_XLA_HAS_AVX
   REGISTER_CPU_RUNTIME_SYMBOL(ExpV8F32AVX);
   REGISTER_CPU_RUNTIME_SYMBOL(LogV8F32AVX);
 #endif
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 2c4b7486d5d..817e96f5da0 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -407,7 +407,9 @@ endif()
 
 # Let's get to work!
 include(tf_core_framework.cmake)
-include(tf_stream_executor.cmake)
+if (tensorflow_ENABLE_GPU)
+  include(tf_stream_executor.cmake)
+endif()
 
 include(tf_core_cpu.cmake)
 include(tf_core_ops.cmake)
diff --git a/tensorflow/contrib/cmake/make.sh b/tensorflow/contrib/cmake/make.sh
new file mode 100755
index 00000000000..eed3c34aba1
--- /dev/null
+++ b/tensorflow/contrib/cmake/make.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+(
+cd "$(dirname "$0")"
+mkdir -p _build
+
+(
+cd _build
+rm -rf -- *
+cmake ..
+)
+)
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 92edce77dff..bd052fb8b67 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -86,9 +86,7 @@ tensorflow/python/ops/distributions
 tensorflow/python/ops/linalg
 tensorflow/python/ops/losses
 tensorflow/python/platform
-tensorflow/python/platform/default
-tensorflow/python/platform/summary
-tensorflow/python/profiler/
+tensorflow/python/profiler
 tensorflow/python/profiler/internal
 tensorflow/python/saved_model
 tensorflow/python/summary
@@ -115,8 +113,6 @@ tensorflow/contrib/batching/kernels
 tensorflow/contrib/batching/python
 tensorflow/contrib/batching/python/ops
 tensorflow/contrib/bayesflow
-tensorflow/contrib/bayesflow/examples
-tensorflow/contrib/bayesflow/examples/reinforce_simple
 tensorflow/contrib/bayesflow/python
 tensorflow/contrib/bayesflow/python/ops
 tensorflow/contrib/boosted_trees
@@ -212,16 +208,6 @@ tensorflow/contrib/input_pipeline/python/ops
 tensorflow/contrib/integrate
 tensorflow/contrib/integrate/python
 tensorflow/contrib/integrate/python/ops
-tensorflow/contrib/ios_examples
-tensorflow/contrib/ios_examples/benchmark
-tensorflow/contrib/ios_examples/benchmark/benchmark.xcodeproj
-tensorflow/contrib/ios_examples/benchmark/data
-tensorflow/contrib/ios_examples/camera
-tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj
-tensorflow/contrib/ios_examples/camera/en.lproj
-tensorflow/contrib/ios_examples/simple
-tensorflow/contrib/ios_examples/simple/data
-tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj
 tensorflow/contrib/keras
 tensorflow/contrib/keras/api
 tensorflow/contrib/keras/api/keras
@@ -276,9 +262,6 @@ tensorflow/contrib/layers/python/ops
 tensorflow/contrib/learn
 tensorflow/contrib/learn/python
 tensorflow/contrib/learn/python/learn
-tensorflow/contrib/learn/python/learn/dataframe
-tensorflow/contrib/learn/python/learn/dataframe/queues
-tensorflow/contrib/learn/python/learn/dataframe/transforms
 tensorflow/contrib/learn/python/learn/datasets
 tensorflow/contrib/learn/python/learn/datasets/data
 tensorflow/contrib/learn/python/learn/estimators
@@ -301,6 +284,9 @@ tensorflow/contrib/linear_optimizer/kernels
 tensorflow/contrib/linear_optimizer/kernels/g3doc
 tensorflow/contrib/linear_optimizer/python
 tensorflow/contrib/linear_optimizer/python/ops
+# TODO(drpngx): Fix failing imports
+# tensorflow/contrib/lite/python
+# tensorflow/contrib/lite/toco/python
 tensorflow/contrib/lookup
 tensorflow/contrib/losses
 tensorflow/contrib/losses/python
@@ -314,7 +300,6 @@ tensorflow/contrib/memory_stats/python
 tensorflow/contrib/memory_stats/python/ops
 tensorflow/contrib/meta_graph_transform
 tensorflow/contrib/metrics
-tensorflow/contrib/metrics/ops
 tensorflow/contrib/metrics/python
 tensorflow/contrib/metrics/python/metrics
 tensorflow/contrib/metrics/python/ops
@@ -346,7 +331,6 @@ tensorflow/contrib/pi_examples/label_image
 tensorflow/contrib/pi_examples/label_image/data
 tensorflow/contrib/periodic_resample
 tensorflow/contrib/periodic_resample/python
-tensorflow/contrib/periodic_resample/python/kernels
 tensorflow/contrib/periodic_resample/python/ops
 tensorflow/contrib/predictor
 tensorflow/contrib/quantization
@@ -411,13 +395,9 @@ tensorflow/contrib/tensorboard/plugins
 tensorflow/contrib/tensorboard/plugins/projector
 tensorflow/contrib/tensor_forest
 tensorflow/contrib/tensor_forest/client
-tensorflow/contrib/tensor_forest/core
-tensorflow/contrib/tensor_forest/core/ops
-tensorflow/contrib/tensor_forest/data
 tensorflow/contrib/tensor_forest/hybrid
 tensorflow/contrib/tensor_forest/hybrid/core
 tensorflow/contrib/tensor_forest/hybrid/core/ops
-tensorflow/contrib/tensor_forest/hybrid/ops
 tensorflow/contrib/tensor_forest/hybrid/python
 tensorflow/contrib/tensor_forest/hybrid/python/layers
 tensorflow/contrib/tensor_forest/hybrid/python/models
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 8db6929e31a..207c7d16f8c 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -126,10 +126,15 @@ STRING(REGEX REPLACE ";" "\\\\;" python_protos "${python_protos}")
 STRING(REGEX REPLACE "\n" ";" python_protos "${python_protos}")
 
 foreach(python_proto ${python_protos})
-  file(GLOB_RECURSE tf_python_protos_src RELATIVE ${tensorflow_source_dir}
-      "${tensorflow_source_dir}/${python_proto}/*.proto"
-  )
-  list(APPEND tf_python_protos_srcs ${tf_python_protos_src})
+  if(NOT python_proto MATCHES "\#")
+    if(NOT EXISTS "${tensorflow_source_dir}/${python_proto}")
+      message(SEND_ERROR "Python proto directory not found: ${python_proto}")
+    endif()
+    file(GLOB_RECURSE tf_python_protos_src RELATIVE ${tensorflow_source_dir}
+        "${tensorflow_source_dir}/${python_proto}/*.proto"
+    )
+    list(APPEND tf_python_protos_srcs ${tf_python_protos_src})
+  endif()
 endforeach(python_proto)
 
 RELATIVE_PROTOBUF_GENERATE_PYTHON(
@@ -142,10 +147,15 @@ STRING(REGEX REPLACE ";" "\\\\;" python_protos_cc "${python_protos_cc}")
 STRING(REGEX REPLACE "\n" ";" python_protos_cc "${python_protos_cc}")
 
 foreach(python_proto_cc ${python_protos_cc})
-  file(GLOB_RECURSE tf_python_protos_cc_src RELATIVE ${tensorflow_source_dir}
-      "${tensorflow_source_dir}/${python_proto_cc}/*.proto"
-  )
-  list(APPEND tf_python_protos_cc_srcs ${tf_python_protos_cc_src})
+  if(NOT python_proto_cc MATCHES "\#")
+    if(NOT EXISTS "${tensorflow_source_dir}/${python_proto_cc}")
+      message(SEND_ERROR "Python proto CC directory not found: ${python_proto_cc}")
+    endif()
+    file(GLOB_RECURSE tf_python_protos_cc_src RELATIVE ${tensorflow_source_dir}
+        "${tensorflow_source_dir}/${python_proto_cc}/*.proto"
+    )
+    list(APPEND tf_python_protos_cc_srcs ${tf_python_protos_cc_src})
+  endif()
 endforeach(python_proto_cc)
 
 RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
@@ -199,7 +209,12 @@ STRING(REGEX REPLACE ";" "\\\\;" python_modules "${python_modules}")
 STRING(REGEX REPLACE "\n" ";" python_modules "${python_modules}")
 
 foreach(python_module ${python_modules})
-  add_python_module(${python_module})
+  if(NOT python_module MATCHES "\#")
+    if(NOT EXISTS "${tensorflow_source_dir}/${python_module}")
+      message(SEND_ERROR "Python module not found: ${python_module}")
+    endif()
+    add_python_module(${python_module})
+  endif()
 endforeach(python_module)
 
 add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index d3dca3d9e75..0d51c282a89 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -96,7 +96,7 @@ class GANEstimator(estimator.Estimator):
       # Generate samples from generator.
       predictions = np.array([
           x for x in gan_estimator.predict(predict_input_fn)])
-    ```
+  ```
   """
 
   def __init__(self,
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 0d25a098525..62bc1ab15d0 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -2674,16 +2674,25 @@ def spatial_softmax(features,
                                         indexing='ij')
       pos_x = array_ops.reshape(pos_x, [height * width])
       pos_y = array_ops.reshape(pos_y, [height * width])
+      
       if temperature is None:
-        temperature_collections = utils.get_variable_collections(
-            variables_collections, 'temperature')
-        temperature = variables.model_variable(
-            'temperature',
-            shape=(),
-            dtype=dtypes.float32,
-            initializer=init_ops.ones_initializer(),
-            collections=temperature_collections,
-            trainable=trainable)
+        temp_initializer = init_ops.ones_initializer()
+      else:
+        temp_initializer = init_ops.constant_initializer(temperature)
+          
+      if not trainable:
+        temp_collections = None
+      else:
+        temp_collections = utils.get_variable_collections(
+              variables_collections, 'temperature')
+      
+      temperature = variables.model_variable(
+          'temperature',
+          shape=(),
+          dtype=dtypes.float32,
+          initializer=temp_initializer,
+          collections=temp_collections,
+          trainable=trainable)
       if data_format == 'NCHW':
         features = array_ops.reshape(features, [-1, height * width])
       else:
diff --git a/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc b/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
index 27ce55d5687..616240fda8f 100644
--- a/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
+++ b/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
@@ -22,10 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
-namespace {
-template <typename T>
-bool ConvertHelper(const string& s, T* value);
-}
 
 template <typename T, typename Tlabel>
 class DecodeLibsvmOp : public OpKernel {
@@ -57,7 +53,7 @@ class DecodeLibsvmOp : public OpKernel {
                                           "]: \"", input_flat(i), "\""));
       Tlabel label_value;
       OP_REQUIRES(
-          ctx, ConvertHelper<Tlabel>(entries[0], &label_value),
+          ctx, strings::SafeStringToNumeric<Tlabel>(entries[0], &label_value),
           errors::InvalidArgument("Label format incorrect: ", entries[0]));
       label(i) = label_value;
       for (int j = 1; j < entries.size(); j++) {
@@ -74,7 +70,7 @@ class DecodeLibsvmOp : public OpKernel {
                         "Feature index should be >= 0, got ", feature_index));
         T feature_value;
         OP_REQUIRES(
-            ctx, ConvertHelper<T>(pair[1], &feature_value),
+            ctx, strings::SafeStringToNumeric<T>(pair[1], &feature_value),
             errors::InvalidArgument("Feature format incorrect: ", entries[j]));
         out_values.emplace_back(feature_value);
         out_indices.emplace_back(std::pair<int64, int64>(i, feature_index));
@@ -128,25 +124,6 @@ class DecodeLibsvmOp : public OpKernel {
   int64 num_features_;
 };
 
-namespace {
-template <>
-bool ConvertHelper<float>(const string& s, float* value) {
-  return strings::safe_strtof(s.c_str(), value);
-}
-template <>
-bool ConvertHelper<double>(const string& s, double* value) {
-  return strings::safe_strtod(s.c_str(), value);
-}
-template <>
-bool ConvertHelper<int32>(const string& s, int32* value) {
-  return strings::safe_strto32(s.c_str(), value);
-}
-template <>
-bool ConvertHelper<int64>(const string& s, int64* value) {
-  return strings::safe_strto64(s.c_str(), value);
-}
-}  // namespace
-
 #define REGISTER_KERNEL(type)                                         \
   REGISTER_KERNEL_BUILDER(Name("DecodeLibsvm")                        \
                               .Device(DEVICE_CPU)                     \
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index fc80bb8e87b..bc3a73ad297 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -80,8 +80,7 @@ FlatBufferModel::FlatBufferModel(const char* filename, bool mmap_file,
   } else {
     allocation_ = new FileCopyAllocation(filename, error_reporter);
   }
-  if (!allocation_->valid()) return;
-  if (!CheckModelIdentifier()) return;
+  if (!allocation_->valid() || !CheckModelIdentifier()) return;
 
   model_ = VerifyAndGetModel(allocation_->base(), allocation_->bytes());
 }
diff --git a/tensorflow/contrib/lite/models/smartreply/g3doc/README.md b/tensorflow/contrib/lite/models/smartreply/g3doc/README.md
index cab5dcca43a..a6d75648b3f 100644
--- a/tensorflow/contrib/lite/models/smartreply/g3doc/README.md
+++ b/tensorflow/contrib/lite/models/smartreply/g3doc/README.md
@@ -137,8 +137,8 @@ Following are the ops supported for using On-Device Smart Reply model:
 
 *   **HASHTABLE_LOOKUP**
 
-    This is a custom op that uses label id from predict op and looks up the
-    response text from the given label id.
+    This is an op inside TensorFlow Lite that uses label id from predict op and
+    looks up the response text from the given label id.
 
 ## Further Information
 
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
index d7c3d6c3be5..a80f060b91d 100644
--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@@ -144,6 +144,7 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":predictor_factories",
+        ":testing_common",
     ],
 )
 
diff --git a/tensorflow/contrib/predictor/core_estimator_predictor.py b/tensorflow/contrib/predictor/core_estimator_predictor.py
index bd5174aef81..d78d94c2699 100644
--- a/tensorflow/contrib/predictor/core_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/core_estimator_predictor.py
@@ -68,10 +68,10 @@ class CoreEstimatorPredictor(predictor.Predictor):
       serving_input_receiver = serving_input_receiver_fn()
       signature_def = _get_signature_def(
           serving_input_receiver, estimator, output_key)
-      checkpoint_path = estimator.model_dir
+      checkpoint_dir = estimator.model_dir
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
-              checkpoint_filename_with_path=checkpoint_path))
+              checkpoint_dir=checkpoint_dir))
 
     feed_tensor_info = signature_def.inputs
     self._feed_tensors = {k: self._graph.get_tensor_by_name(v.name)
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
index 9485187c5d5..04b5d5bdf15 100644
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 from tensorflow.contrib.predictor import contrib_estimator_predictor
 from tensorflow.contrib.predictor import core_estimator_predictor
 from tensorflow.contrib.predictor import saved_model_predictor
+
+from tensorflow.contrib.learn.python.learn.estimators import estimator as contrib_estimator
 from tensorflow.python.estimator import estimator as core_estimator
 
 
@@ -85,7 +87,7 @@ def from_estimator(estimator,
     TypeError: if `estimator` is a contrib `Estimator` instead of a core
       `Estimator`.
   """
-  if isinstance(estimator, estimator.Estimator):
+  if isinstance(estimator, contrib_estimator.Estimator):
     raise TypeError('Espected estimator to be of type '
                     'tf.python.estimator.Estimator, but got type '
                     'tf.contrib.learn.Estimator. You likely want to call '
diff --git a/tensorflow/contrib/predictor/predictor_factories_test.py b/tensorflow/contrib/predictor/predictor_factories_test.py
index 60ffeec6536..e8443e718d1 100644
--- a/tensorflow/contrib/predictor/predictor_factories_test.py
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.predictor import predictor_factories
+from tensorflow.contrib.predictor import testing_common
 from tensorflow.python.platform import test
 
 MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
@@ -46,6 +47,29 @@ class PredictorFactoriesTest(test.TestCase):
     with self.assertRaisesRegexp(RuntimeError, bad_tags_regex):
       predictor_factories.from_saved_model(self._export_dir, tags='bad_tag')
 
+  def testFromContribEstimator(self):
+    estimator = testing_common.get_arithmetic_estimator(core=False)
+    input_fn = testing_common.get_arithmetic_input_fn(core=False)
+    predictor_factories.from_contrib_estimator(estimator, input_fn,
+        output_alternative_key='sum')
+
+  def testFromContribEstimatorWithCoreEstimatorRaises(self):
+    estimator = testing_common.get_arithmetic_estimator(core=True)
+    input_fn = testing_common.get_arithmetic_input_fn(core=True)
+    with self.assertRaises(TypeError):
+      predictor_factories.from_contrib_estimator(estimator, input_fn)
+
+  def testFromCoreEstimator(self):
+    estimator = testing_common.get_arithmetic_estimator(core=True)
+    input_fn = testing_common.get_arithmetic_input_fn(core=True)
+    predictor_factories.from_estimator(estimator, input_fn)
+
+  def testFromCoreEstimatorWithContribEstimatorRaises(self):
+    estimator = testing_common.get_arithmetic_estimator(core=False)
+    input_fn = testing_common.get_arithmetic_input_fn(core=False)
+    with self.assertRaises(TypeError):
+      predictor_factories.from_estimator(estimator, input_fn)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/signal/python/ops/mfcc_ops.py b/tensorflow/contrib/signal/python/ops/mfcc_ops.py
index 7bc7b57cd4f..6cef95f7425 100644
--- a/tensorflow/contrib/signal/python/ops/mfcc_ops.py
+++ b/tensorflow/contrib/signal/python/ops/mfcc_ops.py
@@ -50,7 +50,7 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   # A 1024-point STFT with frames of 64 ms and 75% overlap.
   stfts = tf.contrib.signal.stft(pcm, frame_length=1024, frame_step=256,
                                  fft_length=1024)
-  spectrograms = tf.abs(stft)
+  spectrograms = tf.abs(stfts)
 
   # Warp the linear scale spectrograms into the mel-scale.
   num_spectrogram_bins = stfts.shape[-1].value
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index dc92ae0c859..c7a54cb9a2e 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -676,7 +676,7 @@ file were implicitly obtained from each provided variable's `var.op.name`.
 
 This works well when the variable names in the checkpoint file match those in
 the graph. However, sometimes, we want to restore a model from a checkpoint
-whose variables have different names those in the current graph. In this case,
+whose variables have different names to those in the current graph. In this case,
 we must provide the `Saver` a dictionary that maps from each checkpoint variable
 name to each graph variable. Consider the following example where the checkpoint
 variables names are obtained via a simple function:
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index def00b76184..54362c87b56 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -753,9 +753,10 @@ def train(train_op,
           if logdir:
             sv.start_standard_services(sess)
         elif startup_delay_steps > 0:
+           # (use sys.maxsize because sys.maxint doesn't exist in Python 3)
           _wait_for_step(sess, global_step,
                          min(startup_delay_steps, number_of_steps or
-                             sys.maxint))
+                             sys.maxsize))
         threads = sv.start_queue_runners(sess)
         logging.info('Starting Queues.')
         if is_chief and sync_optimizer is not None:
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index dc6cc674e62..58a7fa095d8 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -530,7 +530,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":client_lib",
-        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index 807c8398439..cddd62851be 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib import layers
 
 from tensorflow.contrib.learn.python.learn.estimators import estimator
@@ -190,7 +189,7 @@ def get_model_fn(params,
                 features, labels, input_weights=weights,
                 num_trainers=num_trainers,
                 trainer_id=trainer_id),
-            state_ops.assign_add(contrib_framework.get_global_step(), 1))
+            state_ops.assign_add(training_util.get_global_step(), 1))
 
     # Put weights back in
     if weights is not None:
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 6139c1d5838..cccaa2b833e 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -26,7 +26,6 @@ py_library(
         "python/training/resample.py",
         "python/training/sampling_ops.py",
         "python/training/sequence_queueing_state_saver.py",
-        "python/training/sgdr_learning_rate_decay.py",
         "python/training/training.py",
         "python/training/tuner.py",
     ],
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index b8ff44bc4b6..d032cf9aa70 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -79,6 +79,7 @@ load(
     "if_linux_x86_64",
     "if_mobile",
     "if_not_mobile",
+    "if_windows",
     "if_not_windows",
     "tf_copts",
     "tf_cc_test",
@@ -562,7 +563,7 @@ cc_library(
         "platform/prefetch.h",
         "platform/thread_annotations.h",
         "platform/types.h",
-    ],
+    ] + if_windows(["platform/windows/integral_types.h"]),
     visibility = ["//visibility:public"],
     deps =
         [
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 1390810c288..f7b733aa00f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -761,6 +761,10 @@ int64 MinSystemMemory(int64 available_memory) {
   // builds in windows); because in non-opt builds more system memory
   // is necessary.
   min_system_memory *= 2;
+#endif
+#if defined(NVIDIA_TEGRA)
+  // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM and Video RAM
+  min_system_memory = 1<<30;
 #endif
   return min_system_memory;
 }
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index a1b62179f58..0183975fdba 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -194,10 +194,9 @@ cc_library(
     ],
 )
 
-cc_library(
+tf_kernel_library(
     name = "fill_functor",
-    srcs = ["fill_functor.cc"],
-    hdrs = ["fill_functor.h"],
+    prefix = "fill_functor",
     deps = [
         "//tensorflow/core:framework",
         "//third_party/eigen3",
@@ -3043,6 +3042,7 @@ tf_kernel_library(
         "//conditions:default": [],
     }),
     hdrs = [
+        "fill_functor.h",
         "conv_grad_ops.h",
         "deep_conv2d.h",
         "gemm_functors.h",
@@ -3067,6 +3067,7 @@ tf_kernel_library(
         ":conv_2d",
         ":conv_3d",
         ":image_resizer_state",
+        ":fill_functor",
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -3173,7 +3174,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "fused_batch_norm_op",
     prefix = "fused_batch_norm_op",
-    deps = NN_DEPS,
+    deps = NN_DEPS + [
+        ":fill_functor",
+    ],
 )
 
 tf_kernel_library(
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index c485d02e232..c8bfb268592 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -151,18 +151,6 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 
-namespace functor {
-
-// Partial specialization of FillFunctor<Device=CPUDevice, T>.
-template <typename T>
-struct FillFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstScalar in) {
-    out.device(d) = out.constant(in());
-  }
-};
-
-}  // end namespace functor
 
 template <typename Device, typename T, typename Index>
 class FillOp : public OpKernel {
@@ -191,28 +179,6 @@ class FillOp : public OpKernel {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-
-namespace functor {
-// Partial specialization of FillFunctor<Device=SYCLDevice, T>.
-template <typename T>
-struct FillFunctor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstScalar in) {
-#if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::array<int, 1> rank1{1};
-#else
-    Eigen::IndexList<Eigen::type2index<1> > rank1;
-#endif
-    const int size = out.dimension(0);
-    Eigen::array<int, 1> broadcast_dims{size};
-
-    To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims);
-  }
-};
-}  // namespace functor
-#endif  // TENSORFLOW_USE_SYCL
-
 #define REGISTER_KERNEL(D, TYPE)                                   \
   REGISTER_KERNEL_BUILDER(Name("Fill")                             \
                               .Device(DEVICE_##D)                  \
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 1791c510966..5e4feb25848 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #ifdef TENSORFLOW_USE_LIBXSMM
 #include "tensorflow/core/kernels/xsmm_conv2d.h"
 #endif
@@ -595,6 +596,13 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
     if (filter_shape.num_elements() == 0) {
       return;
     }
+    // If input is empty, set gradients to zero.
+    if (input.shape().num_elements() == 0) {
+      functor::SetZeroFunctor<Device, T> f;
+      f(context->eigen_device<Device>(), filter_backprop->flat<T>());
+      return;
+    }
+
 
     // For now we take the stride from the second and third dimensions only (we
     // do not support striding on the batch or depth dimension).
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index 9a00a091bd2..172deea1da1 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -19,12 +19,15 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <limits>
+#include <utility>
 
 #include "cuda/include/cuda.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/lib/math/math_util.h"
 
 namespace tensorflow {
 
@@ -223,185 +226,136 @@ __global__ void SwapDimension1And2InTensor3Simple(int nthreads, const T* input,
 // Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor,
 // where dimensions are zero-based: output[i][j][k] = input[i][k][j].
 //
-// Each thread block operates on a single tile, a square of dimensions TileSize
-// x TileSize.  We require that the thread block's X dimension equals TileSize,
-// and its Y dimension equals NumSubTiles.
+// Each thread block operates on a single tile, a rectangle of dimensions
+// TileSizeI x TileSizeJ.
 //
-// For best performance, you should probably set TileSize equal to the number of
-// threads in a warp (32 in nvidia GPUs).  With a TileSize of 32, NumSubTiles ==
-// 4 or 8 seems to get the best performance on K40 GPUs.
-template <typename T, int TileSize, int NumSubTiles, bool conjugate = false>
-__global__ void SwapDimension1And2InTensor3UsingTiles(const T* input,
-                                                      Dimension<3> input_dims,
-                                                      T* output) {
-  // One extra line in the inner dimension to avoid share memory bank conflict.
-  __shared__ T shared_memory_tile[TileSize][TileSize + 1];
-
-  static_assert(TileSize % NumSubTiles == 0,
-                "TileSize must be divisible by NumSubTiles");
-  eigen_assert(blockDim.x == TileSize);
-  eigen_assert(blockDim.y == NumSubTiles);
+// In general, for best performance, you should probably set TileSizeI,
+// TileSizeJ equal to the number of threads in a warp (32 in nvidia GPUs).
+// With a TileSizeI, TileSizeJ of 32, NumThreads of 128 or 256 seems to get
+// the best performance on K40 GPUs.
+template <typename T, int NumThreads, int TileSizeI, int TileSizeJ,
+          bool conjugate = false>
+__global__ void SwapDimension1And2InTensor3UsingTiles(
+    const T* __restrict__ input, Dimension<3> input_dims,
+    T* __restrict__ output) {
+  eigen_assert(blockDim.x == NumThreads);
+  eigen_assert(blockDim.y == 1);
   eigen_assert(blockDim.z == 1);
   eigen_assert(gridDim.y == 1);
   eigen_assert(gridDim.z == 1);
 
-  // We break down the tile into NumSubTiles groups, so each thread processes
-  // kSubTileSize elements (except at the edges of the input).
-  const int kSubTileSize = TileSize / NumSubTiles;
+  constexpr int ReadRowPerPass = NumThreads / TileSizeJ;
+  constexpr int WriteRowPerPass = NumThreads / TileSizeI;
+  // One extra line in the inner dimension to avoid share memory bank conflict.
+  __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
 
   int x = threadIdx.x;
 
   Dimension<3> output_dims = {
-      input_dims[0],
-      input_dims[2],
-      input_dims[1],
+      input_dims[0], input_dims[2], input_dims[1],
   };
 
   Dimension<3> input_dims_in_tiles = {
-      input_dims[0],
-      (input_dims[1] + TileSize - 1) / TileSize,
-      (input_dims[2] + TileSize - 1) / TileSize,
+      input_dims[0], (input_dims[1] + TileSizeI - 1) / TileSizeI,
+      (input_dims[2] + TileSizeJ - 1) / TileSizeJ,
   };
 
   Index<3> input_tile_index =
       FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
 
   Index<3> input_tile_origin = {
-      input_tile_index[0],
-      input_tile_index[1] * TileSize,
-      input_tile_index[2] * TileSize,
+      input_tile_index[0], input_tile_index[1] * TileSizeI,
+      input_tile_index[2] * TileSizeJ,
   };
 
   int input_origin_flat_index =
       TensorIndexToFlat(input_tile_origin, input_dims);
 
-  int tile_width = TileSize;
+  bool full_tile = true;
+  int tile_width = TileSizeJ;
+
   // Only the last row or column may not have the full size.
   if (input_tile_index[2] == input_dims_in_tiles[2] - 1) {
-    tile_width = input_dims[2] - (input_dims_in_tiles[2] - 1) * TileSize;
+    tile_width = input_dims[2] - (input_dims_in_tiles[2] - 1) * TileSizeJ;
+    full_tile &= false;
   }
-  int tile_height = TileSize;
+
+  int tile_height = TileSizeI;
+
   if (input_tile_index[1] == input_dims_in_tiles[1] - 1) {
-    tile_height = input_dims[1] - (input_dims_in_tiles[1] - 1) * TileSize;
+    tile_height = input_dims[1] - (input_dims_in_tiles[1] - 1) * TileSizeI;
+    full_tile &= false;
   }
 
-  int input_flat_index = input_origin_flat_index + x;
-  int y_start = static_cast<int>(threadIdx.y) * kSubTileSize;
+  // Calculate effective thread number. This ensures that we use the largest
+  // number of threads available to form a regular thread block with no
+  // trailing incomplete lines.
+  constexpr int in_effective_thread_num = NumThreads / TileSizeJ * TileSizeJ;
 
-  // Load the data from input memory to the shared memory tile.
-  if (x < tile_width) {
-    int y_end = min(y_start + kSubTileSize, tile_height);
-    for (int y = y_start; y < y_end; y++) {
-      shared_memory_tile[y][x] = maybe_conj<T, conjugate>::run(
-          input[input_flat_index + y * input_dims[2]]);
+  if (x < in_effective_thread_num) {
+    // Orient the logical thread block with respect to the input array.
+    // ie. align the contiguous dimension of thread blocks with the contiguous
+    // dimension of the input array.
+    int ti = x / TileSizeJ;
+    int tj = x % TileSizeJ;
+    int input_index = input_origin_flat_index + ti * input_dims[2] + tj;
+    int input_increment = ReadRowPerPass * input_dims[2];
+
+    if (full_tile) {
+#pragma unroll
+      for (int i_loc = ti; i_loc < (TileSizeI); i_loc += ReadRowPerPass) {
+        shared_memory_tile[i_loc][tj] =
+            maybe_conj<T, conjugate>::run(input[input_index]);
+        input_index += input_increment;
+      }
+    } else {
+      if (tj < tile_width) {
+        for (int i_loc = ti; i_loc < (tile_height); i_loc += ReadRowPerPass) {
+          shared_memory_tile[i_loc][tj] =
+              maybe_conj<T, conjugate>::run(input[input_index]);
+          input_index += input_increment;
+        }
+      }
     }
   }
 
   __syncthreads();
 
   Index<3> output_tile_index = {
-      input_tile_index[0],
-      input_tile_index[2],
-      input_tile_index[1],
+      input_tile_index[0], input_tile_index[2], input_tile_index[1],
   };
 
   Index<3> output_tile_origin = {
-      output_tile_index[0],
-      output_tile_index[1] * TileSize,
-      output_tile_index[2] * TileSize,
+      output_tile_index[0], output_tile_index[1] * TileSizeJ,
+      output_tile_index[2] * TileSizeI,
   };
 
   int output_origin_flat_index =
       TensorIndexToFlat(output_tile_origin, output_dims);
 
-  int output_flat_index = output_origin_flat_index + x;
+  constexpr int out_effective_thread_num = NumThreads / TileSizeI * TileSizeI;
 
-  // Load the data from the shared memory tile to the output memory.
-  if (x < tile_height) {
-    int y_end = min(y_start + kSubTileSize, tile_width);
-    for (int y = y_start; y < y_end; y++) {
-      output[output_flat_index + y * output_dims[2]] = shared_memory_tile[x][y];
-    }
-  }
-}
+  if (x < out_effective_thread_num) {
+    // Re-orient the logical thread block with respect to the output array.
+    // ie. align the contiguous dimension of thread blocks with contiguous
+    // dimension of the output array.
+    int ti = x / TileSizeI;
+    int tj = x % TileSizeI;
+    int output_index = output_origin_flat_index + ti * output_dims[2] + tj;
+    int output_increment = WriteRowPerPass * output_dims[2];
 
-// Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor
-// when only one of the dimension sizes is smaller than 16,
-// where dimensions are zero-based: output[i][j][k] = input[i][k][j].
-//
-// small_dim = the_smaller_dimension_size
-// large_dim = the_larger_dimension_size
-// tile_num_per_block = blockDim.x
-// kTileLength = small_dim
-//
-// Each thread block operates on a single rectangle tile, where its width is
-// kTileLength (we currently set it to 64) and its height is small_dim,
-// We set the thread block's X dimension to be tile_num_per_block, and its Y
-// and Z to be one.
-template <typename T, int ShmemSize, bool SmallDim2, bool conjugate = false>
-__global__ void SwapDimension1And2InTensor3SmallDim(const T* input,
-                                                    int batch_per_block,
-                                                    Dimension<3> input_dims,
-                                                    T* output) {
-  // TODO(yangzihao) avoid share memory bank conflict.
-  __shared__ T shared_memory_tile[ShmemSize];
-
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.z == 1);
-
-  int block_offset = blockIdx.x * blockDim.x;
-
-  int x = threadIdx.x;
-  int tile_height = blockDim.x;
-
-  // Get tile height, width, and thread/block origin indices.
-  int small_dim = SmallDim2 ? input_dims[2] : input_dims[1];
-  int large_dim = SmallDim2 ? input_dims[1] : input_dims[2];
-
-  int global_offset = small_dim * large_dim * (blockIdx.y * batch_per_block) +
-                      (SmallDim2 ? block_offset * small_dim : block_offset);
-  if (global_offset >= (input_dims[0] * input_dims[1] * input_dims[2])) return;
-
-  for (int batch = 0; batch < batch_per_block; ++batch) {
-    int block_origin_idx =
-        small_dim * large_dim * (blockIdx.y * batch_per_block + batch);
-    int thread_origin_idx =
-        block_origin_idx +
-        (SmallDim2 ? block_offset * small_dim : block_offset) + x;
-
-    if (block_offset + blockDim.x > large_dim) {
-      tile_height = large_dim - block_offset;
-    }
-
-    __syncthreads();
-
-    // Load a continuous memory region to shared memory tile.
-    if (x < tile_height) {
-      for (int y = 0; y < small_dim; y++) {
-        int shmem_index =
-            SmallDim2 ? (x + y * tile_height) : (x * small_dim + y);
-        shared_memory_tile[shmem_index] = maybe_conj<T, conjugate>::run(
-            ldg(input + thread_origin_idx +
-                y * (SmallDim2 ? tile_height : large_dim)));
+    if (full_tile) {
+#pragma unroll
+      for (int i_loc = ti; i_loc < (TileSizeJ); i_loc += WriteRowPerPass) {
+        output[output_index] = shared_memory_tile[tj][i_loc];
+        output_index += output_increment;
       }
-    }
-
-    __syncthreads();
-
-    // Get block origin index for output array.
-    int output_block_offset = block_origin_idx;
-    int output_block_idx = SmallDim2 ? block_offset : block_offset * small_dim;
-    int output_block_origin_idx = output_block_offset + output_block_idx;
-
-    // Store the transposed memory region in shared memory to device.
-    if (x < tile_height) {
-      for (int y = 0; y < small_dim; y++) {
-        int output_idx = output_block_origin_idx + x +
-                         y * (SmallDim2 ? large_dim : tile_height);
-        int shmem_index =
-            SmallDim2 ? (x * small_dim + y) : (x + y * tile_height);
-        output[output_idx] = shared_memory_tile[shmem_index];
+    } else {
+      if (tj < tile_height) {
+        for (int i_loc = ti; i_loc < (tile_width); i_loc += WriteRowPerPass) {
+          output[output_index] = shared_memory_tile[tj][i_loc];
+          output_index += output_increment;
+        }
       }
     }
   }
@@ -548,6 +502,380 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
   }
 };
 
+// We want std::equal_to and std::greater, but they're not constexpr until
+// C++14.
+struct EqualTo {
+  constexpr bool operator()(int a, int b) const { return a == b; }
+};
+
+struct GreaterThan {
+  constexpr bool operator()(int a, int b) const { return a > b; }
+};
+
+// For each data type, the tile size posibility frontier denotes the tile size
+// combinations that consume the most computational resources constrained by
+// - number of threads per SM limit,
+// - limit on size of the short dimension (<=15) due to the definition of
+//   narrow matrix,
+// - shared memory limit and
+// - some experimentally determined, type-specific constraint on the product of
+//   two side lengths to increase grid-level parallelism.
+//
+// A tile size combination lies on the frontier if and only if one or more
+// constraint mentioned above is hit. Tile size combinations lying outside this
+// frontier are either not possible, or are slower than the alternatives.
+//
+// It is instrumental to consider, for each data type, two subsets of the
+// corresponding frontier:
+// - long side frontier: the union of the biggest tile size combination for
+//   each legal long side len.
+// - non long side frontier: the frontier set minus the long side frontier.
+//
+// TileSizePossibilityFrontierCheck defines the frontier using only the long
+// side frontier tile size combinations (since one can easily extrapolate
+// the entire frontier from this subset). It serves as a utility function
+// to help us determine where a tile size combination of interest lies with
+// resepect to the frontier.
+template <typename Op>
+constexpr bool TileSizePossibilityFrontierCheck(int TileLongSide,
+                                                int TileShortSide,
+                                                int size_of_t, Op op) {
+  // clang-format off
+  return (size_of_t == 16 && ((TileLongSide == 32   && op(TileShortSide, 4))  ||
+                             (TileLongSide == 64   && op(TileShortSide, 4))  ||
+                             (TileLongSide == 128  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 256  && op(TileShortSide, 2)))) ||
+          (size_of_t == 8 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 256  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 2)))) ||
+          (size_of_t == 4 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2)))) ||
+          (size_of_t == 2 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2)))) ||
+          (size_of_t == 1 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2))));
+  // clang-format on
+}
+
+constexpr bool TileSizeOnLongSideFrontier(int TileLongSide, int TileShortSide,
+                                          int size_of_t) {
+  return TileSizePossibilityFrontierCheck(TileLongSide, TileShortSide,
+                                          size_of_t, EqualTo());
+}
+constexpr bool TileSizeOutsideFrontier(int TileLongSide, int TileShortSide,
+                                       int size_of_t) {
+  return TileSizePossibilityFrontierCheck(TileLongSide, TileShortSide,
+                                          size_of_t, GreaterThan());
+}
+constexpr bool TileSizeOnNonLongSideFrontier(int TileLongSide,
+                                             int TileShortSide, int size_of_t) {
+  // For a tile size combination (longside, shortside), lying on the frontier
+  // implies that (longside, shortside) is on or within the frontier but
+  // (longside*2, shortside) or (longside, shortside+1) is not. With the above
+  // critereon, we simply need to use !TileSizeOnLongSideFrontier to ensure that
+  // it is not on the long side frontier.
+  return !TileSizeOutsideFrontier(TileLongSide, TileShortSide, size_of_t) &&
+         (TileSizeOutsideFrontier(TileLongSide * 2, TileShortSide, size_of_t) ||
+          TileSizeOutsideFrontier(TileLongSide, TileShortSide + 1,
+                                  size_of_t)) &&
+         !TileSizeOnLongSideFrontier(TileLongSide, TileShortSide, size_of_t);
+}
+
+// Helper function to launch a batch narrow matirx transpose kernel.
+template <typename T, int TileLongSide, int TileShortSide>
+void LaunchBatchNarrowMatrixTransposeKernel(
+    const GPUDevice& d, int tile_size_i, int tile_size_j, int total_tiles_count,
+    const T* input, const Dimension<3>& input_dims, T* output) {
+  constexpr int NumThreads = TileLongSide;
+  if (tile_size_i <= TileLongSide && tile_size_j <= TileShortSide) {
+    SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileLongSide,
+                                          TileShortSide>
+        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
+                                                           output);
+  } else {
+    SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileShortSide,
+                                          TileLongSide>
+        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
+                                                           output);
+  }
+}
+
+// Recursive template function to search, in a trial-and-error manner, for the
+// minimum tile size configuration satisfying the requested tile side lengths.
+// An important invariant of this search procedure is that for an unsatisfied
+// request, we always try doubling the long side len first, and only after
+// the request is satisfied for the long side len do we begin incrementing
+// the short side len.
+//
+// We have three specializations of this search function depending on where the
+// current tile size combination lies with respect to the frontier.
+// - It lies within the frontier. If request is not satisfied, for the next tile
+// size combination, we first try doubling the long side len and if that does
+// not work, we then increment the short side len.
+// - It lies on the non long side frontier. If the request is not satisfied, we
+// can only increment the short side len.
+// - It lies on the long side frontier. We launch the kernel without checking if
+// the request is satisfied or not.
+template <typename T, int TileLongSide, int TileShortSide,
+          typename dummy = void>
+struct BatchNarrowMatrixTransposeDispatcher {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+    bool request_satisfied = max(tile_size_i, tile_size_j) <= TileLongSide &&
+                             min(tile_size_i, tile_size_j) <= TileShortSide;
+
+    if (request_satisfied) {
+      LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
+          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+          output);
+      return;
+    }
+
+    // If the execution reaches here, then the kernel was not launched; we then
+    // determine whether it is the long side or the short side that falls short
+    // of the request and increase that parameter accordingly.
+    const bool long_side_request_not_satisfied =
+        max(tile_size_i, tile_size_j) > TileLongSide;
+
+    if (long_side_request_not_satisfied) {
+      BatchNarrowMatrixTransposeDispatcher<
+          T, TileLongSide * 2, TileShortSide>::DoIt(d, tile_size_i, tile_size_j,
+                                                    total_tiles_count, input,
+                                                    input_dims, output);
+    } else {
+      BatchNarrowMatrixTransposeDispatcher<
+          T, TileLongSide, TileShortSide + 1>::DoIt(d, tile_size_i, tile_size_j,
+                                                    total_tiles_count, input,
+                                                    input_dims, output);
+    }
+  }
+};
+
+template <typename T, int TileLongSide, int TileShortSide>
+struct BatchNarrowMatrixTransposeDispatcher<
+    T, TileLongSide, TileShortSide,
+    typename std::enable_if<TileSizeOnNonLongSideFrontier(
+                                TileLongSide, TileShortSide, sizeof(T)),
+                            void>::type> {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+    bool request_satisfied = max(tile_size_i, tile_size_j) <= TileLongSide &&
+                             min(tile_size_i, tile_size_j) <= TileShortSide;
+
+    if (request_satisfied) {
+      LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
+          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+          output);
+      return;
+    }
+
+    // If the execution reaches here, then the kernel was not launched; since
+    // we are on the non long side frontier, we increment the short dimension
+    // and try again.
+    BatchNarrowMatrixTransposeDispatcher<
+        T, TileLongSide, TileShortSide + 1>::DoIt(d, tile_size_i, tile_size_j,
+                                                  total_tiles_count, input,
+                                                  input_dims, output);
+  }
+};
+
+template <typename T, int TileLongSide, int TileShortSide>
+struct BatchNarrowMatrixTransposeDispatcher<
+    T, TileLongSide, TileShortSide,
+    typename std::enable_if<TileSizeOnLongSideFrontier(
+                                TileLongSide, TileShortSide, sizeof(T)),
+                            void>::type> {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+
+    LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
+        d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+        output);
+  }
+};
+
+// This function tries to recover, in a brute force way, the frontier defined in
+// TileSizePossibilityFrontierCheck as a vector of tile size combinations lying
+// on the long side frontier. This vector is sufficient to determine the entire
+// frontier.
+//
+// Note that if one changes the frontier definition in
+// TileSizePossibilityFrontierCheck and forgets to set the largest short
+// side len of the largest legal long side len to 2, this function will fail
+// and crash the program.
+template <int SizeOfT>
+const std::vector<std::pair<int, int>>& GetTileSizesFrontier() {
+  static_assert(
+      SizeOfT <= 16,
+      "Currently, only data types of sizes 16 bytes or less are supported.");
+  static_assert((SizeOfT & (SizeOfT - 1)) == 0,
+                "Data types must have sizes that are powers of 2.");
+
+  // Expensive work to populate sizes, lazily run in a thread-safe
+  // manner the first time GetTileSizesFrontier<N> is called.
+  static auto* frontier = [] {
+    auto* frontier = new std::vector<std::pair<int, int>>();
+    const int kMaxLongSideLen = 1024;
+    const int kMaxShortSideLen = 15;
+    for (int long_side = 32; long_side <= kMaxLongSideLen; long_side *= 2) {
+      for (int short_side = 2; short_side <= kMaxShortSideLen;
+           short_side += 1) {
+        if (TileSizeOnLongSideFrontier(long_side, short_side, SizeOfT)) {
+          // The current combination lies on the frontier, thus we
+          // add it to the frontier definition.
+          frontier->push_back(std::make_pair(long_side, short_side));
+
+          // The long side length is the largest one allowed iff its
+          // corresponding short side length is 2.
+          if (short_side == 2) return frontier;
+
+          // We have exhausted all the possibilities in the frontier
+          // with the given long side length.
+          break;
+        }
+      }
+    }
+    LOG(FATAL)
+        << "The corresponding short side length of the largest long side "
+           "length has to be 2.";
+  }();
+  return *frontier;
+}
+
+// Helper structs to help determine which data type to use given the size of
+// the matrix data type. A transpose of elements of size N will use a kernel
+// which operates on an array of TransposeElemType<N>::type.
+template <int ElemBytes>
+struct TransposeElemType;
+template <>
+struct TransposeElemType<1> {
+  using type = uint8;
+};
+template <>
+struct TransposeElemType<2> {
+  using type = uint16;
+};
+template <>
+struct TransposeElemType<4> {
+  using type = uint32;
+};
+template <>
+struct TransposeElemType<8> {
+  using type = uint64;
+};
+template <>
+struct TransposeElemType<16> {
+  using type = float4;
+};
+
+// A helper function to make RunSwapDimension1And2InTensor3 concise. This
+// helper function looks at the data type and input matrix sizes and decides
+// the thread numbers and tile sizes to use.
+template <typename T, bool conjugate = false >
+void SwapDimension1And2InTensor3WithNarrowMatrices(
+    const GPUDevice& d, const T* input, const Dimension<3>& input_dims,
+    T* output, const int kMinDimensionToUseTiles) {
+  // Get available tile sizes here for the data type requested:
+  const auto& tile_spec = GetTileSizesFrontier<sizeof(T)>();
+
+  int tile_long_side_len = 0;
+  int tile_short_side_len = 0;
+  float lowest_cost = std::numeric_limits<float>::max();
+  int data_long_side = max(input_dims[1], input_dims[2]);
+
+  for (auto tile_size_pair : tile_spec) {
+    int proposed_tile_long_side_len = tile_size_pair.first;
+
+    // Number of threads that will not be doing anything useful when reading
+    // the matrix because the thread block size is bigger than the data block
+    // size.
+    int num_wasted_threads =
+        data_long_side - MathUtil::FloorOfRatio<int>(
+                             data_long_side, proposed_tile_long_side_len) *
+                             proposed_tile_long_side_len;
+
+    int num_full_tiles = MathUtil::FloorOfRatio<int>(
+        data_long_side, proposed_tile_long_side_len);
+
+    float cost = 0;
+
+    // However, if we can execute two or more full tiles, then we gladly
+    // accept any number of wasted threads and ignore its cost.
+    if (num_full_tiles <= 1) cost = num_wasted_threads;
+
+    // Using less than or equal to here because given the same cost, we
+    // would like to launch as many threads as possible.
+    if (cost <= lowest_cost) {
+      tile_long_side_len = proposed_tile_long_side_len;
+      tile_short_side_len = tile_size_pair.second;
+      lowest_cost = cost;
+    }
+  }
+
+  // Request tile sizes such that the longer side of threadblock aligns with
+  // the longer side of input data block to maximize read throughput.
+  // The ideal tile shape is one where the length of the shorter side of the
+  // tile is equal to the length of the shorter side of the input matrix.
+  int requested_tile_size_i = input_dims[1] >= kMinDimensionToUseTiles
+                                  ? tile_long_side_len
+                                  : input_dims[1];
+  int requested_tile_size_j = input_dims[1] >= kMinDimensionToUseTiles
+                                  ? input_dims[2]
+                                  : tile_long_side_len;
+
+  // Truncate the shorter size requested according to the manual limit set in
+  // tile_spec to make sure that we do not launch configurations violating
+  // hardware limits.
+  requested_tile_size_i = requested_tile_size_i == tile_long_side_len
+                              ? tile_long_side_len
+                              : min(requested_tile_size_i, tile_short_side_len);
+  requested_tile_size_j = requested_tile_size_j == tile_long_side_len
+                              ? tile_long_side_len
+                              : min(requested_tile_size_j, tile_short_side_len);
+
+  Dimension<3> input_dims_in_tiles = {
+      input_dims[0],
+      MathUtil::CeilOfRatio<int>(input_dims[1], requested_tile_size_i),
+      MathUtil::CeilOfRatio<int>(input_dims[2], requested_tile_size_j),
+  };
+
+  int total_tiles_count =
+      input_dims_in_tiles[0] * input_dims_in_tiles[1] * input_dims_in_tiles[2];
+
+  using ElemType = typename TransposeElemType<sizeof(T)>::type;
+  static_assert(alignof(T) >= alignof(ElemType), "Unexpected data alignment.");
+  BatchNarrowMatrixTransposeDispatcher<ElemType, 32, 2>::DoIt(
+      d, requested_tile_size_i, requested_tile_size_j, total_tiles_count,
+      reinterpret_cast<const ElemType*>(input), input_dims,
+      reinterpret_cast<ElemType*>(output));
+}
+
 // Launch the GPU kernel that would swap dimension-1 and dimension-2 in a
 // 3D tensor. It looks at the shape of the incoming data, and decides the best
 // strategy to launch.
@@ -558,60 +886,33 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
   // If one dimension is trivial, use SmallDim kernel for swapping.
   // Otherwise, the trivial swapping relying on the ldg cache is more efficient.
   static const int kMinDimensionToUseTiles = 16;
-  bool use_tiles = (input_dims[1] >= kMinDimensionToUseTiles &&
-                    input_dims[2] >= kMinDimensionToUseTiles);
-  bool use_small_dim = ((input_dims[1] >= kMinDimensionToUseTiles &&
-                         input_dims[2] < kMinDimensionToUseTiles)) ||
-                       ((input_dims[1] < kMinDimensionToUseTiles &&
-                         input_dims[2] >= kMinDimensionToUseTiles));
-  static const int NumSubTiles = 8;
+  static const int kMinDimensionToUseRectTiles = 96;
 
-  if (use_tiles) {
-    static const int TileSize = 32;
-    Dimension<3> input_dims_in_tiles = {
-        input_dims[0],
-        (input_dims[1] + TileSize - 1) / TileSize,
-        (input_dims[2] + TileSize - 1) / TileSize,
-    };
-    int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
-                            input_dims_in_tiles[2];
-    // We get best performance when TileSize is the number of threads in a warp
+  bool large_matrix = input_dims[1] >= kMinDimensionToUseTiles &&
+                      input_dims[2] >= kMinDimensionToUseTiles;
+  bool narrow_matrix = input_dims[1] >= kMinDimensionToUseRectTiles ||
+                       input_dims[2] >= kMinDimensionToUseRectTiles;
+  if (large_matrix) {
+    // We get best performance when kTileSize is the number of threads in a warp
     // (32 on our GPUs) and NumSubTiles is 8, so our block size is 8 * 32 = 256
     // threads.
-    SwapDimension1And2InTensor3UsingTiles<T, TileSize, NumSubTiles, conjugate>
-        <<<total_tiles_count, dim3(TileSize, NumSubTiles), 0, d.stream()>>>(
-            input, input_dims, output);
-  } else if (use_small_dim) {
-    // When only one of the dimensions is smaller than kMinDimensionToUseTiles,
-    // we use one block to process a rectangle region with the size of
-    // kTileLength * small_dim. We found that when set kTileLength to 64 on
-    // TitanX Maxwell GPU, it achieves the best performance.
-    //              large_dim
-    //            +---------------...--------+
-    //            |            |        |    |
-    // small_dim  |            |  ...   |    |
-    //            |            |        |    |
-    //            +--------------...---------+
-    //            \----- ------/         \- -/
-    //                  V                  V
-    //    kTileLength(tile_height)    tile_height
-    static const int kTileLength = 64;
-    static const int kGridDimY = 65535;
-    int large_dim = std::max(input_dims[2], input_dims[1]);
-    int tile_num_per_block = (large_dim + kTileLength - 1) / kTileLength;
-    int grid_dim_y = std::min(input_dims[0], kGridDimY);
-    int batch_per_block = (input_dims[0] + grid_dim_y - 1) / grid_dim_y;
-    if (input_dims[2] < input_dims[1]) {
-      SwapDimension1And2InTensor3SmallDim<
-          T, kTileLength * kMinDimensionToUseTiles, true, conjugate>
-          <<<dim3(tile_num_per_block, grid_dim_y), kTileLength, 0,
-             d.stream()>>>(input, batch_per_block, input_dims, output);
-    } else {
-      SwapDimension1And2InTensor3SmallDim<
-          T, kTileLength * kMinDimensionToUseTiles, false, conjugate>
-          <<<dim3(tile_num_per_block, grid_dim_y), kTileLength, 0,
-             d.stream()>>>(input, batch_per_block, input_dims, output);
-    }
+    constexpr int kTileSize = 32;
+    constexpr int kNumThreads = 256;
+
+    Dimension<3> input_dims_in_tiles = {
+        input_dims[0], MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
+        MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
+    };
+
+    int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
+                            input_dims_in_tiles[2];
+    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize, conjugate>
+        <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
+                                                            output);
+
+  } else if (narrow_matrix) {
+    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(d, input, input_dims, output,
+                                                  kMinDimensionToUseTiles);
   } else {
     int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
     CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
diff --git a/tensorflow/core/kernels/data/dataset.h b/tensorflow/core/kernels/data/dataset.h
index cf4722af059..42df4b0340b 100644
--- a/tensorflow/core/kernels/data/dataset.h
+++ b/tensorflow/core/kernels/data/dataset.h
@@ -112,7 +112,7 @@ class GraphDefBuilderWrapper {
   Status AddTensor(const Tensor& val, Node** output) {
     AddTensorInternal(val, output);
     if (*output == nullptr) {
-      return errors::Internal("AddTesor: Failed to build Const op.");
+      return errors::Internal("AddTensor: Failed to build Const op.");
     }
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index ea0cc139f3d..35d9693f541 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 
@@ -74,6 +75,7 @@ DEFINE_SETZERO_SYCL(int32);
 DEFINE_SETZERO_SYCL(int64);
 #undef DEFINE_SETZERO_SYCL
 #endif  // TENSORFLOW_USE_SYCL
+
 template <typename T>
 void SetOneFunctor<Eigen::ThreadPoolDevice, T>::operator()(
     const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out) {
@@ -112,5 +114,47 @@ DEFINE_SETONE_SYCL(double);
 #undef DEFINE_SETONE_SYCL
 #endif  // TENSORFLOW_USE_SYCL
 
+template <typename T>
+struct FillFunctor<Eigen::ThreadPoolDevice, T> {
+  void operator()(const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in) {
+    out.device(d) = out.constant(in());
+  }
+};
+
+// Explicit instantiations.
+#define DEFINE_FILL_CPU(T) \
+  template struct FillFunctor<Eigen::ThreadPoolDevice, T>;
+
+TF_CALL_ALL_TYPES(DEFINE_FILL_CPU);
+DEFINE_FILL_CPU(quint8);
+DEFINE_FILL_CPU(quint16);
+#undef DEFINE_FILL_CPU
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct FillFunctor<Eigen::SyclDevice, T> {
+  void operator()(const Eigen::SyclDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in) {
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::array<int, 1> rank1{1};
+#else
+    Eigen::IndexList<Eigen::type2index<1> > rank1;
+#endif
+    const int size = out.dimension(0);
+    Eigen::array<int, 1> broadcast_dims{size};
+
+    To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims);
+  }
+};
+
+#define DEFINE_FILL_SYCL(T) \
+  template struct FillFunctor<Eigen::SyclDevice, T>;
+DEFINE_FILL_SYCL(float);
+DEFINE_FILL_SYCL(double);
+TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL)
+#undef DEFINE_FILL_SYCL
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fill_functor.cu.cc b/tensorflow/core/kernels/fill_functor.cu.cc
new file mode 100644
index 00000000000..49beb499af2
--- /dev/null
+++ b/tensorflow/core/kernels/fill_functor.cu.cc
@@ -0,0 +1,115 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace Eigen {
+namespace internal {
+
+template <typename T>
+struct scalar_const_op {
+  typedef typename packet_traits<T>::type Packet;
+
+  const T* val;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  scalar_const_op(const scalar_const_op& x)
+      : val(x.val) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_const_op(const T* v) : val(v) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()() const {
+    return *val;
+  }
+
+  template <typename PacketType = Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const {
+    return internal::pset1<PacketType>(*val);
+  }
+};
+
+template <typename T>
+struct functor_traits<scalar_const_op<T> > {
+  enum {
+    Cost = 1,
+    PacketAccess = packet_traits<T>::Vectorizable,
+    IsRepeatable = true
+  };
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+namespace tensorflow {
+
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Partial specialization FillFunctor<Device=GPUDevice, T>
+template <typename T>
+struct FillFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in) {
+    Eigen::internal::scalar_const_op<T> f(in.data());
+    To32Bit(out).device(d) = To32Bit(out).nullaryExpr(f);
+  }
+};
+
+#define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>;
+TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
+TF_CALL_bfloat16(DEFINE_FILL_GPU);
+TF_CALL_bool(DEFINE_FILL_GPU);
+#undef DEFINE_FILL_GPU
+
+// Partial specialization of FillFunctor<Device=GPUDevice, T>.
+template <typename T>
+struct SetZeroFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) {
+    To32Bit(out).device(d) = To32Bit(out).constant(T(0));
+  }
+};
+
+#define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>;
+TF_CALL_NUMBER_TYPES(DEFINE_SETZERO_GPU);
+TF_CALL_bfloat16(DEFINE_SETZERO_GPU);
+TF_CALL_bool(DEFINE_SETZERO_GPU);
+#undef DEFINE_SETZERO_GPU
+
+// Partial specialization of FillFunctor<Device=GPUDevice, T>.
+template <typename T>
+struct SetOneFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) {
+    To32Bit(out).device(d) = To32Bit(out).constant(T(1));
+  }
+};
+
+#define DEFINE_SETONE_GPU(T) template struct SetOneFunctor<GPUDevice, T>;
+TF_CALL_NUMBER_TYPES(DEFINE_SETONE_GPU);
+TF_CALL_bfloat16(DEFINE_SETONE_GPU);
+TF_CALL_bool(DEFINE_SETONE_GPU);
+#undef DEFINE_SETONE_GPU
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 09ba092f404..7ccaef948c4 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/fused_batch_norm_op.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -239,6 +240,14 @@ struct FusedBatchNorm<GPUDevice, T, U> {
             << " offset shape: " << offset.shape().DebugString()
             << " tensor format: " << tensor_format;
 
+    // If input is empty, return NaN mean/variance
+    if (x.shape().num_elements() == 0) {
+      functor::SetNanFunctor<U> f;
+      f(context->eigen_device<GPUDevice>(), batch_mean->flat<U>());
+      f(context->eigen_device<GPUDevice>(), batch_var->flat<U>());
+      return;
+    }
+
     Tensor x_maybe_transformed = x;
     Tensor x_transformed;
     Tensor y_transformed;
@@ -656,6 +665,14 @@ class FusedBatchNormGradOp : public OpKernel {
         context, context->allocate_output(4, TensorShape({}), &placeholder_2));
     FillZeros<Device>(placeholder_2);
 
+    // If input is empty, set gradients w.r.t scale/offset to zero.
+    if (x.shape().num_elements() == 0) {
+      functor::SetZeroFunctor<Device, U> f;
+      f(context->eigen_device<Device>(), scale_backprop->flat<U>());
+      f(context->eigen_device<Device>(), offset_backprop->flat<U>());
+      return;
+    }
+
     if (is_training_) {
       functor::FusedBatchNormGrad<Device, T, U>()(
           context, y_backprop, x, scale, saved_mean_or_pop_mean,
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
index dc956066ecf..a8484390b92 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@@ -65,8 +65,15 @@ void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
                                               epsilon, sample_size, variance);
 }
 
+template <class T>
+void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d,
+                                  typename TTypes<T>::Flat out) {
+  To32Bit(out).device(d) = To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
+}
+
 template class VarianceToInvVariance<float>;
 template class InvVarianceToVariance<float>;
+template class SetNanFunctor<float>;
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.h b/tensorflow/core/kernels/fused_batch_norm_op.h
index 3af104bf954..d6c68df9861 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.h
+++ b/tensorflow/core/kernels/fused_batch_norm_op.h
@@ -49,6 +49,12 @@ struct InvVarianceToVariance {
                   int channels, T* variance);
 };
 
+// This function sets a GPU tensor to NaNs.
+template <class T>
+struct SetNanFunctor {
+  void operator()(const Eigen::GpuDevice& d, typename TTypes<T>::Flat out);
+};
+
 #endif  // GOOGLE_CUDA
 
 // Functor used by FusedBatchNormGradOp to do the computations when
diff --git a/tensorflow/core/kernels/matrix_inverse_op.cc b/tensorflow/core/kernels/matrix_inverse_op.cc
index c61a091c7b7..52afdd15ba6 100644
--- a/tensorflow/core/kernels/matrix_inverse_op.cc
+++ b/tensorflow/core/kernels/matrix_inverse_op.cc
@@ -210,7 +210,7 @@ class MatrixInverseOpGpu : public AsyncOpKernel {
             done);
       }
     } else {
-      // For large matrices, we wompute the inverse of each matrix in the batch
+      // For large matrices, we compute the inverse of each matrix in the batch
       // sequentially. Here we use the cuSolver methods GETRF/GETRS because they
       // are MUCH faster than their batched cuBlas equivalents for large
       // matrices.
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index ac90f67ce0b..6a52a15c931 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -147,6 +147,9 @@ void DnnPoolingOp<T>::Compute(
   Tensor* tensor_out = nullptr;
   OP_REQUIRES_OK(context,
                  context->allocate_output(0, tensor_out_shape, &tensor_out));
+  if (tensor_in.shape().num_elements() == 0) {
+    return;
+  }
 
   PoolParameters params{context, size,        stride,
                         padding, data_format, tensor_in.shape()};
@@ -247,6 +250,9 @@ void DnnPoolingGradOp<T>::Compute(
   Tensor* input_backprop = nullptr;
   OP_REQUIRES_OK(context,
                  context->allocate_output(0, tensor_in_shape, &input_backprop));
+  if (tensor_in_shape.num_elements() == 0) {
+    return;
+  }
 
   PoolParameters params{context, size,        stride,
                         padding, data_format, tensor_in_shape};
diff --git a/tensorflow/core/kernels/record_input_op.cc b/tensorflow/core/kernels/record_input_op.cc
index 1448feb6f63..0c053490a0a 100644
--- a/tensorflow/core/kernels/record_input_op.cc
+++ b/tensorflow/core/kernels/record_input_op.cc
@@ -38,6 +38,7 @@ class RecordInputOp : public OpKernel {
     GETATTR(int64, batch_size);
     GETATTR(string, compression_type);
 #undef GETATTR
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("compression_type", &compression_type));
 
     RecordYielder::Options yopts;
     yopts.file_pattern = file_pattern;
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index de0c66ad238..f92c4ed17af 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -131,7 +131,7 @@ class WhereCPUOp : public OpKernel {
     OP_REQUIRES(
         context, input.dtype() != DT_HALF,
         errors::Unimplemented("No WhereOp available for float16/half type on "
-                              "GPU; dying in CPU WhereOp to avoid silently "
+                              "CPU; dying in CPU WhereOp to avoid silently "
                               "creating costly copies from device."));
 
     const int input_dims = input.dims();
diff --git a/tensorflow/core/ops/bitwise_ops.cc b/tensorflow/core/ops/bitwise_ops.cc
index 2889953bdbc..1d28516888b 100644
--- a/tensorflow/core/ops/bitwise_ops.cc
+++ b/tensorflow/core/ops/bitwise_ops.cc
@@ -38,7 +38,7 @@ computation is performed on the underlying representation of x.
       .Output("z: T")                                                        \
       .SetIsCommutative()                                                    \
       .Attr("T: {int8, int16, int32, int64, uint8, uint16, uint32, uint64}") \
-      .SetShapeFn(shape_inference::UnchangedShape)
+      .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
 
 REGISTER_OP("PopulationCount")
     .Input("x: T")
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index ebdd4b624aa..82bd69f9ca4 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -114,6 +114,8 @@ int64 LogLevelStrToInt(const char* tf_env_var_val) {
   return level;
 }
 
+}  // namespace
+
 int64 MinLogLevelFromEnv() {
   const char* tf_env_var_val = getenv("TF_CPP_MIN_LOG_LEVEL");
   return LogLevelStrToInt(tf_env_var_val);
@@ -124,8 +126,6 @@ int64 MinVLogLevelFromEnv() {
   return LogLevelStrToInt(tf_env_var_val);
 }
 
-}  // namespace
-
 LogMessage::~LogMessage() {
   // Read the min log level once during the first call to logging.
   static int64 min_log_level = MinLogLevelFromEnv();
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index d5f7350cdd8..40c260f2366 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -305,6 +305,10 @@ T&& CheckNotNull(const char* file, int line, const char* exprtext, T&& t) {
   return std::forward<T>(t);
 }
 
+int64 MinLogLevelFromEnv();
+
+int64 MinVLogLevelFromEnv();
+
 }  // namespace internal
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/s3/BUILD b/tensorflow/core/platform/s3/BUILD
index b7bc1a11d65..2cd5f877c9f 100644
--- a/tensorflow/core/platform/s3/BUILD
+++ b/tensorflow/core/platform/s3/BUILD
@@ -28,6 +28,8 @@ filegroup(
 tf_cc_binary(
     name = "s3_file_system.so",
     srcs = [
+        "aws_logging.cc",
+        "aws_logging.h",
         "s3_crypto.cc",
         "s3_crypto.h",
         "s3_file_system.cc",
@@ -66,6 +68,22 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "aws_logging",
+    srcs = [
+        "aws_logging.cc",
+    ],
+    hdrs = [
+        "aws_logging.h",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "@aws//:aws",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "s3_file_system",
     srcs = [
@@ -75,6 +93,7 @@ cc_library(
         "s3_file_system.h",
     ],
     deps = [
+        ":aws_logging",
         ":s3_crypto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/platform/s3/aws_logging.cc b/tensorflow/core/platform/s3/aws_logging.cc
new file mode 100644
index 00000000000..41b854d6343
--- /dev/null
+++ b/tensorflow/core/platform/s3/aws_logging.cc
@@ -0,0 +1,121 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/platform/s3/aws_logging.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+#include <aws/core/Aws.h>
+#include <aws/core/utils/logging/AWSLogging.h>
+#include <aws/core/utils/logging/LogSystemInterface.h>
+
+#include <cstdarg>
+
+namespace tensorflow {
+
+AWSLogSystem::AWSLogSystem(Aws::Utils::Logging::LogLevel log_level)
+    : log_level_(log_level) {}
+
+void AWSLogSystem::Log(Aws::Utils::Logging::LogLevel log_level, const char* tag,
+                       const char* format, ...) {
+  std::va_list args;
+  va_start(args, format);
+
+  const string s = strings::Printf(format, args);
+
+  va_end(args);
+
+  LogMessage(log_level, s);
+}
+
+void AWSLogSystem::LogStream(Aws::Utils::Logging::LogLevel log_level,
+                             const char* tag,
+                             const Aws::OStringStream& message_stream) {
+  LogMessage(log_level, message_stream.rdbuf()->str().c_str());
+}
+
+void AWSLogSystem::LogMessage(Aws::Utils::Logging::LogLevel log_level,
+                              const std::string& message) {
+  switch (log_level) {
+    case Aws::Utils::Logging::LogLevel::Info:
+      LOG(INFO) << message;
+      break;
+    case Aws::Utils::Logging::LogLevel::Warn:
+      LOG(WARNING) << message;
+      break;
+    case Aws::Utils::Logging::LogLevel::Error:
+      LOG(ERROR) << message;
+      break;
+    case Aws::Utils::Logging::LogLevel::Fatal:
+      LOG(FATAL) << message;
+      break;
+    default:
+      LOG(ERROR) << message;
+      break;
+  }
+}
+
+namespace {
+static const char* kAWSLoggingTag = "AWSLogging";
+
+Aws::Utils::Logging::LogLevel ParseLogLevelFromEnv() {
+  Aws::Utils::Logging::LogLevel log_level = Aws::Utils::Logging::LogLevel::Info;
+
+  const int64_t level = tensorflow::internal::MinLogLevelFromEnv();
+
+  switch (level) {
+    case INFO:
+      log_level = Aws::Utils::Logging::LogLevel::Info;
+      break;
+    case WARNING:
+      log_level = Aws::Utils::Logging::LogLevel::Warn;
+      break;
+    case ERROR:
+      log_level = Aws::Utils::Logging::LogLevel::Error;
+      break;
+    case FATAL:
+      log_level = Aws::Utils::Logging::LogLevel::Fatal;
+      break;
+    default:
+      log_level = Aws::Utils::Logging::LogLevel::Info;
+      break;
+  }
+
+  return log_level;
+}
+}
+
+static bool initialized = false;
+static mutex s3_logging_mutex(LINKER_INITIALIZED);
+void AWSLogSystem::InitializeAWSLogging() {
+  std::lock_guard<mutex> s3_logging_lock(s3_logging_mutex);
+  if (!initialized) {
+    Aws::Utils::Logging::InitializeAWSLogging(
+        Aws::MakeShared<AWSLogSystem>(kAWSLoggingTag, ParseLogLevelFromEnv()));
+    initialized = true;
+    return;
+  }
+}
+
+void AWSLogSystem::ShutdownAWSLogging() {
+  std::lock_guard<mutex> s3_logging_lock(s3_logging_mutex);
+  if (initialized) {
+    Aws::Utils::Logging::ShutdownAWSLogging();
+    initialized = false;
+    return;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/s3/aws_logging.h b/tensorflow/core/platform/s3/aws_logging.h
new file mode 100644
index 00000000000..b0da8f3c835
--- /dev/null
+++ b/tensorflow/core/platform/s3/aws_logging.h
@@ -0,0 +1,68 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_S3_S3_LOGGING_H_
+#define TENSORFLOW_CONTRIB_S3_S3_LOGGING_H_
+
+#include <atomic>
+#include <string>
+
+#include <aws/core/utils/logging/LogLevel.h>
+#include <aws/core/utils/logging/LogSystemInterface.h>
+#include "tensorflow/core/platform/default/logging.h"
+
+namespace tensorflow {
+
+class AWSLogSystem : public Aws::Utils::Logging::LogSystemInterface {
+ public:
+  static void InitializeAWSLogging();
+  static void ShutdownAWSLogging();
+
+  explicit AWSLogSystem(Aws::Utils::Logging::LogLevel log_level);
+  virtual ~AWSLogSystem() = default;
+
+  // Gets the currently configured log level.
+  virtual Aws::Utils::Logging::LogLevel GetLogLevel(void) const override {
+    return log_level_;
+  }
+
+  // Set a new log level. This has the immediate effect of changing the log.
+  void SetLogLevel(Aws::Utils::Logging::LogLevel log_level) {
+    log_level_.store(log_level);
+  }
+
+  // Does a printf style output to ProcessFormattedStatement. Don't use this,
+  // it's unsafe. See LogStream.
+  // Since non-static C++ methods have an implicit this argument,
+  // TF_PRINTF_ATTRIBUTE should be counted from two (vs. one).
+  virtual void Log(Aws::Utils::Logging::LogLevel log_level, const char* tag,
+                   const char* format, ...) override TF_PRINTF_ATTRIBUTE(4, 5);
+
+  // Writes the stream to ProcessFormattedStatement.
+  virtual void LogStream(Aws::Utils::Logging::LogLevel log_level,
+                         const char* tag,
+                         const Aws::OStringStream& messageStream) override;
+
+ private:
+  void LogMessage(Aws::Utils::Logging::LogLevel log_level,
+                  const string& message);
+  std::atomic<Aws::Utils::Logging::LogLevel> log_level_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(AWSLogSystem);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_S3_S3_LOGGING_H_
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 682ad97eec3..397f26ec0bf 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -15,10 +15,13 @@ limitations under the License.
 #include "tensorflow/core/platform/s3/s3_file_system.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/s3/aws_logging.h"
 #include "tensorflow/core/platform/s3/s3_crypto.h"
 
 #include <aws/core/Aws.h>
 #include <aws/core/utils/FileSystemUtils.h>
+#include <aws/core/utils/logging/AWSLogging.h>
+#include <aws/core/utils/logging/LogSystemInterface.h>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/S3Errors.h>
 #include <aws/s3/model/CopyObjectRequest.h>
@@ -33,6 +36,7 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
 static const char* kS3FileSystemAllocationTag = "S3FileSystemAllocation";
 static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
 static const int kS3GetChildrenMaxKeys = 100;
@@ -226,7 +230,11 @@ class S3ReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
   uint64 length_;
 };
 
+}  // namespace
+
 S3FileSystem::S3FileSystem() {
+  AWSLogSystem::InitializeAWSLogging();
+
   Aws::SDKOptions options;
   options.cryptoOptions.sha256Factory_create_fn = []() {
     return Aws::MakeShared<S3SHA256Factory>(S3CryptoAllocationTag);
@@ -240,6 +248,8 @@ S3FileSystem::S3FileSystem() {
 S3FileSystem::~S3FileSystem() {
   Aws::SDKOptions options;
   Aws::ShutdownAPI(options);
+
+  AWSLogSystem::ShutdownAWSLogging();
 }
 
 Status S3FileSystem::NewRandomAccessFile(
diff --git a/tensorflow/docs_src/get_started/estimator.md b/tensorflow/docs_src/get_started/estimator.md
index 790de6679b0..f5a8419d2f2 100644
--- a/tensorflow/docs_src/get_started/estimator.md
+++ b/tensorflow/docs_src/get_started/estimator.md
@@ -232,10 +232,10 @@ data and target values for the training set, respectively, and `test_set.data`
 and `test_set.target` contain feature data and target values for the test set.
 
 Later on, in
-["Fit the DNNClassifier to the Iris Training Data,"](#fit-dnnclassifier)
+["Fit the DNNClassifier to the Iris Training Data,"](#fit_the_dnnclassifier_to_the_iris_training_data)
 you'll use `training_set.data` and
 `training_set.target` to train your model, and in
-["Evaluate Model Accuracy,"](#evaluate-accuracy) you'll use `test_set.data` and
+["Evaluate Model Accuracy,"](#evaluate_model_accuracy) you'll use `test_set.data` and
 `test_set.target`. But first, you'll construct your model in the next section.
 
 ## Construct a Deep Neural Network Classifier
diff --git a/tensorflow/docs_src/performance/datasets_performance.md b/tensorflow/docs_src/performance/datasets_performance.md
index 5efe9719aab..dd55849f8e4 100644
--- a/tensorflow/docs_src/performance/datasets_performance.md
+++ b/tensorflow/docs_src/performance/datasets_performance.md
@@ -177,7 +177,7 @@ dataset = dataset.batch(batch_size=FLAGS.batch_size)
 to:
 
 ```
-dataset = dataset.apply(tf.data.contrib.map_and_batch(
+dataset = dataset.apply(tf.contrib.data.map_and_batch(
     map_func=parse_fn, batch_size=FLAGS.batch_size))
 ```
 
diff --git a/tensorflow/docs_src/programmers_guide/faq.md b/tensorflow/docs_src/programmers_guide/faq.md
index 67ed0a9a607..809d74248ea 100644
--- a/tensorflow/docs_src/programmers_guide/faq.md
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@@ -300,7 +300,7 @@ functions, methods, and properties. We also adhere to the
 [Google Python style guide](https://google.github.io/styleguide/pyguide.html).
 
 The TensorFlow C++ code base adheres to the
-[Google C++ style guide](http://google.github.io/styleguide/cppguide.html).
+[Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
 
 (<sup>*</sup> With one exception: we use 2-space indentation instead of 4-space
 indentation.)
diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index 984058297f9..2b4896c3810 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -487,7 +487,7 @@ subgraph inside.
 ![](../images/mnist_deep.png)
 
 For more information about visualizing your TensorFlow application with
-TensorBoard, see the [TensorBoard tutorial](TODO).
+TensorBoard, see the [TensorBoard tutorial](../get_started/summaries_and_tensorboard.md).
 
 ## Programming with multiple graphs
 
diff --git a/tensorflow/docs_src/tutorials/audio_recognition.md b/tensorflow/docs_src/tutorials/audio_recognition.md
index 336f4d9c18b..7d79f433c41 100644
--- a/tensorflow/docs_src/tutorials/audio_recognition.md
+++ b/tensorflow/docs_src/tutorials/audio_recognition.md
@@ -246,7 +246,7 @@ results as in your server testing.
 The demo app updates its UI list of results automatically based on the labels
 text file you copy into assets alongside your frozen graph, which means you can
 easily try out different models without needing to make any code changes. You
-will need to updaye `LABEL_FILENAME` and `MODEL_FILENAME` to point to the files
+will need to update `LABEL_FILENAME` and `MODEL_FILENAME` to point to the files
 you've added if you change the paths though.
 
 ## How does this Model Work?
diff --git a/tensorflow/examples/label_image/label_image.py b/tensorflow/examples/label_image/label_image.py
index 39d09813375..d62b73384c4 100644
--- a/tensorflow/examples/label_image/label_image.py
+++ b/tensorflow/examples/label_image/label_image.py
@@ -51,7 +51,7 @@ def read_tensor_from_image_file(file_name, input_height=299, input_width=299,
     image_reader = tf.image.decode_jpeg(file_reader, channels = 3,
                                         name='jpeg_reader')
   float_caster = tf.cast(image_reader, tf.float32)
-  dims_expander = tf.expand_dims(float_caster, 0);
+  dims_expander = tf.expand_dims(float_caster, 0)
   resized = tf.image.resize_bilinear(dims_expander, [input_height, input_width])
   normalized = tf.divide(tf.subtract(resized, [input_mean]), [input_std])
   sess = tf.Session()
@@ -118,8 +118,8 @@ if __name__ == "__main__":
 
   input_name = "import/" + input_layer
   output_name = "import/" + output_layer
-  input_operation = graph.get_operation_by_name(input_name);
-  output_operation = graph.get_operation_by_name(output_name);
+  input_operation = graph.get_operation_by_name(input_name)
+  output_operation = graph.get_operation_by_name(output_name)
 
   with tf.Session(graph=graph) as sess:
     results = sess.run(output_operation.outputs[0],
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index c0563da06d9..bd60f151de2 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -97,10 +97,27 @@ tf_java_op_gen_srcjar(
 # file before making it an executable. See tf_java_op_gen_srcjar().
 cc_library(
     name = "java_op_gen_tool",
-    srcs = glob([
-        "src/gen/cc/*.h",
-        "src/gen/cc/*.cc",
-    ]),
+    srcs = [
+        "src/gen/cc/op_gen_main.cc",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":java_op_gen_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "java_op_gen_lib",
+    srcs = [
+        "src/gen/cc/op_generator.cc",
+    ],
+    hdrs = [
+        "src/gen/cc/java_defs.h",
+        "src/gen/cc/op_generator.h",
+    ],
     copts = tf_copts(),
     deps = [
         "//tensorflow/core:framework",
@@ -280,21 +297,6 @@ tf_java_test(
     ],
 )
 
-#java_test(
-#    name = "OperatorProcessorTest",
-#    size = "small",
-#    srcs = ["src/test/java/org/tensorflow/processor/OperatorProcessorTest.java"],
-#    javacopts = JAVACOPTS,
-#    resources = [":processor_test_resources"],
-#    test_class = "org.tensorflow.processor.OperatorProcessorTest",
-#    deps = [
-#        ":processor_library",
-#        "//third_party/java/junit",
-#        "@com_google_testing_compile",
-#        "@com_google_truth",
-#    ],
-#)
-
 filegroup(
     name = "processor_test_resources",
     srcs = glob([
diff --git a/tensorflow/java/src/gen/cc/java_defs.h b/tensorflow/java/src/gen/cc/java_defs.h
new file mode 100644
index 00000000000..615cdc165b3
--- /dev/null
+++ b/tensorflow/java/src/gen/cc/java_defs.h
@@ -0,0 +1,273 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_
+#define TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_
+
+#include <string>
+#include <vector>
+#include <deque>
+
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace java {
+
+// An enumeration of different modifiers commonly used in Java
+enum Modifier {
+  PUBLIC    = (1 << 0),
+  PROTECTED = (1 << 1),
+  PRIVATE   = (1 << 2),
+  STATIC    = (1 << 3),
+  FINAL     = (1 << 4),
+};
+
+class Annotation;
+
+// A definition of any kind of Java type (classes, interfaces...)
+//
+// Note that most of the data fields of this class are only useful in specific
+// contexts and are not required in many cases. For example, annotations and
+// supertypes are only useful when declaring a type.
+class Type {
+ public:
+  enum Kind {
+    PRIMITIVE, CLASS, INTERFACE, ENUM, GENERIC, ANNOTATION
+  };
+  static const Type Byte() {
+    return Type(Type::PRIMITIVE, "byte");
+  }
+  static const Type Char() {
+    return Type(Type::PRIMITIVE, "char");
+  }
+  static const Type Short() {
+    return Type(Type::PRIMITIVE, "short");
+  }
+  static const Type Int() {
+    return Type(Type::PRIMITIVE, "int");
+  }
+  static const Type Long() {
+    return Type(Type::PRIMITIVE, "long");
+  }
+  static const Type Float() {
+    return Type(Type::PRIMITIVE, "float");
+  }
+  static const Type Double() {
+    return Type(Type::PRIMITIVE, "double");
+  }
+  static const Type Boolean() {
+    return Type(Type::PRIMITIVE, "boolean");
+  }
+  static const Type Void() {
+    // For simplicity, we consider 'void' as a primitive type, like the Java
+    // Reflection API does
+    return Type(Type::PRIMITIVE, "void");
+  }
+  static Type Class(const string& name, const string& package = "") {
+    return Type(Type::CLASS, name, package);
+  }
+  static Type Interface(const string& name, const string& package = "") {
+    return Type(Type::INTERFACE, name, package);
+  }
+  static Type Enum(const string& name, const string& package = "") {
+    return Type(Type::ENUM, name, package);
+  }
+  static Type Generic(const string& name = "") {
+    return Type(Type::GENERIC, name);
+  }
+  static Type ClassOf(const Type& type) {
+    return Class("Class").add_parameter(type);
+  }
+  static Type ListOf(const Type& type) {
+    return Interface("List", "java.util").add_parameter(type);
+  }
+  static Type IterableOf(const Type& type) {
+    return Interface("Iterable").add_parameter(type);
+  }
+  const Kind& kind() const { return kind_; }
+  const string& name() const { return name_; }
+  const string& package() const { return package_; }
+  const string& description() const { return description_; }
+  Type& description(const string& description) {
+    description_ = description;
+    return *this;
+  }
+  const std::vector<Type>& parameters() const { return parameters_; }
+  Type& add_parameter(const Type& parameter) {
+    parameters_.push_back(parameter);
+    return *this;
+  }
+  const std::vector<Annotation>& annotations() const { return annotations_; }
+  Type& add_annotation(const Annotation& annotation) {
+    annotations_.push_back(annotation);
+    return *this;
+  }
+  const std::deque<Type>& supertypes() const { return supertypes_; }
+  Type& add_supertype(const Type& type) {
+    if (type.kind_ == CLASS) {
+      supertypes_.push_front(type);  // keep superclass at the front of the list
+    } else if (type.kind_ == INTERFACE) {
+      supertypes_.push_back(type);
+    }
+    return *this;
+  }
+  // Returns true if "type" is of a known collection type (only a few for now)
+  bool IsCollection() const {
+    return name_ == "List" || name_ == "Iterable";
+  }
+  // Returns true if this instance is a wildcard (<?>)
+  bool IsWildcard() const {
+    return kind_ == GENERIC && name_.empty();
+  }
+
+ protected:
+  Type(Kind kind, const string& name, const string& package = "")
+    : kind_(kind), name_(name), package_(package) {}
+
+ private:
+  Kind kind_;
+  string name_;
+  string package_;
+  string description_;
+  std::vector<Type> parameters_;
+  std::vector<Annotation> annotations_;
+  std::deque<Type> supertypes_;
+};
+
+// Definition of a Java annotation
+//
+// This class only defines the usage of an annotation in a specific context,
+// giving optionally a set of attributes to initialize.
+class Annotation : public Type {
+ public:
+  static Annotation Create(const string& type_name, const string& pkg = "") {
+    return Annotation(type_name, pkg);
+  }
+  const string& attributes() const { return attributes_; }
+  Annotation& attributes(const string& attributes) {
+    attributes_ = attributes;
+    return *this;
+  }
+
+ private:
+  string attributes_;
+
+  Annotation(const string& name, const string& package)
+    : Type(Kind::ANNOTATION, name, package) {}
+};
+
+// A definition of a Java variable
+//
+// This class declares an instance of a type, such as a class field or a
+// method argument, which can be documented.
+class Variable {
+ public:
+  static Variable Create(const string& name, const Type& type) {
+    return Variable(name, type, false);
+  }
+  static Variable Varargs(const string& name, const Type& type) {
+    return Variable(name, type, true);
+  }
+  const string& name() const { return name_; }
+  const Type& type() const { return type_; }
+  bool variadic() const { return variadic_; }
+  const string& description() const { return description_; }
+  Variable& description(const string& description) {
+    description_ = description;
+    return *this;
+  }
+ private:
+  string name_;
+  Type type_;
+  bool variadic_;
+  string description_;
+
+  Variable(const string& name, const Type& type, bool variadic)
+    : name_(name), type_(type), variadic_(variadic) {}
+};
+
+// A definition of a Java class method
+//
+// This class defines the signature of a method, including its name, return
+// type and arguments.
+class Method {
+ public:
+  static Method Create(const string& name, const Type& return_type) {
+    return Method(name, return_type, false);
+  }
+  static Method ConstructorFor(const Type& clazz) {
+    return Method(clazz.name(), clazz, true);
+  }
+  bool constructor() const { return constructor_; }
+  const string& name() const { return name_; }
+  const Type& return_type() const { return return_type_; }
+  const string& description() const { return description_; }
+  Method& description(const string& description) {
+    description_ = description;
+    return *this;
+  }
+  const string& return_description() const { return return_description_; }
+  Method& return_description(const string& description) {
+    return_description_ = description;
+    return *this;
+  }
+  const std::vector<Variable>& arguments() const { return arguments_; }
+  Method& add_arguments(const std::vector<Variable>& args) {
+    arguments_.insert(arguments_.cend(), args.cbegin(), args.cend());
+    return *this;
+  }
+  Method& add_argument(const Variable& var) {
+    arguments_.push_back(var);
+    return *this;
+  }
+  const std::vector<Annotation>& annotations() const { return annotations_; }
+  Method& add_annotation(const Annotation& annotation) {
+    annotations_.push_back(annotation);
+    return *this;
+  }
+
+ private:
+  string name_;
+  Type return_type_;
+  bool constructor_;
+  string description_;
+  string return_description_;
+  std::vector<Variable> arguments_;
+  std::vector<Annotation> annotations_;
+
+  Method(const string& name, const Type& return_type, bool constructor)
+    : name_(name), return_type_(return_type), constructor_(constructor) {}
+};
+
+// A piece of code to read from a file.
+class Snippet {
+ public:
+  static Snippet Create(const string& fname, Env* env = Env::Default()) {
+    return Snippet(fname, env);
+  }
+  const string& data() const { return data_; }
+
+ private:
+  string data_;
+
+  Snippet(const string& fname, Env* env) {
+    TF_CHECK_OK(ReadFileToString(env, fname, &data_));
+  }
+};
+
+}  // namespace java
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_
diff --git a/tensorflow/java/src/gen/cc/op_gen_main.cc b/tensorflow/java/src/gen/cc/op_gen_main.cc
index a7c66dda893..bea99f3d7f6 100644
--- a/tensorflow/java/src/gen/cc/op_gen_main.cc
+++ b/tensorflow/java/src/gen/cc/op_gen_main.cc
@@ -25,7 +25,7 @@
 #include "tensorflow/java/src/gen/cc/op_generator.h"
 
 namespace tensorflow {
-namespace op_gen {
+namespace java {
 
 const char kUsageHeader[] =
     "\n\nGenerator of operation wrappers in Java.\n\n"
@@ -51,7 +51,7 @@ const char kUsageHeader[] =
     "Finally, the '--base_package' overrides the default parent package "
     "under which the generated subpackage and classes are to be located.\n\n";
 
-}  // namespace op_gen
+}  // namespace java
 }  // namespace tensorflow
 
 int main(int argc, char* argv[]) {
@@ -67,13 +67,13 @@ int main(int argc, char* argv[]) {
       tensorflow::Flag(
           "base_package", &base_package,
           "Package parent to the generated subpackage and classes")};
-  tensorflow::string usage = tensorflow::op_gen::kUsageHeader;
+  tensorflow::string usage = tensorflow::java::kUsageHeader;
   usage += tensorflow::Flags::Usage(argv[0], flag_list);
   bool parsed_flags_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
   tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
   QCHECK(parsed_flags_ok && !lib_name.empty() && !output_dir.empty()) << usage;
 
-  tensorflow::OpGenerator generator;
+  tensorflow::java::OpGenerator generator;
   tensorflow::OpList ops;
   tensorflow::OpRegistry::Global()->Export(true, &ops);
   tensorflow::Status status =
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index df130c32e6a..def06baf2db 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/java/src/gen/cc/op_generator.h"
 
 namespace tensorflow {
+namespace java {
 namespace {
 
 string CamelCase(const string& str, char delimiter, bool upper) {
@@ -63,4 +64,5 @@ Status OpGenerator::Run(const OpList& ops, const string& lib_name,
   return Status::OK();
 }
 
+}  // namespace java
 }  // namespace tensorflow
diff --git a/tensorflow/java/src/gen/cc/op_generator.h b/tensorflow/java/src/gen/cc/op_generator.h
index eec1082b516..4b55ed3ed94 100644
--- a/tensorflow/java/src/gen/cc/op_generator.h
+++ b/tensorflow/java/src/gen/cc/op_generator.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
+namespace java {
 
 /// \brief A generator of Java operation wrappers.
 ///
@@ -46,6 +47,7 @@ class OpGenerator {
   Env* env;
 };
 
+}  // namespace java
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_JAVA_SRC_GEN_CC_OP_GENERATOR_H_
diff --git a/tensorflow/java/src/gen/gen_ops.bzl b/tensorflow/java/src/gen/gen_ops.bzl
index 28f0908ec4a..a6650fc4ea0 100644
--- a/tensorflow/java/src/gen/gen_ops.bzl
+++ b/tensorflow/java/src/gen/gen_ops.bzl
@@ -52,7 +52,7 @@ def tf_java_op_gen_srcjar(name,
 
   # Generate a source archive containing generated code for these ops.
   gen_srcjar = out_dir + name + ".srcjar"
-  gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) ."]
+  gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) src"]
   gen_tools += ["@local_jdk//:jar"] + ["@local_jdk//:jdk"]
   gen_tools += tf_binary_additional_srcs()
   native.genrule(
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index eba9637bdce..2fbcda1e6bd 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -803,7 +803,7 @@ class Dataset(object):
     ```python
     # Preprocess 4 files concurrently, and interleave blocks of 16 records from
     # each file.
-    filenames = ["/var/data/file1.txt", "/var/data/file2.txt", ..."]
+    filenames = ["/var/data/file1.txt", "/var/data/file2.txt", ...]
     dataset = (Dataset.from_tensor_slices(filenames)
                .interleave(lambda x:
                    TextLineDataset(x).map(parse_fn, num_parallel_calls=1),
diff --git a/tensorflow/python/debug/examples/examples_test.sh b/tensorflow/python/debug/examples/examples_test.sh
index 25916f1903c..2df6c0b6a27 100755
--- a/tensorflow/python/debug/examples/examples_test.sh
+++ b/tensorflow/python/debug/examples/examples_test.sh
@@ -23,6 +23,9 @@
 
 set -e
 
+# Filter out LOG(INFO)
+export TF_CPP_MIN_LOG_LEVEL=1
+
 IS_VIRTUALENV=0
 PYTHON_BIN_PATH=""
 while true; do
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index c72d37b4420..bf175cbe01e 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -688,9 +688,10 @@ class Estimator(object):
     Raises:
       ValueError: if input_fn takes invalid arguments.
     """
-    del mode  # unused
     input_fn_args = util.fn_args(input_fn)
     kwargs = {}
+    if 'mode' in input_fn_args:
+      kwargs['mode'] = mode
     if 'params' in input_fn_args:
       kwargs['params'] = self.params
     if 'config' in input_fn_args:
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index 58d0cb0018e..ed1676a92de 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -420,6 +420,7 @@ class EstimatorTrainTest(test.TestCase):
     self.assertEqual(1, model_fn_call_count[0])
 
   def test_callable_input_fn(self):
+    expected_mode = model_fn_lib.ModeKeys.TRAIN
     expected_params = {'batch_size': 10}
     expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
     input_fn_call_count = [0]
@@ -432,8 +433,9 @@ class EstimatorTrainTest(test.TestCase):
 
     class InputFn(object):
 
-      def __call__(self, params, config):
+      def __call__(self, mode, params, config):
         input_fn_call_count[0] += 1
+        test_self.assertEqual(expected_mode, mode)
         test_self.assertEqual(expected_params, params)
         test_self.assertEqual(4321, config.tf_random_seed)
         return dummy_input_fn()
@@ -446,6 +448,7 @@ class EstimatorTrainTest(test.TestCase):
     self.assertEqual(1, input_fn_call_count[0])
 
   def test_input_fn_args(self):
+    expected_mode = model_fn_lib.ModeKeys.TRAIN
     expected_params = {'batch_size': 10}
     expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
     input_fn_call_count = [0]
@@ -454,8 +457,9 @@ class EstimatorTrainTest(test.TestCase):
       del params, config
       return model_fn_global_step_incrementer(features, labels, mode)
 
-    def _input_fn(params, config):
+    def _input_fn(mode, params, config):
       input_fn_call_count[0] += 1
+      self.assertEqual(expected_mode, mode)
       self.assertEqual(expected_params, params)
       self.assertEqual(4321, config.tf_random_seed)
       return dummy_input_fn()
@@ -992,6 +996,7 @@ class EstimatorDatasetIntegrationTest(test.TestCase):
 class EstimatorEvaluateTest(test.TestCase):
 
   def test_input_fn_args(self):
+    expected_mode = model_fn_lib.ModeKeys.EVAL
     expected_params = {'batch_size': 10}
     expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
     input_fn_call_count = [0]
@@ -1000,8 +1005,9 @@ class EstimatorEvaluateTest(test.TestCase):
       del params, config
       return model_fn_global_step_incrementer(features, labels, mode)
 
-    def _input_fn(params, config):
+    def _input_fn(mode, params, config):
       input_fn_call_count[0] += 1
+      self.assertEqual(expected_mode, mode)
       self.assertEqual(expected_params, params)
       self.assertEqual(4321, config.tf_random_seed)
       return dummy_input_fn()
@@ -1265,6 +1271,7 @@ class EstimatorEvaluateTest(test.TestCase):
 class EstimatorPredictTest(test.TestCase):
 
   def test_input_fn_args(self):
+    expected_mode = model_fn_lib.ModeKeys.PREDICT
     expected_params = {'batch_size': 10}
     expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
     input_fn_call_count = [0]
@@ -1277,8 +1284,9 @@ class EstimatorPredictTest(test.TestCase):
           train_op=state_ops.assign_add(training.get_global_step(), 1),
           predictions=constant_op.constant([[10.]]))
 
-    def _input_fn(params, config):
+    def _input_fn(mode, params, config):
       input_fn_call_count[0] += 1
+      self.assertEqual(expected_mode, mode)
       self.assertEqual(expected_params, params)
       self.assertEqual(4321, config.tf_random_seed)
       return dummy_input_fn()
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index f789bcbbac1..52fb1d39ae2 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -677,11 +677,19 @@ class _TrainingExecutor(object):
             'RunConfig or set the TF_CONFIG environment variable.')
 
     logging.info('Start Tensorflow server.')
+
+    if config.session_config is None:
+      session_config=config_pb2.ConfigProto(log_device_placement=False)
+    else:
+      session_config=config_pb2.ConfigProto(
+          log_device_placement=False,
+          gpu_options=config.session_config.gpu_options)
+
     server = server_lib.Server(
         config.cluster_spec,
         job_name=config.task_type,
         task_index=config.task_id,
-        config=config_pb2.ConfigProto(log_device_placement=False),
+        config=session_config,
         start=False)
     server.start()
     return server
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index a7cbc76b879..3e9bd3dade6 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -796,6 +796,20 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DBackpropFilterWithEmptyInput(self):
+    expected = [0, 0, 0, 0]
+    for (data_format, use_gpu) in GetTestConfigs():
+      self._RunAndVerifyBackpropFilter(
+          input_sizes=[0, 2, 3, 1],
+          filter_sizes=[2, 2, 1, 1],
+          output_sizes=[0, 1, 2, 1],
+          strides=[1, 1],
+          padding="VALID",
+          expected=expected,
+          data_format=data_format,
+          use_gpu=use_gpu)
+
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Depth3ValidBackpropFilter(self):
     expected = [
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 6be8997cabd..5c0ea8ec8ed 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -361,6 +361,16 @@ class PoolingTest(test.TestCase):
         expected=expected_output,
         use_gpu=use_gpu)
 
+  def _testAvgPoolEmptyInput(self, use_gpu):
+    self._VerifyValues(
+        nn_ops.avg_pool,
+        input_sizes=[0, 8, 8, 8],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding="SAME",
+        expected=[],
+        use_gpu=use_gpu)
+
   def testAvgPooling(self):
     for use_gpu in True, False:
       self._testAvgPoolValidPadding(use_gpu)
@@ -371,6 +381,7 @@ class PoolingTest(test.TestCase):
       self._testAvgPoolSamePadding4(use_gpu)
       self._testAvgPoolSamePaddingPacket4(use_gpu)
       self._testAvgPoolSamePaddingPacket8(use_gpu)
+      self._testAvgPoolEmptyInput(use_gpu)
 
   def _testMaxPoolValidPadding(self, use_gpu):
     expected_output = [13.0, 14.0, 15.0]
@@ -543,6 +554,16 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu,
           v2=v2)
 
+  def _testMaxPoolEmptyInput(self, use_gpu):
+    self._VerifyValues(
+        gen_nn_ops._max_pool_v2,
+        input_sizes=[0, 8, 8, 8],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding="SAME",
+        expected=[],
+        use_gpu=use_gpu)
+
   def testMaxPooling(self):
     for use_gpu in True, False:
       self._testMaxPoolValidPadding(use_gpu)
@@ -551,6 +572,7 @@ class PoolingTest(test.TestCase):
       self._testMaxPoolValidPaddingUnevenStride(use_gpu)
       self._testMaxPoolSamePaddingPacket4(use_gpu)
       self._testMaxPoolSamePaddingPacket8(use_gpu)
+      self._testMaxPoolEmptyInput(use_gpu)
 
   # Tests for DepthwiseMaxPooling on CPU only.
   def testDepthwiseMaxPool1x1DepthWindow1(self):
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
index 75eb100a90f..f9b025b787e 100644
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -135,5 +135,36 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
                   bitwise_ops.right_shift(lhs, rhs)])
 
 
+  def testShapeInference(self):
+    dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]
+
+    with self.test_session(use_gpu=True) as sess:
+      for dtype in dtype_list:
+        lhs = constant_op.constant([[0], [3], [5]], dtype=dtype)
+        rhs = constant_op.constant([[1, 2, 4]], dtype=dtype)
+
+        and_tensor = bitwise_ops.bitwise_and(lhs, rhs)
+        or_tensor = bitwise_ops.bitwise_or(lhs, rhs)
+        xor_tensor = bitwise_ops.bitwise_xor(lhs, rhs)
+        ls_tensor = bitwise_ops.left_shift(lhs, rhs)
+        rs_tensor = bitwise_ops.right_shift(lhs, rhs)
+
+        and_result, or_result, xor_result, ls_result, rs_result = sess.run(
+            [and_tensor, or_tensor, xor_tensor, ls_tensor, rs_tensor])
+
+        # Compare shape inference with result
+        self.assertAllEqual(and_tensor.get_shape().as_list(), and_result.shape)
+        self.assertAllEqual(and_tensor.get_shape().as_list(), [3, 3])
+        self.assertAllEqual(or_tensor.get_shape().as_list(), or_result.shape)
+        self.assertAllEqual(or_tensor.get_shape().as_list(), [3, 3])
+        self.assertAllEqual(xor_tensor.get_shape().as_list(), xor_result.shape)
+        self.assertAllEqual(xor_tensor.get_shape().as_list(), [3, 3])
+        self.assertAllEqual(ls_tensor.get_shape().as_list(), ls_result.shape)
+        self.assertAllEqual(ls_tensor.get_shape().as_list(), [3, 3])
+        self.assertAllEqual(rs_tensor.get_shape().as_list(), rs_result.shape)
+        self.assertAllEqual(rs_tensor.get_shape().as_list(), [3, 3])
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index f5fdb12b2c8..20c7a9fd662 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -977,9 +977,7 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
 
   `hessians()` adds ops to the graph to output the Hessian matrix of `ys`
   with respect to `xs`.  It returns a list of `Tensor` of length `len(xs)`
-  where each tensor is the Hessian of `sum(ys)`. This function currently
-  only supports evaluating the Hessian with respect to (a list of) one-
-  dimensional tensors.
+  where each tensor is the Hessian of `sum(ys)`.
 
   The Hessian is a matrix of second-order partial derivatives of a scalar
   tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
@@ -1005,31 +1003,32 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
       'colocate_gradients_with_ops': colocate_gradients_with_ops,
       'gate_gradients': gate_gradients,
       'aggregation_method': aggregation_method
-    }
+  }
   # Compute first-order derivatives and iterate for each x in xs.
   hessians = []
   _gradients = gradients(ys, xs, **kwargs)
-  for i, _gradient, x in zip(range(len(xs)), _gradients, xs):
-    # Ensure that x is a vector.
-    check_rank = check_ops.assert_rank(
-      x, 1, message='Cannot compute Hessian because element %d of `xs` does '
-      'not have rank one.' % i
-    )
-    with ops.control_dependencies([check_rank]):
-      # Declare an iterator and tensor array loop variables for the gradients.
-      n = array_ops.size(x)
-      loop_vars = [
+  for gradient, x in zip(_gradients, xs):
+    # change shape to one-dimension without graph branching
+    gradient = array_ops.reshape(gradient, [-1])
+
+    # Declare an iterator and tensor array loop variables for the gradients.
+    n = array_ops.size(x)
+    loop_vars = [
         array_ops.constant(0, dtypes.int32),
         tensor_array_ops.TensorArray(x.dtype, n)
-      ]
-      # Iterate over all elements of the gradient and compute second order
-      # derivatives.
-      _, hessian = control_flow_ops.while_loop(
-          lambda j, _: j < n,
-          lambda j, result: (j + 1,
-                             result.write(j, gradients(_gradient[j], x)[0])),
-          loop_vars
-      )
+    ]
+    # Iterate over all elements of the gradient and compute second order
+    # derivatives.
+    _, hessian = control_flow_ops.while_loop(
+        lambda j, _: j < n,
+        lambda j, result: (j + 1,
+                           result.write(j, gradients(gradient[j], x)[0])),
+        loop_vars
+    )
 
-      hessians.append(hessian.stack())
+    _shape = array_ops.shape(x)
+    _reshaped_hessian = array_ops.reshape(
+        hessian.stack(), array_ops.concat((_shape, _shape), 0)
+    )
+    hessians.append(_reshaped_hessian)
   return hessians
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 1211b2e9230..7432c6b02cb 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -621,6 +621,45 @@ class HessianTest(test_util.TensorFlowTestCase):
         with self.assertRaises(ValueError):
           gradients.hessians(x, x)
 
+  def testHessian2D_square_matrix(self):
+    # Manually compute the Hessian explicitly for a low-dimensional problem
+    # and check that `hessian` matches. Specifically, the Hessian of
+    # f(x) = 1/2 * x^T * x is H = constant (block identity matrix)
+    m = 3
+    rng = np.random.RandomState([1, 2, 3])
+    x_value = rng.randn(m, m).astype("float32")
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant(x_value)
+      x_square = math_ops.reduce_sum(
+          math_ops.matmul(array_ops.transpose(x), x) * 0.5
+      )
+      hess = gradients.hessians(x_square, x)[0]
+      hess_actual = hess.eval()
+    hess_value = np.bmat([
+        [elem*np.ones((m, m)) for elem in vec]
+        for vec in np.eye(m)
+    ]).astype("float32")
+    self.assertAllEqual((m, m, m, m), hess_actual.shape)
+    self.assertAllClose(hess_value, hess_actual.reshape((m * m, m * m)))
+
+  def testHessian2D_non_square_matrix(self):
+    m = 3
+    n = 4
+    rng = np.random.RandomState([1, 2, 3])
+    x_value = rng.randn(m, n).astype("float32")
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant(x_value)
+      x_square = math_ops.reduce_sum(
+          math_ops.matmul(array_ops.transpose(x), x) * 0.5
+      )
+      hess = gradients.hessians(x_square, x)[0]
+      hess_actual = hess.eval()
+    hess_value = np.bmat([
+        [elem*np.ones((n, n)) for elem in vec]
+        for vec in np.eye(m)
+    ]).astype("float32")
+    self.assertAllEqual((m, n, m, n), hess_actual.shape)
+    self.assertAllClose(hess_value, hess_actual.reshape((m * n, m * n)))
 
 @test_util.with_c_api
 class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 7506ab653bd..7f494db1a1a 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -219,15 +219,17 @@ def random_flip_up_down(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-  mirror_cond = math_ops.less(uniform_random, .5)
-  result = control_flow_ops.cond(mirror_cond,
-                                 lambda: array_ops.reverse(image, [0]),
-                                 lambda: image)
-  return fix_image_flip_shape(image, result)
+  with ops.name_scope(None, 'random_flip_up_down', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
+    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+    mirror_cond = math_ops.less(uniform_random, .5)
+    result = control_flow_ops.cond(mirror_cond,
+                                   lambda: array_ops.reverse(image, [0]),
+                                   lambda: image,
+                                   name=scope)
+    return fix_image_flip_shape(image, result)
 
 
 def random_flip_left_right(image, seed=None):
@@ -248,15 +250,19 @@ def random_flip_left_right(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-  mirror_cond = math_ops.less(uniform_random, .5)
-  result = control_flow_ops.cond(mirror_cond,
-                                 lambda: array_ops.reverse(image, [1]),
-                                 lambda: image)
-  return fix_image_flip_shape(image, result)
+  with ops.name_scope(None, 'random_flip_left_right', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
+    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+    mirror_cond = math_ops.less(uniform_random, .5)
+    result = control_flow_ops.cond(mirror_cond,
+                                   lambda: array_ops.reverse(image, [1]),
+                                   lambda: image,
+                                   name=scope)
+    print('scope: ' + scope)
+    print('result name: ' + result.name)
+    return fix_image_flip_shape(image, result)
 
 
 def flip_left_right(image):
@@ -276,10 +282,12 @@ def flip_left_right(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  return fix_image_flip_shape(image, array_ops.reverse(image, [1]))
+  with ops.name_scope(None, 'flip_left_right', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
+    return fix_image_flip_shape(image,
+                                array_ops.reverse(image, [1], name=scope))
 
 
 def flip_up_down(image):
@@ -299,10 +307,12 @@ def flip_up_down(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  return fix_image_flip_shape(image, array_ops.reverse(image, [0]))
+  with ops.name_scope(None, 'flip_up_down', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
+    return fix_image_flip_shape(image,
+                                array_ops.reverse(image, [0], name=scope))
 
 
 def rot90(image, k=1, name=None):
@@ -356,10 +366,11 @@ def transpose_image(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  return array_ops.transpose(image, [1, 0, 2], name='transpose_image')
+  with ops.name_scope(None, 'transpose_image', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
+    return array_ops.transpose(image, [1, 0, 2], name=scope)
 
 
 def central_crop(image, central_fraction):
@@ -386,33 +397,34 @@ def central_crop(image, central_fraction):
   Returns:
     3-D float Tensor
   """
-  image = ops.convert_to_tensor(image, name='image')
-  if central_fraction <= 0.0 or central_fraction > 1.0:
-    raise ValueError('central_fraction must be within (0, 1]')
-  if central_fraction == 1.0:
+  with ops.name_scope(None, 'central_crop', [image]):
+    image = ops.convert_to_tensor(image, name='image')
+    if central_fraction <= 0.0 or central_fraction > 1.0:
+      raise ValueError('central_fraction must be within (0, 1]')
+    if central_fraction == 1.0:
+      return image
+
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
+
+    img_shape = array_ops.shape(image)
+    depth = image.get_shape()[2]
+    img_h = math_ops.to_double(img_shape[0])
+    img_w = math_ops.to_double(img_shape[1])
+    bbox_h_start = math_ops.to_int32((img_h - img_h * central_fraction) / 2)
+    bbox_w_start = math_ops.to_int32((img_w - img_w * central_fraction) / 2)
+
+    bbox_h_size = img_shape[0] - bbox_h_start * 2
+    bbox_w_size = img_shape[1] - bbox_w_start * 2
+
+    bbox_begin = array_ops.stack([bbox_h_start, bbox_w_start, 0])
+    bbox_size = array_ops.stack([bbox_h_size, bbox_w_size, -1])
+    image = array_ops.slice(image, bbox_begin, bbox_size)
+
+    # The first two dimensions are dynamic and unknown.
+    image.set_shape([None, None, depth])
     return image
 
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-
-  img_shape = array_ops.shape(image)
-  depth = image.get_shape()[2]
-  img_h = math_ops.to_double(img_shape[0])
-  img_w = math_ops.to_double(img_shape[1])
-  bbox_h_start = math_ops.to_int32((img_h - img_h * central_fraction) / 2)
-  bbox_w_start = math_ops.to_int32((img_w - img_w * central_fraction) / 2)
-
-  bbox_h_size = img_shape[0] - bbox_h_start * 2
-  bbox_w_size = img_shape[1] - bbox_w_start * 2
-
-  bbox_begin = array_ops.stack([bbox_h_start, bbox_w_start, 0])
-  bbox_size = array_ops.stack([bbox_h_size, bbox_w_size, -1])
-  image = array_ops.slice(image, bbox_begin, bbox_size)
-
-  # The first two dimensions are dynamic and unknown.
-  image.set_shape([None, None, depth])
-  return image
-
 
 def pad_to_bounding_box(image, offset_height, offset_width, target_height,
                         target_width):
@@ -444,53 +456,54 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
       `target_*` arguments, or either `offset_height` or `offset_width` is
       negative.
   """
-  image = ops.convert_to_tensor(image, name='image')
+  with ops.name_scope(None, 'pad_to_bounding_box', [image]):
+    image = ops.convert_to_tensor(image, name='image')
 
-  is_batch = True
-  image_shape = image.get_shape()
-  if image_shape.ndims == 3:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-  elif image_shape.ndims is None:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-    image.set_shape([None] * 4)
-  elif image_shape.ndims != 4:
-    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+    is_batch = True
+    image_shape = image.get_shape()
+    if image_shape.ndims == 3:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+    elif image_shape.ndims is None:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+      image.set_shape([None] * 4)
+    elif image_shape.ndims != 4:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
-  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+    assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+    batch, height, width, depth = _ImageDimensions(image, rank=4)
 
-  batch, height, width, depth = _ImageDimensions(image, rank=4)
+    after_padding_width = target_width - offset_width - width
 
-  after_padding_width = target_width - offset_width - width
-  after_padding_height = target_height - offset_height - height
+    after_padding_height = target_height - offset_height - height
 
-  assert_ops += _assert(offset_height >= 0, ValueError,
-                        'offset_height must be >= 0')
-  assert_ops += _assert(offset_width >= 0, ValueError,
-                        'offset_width must be >= 0')
-  assert_ops += _assert(after_padding_width >= 0, ValueError,
-                        'width must be <= target - offset')
-  assert_ops += _assert(after_padding_height >= 0, ValueError,
-                        'height must be <= target - offset')
-  image = control_flow_ops.with_dependencies(assert_ops, image)
+    assert_ops += _assert(offset_height >= 0, ValueError,
+                          'offset_height must be >= 0')
+    assert_ops += _assert(offset_width >= 0, ValueError,
+                          'offset_width must be >= 0')
+    assert_ops += _assert(after_padding_width >= 0, ValueError,
+                          'width must be <= target - offset')
+    assert_ops += _assert(after_padding_height >= 0, ValueError,
+                          'height must be <= target - offset')
+    image = control_flow_ops.with_dependencies(assert_ops, image)
 
-  # Do not pad on the depth dimensions.
-  paddings = array_ops.reshape(
-      array_ops.stack([
-          0, 0, offset_height, after_padding_height, offset_width,
-          after_padding_width, 0, 0
-      ]), [4, 2])
-  padded = array_ops.pad(image, paddings)
+    # Do not pad on the depth dimensions.
+    paddings = array_ops.reshape(
+        array_ops.stack([
+            0, 0, offset_height, after_padding_height, offset_width,
+            after_padding_width, 0, 0
+        ]), [4, 2])
+    padded = array_ops.pad(image, paddings)
 
-  padded_shape = [None if _is_tensor(i) else i
-                  for i in [batch, target_height, target_width, depth]]
-  padded.set_shape(padded_shape)
+    padded_shape = [None if _is_tensor(i) else i
+                    for i in [batch, target_height, target_width, depth]]
+    padded.set_shape(padded_shape)
 
-  if not is_batch:
-    padded = array_ops.squeeze(padded, squeeze_dims=[0])
+    if not is_batch:
+      padded = array_ops.squeeze(padded, squeeze_dims=[0])
 
-  return padded
+    return padded
 
 
 def crop_to_bounding_box(image, offset_height, offset_width, target_height,
@@ -523,51 +536,52 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
       `target_*` arguments, or either `offset_height` or `offset_width` is
       negative, or either `target_height` or `target_width` is not positive.
   """
-  image = ops.convert_to_tensor(image, name='image')
+  with ops.name_scope(None, 'crop_to_bounding_box', [image]):
+    image = ops.convert_to_tensor(image, name='image')
 
-  is_batch = True
-  image_shape = image.get_shape()
-  if image_shape.ndims == 3:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-  elif image_shape.ndims is None:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-    image.set_shape([None] * 4)
-  elif image_shape.ndims != 4:
-    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+    is_batch = True
+    image_shape = image.get_shape()
+    if image_shape.ndims == 3:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+    elif image_shape.ndims is None:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+      image.set_shape([None] * 4)
+    elif image_shape.ndims != 4:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
-  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+    assert_ops = _CheckAtLeast3DImage(image, require_static=False)
 
-  batch, height, width, depth = _ImageDimensions(image, rank=4)
+    batch, height, width, depth = _ImageDimensions(image, rank=4)
 
-  assert_ops += _assert(offset_width >= 0, ValueError,
-                        'offset_width must be >= 0.')
-  assert_ops += _assert(offset_height >= 0, ValueError,
-                        'offset_height must be >= 0.')
-  assert_ops += _assert(target_width > 0, ValueError,
-                        'target_width must be > 0.')
-  assert_ops += _assert(target_height > 0, ValueError,
-                        'target_height must be > 0.')
-  assert_ops += _assert(width >= (target_width + offset_width), ValueError,
-                        'width must be >= target + offset.')
-  assert_ops += _assert(height >= (target_height + offset_height), ValueError,
-                        'height must be >= target + offset.')
-  image = control_flow_ops.with_dependencies(assert_ops, image)
+    assert_ops += _assert(offset_width >= 0, ValueError,
+                          'offset_width must be >= 0.')
+    assert_ops += _assert(offset_height >= 0, ValueError,
+                          'offset_height must be >= 0.')
+    assert_ops += _assert(target_width > 0, ValueError,
+                          'target_width must be > 0.')
+    assert_ops += _assert(target_height > 0, ValueError,
+                          'target_height must be > 0.')
+    assert_ops += _assert(width >= (target_width + offset_width), ValueError,
+                          'width must be >= target + offset.')
+    assert_ops += _assert(height >= (target_height + offset_height), ValueError,
+                          'height must be >= target + offset.')
+    image = control_flow_ops.with_dependencies(assert_ops, image)
 
-  cropped = array_ops.slice(
-      image,
-      array_ops.stack([0, offset_height, offset_width, 0]),
-      array_ops.stack([-1, target_height, target_width, -1]))
+    cropped = array_ops.slice(
+        image,
+        array_ops.stack([0, offset_height, offset_width, 0]),
+        array_ops.stack([-1, target_height, target_width, -1]))
 
-  cropped_shape = [None if _is_tensor(i) else i
-                   for i in [batch, target_height, target_width, depth]]
-  cropped.set_shape(cropped_shape)
+    cropped_shape = [None if _is_tensor(i) else i
+                     for i in [batch, target_height, target_width, depth]]
+    cropped.set_shape(cropped_shape)
 
-  if not is_batch:
-    cropped = array_ops.squeeze(cropped, squeeze_dims=[0])
+    if not is_batch:
+      cropped = array_ops.squeeze(cropped, squeeze_dims=[0])
 
-  return cropped
+    return cropped
 
 
 def resize_image_with_crop_or_pad(image, target_height, target_width):
@@ -598,88 +612,90 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
     If `images` was 3-D, a 3-D float Tensor of shape
     `[new_height, new_width, channels]`.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image_shape = image.get_shape()
-  is_batch = True
-  if image_shape.ndims == 3:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-  elif image_shape.ndims is None:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-    image.set_shape([None] * 4)
-  elif image_shape.ndims != 4:
-    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+  with ops.name_scope(None, 'resize_image_with_crop_or_pad', [image]):
+    image = ops.convert_to_tensor(image, name='image')
+    image_shape = image.get_shape()
+    is_batch = True
+    if image_shape.ndims == 3:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+    elif image_shape.ndims is None:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+      image.set_shape([None] * 4)
+    elif image_shape.ndims != 4:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
-  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
-  assert_ops += _assert(target_width > 0, ValueError,
-                        'target_width must be > 0.')
-  assert_ops += _assert(target_height > 0, ValueError,
-                        'target_height must be > 0.')
+    assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+    assert_ops += _assert(target_width > 0, ValueError,
+                          'target_width must be > 0.')
+    assert_ops += _assert(target_height > 0, ValueError,
+                          'target_height must be > 0.')
 
-  image = control_flow_ops.with_dependencies(assert_ops, image)
-  # `crop_to_bounding_box` and `pad_to_bounding_box` have their own checks.
-  # Make sure our checks come first, so that error messages are clearer.
-  if _is_tensor(target_height):
-    target_height = control_flow_ops.with_dependencies(
-        assert_ops, target_height)
-  if _is_tensor(target_width):
-    target_width = control_flow_ops.with_dependencies(assert_ops, target_width)
+    image = control_flow_ops.with_dependencies(assert_ops, image)
+    # `crop_to_bounding_box` and `pad_to_bounding_box` have their own checks.
+    # Make sure our checks come first, so that error messages are clearer.
+    if _is_tensor(target_height):
+      target_height = control_flow_ops.with_dependencies(
+          assert_ops, target_height)
+    if _is_tensor(target_width):
+      target_width = control_flow_ops.with_dependencies(
+          assert_ops, target_width)
 
-  def max_(x, y):
-    if _is_tensor(x) or _is_tensor(y):
-      return math_ops.maximum(x, y)
-    else:
-      return max(x, y)
+    def max_(x, y):
+      if _is_tensor(x) or _is_tensor(y):
+        return math_ops.maximum(x, y)
+      else:
+        return max(x, y)
 
-  def min_(x, y):
-    if _is_tensor(x) or _is_tensor(y):
-      return math_ops.minimum(x, y)
-    else:
-      return min(x, y)
+    def min_(x, y):
+      if _is_tensor(x) or _is_tensor(y):
+        return math_ops.minimum(x, y)
+      else:
+        return min(x, y)
 
-  def equal_(x, y):
-    if _is_tensor(x) or _is_tensor(y):
-      return math_ops.equal(x, y)
-    else:
-      return x == y
+    def equal_(x, y):
+      if _is_tensor(x) or _is_tensor(y):
+        return math_ops.equal(x, y)
+      else:
+        return x == y
 
-  _, height, width, _ = _ImageDimensions(image, rank=4)
-  width_diff = target_width - width
-  offset_crop_width = max_(-width_diff // 2, 0)
-  offset_pad_width = max_(width_diff // 2, 0)
+    _, height, width, _ = _ImageDimensions(image, rank=4)
+    width_diff = target_width - width
+    offset_crop_width = max_(-width_diff // 2, 0)
+    offset_pad_width = max_(width_diff // 2, 0)
 
-  height_diff = target_height - height
-  offset_crop_height = max_(-height_diff // 2, 0)
-  offset_pad_height = max_(height_diff // 2, 0)
+    height_diff = target_height - height
+    offset_crop_height = max_(-height_diff // 2, 0)
+    offset_pad_height = max_(height_diff // 2, 0)
 
-  # Maybe crop if needed.
-  cropped = crop_to_bounding_box(image, offset_crop_height, offset_crop_width,
-                                 min_(target_height, height),
-                                 min_(target_width, width))
+    # Maybe crop if needed.
+    cropped = crop_to_bounding_box(image, offset_crop_height, offset_crop_width,
+                                   min_(target_height, height),
+                                   min_(target_width, width))
 
-  # Maybe pad if needed.
-  resized = pad_to_bounding_box(cropped, offset_pad_height, offset_pad_width,
-                                target_height, target_width)
+    # Maybe pad if needed.
+    resized = pad_to_bounding_box(cropped, offset_pad_height, offset_pad_width,
+                                  target_height, target_width)
 
-  # In theory all the checks below are redundant.
-  if resized.get_shape().ndims is None:
-    raise ValueError('resized contains no shape.')
+    # In theory all the checks below are redundant.
+    if resized.get_shape().ndims is None:
+      raise ValueError('resized contains no shape.')
 
-  _, resized_height, resized_width, _ = _ImageDimensions(resized, rank=4)
+    _, resized_height, resized_width, _ = _ImageDimensions(resized, rank=4)
 
-  assert_ops = []
-  assert_ops += _assert(equal_(resized_height, target_height), ValueError,
-                        'resized height is not correct.')
-  assert_ops += _assert(equal_(resized_width, target_width), ValueError,
-                        'resized width is not correct.')
+    assert_ops = []
+    assert_ops += _assert(equal_(resized_height, target_height), ValueError,
+                          'resized height is not correct.')
+    assert_ops += _assert(equal_(resized_width, target_width), ValueError,
+                          'resized width is not correct.')
 
-  resized = control_flow_ops.with_dependencies(assert_ops, resized)
+    resized = control_flow_ops.with_dependencies(assert_ops, resized)
 
-  if not is_batch:
-    resized = array_ops.squeeze(resized, squeeze_dims=[0])
+    if not is_batch:
+      resized = array_ops.squeeze(resized, squeeze_dims=[0])
 
-  return resized
+    return resized
 
 
 class ResizeMethod(object):
@@ -736,66 +752,68 @@ def resize_images(images,
     If `images` was 3-D, a 3-D float Tensor of shape
     `[new_height, new_width, channels]`.
   """
-  images = ops.convert_to_tensor(images, name='images')
-  if images.get_shape().ndims is None:
-    raise ValueError('\'images\' contains no shape.')
-  # TODO(shlens): Migrate this functionality to the underlying Op's.
-  is_batch = True
-  if images.get_shape().ndims == 3:
-    is_batch = False
-    images = array_ops.expand_dims(images, 0)
-  elif images.get_shape().ndims != 4:
-    raise ValueError('\'images\' must have either 3 or 4 dimensions.')
+  with ops.name_scope(None, 'resize_images', [images, size]):
+    images = ops.convert_to_tensor(images, name='images')
+    if images.get_shape().ndims is None:
+      raise ValueError('\'images\' contains no shape.')
+    # TODO(shlens): Migrate this functionality to the underlying Op's.
+    is_batch = True
+    if images.get_shape().ndims == 3:
+      is_batch = False
+      images = array_ops.expand_dims(images, 0)
+    elif images.get_shape().ndims != 4:
+      raise ValueError('\'images\' must have either 3 or 4 dimensions.')
 
-  _, height, width, _ = images.get_shape().as_list()
+    _, height, width, _ = images.get_shape().as_list()
 
-  try:
-    size = ops.convert_to_tensor(size, dtypes.int32, name='size')
-  except (TypeError, ValueError):
-    raise ValueError('\'size\' must be a 1-D int32 Tensor')
-  if not size.get_shape().is_compatible_with([2]):
-    raise ValueError('\'size\' must be a 1-D Tensor of 2 elements: '
-                     'new_height, new_width')
-  size_const_as_shape = tensor_util.constant_value_as_shape(size)
-  new_height_const = size_const_as_shape[0].value
-  new_width_const = size_const_as_shape[1].value
+    try:
+      size = ops.convert_to_tensor(size, dtypes.int32, name='size')
+    except (TypeError, ValueError):
+      raise ValueError('\'size\' must be a 1-D int32 Tensor')
+    if not size.get_shape().is_compatible_with([2]):
+      raise ValueError('\'size\' must be a 1-D Tensor of 2 elements: '
+                       'new_height, new_width')
+    size_const_as_shape = tensor_util.constant_value_as_shape(size)
+    new_height_const = size_const_as_shape[0].value
+    new_width_const = size_const_as_shape[1].value
+
+    # If we can determine that the height and width will be unmodified by this
+    # transformation, we avoid performing the resize.
+    if all(x is not None
+           for x in [new_width_const, width, new_height_const, height]) and (
+               width == new_width_const and height == new_height_const):
+      if not is_batch:
+        images = array_ops.squeeze(images, squeeze_dims=[0])
+      return images
+
+    if method == ResizeMethod.BILINEAR:
+      images = gen_image_ops.resize_bilinear(images,
+                                             size,
+                                             align_corners=align_corners)
+    elif method == ResizeMethod.NEAREST_NEIGHBOR:
+      images = gen_image_ops.resize_nearest_neighbor(images,
+                                                     size,
+                                                     align_corners=
+                                                     align_corners)
+    elif method == ResizeMethod.BICUBIC:
+      images = gen_image_ops.resize_bicubic(images,
+                                            size,
+                                            align_corners=align_corners)
+    elif method == ResizeMethod.AREA:
+      images = gen_image_ops.resize_area(images,
+                                         size,
+                                         align_corners=align_corners)
+    else:
+      raise ValueError('Resize method is not implemented.')
+
+    # NOTE(mrry): The shape functions for the resize ops cannot unpack
+    # the packed values in `new_size`, so set the shape here.
+    images.set_shape([None, new_height_const, new_width_const, None])
 
-  # If we can determine that the height and width will be unmodified by this
-  # transformation, we avoid performing the resize.
-  if all(x is not None
-         for x in [new_width_const, width, new_height_const, height]) and (
-             width == new_width_const and height == new_height_const):
     if not is_batch:
       images = array_ops.squeeze(images, squeeze_dims=[0])
     return images
 
-  if method == ResizeMethod.BILINEAR:
-    images = gen_image_ops.resize_bilinear(images,
-                                           size,
-                                           align_corners=align_corners)
-  elif method == ResizeMethod.NEAREST_NEIGHBOR:
-    images = gen_image_ops.resize_nearest_neighbor(images,
-                                                   size,
-                                                   align_corners=align_corners)
-  elif method == ResizeMethod.BICUBIC:
-    images = gen_image_ops.resize_bicubic(images,
-                                          size,
-                                          align_corners=align_corners)
-  elif method == ResizeMethod.AREA:
-    images = gen_image_ops.resize_area(images,
-                                       size,
-                                       align_corners=align_corners)
-  else:
-    raise ValueError('Resize method is not implemented.')
-
-  # NOTE(mrry): The shape functions for the resize ops cannot unpack
-  # the packed values in `new_size`, so set the shape here.
-  images.set_shape([None, new_height_const, new_width_const, None])
-
-  if not is_batch:
-    images = array_ops.squeeze(images, squeeze_dims=[0])
-  return images
-
 
 def per_image_standardization(image):
   """Linearly scales `image` to have zero mean and unit norm.
@@ -816,27 +834,28 @@ def per_image_standardization(image):
   Raises:
     ValueError: if the shape of 'image' is incompatible with this function.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  num_pixels = math_ops.reduce_prod(array_ops.shape(image))
+  with ops.name_scope(None, 'per_image_standardization', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
+    num_pixels = math_ops.reduce_prod(array_ops.shape(image))
 
-  image = math_ops.cast(image, dtype=dtypes.float32)
-  image_mean = math_ops.reduce_mean(image)
+    image = math_ops.cast(image, dtype=dtypes.float32)
+    image_mean = math_ops.reduce_mean(image)
 
-  variance = (math_ops.reduce_mean(math_ops.square(image)) -
-              math_ops.square(image_mean))
-  variance = gen_nn_ops.relu(variance)
-  stddev = math_ops.sqrt(variance)
+    variance = (math_ops.reduce_mean(math_ops.square(image)) -
+                math_ops.square(image_mean))
+    variance = gen_nn_ops.relu(variance)
+    stddev = math_ops.sqrt(variance)
 
-  # Apply a minimum normalization that protects us against uniform images.
-  min_stddev = math_ops.rsqrt(math_ops.cast(num_pixels, dtypes.float32))
-  pixel_value_scale = math_ops.maximum(stddev, min_stddev)
-  pixel_value_offset = image_mean
+    # Apply a minimum normalization that protects us against uniform images.
+    min_stddev = math_ops.rsqrt(math_ops.cast(num_pixels, dtypes.float32))
+    pixel_value_scale = math_ops.maximum(stddev, min_stddev)
+    pixel_value_offset = image_mean
 
-  image = math_ops.subtract(image, pixel_value_offset)
-  image = math_ops.div(image, pixel_value_scale)
-  return image
+    image = math_ops.subtract(image, pixel_value_offset)
+    image = math_ops.div(image, pixel_value_scale, name=scope)
+    return image
 
 
 def random_brightness(image, max_delta, seed=None):
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 4af9bd2a009..3d73b772914 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -857,6 +857,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
+      self.assertTrue(y.op.name.startswith('flip_left_right'))
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
@@ -867,6 +868,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_left_right(x_tf)
+      self.assertTrue(y.op.name.startswith('random_flip_left_right'))
 
       count_flipped = 0
       count_unflipped = 0
@@ -897,6 +899,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
+      self.assertTrue(y.op.name.startswith('flip_up_down'))
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
@@ -907,6 +910,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_up_down(x_tf)
+      self.assertTrue(y.op.name.startswith('random_flip_up_down'))
       count_flipped = 0
       count_unflipped = 0
       for _ in range(50):
@@ -936,6 +940,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
+      self.assertTrue(y.op.name.startswith('transpose_image'))
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
@@ -1160,6 +1165,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.per_image_standardization(x)
+      self.assertTrue(y.op.name.startswith('per_image_standardization'))
       y_tf = y.eval()
       self.assertAllClose(y_tf, y_np, atol=1e-4)
 
@@ -1341,6 +1347,11 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     for params, err_msg in test_config:
       self._assertRaises(x, x_shape, *params, err_msg=err_msg)
 
+  def testNameScope(self):
+    image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
+    y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66)
+    self.assertTrue(y.name.startswith('crop_to_bounding_box'))
+
 
 class CentralCropTest(test_util.TensorFlowTestCase):
 
@@ -1417,6 +1428,13 @@ class CentralCropTest(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         _ = image_ops.central_crop(x, 1.01)
 
+  def testNameScope(self):
+    x_shape = [13, 9, 3]
+    x_np = np.ones(x_shape, dtype=np.float32)
+    with self.test_session(use_gpu=True):
+      y = image_ops.central_crop(x_np, 1.0)
+      self.assertTrue(y.op.name.startswith('central_crop'))
+
 
 class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
 
@@ -1620,6 +1638,11 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     for config_item in test_config:
       self._assertRaises(x, x_shape, *config_item)
 
+  def testNameScope(self):
+    image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
+    y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66)
+    self.assertTrue(y.op.name.startswith('pad_to_bounding_box'))
+
 
 class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
 
@@ -2224,6 +2247,13 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self._assertShapeInference([59, 60, None], [55, 66], [55, 66, None])
     self._assertShapeInference([None, None, None], [55, 66], [55, 66, None])
 
+  def testNameScope(self):
+    img_shape = [1, 3, 2, 1]
+    with self.test_session(use_gpu=True):
+      single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
+      y = image_ops.resize_images(single_image, [55, 66])
+      self.assertTrue(y.op.name.startswith('resize_images'))
+
 
 class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
@@ -2499,6 +2529,11 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     self._assertRaises(x, x_shape, target_height, target_width,
                        "target_width must be > 0")
 
+  def testNameScope(self):
+    image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
+    y = image_ops.resize_image_with_crop_or_pad(image, 55, 66)
+    self.assertTrue(y.op.name.startswith('resize_image_with_crop_or_pad'))
+
 
 def _SimpleColorRamp():
   """Build a simple color ramp RGB image."""
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index ff7137d492c..0593ed2cfa6 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -171,6 +171,10 @@ class BatchNormalizationTest(test.TestCase):
         x, x_shape, y, y_shape, delta=1e-3, x_init_value=x_init_val)
     _, numerical_grad = gradient_checker.compute_gradient(
         x32, x_shape, y32, y_shape, delta=1e-3, x_init_value=x32_init_val)
+
+    # If grad is empty, no error.
+    if theoretical_grad.size == 0 and numerical_grad.size == 0:
+      return 0
     return np.fabs(theoretical_grad - numerical_grad).max()
 
   def _test_gradient(self,
@@ -371,6 +375,17 @@ class BatchNormalizationTest(test.TestCase):
       self._test_inference(
           x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
+  def testInferenceShape5(self):
+    x_shape = [0, 131, 127, 6]
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
+        self._test_inference(
+            x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW')
+        self._test_inference(
+            x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
+      self._test_inference(
+          x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
+
   def testTrainingShape1(self):
     x_shape = [1, 1, 6, 1]
     for dtype in [np.float16, np.float32]:
@@ -409,6 +424,17 @@ class BatchNormalizationTest(test.TestCase):
       self._test_training(
           x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
+  def testTrainingShape5(self):
+    x_shape = [0, 131, 127, 6]
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
+        self._test_training(
+            x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW')
+        self._test_training(
+            x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
+      self._test_training(
+          x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
+
   def testBatchNormGradShape1(self):
     for is_training in [True, False]:
       x_shape = [1, 1, 6, 1]
@@ -496,6 +522,33 @@ class BatchNormalizationTest(test.TestCase):
             data_format='NHWC',
             is_training=is_training)
 
+  def testBatchNormGradShape5(self):
+    for is_training in [True, False]:
+      x_shape = [0, 7, 11, 4]
+      for dtype in [np.float16, np.float32]:
+        if test.is_gpu_available(cuda_only=True):
+          self._test_gradient(
+              x_shape,
+              dtype, [7],
+              np.float32,
+              use_gpu=True,
+              data_format='NCHW',
+              is_training=is_training)
+          self._test_gradient(
+              x_shape,
+              dtype, [4],
+              np.float32,
+              use_gpu=True,
+              data_format='NHWC',
+              is_training=is_training)
+        self._test_gradient(
+            x_shape,
+            dtype, [4],
+            np.float32,
+            use_gpu=False,
+            data_format='NHWC',
+            is_training=is_training)
+
   def _testBatchNormGradGrad(self, config):
     shape = config['shape']
     err_tolerance = config['err_tolerance']
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 382c7a80a67..3ee49650e01 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -120,7 +120,7 @@ def piecewise_constant(x, boundaries, values, name=None):
       `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
     boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
       increasing entries, and with all elements having the same type as `x`.
-    values: A list of `Tensor`s or float`s or `int`s that specifies the values
+    values: A list of `Tensor`s or `float`s or `int`s that specifies the values
       for the intervals defined by `boundaries`. It should have one more element
       than `boundaries`, and all elements should have the same type.
     name: A string. Optional name of the operation. Defaults to
@@ -424,11 +424,12 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
     return math_ops.div(learning_rate, denom, name=name)
 
 
-def cosine_decay(learning_rate, global_step, decay_steps, name=None):
+def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
+                 name=None):
   """Applies cosine decay to the learning rate.
 
   See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
-  with Warm Restarts.
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
 
   When training a model, it is often recommended to lower the learning rate as
   the training progresses.  This function applies a cosine decay function
@@ -439,7 +440,8 @@ def cosine_decay(learning_rate, global_step, decay_steps, name=None):
   The function returns the decayed learning rate.  It is computed as:
   ```python
   global_step = min(global_step, decay_steps)
-  decayed = 0.5 * (1 + cos(pi * global_step / decay_steps))
+  cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
+  decayed = (1 - alpha) * cosine_decay + alpha
   decayed_learning_rate = learning_rate * decayed
   ```
 
@@ -456,6 +458,8 @@ def cosine_decay(learning_rate, global_step, decay_steps, name=None):
       Global step to use for the decay computation.
     decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
       Number of steps to decay over.
+    alpha: A scalar `float32` or `float64` Tensor or a Python number.
+      Minimum learning rate value as a fraction of learning_rate.
     name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
   Returns:
     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
@@ -476,7 +480,96 @@ def cosine_decay(learning_rate, global_step, decay_steps, name=None):
     cosine_decayed = 0.5 * (
         1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))
 
-    return math_ops.multiply(learning_rate, cosine_decayed)
+    decayed = (1 - alpha) * cosine_decayed + alpha
+    return math_ops.multiply(learning_rate, decayed)
+
+
+def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
+                          t_mul=2.0, m_mul=1.0, alpha=0.0, name=None):
+  """Applies cosine decay with restarts to the learning rate.
+
+  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses.  This function applies a cosine decay function with
+  restarts to a provided initial learning rate.  It requires a `global_step`
+  value to compute the decayed learning rate.  You can just pass a TensorFlow
+  variable that you increment at each training step.
+
+  The function returns the decayed learning rate while taking into account
+  possible warm restarts. The learning rate multiplier first decays
+  from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
+  restart is performed. Each new warm restart runs for `t_mul` times more steps
+  and with `m_mul` times smaller initial learning rate.
+
+  Example usage:
+  ```python
+  first_decay_steps = 1000
+  lr_decayed = cosine_decay_restarts(learning_rate, global_step,
+                                     first_decay_steps)
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
+      The initial learning rate.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Global step to use for the decay computation.
+    first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Number of steps to decay over.
+    t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+      Used to derive the number of iterations in the i-th period
+    m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+      Used to derive the initial learning rate of the i-th period:
+    alpha: A scalar `float32` or `float64` Tensor or a Python number.
+      Minimum learning rate value as a fraction of the learning_rate.
+    name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
+  Returns:
+    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+    learning rate.
+  Raises:
+    ValueError: if `global_step` is not supplied.
+  """
+  if global_step is None:
+    raise ValueError("cosine decay restarts requires global_step")
+  with ops.name_scope(name, "SGDRDecay",
+                      [learning_rate, global_step]) as name:
+    learning_rate = ops.convert_to_tensor(learning_rate,
+                                          name="initial_learning_rate")
+    dtype = learning_rate.dtype
+    global_step = math_ops.cast(global_step, dtype)
+    first_decay_steps = math_ops.cast(first_decay_steps, dtype)
+    alpha = math_ops.cast(alpha, dtype)
+    t_mul = math_ops.cast(t_mul, dtype)
+    m_mul = math_ops.cast(m_mul, dtype)
+
+    completed_fraction = global_step / first_decay_steps
+
+    def compute_step(completed_fraction, geometric=False):
+      if geometric:
+        i_restart = math_ops.floor(math_ops.log(1.0 - completed_fraction * (
+            1.0 - t_mul)) / math_ops.log(t_mul))
+
+        sum_r = (1.0 - t_mul ** i_restart) / (1.0 - t_mul)
+        completed_fraction = (completed_fraction - sum_r) / t_mul ** i_restart
+
+      else:
+        i_restart = math_ops.floor(completed_fraction)
+        completed_fraction = completed_fraction - i_restart
+
+      return i_restart, completed_fraction
+
+    i_restart, completed_fraction = control_flow_ops.cond(
+        math_ops.equal(t_mul, 1.0),
+        lambda: compute_step(completed_fraction, geometric=False),
+        lambda: compute_step(completed_fraction, geometric=True))
+
+    m_fac = m_mul ** i_restart
+    cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
+        constant_op.constant(math.pi) * completed_fraction))
+    decayed = (1 - alpha) * cosine_decayed + alpha
+
+  return math_ops.multiply(learning_rate, decayed, name=name)
 
 
 def linear_cosine_decay(learning_rate, global_step, decay_steps,
@@ -487,6 +580,10 @@ def linear_cosine_decay(learning_rate, global_step, decay_steps,
   See [Bello et al., ICML2017] Neural Optimizer Search with RL.
   https://arxiv.org/abs/1709.07417
 
+  For the idea of warm starts here controlled by `num_periods`,
+  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
   Note that linear cosine decay is more aggressive than cosine decay and
   larger initial learning rates can typically be used.
 
@@ -563,6 +660,10 @@ def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps,
   See [Bello et al., ICML2017] Neural Optimizer Search with RL.
   https://arxiv.org/abs/1709.07417
 
+  For the idea of warm starts here controlled by `num_periods`,
+  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
   Note that linear cosine decay is more aggressive than cosine decay and
   larger initial learning rates can typically be used.
 
diff --git a/tensorflow/python/training/learning_rate_decay_test.py b/tensorflow/python/training/learning_rate_decay_test.py
index ff41d80940a..1ce8c156a0b 100644
--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@@ -342,10 +342,11 @@ class InverseDecayTest(test_util.TensorFlowTestCase):
 
 class CosineDecayTest(test_util.TensorFlowTestCase):
 
-  def np_cosine_decay(self, step, decay_steps):
+  def np_cosine_decay(self, step, decay_steps, alpha=0.0):
     step = min(step, decay_steps)
     completed_fraction = step / decay_steps
-    return 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    return (1.0 - alpha) * decay + alpha
 
   def testDecay(self):
     num_training_steps = 1000
@@ -357,6 +358,77 @@ class CosineDecayTest(test_util.TensorFlowTestCase):
         expected = self.np_cosine_decay(step, num_training_steps)
         self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
 
+  def testAlpha(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    alpha = 0.1
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay(
+            initial_lr, step, num_training_steps, alpha)
+        expected = self.np_cosine_decay(step, num_training_steps, alpha)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
+
+class CosineDecayRestartsTest(test_util.TensorFlowTestCase):
+  def np_cosine_decay_restarts(self, step, decay_steps, t_mul=2.0, m_mul=1.0,
+                               alpha=0.0):
+    fac = 1.0
+    while step >= decay_steps:
+      step = step - decay_steps
+      decay_steps *= t_mul
+      fac *= m_mul
+
+    completed_fraction = step / decay_steps
+    decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    return (1.0 - alpha) * decay + alpha
+
+  def testDecay(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay_restarts(
+            initial_lr, step, num_training_steps)
+        expected = self.np_cosine_decay_restarts(step, num_training_steps)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
+  def testAlpha(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    alpha = 0.1
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay_restarts(
+            initial_lr, step, num_training_steps, alpha=alpha)
+        expected = self.np_cosine_decay_restarts(step, num_training_steps,
+                                                 alpha=alpha)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
+  def testMMul(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    m_mul = 0.9
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay_restarts(
+            initial_lr, step, num_training_steps, m_mul=m_mul)
+        expected = self.np_cosine_decay_restarts(step, num_training_steps,
+                                                 m_mul=m_mul)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
+  def testTMul(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    t_mul = 1.0
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay_restarts(
+            initial_lr, step, num_training_steps, t_mul=t_mul)
+        expected = self.np_cosine_decay_restarts(step, num_training_steps,
+                                                 t_mul=t_mul)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
 
 class LinearCosineDecayTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index fa02ad84cce..03811fa38dd 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -38,6 +38,7 @@ See the @{$python/train} guide.
 @@clip_by_global_norm
 @@global_norm
 @@cosine_decay
+@@cosine_decay_restarts
 @@linear_cosine_decay
 @@noisy_linear_cosine_decay
 @@exponential_decay
diff --git a/tensorflow/tools/api/golden/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
index b2ef17b39e7..e49c719a334 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
@@ -266,7 +266,11 @@ tf_module {
   }
   member_method {
     name: "cosine_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "cosine_decay_restarts"
+    argspec: "args=[\'learning_rate\', \'global_step\', \'first_decay_steps\', \'t_mul\', \'m_mul\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'1.0\', \'0.0\', \'None\'], "
   }
   member_method {
     name: "create_global_step"
diff --git a/tensorflow/tools/ci_build/Dockerfile.android b/tensorflow/tools/ci_build/Dockerfile.android
index 99a69d7b43b..dcf077791a9 100644
--- a/tensorflow/tools/ci_build/Dockerfile.android
+++ b/tensorflow/tools/ci_build/Dockerfile.android
@@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 LABEL maintainer="Jan Prach <jendap@google.com>"
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu b/tensorflow/tools/ci_build/Dockerfile.cpu
index 57a854a9df7..c61fda09af0 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu
@@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 LABEL maintainer="Jan Prach <jendap@google.com>"
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu.mpi b/tensorflow/tools/ci_build/Dockerfile.cpu.mpi
index 2bf7fd1d234..d9f5b7c0364 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cpu.mpi
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu.mpi
@@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 LABEL authors="Andrew Gibiansky <andrew.gibiansky@gmail.com>, Joel Hestness <jthestness@gmail.com>"
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.hadoop b/tensorflow/tools/ci_build/Dockerfile.hadoop
index 6010aedb339..d05dedafbe2 100644
--- a/tensorflow/tools/ci_build/Dockerfile.hadoop
+++ b/tensorflow/tools/ci_build/Dockerfile.hadoop
@@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 LABEL maintainer="Jonathan Hseu <jhseu@google.com>"
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.pi b/tensorflow/tools/ci_build/Dockerfile.pi
index 75ef30d32b0..176f9f63219 100644
--- a/tensorflow/tools/ci_build/Dockerfile.pi
+++ b/tensorflow/tools/ci_build/Dockerfile.pi
@@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 LABEL maintainer="Jan Prach <jendap@google.com>"
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.pi-python3 b/tensorflow/tools/ci_build/Dockerfile.pi-python3
index b1c648ba301..14ebb9069f9 100644
--- a/tensorflow/tools/ci_build/Dockerfile.pi-python3
+++ b/tensorflow/tools/ci_build/Dockerfile.pi-python3
@@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 LABEL maintainer="Jan Prach <jendap@google.com>"
 
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 4021d794b6c..b728c878da0 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -111,9 +111,9 @@ do_pylint() {
   fi
 
   if [[ $1 == "PYTHON2" ]]; then
-    PYLINT_BIN="python /usr/local/lib/python2.7/dist-packages/pylint/lint.py"
+    PYLINT_BIN="python -m pylint"
   elif [[ $1 == "PYTHON3" ]]; then
-    PYLINT_BIN="python3 /usr/local/lib/python3.4/dist-packages/pylint/lint.py"
+    PYLINT_BIN="python3 -m pylint"
   else
     echo "Unrecognized python version (PYTHON2 | PYTHON3): $1"
     return 1
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index 1454264a800..cf8737c2d8c 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="0.5.4"
+BAZEL_VERSION="0.8.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 7a1630aff60..6e846ef8784 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -22,9 +22,24 @@
 
 # fkrull/deadsnakes is for Python3.6
 add-apt-repository -y ppa:fkrull/deadsnakes
+
 apt-get update
+apt-get upgrade
+
+# Install python dep
+apt-get install python-dev
+# Install bz2 dep
+apt-get install libbz2-dev
+# Install curses dep
+apt-get install libncurses5 libncurses5-dev
+apt-get install libncursesw5 libncursesw5-dev
+# Install readline dep
+apt-get install libreadline6 libreadline6-dev
+# Install sqlite3 dependencies
+apt-get install libsqlite3-dev
 
 set -e
+
 # Install Python 3.6 and dev library
 wget https://www.python.org/ftp/python/3.6.1/Python-3.6.1.tar.xz
 tar xvf Python-3.6.1.tar.xz
@@ -63,6 +78,10 @@ pip3 install scikit-learn==0.18.1
 # pandas required by `inflow`
 pip3 install pandas==0.19.2
 
+pip3 install gnureadline
+
+pip3 install bz2file
+
 # Install recent-enough version of wheel for Python 3.6 wheel builds
 pip3 install wheel==0.29.0
 
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 51e4c6cef38..4f90c4d9406 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -10,10 +10,7 @@ load(
 
 py_binary(
     name = "tf_upgrade",
-    srcs = [
-        "ast_edits.py",
-        "tf_upgrade.py",
-    ],
+    srcs = ["tf_upgrade.py"],
     srcs_version = "PY2AND3",
 )
 
@@ -22,7 +19,7 @@ py_test(
     srcs = ["tf_upgrade_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "tf_upgrade",
+        ":tf_upgrade",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "@six_archive//:six",
@@ -48,11 +45,11 @@ genrule(
         "test_file_v1_0.py",
         "report.txt",
     ],
-    cmd = ("$(location tf_upgrade)" +
+    cmd = ("$(location :tf_upgrade)" +
            " --infile $(location testdata/test_file_v0_11.py)" +
            " --outfile $(location test_file_v1_0.py)" +
            " --reportfile $(location report.txt)"),
-    tools = ["tf_upgrade"],
+    tools = [":tf_upgrade"],
 )
 
 py_test(
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 72fe4a48cdd..fa1cc739056 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -19,11 +19,486 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-
-from tensorflow.tools.compatibility import ast_edits
+import ast
+import collections
+import os
+import shutil
+import sys
+import tempfile
+import traceback
 
 
-class TFAPIChangeSpec(ast_edits.APIChangeSpec):
+class APIChangeSpec(object):
+  """This class defines the transformations that need to happen.
+
+  This class must provide the following fields:
+
+  * `function_keyword_renames`: maps function names to a map of old -> new
+    argument names
+  * `function_renames`: maps function names to new function names
+  * `change_to_function`: a set of function names that have changed (for
+    notifications)
+  * `function_reorders`: maps functions whose argument order has changed to the
+    list of arguments in the new order
+  * `function_handle`: maps function names to custom handlers for the function
+
+  For an example, see `TFAPIChangeSpec`.
+  """
+
+
+class _FileEditTuple(collections.namedtuple(
+    "_FileEditTuple", ["comment", "line", "start", "old", "new"])):
+  """Each edit that is recorded by a _FileEditRecorder.
+
+  Fields:
+    comment: A description of the edit and why it was made.
+    line: The line number in the file where the edit occurs (1-indexed).
+    start: The line number in the file where the edit occurs (0-indexed).
+    old: text string to remove (this must match what was in file).
+    new: text string to add in place of `old`.
+  """
+
+  __slots__ = ()
+
+
+class _FileEditRecorder(object):
+  """Record changes that need to be done to the file."""
+
+  def __init__(self, filename):
+    # all edits are lists of chars
+    self._filename = filename
+
+    self._line_to_edit = collections.defaultdict(list)
+    self._errors = []
+
+  def process(self, text):
+    """Process a list of strings, each corresponding to the recorded changes.
+
+    Args:
+      text: A list of lines of text (assumed to contain newlines)
+    Returns:
+      A tuple of the modified text and a textual description of what is done.
+    Raises:
+      ValueError: if substitution source location does not have expected text.
+    """
+
+    change_report = ""
+
+    # Iterate of each line
+    for line, edits in self._line_to_edit.items():
+      offset = 0
+      # sort by column so that edits are processed in order in order to make
+      # indexing adjustments cumulative for changes that change the string
+      # length
+      edits.sort(key=lambda x: x.start)
+
+      # Extract each line to a list of characters, because mutable lists
+      # are editable, unlike immutable strings.
+      char_array = list(text[line - 1])
+
+      # Record a description of the change
+      change_report += "%r Line %d\n" % (self._filename, line)
+      change_report += "-" * 80 + "\n\n"
+      for e in edits:
+        change_report += "%s\n" % e.comment
+      change_report += "\n    Old: %s" % (text[line - 1])
+
+      # Make underscore buffers for underlining where in the line the edit was
+      change_list = [" "] * len(text[line - 1])
+      change_list_new = [" "] * len(text[line - 1])
+
+      # Iterate for each edit
+      for e in edits:
+        # Create effective start, end by accounting for change in length due
+        # to previous edits
+        start_eff = e.start + offset
+        end_eff = start_eff + len(e.old)
+
+        # Make sure the edit is changing what it should be changing
+        old_actual = "".join(char_array[start_eff:end_eff])
+        if old_actual != e.old:
+          raise ValueError("Expected text %r but got %r" %
+                           ("".join(e.old), "".join(old_actual)))
+        # Make the edit
+        char_array[start_eff:end_eff] = list(e.new)
+
+        # Create the underline highlighting of the before and after
+        change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
+        change_list_new[start_eff:end_eff] = "~" * len(e.new)
+
+        # Keep track of how to generate effective ranges
+        offset += len(e.new) - len(e.old)
+
+      # Finish the report comment
+      change_report += "         %s\n" % "".join(change_list)
+      text[line - 1] = "".join(char_array)
+      change_report += "    New: %s" % (text[line - 1])
+      change_report += "         %s\n\n" % "".join(change_list_new)
+    return "".join(text), change_report, self._errors
+
+  def add(self, comment, line, start, old, new, error=None):
+    """Add a new change that is needed.
+
+    Args:
+      comment: A description of what was changed
+      line: Line number (1 indexed)
+      start: Column offset (0 indexed)
+      old: old text
+      new: new text
+      error: this "edit" is something that cannot be fixed automatically
+    Returns:
+      None
+    """
+
+    self._line_to_edit[line].append(
+        _FileEditTuple(comment, line, start, old, new))
+    if error:
+      self._errors.append("%s:%d: %s" % (self._filename, line, error))
+
+
+class _ASTCallVisitor(ast.NodeVisitor):
+  """AST Visitor that processes function calls.
+
+  Updates function calls from old API version to new API version using a given
+  change spec.
+  """
+
+  def __init__(self, filename, lines, api_change_spec):
+    self._filename = filename
+    self._file_edit = _FileEditRecorder(filename)
+    self._lines = lines
+    self._api_change_spec = api_change_spec
+
+  def process(self, lines):
+    return self._file_edit.process(lines)
+
+  def generic_visit(self, node):
+    ast.NodeVisitor.generic_visit(self, node)
+
+  def _rename_functions(self, node, full_name):
+    function_renames = self._api_change_spec.function_renames
+    try:
+      new_name = function_renames[full_name]
+      self._file_edit.add("Renamed function %r to %r" % (full_name,
+                                                         new_name),
+                          node.lineno, node.col_offset, full_name, new_name)
+    except KeyError:
+      pass
+
+  def _get_attribute_full_path(self, node):
+    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
+
+    Args:
+      node: A Node of type Attribute.
+
+    Returns:
+      a '.'-delimited full-name or None if the tree was not a simple form.
+      i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
+    """
+    curr = node
+    items = []
+    while not isinstance(curr, ast.Name):
+      if not isinstance(curr, ast.Attribute):
+        return None
+      items.append(curr.attr)
+      curr = curr.value
+    items.append(curr.id)
+    return ".".join(reversed(items))
+
+  def _find_true_position(self, node):
+    """Return correct line number and column offset for a given node.
+
+    This is necessary mainly because ListComp's location reporting reports
+    the next token after the list comprehension list opening.
+
+    Args:
+      node: Node for which we wish to know the lineno and col_offset
+    """
+    import re
+    find_open = re.compile("^\s*(\\[).*$")
+    find_string_chars = re.compile("['\"]")
+
+    if isinstance(node, ast.ListComp):
+      # Strangely, ast.ListComp returns the col_offset of the first token
+      # after the '[' token which appears to be a bug. Workaround by
+      # explicitly finding the real start of the list comprehension.
+      line = node.lineno
+      col = node.col_offset
+      # loop over lines
+      while 1:
+        # Reverse the text to and regular expression search for whitespace
+        text = self._lines[line-1]
+        reversed_preceding_text = text[:col][::-1]
+        # First find if a [ can be found with only whitespace between it and
+        # col.
+        m = find_open.match(reversed_preceding_text)
+        if m:
+          new_col_offset = col - m.start(1) - 1
+          return line, new_col_offset
+        else:
+          if (reversed_preceding_text=="" or
+             reversed_preceding_text.isspace()):
+            line = line - 1
+            prev_line = self._lines[line - 1]
+            # TODO(aselle):
+            # this is poor comment detection, but it is good enough for
+            # cases where the comment does not contain string literal starting/
+            # ending characters. If ast gave us start and end locations of the
+            # ast nodes rather than just start, we could use string literal
+            # node ranges to filter out spurious #'s that appear in string
+            # literals.
+            comment_start = prev_line.find("#")
+            if comment_start ==  -1:
+              col = len(prev_line) -1
+            elif find_string_chars.search(prev_line[comment_start:]) is None:
+              col = comment_start
+            else:
+              return None, None
+          else:
+            return None, None
+    # Most other nodes return proper locations (with notably does not), but
+    # it is not possible to use that in an argument.
+    return node.lineno, node.col_offset
+
+
+  def visit_Call(self, node):  # pylint: disable=invalid-name
+    """Handle visiting a call node in the AST.
+
+    Args:
+      node: Current Node
+    """
+
+
+    # Find a simple attribute name path e.g. "tf.foo.bar"
+    full_name = self._get_attribute_full_path(node.func)
+
+    # Make sure the func is marked as being part of a call
+    node.func.is_function_for_call = True
+
+    if full_name:
+      # Call special handlers
+      function_handles = self._api_change_spec.function_handle
+      if full_name in function_handles:
+        function_handles[full_name](self._file_edit, node)
+
+      # Examine any non-keyword argument and make it into a keyword argument
+      # if reordering required.
+      function_reorders = self._api_change_spec.function_reorders
+      function_keyword_renames = (
+          self._api_change_spec.function_keyword_renames)
+
+      if full_name in function_reorders:
+        reordered = function_reorders[full_name]
+        for idx, arg in enumerate(node.args):
+          lineno, col_offset = self._find_true_position(arg)
+          if lineno is None or col_offset is None:
+            self._file_edit.add(
+                "Failed to add keyword %r to reordered function %r"
+                % (reordered[idx], full_name), arg.lineno, arg.col_offset,
+                "", "",
+                error="A necessary keyword argument failed to be inserted.")
+          else:
+            keyword_arg = reordered[idx]
+            if (full_name in function_keyword_renames and
+                keyword_arg in function_keyword_renames[full_name]):
+              keyword_arg = function_keyword_renames[full_name][keyword_arg]
+            self._file_edit.add("Added keyword %r to reordered function %r"
+                                % (reordered[idx], full_name), lineno,
+                                col_offset, "", keyword_arg + "=")
+
+      # Examine each keyword argument and convert it to the final renamed form
+      renamed_keywords = ({} if full_name not in function_keyword_renames else
+                          function_keyword_renames[full_name])
+      for keyword in node.keywords:
+        argkey = keyword.arg
+        argval = keyword.value
+
+        if argkey in renamed_keywords:
+          argval_lineno, argval_col_offset = self._find_true_position(argval)
+          if argval_lineno is not None and argval_col_offset is not None:
+            # TODO(aselle): We should scan backward to find the start of the
+            # keyword key. Unfortunately ast does not give you the location of
+            # keyword keys, so we are forced to infer it from the keyword arg
+            # value.
+            key_start = argval_col_offset - len(argkey) - 1
+            key_end = key_start + len(argkey) + 1
+            if (self._lines[argval_lineno - 1][key_start:key_end] ==
+                argkey + "="):
+              self._file_edit.add("Renamed keyword argument from %r to %r" %
+                                  (argkey, renamed_keywords[argkey]),
+                                  argval_lineno,
+                                  argval_col_offset - len(argkey) - 1,
+                                  argkey + "=", renamed_keywords[argkey] + "=")
+              continue
+          self._file_edit.add(
+              "Failed to rename keyword argument from %r to %r" %
+              (argkey, renamed_keywords[argkey]),
+              argval.lineno,
+              argval.col_offset - len(argkey) - 1,
+              "", "",
+              error="Failed to find keyword lexographically. Fix manually.")
+
+    ast.NodeVisitor.generic_visit(self, node)
+
+  def visit_Attribute(self, node):  # pylint: disable=invalid-name
+    """Handle bare Attributes i.e. [tf.foo, tf.bar].
+
+    Args:
+      node: Node that is of type ast.Attribute
+    """
+    full_name = self._get_attribute_full_path(node)
+    if full_name:
+      self._rename_functions(node, full_name)
+    if full_name in self._api_change_spec.change_to_function:
+      if not hasattr(node, "is_function_for_call"):
+        new_text = full_name + "()"
+        self._file_edit.add("Changed %r to %r"%(full_name, new_text),
+                            node.lineno, node.col_offset, full_name, new_text)
+
+    ast.NodeVisitor.generic_visit(self, node)
+
+
+class ASTCodeUpgrader(object):
+  """Handles upgrading a set of Python files using a given API change spec."""
+
+  def __init__(self, api_change_spec):
+    if not isinstance(api_change_spec, APIChangeSpec):
+      raise TypeError("Must pass APIChangeSpec to ASTCodeUpgrader, got %s" %
+                      type(api_change_spec))
+    self._api_change_spec = api_change_spec
+
+  def process_file(self, in_filename, out_filename):
+    """Process the given python file for incompatible changes.
+
+    Args:
+      in_filename: filename to parse
+      out_filename: output file to write to
+    Returns:
+      A tuple representing number of files processed, log of actions, errors
+    """
+
+    # Write to a temporary file, just in case we are doing an implace modify.
+    with open(in_filename, "r") as in_file, \
+        tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
+      ret = self.process_opened_file(
+          in_filename, in_file, out_filename, temp_file)
+
+    shutil.move(temp_file.name, out_filename)
+    return ret
+
+  # Broad exceptions are required here because ast throws whatever it wants.
+  # pylint: disable=broad-except
+  def process_opened_file(self, in_filename, in_file, out_filename, out_file):
+    """Process the given python file for incompatible changes.
+
+    This function is split out to facilitate StringIO testing from
+    tf_upgrade_test.py.
+
+    Args:
+      in_filename: filename to parse
+      in_file: opened file (or StringIO)
+      out_filename: output file to write to
+      out_file: opened file (or StringIO)
+    Returns:
+      A tuple representing number of files processed, log of actions, errors
+    """
+    process_errors = []
+    text = "-" * 80 + "\n"
+    text += "Processing file %r\n outputting to %r\n" % (in_filename,
+                                                         out_filename)
+    text += "-" * 80 + "\n\n"
+
+    parsed_ast = None
+    lines = in_file.readlines()
+    try:
+      parsed_ast = ast.parse("".join(lines))
+    except Exception:
+      text += "Failed to parse %r\n\n" % in_filename
+      text += traceback.format_exc()
+    if parsed_ast:
+      visitor = _ASTCallVisitor(in_filename, lines, self._api_change_spec)
+      visitor.visit(parsed_ast)
+      out_text, new_text, process_errors = visitor.process(lines)
+      text += new_text
+      if out_file:
+        out_file.write(out_text)
+    text += "\n"
+    return 1, text, process_errors
+  # pylint: enable=broad-except
+
+  def process_tree(self, root_directory, output_root_directory,
+                   copy_other_files):
+    """Processes upgrades on an entire tree of python files in place.
+
+    Note that only Python files. If you have custom code in other languages,
+    you will need to manually upgrade those.
+
+    Args:
+      root_directory: Directory to walk and process.
+      output_root_directory: Directory to use as base.
+      copy_other_files: Copy files that are not touched by this converter.
+
+    Returns:
+      A tuple of files processed, the report string ofr all files, and errors
+    """
+
+    # make sure output directory doesn't exist
+    if output_root_directory and os.path.exists(output_root_directory):
+      print("Output directory %r must not already exist." % (
+          output_root_directory))
+      sys.exit(1)
+
+    # make sure output directory does not overlap with root_directory
+    norm_root = os.path.split(os.path.normpath(root_directory))
+    norm_output = os.path.split(os.path.normpath(output_root_directory))
+    if norm_root == norm_output:
+      print("Output directory %r same as input directory %r" % (
+          root_directory, output_root_directory))
+      sys.exit(1)
+
+    # Collect list of files to process (we do this to correctly handle if the
+    # user puts the output directory in some sub directory of the input dir)
+    files_to_process = []
+    files_to_copy = []
+    for dir_name, _, file_list in os.walk(root_directory):
+      py_files = [f for f in file_list if f.endswith(".py")]
+      copy_files = [f for f in file_list if not f.endswith(".py")]
+      for filename in py_files:
+        fullpath = os.path.join(dir_name, filename)
+        fullpath_output = os.path.join(
+            output_root_directory, os.path.relpath(fullpath, root_directory))
+        files_to_process.append((fullpath, fullpath_output))
+      if copy_other_files:
+        for filename in copy_files:
+          fullpath = os.path.join(dir_name, filename)
+          fullpath_output = os.path.join(
+              output_root_directory, os.path.relpath(fullpath, root_directory))
+          files_to_copy.append((fullpath, fullpath_output))
+
+    file_count = 0
+    tree_errors = []
+    report = ""
+    report += ("=" * 80) + "\n"
+    report += "Input tree: %r\n" % root_directory
+    report += ("=" * 80) + "\n"
+
+    for input_path, output_path in files_to_process:
+      output_directory = os.path.dirname(output_path)
+      if not os.path.isdir(output_directory):
+        os.makedirs(output_directory)
+      file_count += 1
+      _, l_report, l_errors = self.process_file(input_path, output_path)
+      tree_errors += l_errors
+      report += l_report
+    for input_path, output_path in files_to_copy:
+      output_directory = os.path.dirname(output_path)
+      if not os.path.isdir(output_directory):
+        os.makedirs(output_directory)
+      shutil.copy(input_path, output_path)
+    return file_count, report, tree_errors
+
+
+class TFAPIChangeSpec(APIChangeSpec):
   """List of maps that describe what changed in the API."""
 
   def __init__(self):
@@ -238,7 +713,7 @@ Simple usage:
       default="report.txt")
   args = parser.parse_args()
 
-  upgrade = ast_edits.ASTCodeUpgrader(TFAPIChangeSpec())
+  upgrade = ASTCodeUpgrader(TFAPIChangeSpec())
   report_text = None
   report_filename = args.report_filename
   files_processed = 0
diff --git a/tensorflow/tools/compatibility/tf_upgrade_test.py b/tensorflow/tools/compatibility/tf_upgrade_test.py
index ac838a2791f..a495f9883b2 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_test.py
@@ -22,7 +22,6 @@ import tempfile
 import six
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
-from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import tf_upgrade
 
 
@@ -37,7 +36,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   def _upgrade(self, old_file_text):
     in_file = six.StringIO(old_file_text)
     out_file = six.StringIO()
-    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
+    upgrader = tf_upgrade.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
     count, report, errors = (
         upgrader.process_opened_file("test.py", in_file,
                                      "test_out.py", out_file))
@@ -140,7 +139,7 @@ class TestUpgradeFiles(test_util.TensorFlowTestCase):
     upgraded = "tf.multiply(a, b)\n"
     temp_file.write(original)
     temp_file.close()
-    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
+    upgrader = tf_upgrade.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
     upgrader.process_file(temp_file.name, temp_file.name)
     self.assertAllEqual(open(temp_file.name).read(), upgraded)
     os.unlink(temp_file.name)
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 0a6860e7915..cd22f1832f7 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -57,7 +57,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.5.4
+ENV BAZEL_VERSION 0.8.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 4164cc3f888..d0c540ae560 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -66,7 +66,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.5.4
+ENV BAZEL_VERSION 0.8.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 21e7318ddf1..e03faee61e7 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -179,10 +179,15 @@ def find_files(pattern, root):
 
 
 matches = ['../' + x for x in find_files('*', 'external') if '.py' not in x]
-matches += ['../' + x for x in find_files('*', '_solib_k8') if '.py' not in x]
-matches += [
-    '../' + x for x in find_files('*', '_solib_local') if '.py' not in x
-]
+
+so_lib_paths = [i for i in os.listdir('.')
+                if os.path.isdir(i) 
+                and fnmatch.fnmatch(i, '_solib_*')]
+
+for path in so_lib_paths:
+  matches.extend(
+      ['../' + x for x in find_files('*', path) if '.py' not in x]
+  )
 
 if os.name == 'nt':
   EXTENSION_NAME = 'python/_pywrap_tensorflow_internal.pyd'
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b295e2bc1e2..fa9d0c77c56 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -76,11 +76,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/01org/mkl-dnn/archive/aab753280e83137ba955f8f19d72cb6aaba545ef.tar.gz",
-          "https://github.com/01org/mkl-dnn/archive/aab753280e83137ba955f8f19d72cb6aaba545ef.tar.gz",
+          "https://mirror.bazel.build/github.com/01org/mkl-dnn/archive/e0bfcaa7fcb2b1e1558f5f0676933c1db807a729.tar.gz",
+          "https://github.com/01org/mkl-dnn/archive/e0bfcaa7fcb2b1e1558f5f0676933c1db807a729.tar.gz",
       ],
-      sha256 = "fb67f255a96bd4ad39b8dd104eca5aa92200c95c1ed36e59641e6c0478eefd11",
-      strip_prefix = "mkl-dnn-aab753280e83137ba955f8f19d72cb6aaba545ef",
+      sha256 = "02e244f63dd95402691a361392504c143eede9a89043426f174836638a9cbf09",
+      strip_prefix = "mkl-dnn-e0bfcaa7fcb2b1e1558f5f0676933c1db807a729",
       build_file = str(Label("//third_party/mkl_dnn:mkldnn.BUILD")),
   )
 
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index b752734a08a..2a37c65bc74 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -46,6 +46,7 @@ cc_library(
     includes = [
         ".",
         "cuda/include",
+        "cuda/include/crt",
     ],
     visibility = ["//visibility:public"],
 )