Merge changes from github.

PiperOrigin-RevId: 180746153
2018-01-03 19:06:54 -08:00 · 2018-01-03 19:06:54 -08:00 · 71896cc7e5
commit 71896cc7e5
parent 2eef71c3f9
103 changed files with 2654 additions and 744 deletions
--- a/2
+++ b/2
@ -1,4 +1,4 @@
-Copyright 2017 The TensorFlow Authors.  All rights reserved.
+Copyright 2018 The TensorFlow Authors.  All rights reserved.

                                 Apache License
                           Version 2.0, January 2004
--- a/README.md
+++ b/README.md
@ -46,8 +46,8 @@ packages on Linux, Mac, and Windows.


 **Individual whl files**
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/42/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/)) / [Python 3.6](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp36-cp36m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=cpu-slave/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/42/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/)) / [Python 3.6](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/))
 * Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
 * Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/))
 * Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/))
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@ -763,24 +763,6 @@ Status LgammaGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Lgamma", LgammaGrad);

-Status SelectGrad(const Scope& scope, const Operation& op,
-                  const std::vector<Output>& grad_inputs,
-                  std::vector<Output>* grad_outputs) {
-  auto comparator = op.input(0);
-  auto x = op.input(1);
-  auto zeros = ZerosLike(scope, x);
-  auto grad = grad_inputs[0];
-
-  auto gx_1 = Where3(scope, comparator, grad, zeros);
-  auto gx_2 = Where3(scope, comparator, zeros, grad);
-
-  grad_outputs->push_back(NoGradient());
-  grad_outputs->push_back(gx_1);
-  grad_outputs->push_back(gx_2);
-  return scope.status();
-}
-REGISTER_GRADIENT_OP("Select", SelectGrad);
-
 Status MinOrMaxGrad(const Scope& scope, const Operation& op,
                    const std::vector<Output>& grad_inputs,
                    std::vector<Output>* grad_outputs) {
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@ -904,13 +904,5 @@ TEST_F(NaryGradTest, Prod) {
  RunTest({x}, {x_shape}, {y}, {y_shape});
 }

-TEST_F(NaryGradTest, Select) {
-  TensorShape shape({3, 4});
-  auto x1 = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
-  auto x2 = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
-  auto y = Where3(scope_, Greater(scope_, x1, x2), x1, x2);
-  RunTest({x1, x2}, {shape, shape}, {y}, {shape});
-}
-
 }  // namespace
 }  // namespace tensorflow
--- a/tensorflow/cc/ops/while_loop.cc
+++ b/tensorflow/cc/ops/while_loop.cc
@ -116,7 +116,7 @@ Status CreateCond(const Scope& scope, const CondGraphBuilderFn& cond,
  return Status::OK();
 }

-// Create the bdoy subgraph defined by `body`. `outputs` must be non-null and
+// Create the body subgraph defined by `body`. `outputs` must be non-null and
 // empty.
 Status CreateBody(const Scope& scope, const BodyGraphBuilderFn& body,
                  const std::vector<Output>& inputs,
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@ -4,6 +4,7 @@ package(
    default_visibility = ["//tensorflow/compiler/tf2xla:internal"],
 )

+load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")

 tf_kernel_library(
@ -167,6 +168,7 @@ tf_kernel_library(
 cc_library(
    name = "index_ops_kernel_argmax_float_1d",
    srcs = ["index_ops_kernel_argmax_float_1d.cc"],
+    copts = tf_copts(),
    visibility = ["//visibility:public"],
    deps = [
        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
@ -179,6 +181,7 @@ cc_library(
 cc_library(
    name = "index_ops_kernel_argmax_float_2d",
    srcs = ["index_ops_kernel_argmax_float_2d.cc"],
+    copts = tf_copts(),
    visibility = ["//visibility:public"],
    deps = [
        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@ -39,7 +39,7 @@ Status MakeXlaCompilerArgumentsFromInputs(
  *has_uninitialized_vars = false;
  *has_tensor_arrays = false;
  for (int i = 0; i < ctx->num_inputs(); ++i) {
-    VLOG(2) << "  Input " << i
+    VLOG(2) << " Input " << i
            << " type: " << DataTypeString(ctx->input_type(i))
            << " shape: " << ctx->InputShape(i).DebugString();
    XlaCompiler::Argument& arg = (*args)[i];
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
@ -19,7 +19,7 @@ limitations under the License.

 #include "third_party/eigen3/Eigen/Core"

-#ifdef __AVX__
+#ifdef TF_XLA_HAS_AVX
 xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_ExpV8F32AVX(
    xla::cpu::runtime::V8F32AVX x) {
  return Eigen::internal::pexp(x);
@ -29,7 +29,7 @@ xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_LogV8F32AVX(
    xla::cpu::runtime::V8F32AVX x) {
  return Eigen::internal::plog(x);
 }
-#endif  // __AVX__
+#endif  // TF_XLA_HAS_AVX

 namespace xla {
 namespace cpu {
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
@ -24,6 +24,11 @@ limitations under the License.

 #include "tensorflow/core/platform/macros.h"

+#if defined(__AVX__)
+#include <immintrin.h>
+#define TF_XLA_HAS_AVX
+#endif
+
 namespace xla {
 namespace cpu {
 namespace runtime {
@ -31,14 +36,16 @@ namespace runtime {
 extern const char *const kExpV8F32AVXSymbolName;
 extern const char *const kLogV8F32AVXSymbolName;

-typedef float V8F32AVX __attribute__((__vector_size__(32)));
+#ifdef TF_XLA_HAS_AVX
+typedef __m256 V8F32AVX;
+#endif
 }  // namespace runtime
 }  // namespace cpu
 }  // namespace xla

 extern "C" {

-#ifdef __AVX__
+#ifdef TF_XLA_HAS_AVX
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.cc
@ -19,7 +19,7 @@ limitations under the License.

 #include "third_party/eigen3/Eigen/Core"

-#ifdef __ARM_NEON__
+#ifdef TF_XLA_HAS_NEON

 xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_ExpV4F32NEON(
    xla::cpu::runtime::V4F32NEON x) {
@ -32,7 +32,7 @@ xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_LogV4F32NEON(
  return Eigen::internal::plog(p);
 }

-#endif  // __ARM_NEON__
+#endif  // TF_XLA_HAS_NEON

 namespace xla {
 namespace cpu {
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
@ -27,6 +27,7 @@ limitations under the License.
 // __attribute__((__vector_size__(*))).  Unfortunately, the typedef for the ARM
 // NEON SIMD types is not portable, so the type has to come from <arm_neon.h>
 #include <arm_neon.h>
+#define TF_XLA_HAS_NEON
 #endif  // __ARM_NEON__

 namespace xla {
@ -36,12 +37,9 @@ namespace runtime {
 extern const char *const kExpV4F32NEONSymbolName;
 extern const char *const kLogV4F32NEONSymbolName;

-#ifdef __ARM_NEON__
+#ifdef TF_XLA_HAS_NEON
 typedef float32x4_t V4F32NEON;
-#else
-// On non-ARM platforms ensure the declaration is present
-struct V4F32NEON;
-#endif  // __ARM_NEON__
+#endif  // TF_XLA_HAS_NEON

 }  // namespace runtime
 }  // namespace cpu
@ -49,7 +47,7 @@ struct V4F32NEON;

 extern "C" {

-#ifdef __ARM_NEON__
+#ifdef TF_XLA_HAS_NEON
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
@ -58,7 +56,7 @@ xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_ExpV4F32NEON(

 xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_LogV4F32NEON(
    xla::cpu::runtime::V4F32NEON x);
-#endif  // __ARM_NEON__
+#endif  // TF_XLA_HAS_NEON
 }

 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_NEON_H_
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
@ -19,7 +19,7 @@ limitations under the License.

 #include "third_party/eigen3/Eigen/Core"

-#ifdef __SSE4_1__
+#ifdef TF_XLA_HAS_SSE4_1

 xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_ExpV4F32SSE(
    xla::cpu::runtime::V4F32SSE x) {
@ -33,7 +33,7 @@ xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_LogV4F32SSE(
  return Eigen::internal::plog(p);
 }

-#endif  // __SSE4_1__
+#endif  // TF_XLA_HAS_SSE4_1

 namespace xla {
 namespace cpu {
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
@ -24,6 +24,13 @@ limitations under the License.

 #include "tensorflow/core/platform/macros.h"

+// MSVC does not have __SSE4_1__ macro. Eigen enables EIGEN_VECTORIZE_SSE4_1
+// when __AVX__ is defined, we should do the same.
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && defined(__AVX__))
+#include <smmintrin.h>
+#define TF_XLA_HAS_SSE4_1
+#endif
+
 namespace xla {
 namespace cpu {
 namespace runtime {
@ -31,7 +38,9 @@ namespace runtime {
 extern const char *const kExpV4F32SSESymbolName;
 extern const char *const kLogV4F32SSESymbolName;

-typedef float V4F32SSE __attribute__((__vector_size__(16)));
+#ifdef TF_XLA_HAS_SSE4_1
+typedef __m128 V4F32SSE;
+#endif

 }  // namespace runtime
 }  // namespace cpu
@ -39,7 +48,7 @@ typedef float V4F32SSE __attribute__((__vector_size__(16)));

 extern "C" {

-#ifdef __SSE4_1__
+#ifdef TF_XLA_HAS_SSE4_1
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@ -64,14 +64,14 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
                            &ir_builder),
      llvm::ConstantFP::get(vector_type, 9.0), &ir_builder);

-  std::array<float, 7> numerator_coeffs(
-      {-2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
-       5.12229709037114e-08f, 1.48572235717979e-05f, 6.37261928875436e-04f,
-       4.89352455891786e-03f});
+  std::array<float, 7> numerator_coeffs{
+      -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
+      5.12229709037114e-08f,  1.48572235717979e-05f, 6.37261928875436e-04f,
+      4.89352455891786e-03f};

-  std::array<float, 4> denominator_coeffs(
-      {1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
-       4.89352518554385e-03f});
+  std::array<float, 4> denominator_coeffs{
+      1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
+      4.89352518554385e-03f};

  llvm::Value* input_squared =
      ir_builder.CreateFMul(input_clamped, input_clamped);
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@ -103,17 +103,17 @@ llvm::StringRef GetHostCpuName() {

 CompilerFunctor::VectorIntrinsics GetAvailableIntrinsics() {
  CompilerFunctor::VectorIntrinsics intrinsics;
-#ifdef __SSE4_1__
+#ifdef TF_XLA_HAS_SSE4_1
  intrinsics.sse_intrinsics = true;
 #else
  intrinsics.sse_intrinsics = false;
 #endif
-#ifdef __AVX__
+#ifdef TF_XLA_HAS_AVX
  intrinsics.avx_intrinsics = true;
 #else
  intrinsics.avx_intrinsics = false;
 #endif
-#ifdef __ARM_NEON__
+#ifdef TF_XLA_HAS_NEON
  intrinsics.neon_intrinsics = true;
 #else
  intrinsics.neon_intrinsics = false;
@ -215,15 +215,15 @@ bool RegisterKnownJITSymbols() {
  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
-#ifdef __ARM_NEON__
+#ifdef TF_XLA_HAS_NEON
  REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32NEON);
  REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32NEON);
 #endif
-#ifdef __SSE4_1__
+#ifdef TF_XLA_HAS_SSE4_1
  REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32SSE);
  REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32SSE);
 #endif
-#ifdef __AVX__
+#ifdef TF_XLA_HAS_AVX
  REGISTER_CPU_RUNTIME_SYMBOL(ExpV8F32AVX);
  REGISTER_CPU_RUNTIME_SYMBOL(LogV8F32AVX);
 #endif
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -407,7 +407,9 @@ endif()

 # Let's get to work!
 include(tf_core_framework.cmake)
-include(tf_stream_executor.cmake)
+if (tensorflow_ENABLE_GPU)
+  include(tf_stream_executor.cmake)
+endif()

 include(tf_core_cpu.cmake)
 include(tf_core_ops.cmake)
--- a/tensorflow/contrib/cmake/make.sh
+++ b/tensorflow/contrib/cmake/make.sh
@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+(
+cd "$(dirname "$0")"
+mkdir -p _build
+
+(
+cd _build
+rm -rf -- *
+cmake ..
+)
+)
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@ -86,9 +86,7 @@ tensorflow/python/ops/distributions
 tensorflow/python/ops/linalg
 tensorflow/python/ops/losses
 tensorflow/python/platform
-tensorflow/python/platform/default
-tensorflow/python/platform/summary
-tensorflow/python/profiler/
+tensorflow/python/profiler
 tensorflow/python/profiler/internal
 tensorflow/python/saved_model
 tensorflow/python/summary
@ -115,8 +113,6 @@ tensorflow/contrib/batching/kernels
 tensorflow/contrib/batching/python
 tensorflow/contrib/batching/python/ops
 tensorflow/contrib/bayesflow
-tensorflow/contrib/bayesflow/examples
-tensorflow/contrib/bayesflow/examples/reinforce_simple
 tensorflow/contrib/bayesflow/python
 tensorflow/contrib/bayesflow/python/ops
 tensorflow/contrib/boosted_trees
@ -212,16 +208,6 @@ tensorflow/contrib/input_pipeline/python/ops
 tensorflow/contrib/integrate
 tensorflow/contrib/integrate/python
 tensorflow/contrib/integrate/python/ops
-tensorflow/contrib/ios_examples
-tensorflow/contrib/ios_examples/benchmark
-tensorflow/contrib/ios_examples/benchmark/benchmark.xcodeproj
-tensorflow/contrib/ios_examples/benchmark/data
-tensorflow/contrib/ios_examples/camera
-tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj
-tensorflow/contrib/ios_examples/camera/en.lproj
-tensorflow/contrib/ios_examples/simple
-tensorflow/contrib/ios_examples/simple/data
-tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj
 tensorflow/contrib/keras
 tensorflow/contrib/keras/api
 tensorflow/contrib/keras/api/keras
@ -276,9 +262,6 @@ tensorflow/contrib/layers/python/ops
 tensorflow/contrib/learn
 tensorflow/contrib/learn/python
 tensorflow/contrib/learn/python/learn
-tensorflow/contrib/learn/python/learn/dataframe
-tensorflow/contrib/learn/python/learn/dataframe/queues
-tensorflow/contrib/learn/python/learn/dataframe/transforms
 tensorflow/contrib/learn/python/learn/datasets
 tensorflow/contrib/learn/python/learn/datasets/data
 tensorflow/contrib/learn/python/learn/estimators
@ -301,6 +284,9 @@ tensorflow/contrib/linear_optimizer/kernels
 tensorflow/contrib/linear_optimizer/kernels/g3doc
 tensorflow/contrib/linear_optimizer/python
 tensorflow/contrib/linear_optimizer/python/ops
+# TODO(drpngx): Fix failing imports
+# tensorflow/contrib/lite/python
+# tensorflow/contrib/lite/toco/python
 tensorflow/contrib/lookup
 tensorflow/contrib/losses
 tensorflow/contrib/losses/python
@ -314,7 +300,6 @@ tensorflow/contrib/memory_stats/python
 tensorflow/contrib/memory_stats/python/ops
 tensorflow/contrib/meta_graph_transform
 tensorflow/contrib/metrics
-tensorflow/contrib/metrics/ops
 tensorflow/contrib/metrics/python
 tensorflow/contrib/metrics/python/metrics
 tensorflow/contrib/metrics/python/ops
@ -346,7 +331,6 @@ tensorflow/contrib/pi_examples/label_image
 tensorflow/contrib/pi_examples/label_image/data
 tensorflow/contrib/periodic_resample
 tensorflow/contrib/periodic_resample/python
-tensorflow/contrib/periodic_resample/python/kernels
 tensorflow/contrib/periodic_resample/python/ops
 tensorflow/contrib/predictor
 tensorflow/contrib/quantization
@ -411,13 +395,9 @@ tensorflow/contrib/tensorboard/plugins
 tensorflow/contrib/tensorboard/plugins/projector
 tensorflow/contrib/tensor_forest
 tensorflow/contrib/tensor_forest/client
-tensorflow/contrib/tensor_forest/core
-tensorflow/contrib/tensor_forest/core/ops
-tensorflow/contrib/tensor_forest/data
 tensorflow/contrib/tensor_forest/hybrid
 tensorflow/contrib/tensor_forest/hybrid/core
 tensorflow/contrib/tensor_forest/hybrid/core/ops
-tensorflow/contrib/tensor_forest/hybrid/ops
 tensorflow/contrib/tensor_forest/hybrid/python
 tensorflow/contrib/tensor_forest/hybrid/python/layers
 tensorflow/contrib/tensor_forest/hybrid/python/models
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@ -126,10 +126,15 @@ STRING(REGEX REPLACE ";" "\\\\;" python_protos "${python_protos}")
 STRING(REGEX REPLACE "\n" ";" python_protos "${python_protos}")

 foreach(python_proto ${python_protos})
-  file(GLOB_RECURSE tf_python_protos_src RELATIVE ${tensorflow_source_dir}
-      "${tensorflow_source_dir}/${python_proto}/*.proto"
-  )
-  list(APPEND tf_python_protos_srcs ${tf_python_protos_src})
+  if(NOT python_proto MATCHES "\#")
+    if(NOT EXISTS "${tensorflow_source_dir}/${python_proto}")
+      message(SEND_ERROR "Python proto directory not found: ${python_proto}")
+    endif()
+    file(GLOB_RECURSE tf_python_protos_src RELATIVE ${tensorflow_source_dir}
+        "${tensorflow_source_dir}/${python_proto}/*.proto"
+    )
+    list(APPEND tf_python_protos_srcs ${tf_python_protos_src})
+  endif()
 endforeach(python_proto)

 RELATIVE_PROTOBUF_GENERATE_PYTHON(
@ -142,10 +147,15 @@ STRING(REGEX REPLACE ";" "\\\\;" python_protos_cc "${python_protos_cc}")
 STRING(REGEX REPLACE "\n" ";" python_protos_cc "${python_protos_cc}")

 foreach(python_proto_cc ${python_protos_cc})
-  file(GLOB_RECURSE tf_python_protos_cc_src RELATIVE ${tensorflow_source_dir}
-      "${tensorflow_source_dir}/${python_proto_cc}/*.proto"
-  )
-  list(APPEND tf_python_protos_cc_srcs ${tf_python_protos_cc_src})
+  if(NOT python_proto_cc MATCHES "\#")
+    if(NOT EXISTS "${tensorflow_source_dir}/${python_proto_cc}")
+      message(SEND_ERROR "Python proto CC directory not found: ${python_proto_cc}")
+    endif()
+    file(GLOB_RECURSE tf_python_protos_cc_src RELATIVE ${tensorflow_source_dir}
+        "${tensorflow_source_dir}/${python_proto_cc}/*.proto"
+    )
+    list(APPEND tf_python_protos_cc_srcs ${tf_python_protos_cc_src})
+  endif()
 endforeach(python_proto_cc)

 RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
@ -199,7 +209,12 @@ STRING(REGEX REPLACE ";" "\\\\;" python_modules "${python_modules}")
 STRING(REGEX REPLACE "\n" ";" python_modules "${python_modules}")

 foreach(python_module ${python_modules})
-  add_python_module(${python_module})
+  if(NOT python_module MATCHES "\#")
+    if(NOT EXISTS "${tensorflow_source_dir}/${python_module}")
+      message(SEND_ERROR "Python module not found: ${python_module}")
+    endif()
+    add_python_module(${python_module})
+  endif()
 endforeach(python_module)

 add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@ -96,7 +96,7 @@ class GANEstimator(estimator.Estimator):
      # Generate samples from generator.
      predictions = np.array([
          x for x in gan_estimator.predict(predict_input_fn)])
-    ```
+  ```
  """

  def __init__(self,
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@ -2674,16 +2674,25 @@ def spatial_softmax(features,
                                        indexing='ij')
      pos_x = array_ops.reshape(pos_x, [height * width])
      pos_y = array_ops.reshape(pos_y, [height * width])
+      
      if temperature is None:
-        temperature_collections = utils.get_variable_collections(
-            variables_collections, 'temperature')
-        temperature = variables.model_variable(
-            'temperature',
-            shape=(),
-            dtype=dtypes.float32,
-            initializer=init_ops.ones_initializer(),
-            collections=temperature_collections,
-            trainable=trainable)
+        temp_initializer = init_ops.ones_initializer()
+      else:
+        temp_initializer = init_ops.constant_initializer(temperature)
+          
+      if not trainable:
+        temp_collections = None
+      else:
+        temp_collections = utils.get_variable_collections(
+              variables_collections, 'temperature')
+      
+      temperature = variables.model_variable(
+          'temperature',
+          shape=(),
+          dtype=dtypes.float32,
+          initializer=temp_initializer,
+          collections=temp_collections,
+          trainable=trainable)
      if data_format == 'NCHW':
        features = array_ops.reshape(features, [-1, height * width])
      else:
--- a/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
+++ b/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
@ -22,10 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"

 namespace tensorflow {
-namespace {
-template <typename T>
-bool ConvertHelper(const string& s, T* value);
-}

 template <typename T, typename Tlabel>
 class DecodeLibsvmOp : public OpKernel {
@ -57,7 +53,7 @@ class DecodeLibsvmOp : public OpKernel {
                                          "]: \"", input_flat(i), "\""));
      Tlabel label_value;
      OP_REQUIRES(
-          ctx, ConvertHelper<Tlabel>(entries[0], &label_value),
+          ctx, strings::SafeStringToNumeric<Tlabel>(entries[0], &label_value),
          errors::InvalidArgument("Label format incorrect: ", entries[0]));
      label(i) = label_value;
      for (int j = 1; j < entries.size(); j++) {
@ -74,7 +70,7 @@ class DecodeLibsvmOp : public OpKernel {
                        "Feature index should be >= 0, got ", feature_index));
        T feature_value;
        OP_REQUIRES(
-            ctx, ConvertHelper<T>(pair[1], &feature_value),
+            ctx, strings::SafeStringToNumeric<T>(pair[1], &feature_value),
            errors::InvalidArgument("Feature format incorrect: ", entries[j]));
        out_values.emplace_back(feature_value);
        out_indices.emplace_back(std::pair<int64, int64>(i, feature_index));
@ -128,25 +124,6 @@ class DecodeLibsvmOp : public OpKernel {
  int64 num_features_;
 };

-namespace {
-template <>
-bool ConvertHelper<float>(const string& s, float* value) {
-  return strings::safe_strtof(s.c_str(), value);
-}
-template <>
-bool ConvertHelper<double>(const string& s, double* value) {
-  return strings::safe_strtod(s.c_str(), value);
-}
-template <>
-bool ConvertHelper<int32>(const string& s, int32* value) {
-  return strings::safe_strto32(s.c_str(), value);
-}
-template <>
-bool ConvertHelper<int64>(const string& s, int64* value) {
-  return strings::safe_strto64(s.c_str(), value);
-}
-}  // namespace
-
 #define REGISTER_KERNEL(type)                                         \
  REGISTER_KERNEL_BUILDER(Name("DecodeLibsvm")                        \
                              .Device(DEVICE_CPU)                     \
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@ -80,8 +80,7 @@ FlatBufferModel::FlatBufferModel(const char* filename, bool mmap_file,
  } else {
    allocation_ = new FileCopyAllocation(filename, error_reporter);
  }
-  if (!allocation_->valid()) return;
-  if (!CheckModelIdentifier()) return;
+  if (!allocation_->valid() || !CheckModelIdentifier()) return;

  model_ = VerifyAndGetModel(allocation_->base(), allocation_->bytes());
 }
--- a/tensorflow/contrib/lite/models/smartreply/g3doc/README.md
+++ b/tensorflow/contrib/lite/models/smartreply/g3doc/README.md
@ -137,8 +137,8 @@ Following are the ops supported for using On-Device Smart Reply model:

 *   **HASHTABLE_LOOKUP**

-    This is a custom op that uses label id from predict op and looks up the
-    response text from the given label id.
+    This is an op inside TensorFlow Lite that uses label id from predict op and
+    looks up the response text from the given label id.

 ## Further Information

--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@ -144,6 +144,7 @@ py_test(
    tags = ["no_pip"],
    deps = [
        ":predictor_factories",
+        ":testing_common",
    ],
 )

--- a/tensorflow/contrib/predictor/core_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/core_estimator_predictor.py
@ -68,10 +68,10 @@ class CoreEstimatorPredictor(predictor.Predictor):
      serving_input_receiver = serving_input_receiver_fn()
      signature_def = _get_signature_def(
          serving_input_receiver, estimator, output_key)
-      checkpoint_path = estimator.model_dir
+      checkpoint_dir = estimator.model_dir
      self._session = monitored_session.MonitoredSession(
          session_creator=monitored_session.ChiefSessionCreator(
-              checkpoint_filename_with_path=checkpoint_path))
+              checkpoint_dir=checkpoint_dir))

    feed_tensor_info = signature_def.inputs
    self._feed_tensors = {k: self._graph.get_tensor_by_name(v.name)
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@ -21,6 +21,8 @@ from __future__ import print_function
 from tensorflow.contrib.predictor import contrib_estimator_predictor
 from tensorflow.contrib.predictor import core_estimator_predictor
 from tensorflow.contrib.predictor import saved_model_predictor
+
+from tensorflow.contrib.learn.python.learn.estimators import estimator as contrib_estimator
 from tensorflow.python.estimator import estimator as core_estimator


@ -85,7 +87,7 @@ def from_estimator(estimator,
    TypeError: if `estimator` is a contrib `Estimator` instead of a core
      `Estimator`.
  """
-  if isinstance(estimator, estimator.Estimator):
+  if isinstance(estimator, contrib_estimator.Estimator):
    raise TypeError('Espected estimator to be of type '
                    'tf.python.estimator.Estimator, but got type '
                    'tf.contrib.learn.Estimator. You likely want to call '
--- a/tensorflow/contrib/predictor/predictor_factories_test.py
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function

 from tensorflow.contrib.predictor import predictor_factories
+from tensorflow.contrib.predictor import testing_common
 from tensorflow.python.platform import test

 MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
@ -46,6 +47,29 @@ class PredictorFactoriesTest(test.TestCase):
    with self.assertRaisesRegexp(RuntimeError, bad_tags_regex):
      predictor_factories.from_saved_model(self._export_dir, tags='bad_tag')

+  def testFromContribEstimator(self):
+    estimator = testing_common.get_arithmetic_estimator(core=False)
+    input_fn = testing_common.get_arithmetic_input_fn(core=False)
+    predictor_factories.from_contrib_estimator(estimator, input_fn,
+        output_alternative_key='sum')
+
+  def testFromContribEstimatorWithCoreEstimatorRaises(self):
+    estimator = testing_common.get_arithmetic_estimator(core=True)
+    input_fn = testing_common.get_arithmetic_input_fn(core=True)
+    with self.assertRaises(TypeError):
+      predictor_factories.from_contrib_estimator(estimator, input_fn)
+
+  def testFromCoreEstimator(self):
+    estimator = testing_common.get_arithmetic_estimator(core=True)
+    input_fn = testing_common.get_arithmetic_input_fn(core=True)
+    predictor_factories.from_estimator(estimator, input_fn)
+
+  def testFromCoreEstimatorWithContribEstimatorRaises(self):
+    estimator = testing_common.get_arithmetic_estimator(core=False)
+    input_fn = testing_common.get_arithmetic_input_fn(core=False)
+    with self.assertRaises(TypeError):
+      predictor_factories.from_estimator(estimator, input_fn)
+

 if __name__ == '__main__':
  test.main()
--- a/tensorflow/contrib/signal/python/ops/mfcc_ops.py
+++ b/tensorflow/contrib/signal/python/ops/mfcc_ops.py
@ -50,7 +50,7 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
  # A 1024-point STFT with frames of 64 ms and 75% overlap.
  stfts = tf.contrib.signal.stft(pcm, frame_length=1024, frame_step=256,
                                 fft_length=1024)
-  spectrograms = tf.abs(stft)
+  spectrograms = tf.abs(stfts)

  # Warp the linear scale spectrograms into the mel-scale.
  num_spectrogram_bins = stfts.shape[-1].value
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@ -676,7 +676,7 @@ file were implicitly obtained from each provided variable's `var.op.name`.

 This works well when the variable names in the checkpoint file match those in
 the graph. However, sometimes, we want to restore a model from a checkpoint
-whose variables have different names those in the current graph. In this case,
+whose variables have different names to those in the current graph. In this case,
 we must provide the `Saver` a dictionary that maps from each checkpoint variable
 name to each graph variable. Consider the following example where the checkpoint
 variables names are obtained via a simple function:
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@ -753,9 +753,10 @@ def train(train_op,
          if logdir:
            sv.start_standard_services(sess)
        elif startup_delay_steps > 0:
+           # (use sys.maxsize because sys.maxint doesn't exist in Python 3)
          _wait_for_step(sess, global_step,
                         min(startup_delay_steps, number_of_steps or
-                             sys.maxint))
+                             sys.maxsize))
        threads = sv.start_queue_runners(sess)
        logging.info('Starting Queues.')
        if is_chief and sync_optimizer is not None:
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@ -530,7 +530,6 @@ py_library(
    srcs_version = "PY2AND3",
    deps = [
        ":client_lib",
-        "//tensorflow/contrib/framework:framework_py",
        "//tensorflow/contrib/layers:layers_py",
        "//tensorflow/contrib/learn",
        "//tensorflow/python:array_ops",
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib import layers

 from tensorflow.contrib.learn.python.learn.estimators import estimator
@ -190,7 +189,7 @@ def get_model_fn(params,
                features, labels, input_weights=weights,
                num_trainers=num_trainers,
                trainer_id=trainer_id),
-            state_ops.assign_add(contrib_framework.get_global_step(), 1))
+            state_ops.assign_add(training_util.get_global_step(), 1))

    # Put weights back in
    if weights is not None:
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@ -26,7 +26,6 @@ py_library(
        "python/training/resample.py",
        "python/training/sampling_ops.py",
        "python/training/sequence_queueing_state_saver.py",
-        "python/training/sgdr_learning_rate_decay.py",
        "python/training/training.py",
        "python/training/tuner.py",
    ],
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -79,6 +79,7 @@ load(
    "if_linux_x86_64",
    "if_mobile",
    "if_not_mobile",
+    "if_windows",
    "if_not_windows",
    "tf_copts",
    "tf_cc_test",
@ -562,7 +563,7 @@ cc_library(
        "platform/prefetch.h",
        "platform/thread_annotations.h",
        "platform/types.h",
-    ],
+    ] + if_windows(["platform/windows/integral_types.h"]),
    visibility = ["//visibility:public"],
    deps =
        [
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@ -761,6 +761,10 @@ int64 MinSystemMemory(int64 available_memory) {
  // builds in windows); because in non-opt builds more system memory
  // is necessary.
  min_system_memory *= 2;
+#endif
+#if defined(NVIDIA_TEGRA)
+  // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM and Video RAM
+  min_system_memory = 1<<30;
 #endif
  return min_system_memory;
 }
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -194,10 +194,9 @@ cc_library(
    ],
 )

-cc_library(
+tf_kernel_library(
    name = "fill_functor",
-    srcs = ["fill_functor.cc"],
-    hdrs = ["fill_functor.h"],
+    prefix = "fill_functor",
    deps = [
        "//tensorflow/core:framework",
        "//third_party/eigen3",
@ -3043,6 +3042,7 @@ tf_kernel_library(
        "//conditions:default": [],
    }),
    hdrs = [
+        "fill_functor.h",
        "conv_grad_ops.h",
        "deep_conv2d.h",
        "gemm_functors.h",
@ -3067,6 +3067,7 @@ tf_kernel_library(
        ":conv_2d",
        ":conv_3d",
        ":image_resizer_state",
+        ":fill_functor",
        ":ops_util",
        "//tensorflow/core:core_cpu",
        "//tensorflow/core:framework",
@ -3173,7 +3174,9 @@ tf_kernel_library(
 tf_kernel_library(
    name = "fused_batch_norm_op",
    prefix = "fused_batch_norm_op",
-    deps = NN_DEPS,
+    deps = NN_DEPS + [
+        ":fill_functor",
+    ],
 )

 tf_kernel_library(
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@ -151,18 +151,6 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL

-namespace functor {
-
-// Partial specialization of FillFunctor<Device=CPUDevice, T>.
-template <typename T>
-struct FillFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstScalar in) {
-    out.device(d) = out.constant(in());
-  }
-};
-
-}  // end namespace functor

 template <typename Device, typename T, typename Index>
 class FillOp : public OpKernel {
@ -191,28 +179,6 @@ class FillOp : public OpKernel {
  }
 };

-#ifdef TENSORFLOW_USE_SYCL
-
-namespace functor {
-// Partial specialization of FillFunctor<Device=SYCLDevice, T>.
-template <typename T>
-struct FillFunctor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstScalar in) {
-#if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::array<int, 1> rank1{1};
-#else
-    Eigen::IndexList<Eigen::type2index<1> > rank1;
-#endif
-    const int size = out.dimension(0);
-    Eigen::array<int, 1> broadcast_dims{size};
-
-    To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims);
-  }
-};
-}  // namespace functor
-#endif  // TENSORFLOW_USE_SYCL
-
 #define REGISTER_KERNEL(D, TYPE)                                   \
  REGISTER_KERNEL_BUILDER(Name("Fill")                             \
                              .Device(DEVICE_##D)                  \
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #ifdef TENSORFLOW_USE_LIBXSMM
 #include "tensorflow/core/kernels/xsmm_conv2d.h"
 #endif
@ -595,6 +596,13 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
    if (filter_shape.num_elements() == 0) {
      return;
    }
+    // If input is empty, set gradients to zero.
+    if (input.shape().num_elements() == 0) {
+      functor::SetZeroFunctor<Device, T> f;
+      f(context->eigen_device<Device>(), filter_backprop->flat<T>());
+      return;
+    }
+

    // For now we take the stride from the second and third dimensions only (we
    // do not support striding on the batch or depth dimension).
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@ -19,12 +19,15 @@ limitations under the License.

 #include <algorithm>
 #include <array>
+#include <limits>
+#include <utility>

 #include "cuda/include/cuda.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/lib/math/math_util.h"

 namespace tensorflow {

@ -223,185 +226,136 @@ __global__ void SwapDimension1And2InTensor3Simple(int nthreads, const T* input,
 // Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor,
 // where dimensions are zero-based: output[i][j][k] = input[i][k][j].
 //
-// Each thread block operates on a single tile, a square of dimensions TileSize
-// x TileSize.  We require that the thread block's X dimension equals TileSize,
-// and its Y dimension equals NumSubTiles.
+// Each thread block operates on a single tile, a rectangle of dimensions
+// TileSizeI x TileSizeJ.
 //
-// For best performance, you should probably set TileSize equal to the number of
-// threads in a warp (32 in nvidia GPUs).  With a TileSize of 32, NumSubTiles ==
-// 4 or 8 seems to get the best performance on K40 GPUs.
-template <typename T, int TileSize, int NumSubTiles, bool conjugate = false>
-__global__ void SwapDimension1And2InTensor3UsingTiles(const T* input,
-                                                      Dimension<3> input_dims,
-                                                      T* output) {
-  // One extra line in the inner dimension to avoid share memory bank conflict.
-  __shared__ T shared_memory_tile[TileSize][TileSize + 1];
-
-  static_assert(TileSize % NumSubTiles == 0,
-                "TileSize must be divisible by NumSubTiles");
-  eigen_assert(blockDim.x == TileSize);
-  eigen_assert(blockDim.y == NumSubTiles);
+// In general, for best performance, you should probably set TileSizeI,
+// TileSizeJ equal to the number of threads in a warp (32 in nvidia GPUs).
+// With a TileSizeI, TileSizeJ of 32, NumThreads of 128 or 256 seems to get
+// the best performance on K40 GPUs.
+template <typename T, int NumThreads, int TileSizeI, int TileSizeJ,
+          bool conjugate = false>
+__global__ void SwapDimension1And2InTensor3UsingTiles(
+    const T* __restrict__ input, Dimension<3> input_dims,
+    T* __restrict__ output) {
+  eigen_assert(blockDim.x == NumThreads);
+  eigen_assert(blockDim.y == 1);
  eigen_assert(blockDim.z == 1);
  eigen_assert(gridDim.y == 1);
  eigen_assert(gridDim.z == 1);

-  // We break down the tile into NumSubTiles groups, so each thread processes
-  // kSubTileSize elements (except at the edges of the input).
-  const int kSubTileSize = TileSize / NumSubTiles;
+  constexpr int ReadRowPerPass = NumThreads / TileSizeJ;
+  constexpr int WriteRowPerPass = NumThreads / TileSizeI;
+  // One extra line in the inner dimension to avoid share memory bank conflict.
+  __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];

  int x = threadIdx.x;

  Dimension<3> output_dims = {
-      input_dims[0],
-      input_dims[2],
-      input_dims[1],
+      input_dims[0], input_dims[2], input_dims[1],
  };

  Dimension<3> input_dims_in_tiles = {
-      input_dims[0],
-      (input_dims[1] + TileSize - 1) / TileSize,
-      (input_dims[2] + TileSize - 1) / TileSize,
+      input_dims[0], (input_dims[1] + TileSizeI - 1) / TileSizeI,
+      (input_dims[2] + TileSizeJ - 1) / TileSizeJ,
  };

  Index<3> input_tile_index =
      FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);

  Index<3> input_tile_origin = {
-      input_tile_index[0],
-      input_tile_index[1] * TileSize,
-      input_tile_index[2] * TileSize,
+      input_tile_index[0], input_tile_index[1] * TileSizeI,
+      input_tile_index[2] * TileSizeJ,
  };

  int input_origin_flat_index =
      TensorIndexToFlat(input_tile_origin, input_dims);

-  int tile_width = TileSize;
+  bool full_tile = true;
+  int tile_width = TileSizeJ;
+
  // Only the last row or column may not have the full size.
  if (input_tile_index[2] == input_dims_in_tiles[2] - 1) {
-    tile_width = input_dims[2] - (input_dims_in_tiles[2] - 1) * TileSize;
+    tile_width = input_dims[2] - (input_dims_in_tiles[2] - 1) * TileSizeJ;
+    full_tile &= false;
  }
-  int tile_height = TileSize;
+
+  int tile_height = TileSizeI;
+
  if (input_tile_index[1] == input_dims_in_tiles[1] - 1) {
-    tile_height = input_dims[1] - (input_dims_in_tiles[1] - 1) * TileSize;
+    tile_height = input_dims[1] - (input_dims_in_tiles[1] - 1) * TileSizeI;
+    full_tile &= false;
  }

-  int input_flat_index = input_origin_flat_index + x;
-  int y_start = static_cast<int>(threadIdx.y) * kSubTileSize;
+  // Calculate effective thread number. This ensures that we use the largest
+  // number of threads available to form a regular thread block with no
+  // trailing incomplete lines.
+  constexpr int in_effective_thread_num = NumThreads / TileSizeJ * TileSizeJ;

-  // Load the data from input memory to the shared memory tile.
-  if (x < tile_width) {
-    int y_end = min(y_start + kSubTileSize, tile_height);
-    for (int y = y_start; y < y_end; y++) {
-      shared_memory_tile[y][x] = maybe_conj<T, conjugate>::run(
-          input[input_flat_index + y * input_dims[2]]);
+  if (x < in_effective_thread_num) {
+    // Orient the logical thread block with respect to the input array.
+    // ie. align the contiguous dimension of thread blocks with the contiguous
+    // dimension of the input array.
+    int ti = x / TileSizeJ;
+    int tj = x % TileSizeJ;
+    int input_index = input_origin_flat_index + ti * input_dims[2] + tj;
+    int input_increment = ReadRowPerPass * input_dims[2];
+
+    if (full_tile) {
+#pragma unroll
+      for (int i_loc = ti; i_loc < (TileSizeI); i_loc += ReadRowPerPass) {
+        shared_memory_tile[i_loc][tj] =
+            maybe_conj<T, conjugate>::run(input[input_index]);
+        input_index += input_increment;
+      }
+    } else {
+      if (tj < tile_width) {
+        for (int i_loc = ti; i_loc < (tile_height); i_loc += ReadRowPerPass) {
+          shared_memory_tile[i_loc][tj] =
+              maybe_conj<T, conjugate>::run(input[input_index]);
+          input_index += input_increment;
+        }
+      }
    }
  }

  __syncthreads();

  Index<3> output_tile_index = {
-      input_tile_index[0],
-      input_tile_index[2],
-      input_tile_index[1],
+      input_tile_index[0], input_tile_index[2], input_tile_index[1],
  };

  Index<3> output_tile_origin = {
-      output_tile_index[0],
-      output_tile_index[1] * TileSize,
-      output_tile_index[2] * TileSize,
+      output_tile_index[0], output_tile_index[1] * TileSizeJ,
+      output_tile_index[2] * TileSizeI,
  };

  int output_origin_flat_index =
      TensorIndexToFlat(output_tile_origin, output_dims);

-  int output_flat_index = output_origin_flat_index + x;
+  constexpr int out_effective_thread_num = NumThreads / TileSizeI * TileSizeI;

-  // Load the data from the shared memory tile to the output memory.
-  if (x < tile_height) {
-    int y_end = min(y_start + kSubTileSize, tile_width);
-    for (int y = y_start; y < y_end; y++) {
-      output[output_flat_index + y * output_dims[2]] = shared_memory_tile[x][y];
-    }
-  }
-}
+  if (x < out_effective_thread_num) {
+    // Re-orient the logical thread block with respect to the output array.
+    // ie. align the contiguous dimension of thread blocks with contiguous
+    // dimension of the output array.
+    int ti = x / TileSizeI;
+    int tj = x % TileSizeI;
+    int output_index = output_origin_flat_index + ti * output_dims[2] + tj;
+    int output_increment = WriteRowPerPass * output_dims[2];

-// Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor
-// when only one of the dimension sizes is smaller than 16,
-// where dimensions are zero-based: output[i][j][k] = input[i][k][j].
-//
-// small_dim = the_smaller_dimension_size
-// large_dim = the_larger_dimension_size
-// tile_num_per_block = blockDim.x
-// kTileLength = small_dim
-//
-// Each thread block operates on a single rectangle tile, where its width is
-// kTileLength (we currently set it to 64) and its height is small_dim,
-// We set the thread block's X dimension to be tile_num_per_block, and its Y
-// and Z to be one.
-template <typename T, int ShmemSize, bool SmallDim2, bool conjugate = false>
-__global__ void SwapDimension1And2InTensor3SmallDim(const T* input,
-                                                    int batch_per_block,
-                                                    Dimension<3> input_dims,
-                                                    T* output) {
-  // TODO(yangzihao) avoid share memory bank conflict.
-  __shared__ T shared_memory_tile[ShmemSize];
-
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.z == 1);
-
-  int block_offset = blockIdx.x * blockDim.x;
-
-  int x = threadIdx.x;
-  int tile_height = blockDim.x;
-
-  // Get tile height, width, and thread/block origin indices.
-  int small_dim = SmallDim2 ? input_dims[2] : input_dims[1];
-  int large_dim = SmallDim2 ? input_dims[1] : input_dims[2];
-
-  int global_offset = small_dim * large_dim * (blockIdx.y * batch_per_block) +
-                      (SmallDim2 ? block_offset * small_dim : block_offset);
-  if (global_offset >= (input_dims[0] * input_dims[1] * input_dims[2])) return;
-
-  for (int batch = 0; batch < batch_per_block; ++batch) {
-    int block_origin_idx =
-        small_dim * large_dim * (blockIdx.y * batch_per_block + batch);
-    int thread_origin_idx =
-        block_origin_idx +
-        (SmallDim2 ? block_offset * small_dim : block_offset) + x;
-
-    if (block_offset + blockDim.x > large_dim) {
-      tile_height = large_dim - block_offset;
-    }
-
-    __syncthreads();
-
-    // Load a continuous memory region to shared memory tile.
-    if (x < tile_height) {
-      for (int y = 0; y < small_dim; y++) {
-        int shmem_index =
-            SmallDim2 ? (x + y * tile_height) : (x * small_dim + y);
-        shared_memory_tile[shmem_index] = maybe_conj<T, conjugate>::run(
-            ldg(input + thread_origin_idx +
-                y * (SmallDim2 ? tile_height : large_dim)));
+    if (full_tile) {
+#pragma unroll
+      for (int i_loc = ti; i_loc < (TileSizeJ); i_loc += WriteRowPerPass) {
+        output[output_index] = shared_memory_tile[tj][i_loc];
+        output_index += output_increment;
      }
-    }
-
-    __syncthreads();
-
-    // Get block origin index for output array.
-    int output_block_offset = block_origin_idx;
-    int output_block_idx = SmallDim2 ? block_offset : block_offset * small_dim;
-    int output_block_origin_idx = output_block_offset + output_block_idx;
-
-    // Store the transposed memory region in shared memory to device.
-    if (x < tile_height) {
-      for (int y = 0; y < small_dim; y++) {
-        int output_idx = output_block_origin_idx + x +
-                         y * (SmallDim2 ? large_dim : tile_height);
-        int shmem_index =
-            SmallDim2 ? (x * small_dim + y) : (x + y * tile_height);
-        output[output_idx] = shared_memory_tile[shmem_index];
+    } else {
+      if (tj < tile_height) {
+        for (int i_loc = ti; i_loc < (tile_width); i_loc += WriteRowPerPass) {
+          output[output_index] = shared_memory_tile[tj][i_loc];
+          output_index += output_increment;
+        }
      }
    }
  }
@ -548,6 +502,380 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
  }
 };

+// We want std::equal_to and std::greater, but they're not constexpr until
+// C++14.
+struct EqualTo {
+  constexpr bool operator()(int a, int b) const { return a == b; }
+};
+
+struct GreaterThan {
+  constexpr bool operator()(int a, int b) const { return a > b; }
+};
+
+// For each data type, the tile size posibility frontier denotes the tile size
+// combinations that consume the most computational resources constrained by
+// - number of threads per SM limit,
+// - limit on size of the short dimension (<=15) due to the definition of
+//   narrow matrix,
+// - shared memory limit and
+// - some experimentally determined, type-specific constraint on the product of
+//   two side lengths to increase grid-level parallelism.
+//
+// A tile size combination lies on the frontier if and only if one or more
+// constraint mentioned above is hit. Tile size combinations lying outside this
+// frontier are either not possible, or are slower than the alternatives.
+//
+// It is instrumental to consider, for each data type, two subsets of the
+// corresponding frontier:
+// - long side frontier: the union of the biggest tile size combination for
+//   each legal long side len.
+// - non long side frontier: the frontier set minus the long side frontier.
+//
+// TileSizePossibilityFrontierCheck defines the frontier using only the long
+// side frontier tile size combinations (since one can easily extrapolate
+// the entire frontier from this subset). It serves as a utility function
+// to help us determine where a tile size combination of interest lies with
+// resepect to the frontier.
+template <typename Op>
+constexpr bool TileSizePossibilityFrontierCheck(int TileLongSide,
+                                                int TileShortSide,
+                                                int size_of_t, Op op) {
+  // clang-format off
+  return (size_of_t == 16 && ((TileLongSide == 32   && op(TileShortSide, 4))  ||
+                             (TileLongSide == 64   && op(TileShortSide, 4))  ||
+                             (TileLongSide == 128  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 256  && op(TileShortSide, 2)))) ||
+          (size_of_t == 8 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 256  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 2)))) ||
+          (size_of_t == 4 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2)))) ||
+          (size_of_t == 2 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2)))) ||
+          (size_of_t == 1 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2))));
+  // clang-format on
+}
+
+constexpr bool TileSizeOnLongSideFrontier(int TileLongSide, int TileShortSide,
+                                          int size_of_t) {
+  return TileSizePossibilityFrontierCheck(TileLongSide, TileShortSide,
+                                          size_of_t, EqualTo());
+}
+constexpr bool TileSizeOutsideFrontier(int TileLongSide, int TileShortSide,
+                                       int size_of_t) {
+  return TileSizePossibilityFrontierCheck(TileLongSide, TileShortSide,
+                                          size_of_t, GreaterThan());
+}
+constexpr bool TileSizeOnNonLongSideFrontier(int TileLongSide,
+                                             int TileShortSide, int size_of_t) {
+  // For a tile size combination (longside, shortside), lying on the frontier
+  // implies that (longside, shortside) is on or within the frontier but
+  // (longside*2, shortside) or (longside, shortside+1) is not. With the above
+  // critereon, we simply need to use !TileSizeOnLongSideFrontier to ensure that
+  // it is not on the long side frontier.
+  return !TileSizeOutsideFrontier(TileLongSide, TileShortSide, size_of_t) &&
+         (TileSizeOutsideFrontier(TileLongSide * 2, TileShortSide, size_of_t) ||
+          TileSizeOutsideFrontier(TileLongSide, TileShortSide + 1,
+                                  size_of_t)) &&
+         !TileSizeOnLongSideFrontier(TileLongSide, TileShortSide, size_of_t);
+}
+
+// Helper function to launch a batch narrow matirx transpose kernel.
+template <typename T, int TileLongSide, int TileShortSide>
+void LaunchBatchNarrowMatrixTransposeKernel(
+    const GPUDevice& d, int tile_size_i, int tile_size_j, int total_tiles_count,
+    const T* input, const Dimension<3>& input_dims, T* output) {
+  constexpr int NumThreads = TileLongSide;
+  if (tile_size_i <= TileLongSide && tile_size_j <= TileShortSide) {
+    SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileLongSide,
+                                          TileShortSide>
+        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
+                                                           output);
+  } else {
+    SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileShortSide,
+                                          TileLongSide>
+        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
+                                                           output);
+  }
+}
+
+// Recursive template function to search, in a trial-and-error manner, for the
+// minimum tile size configuration satisfying the requested tile side lengths.
+// An important invariant of this search procedure is that for an unsatisfied
+// request, we always try doubling the long side len first, and only after
+// the request is satisfied for the long side len do we begin incrementing
+// the short side len.
+//
+// We have three specializations of this search function depending on where the
+// current tile size combination lies with respect to the frontier.
+// - It lies within the frontier. If request is not satisfied, for the next tile
+// size combination, we first try doubling the long side len and if that does
+// not work, we then increment the short side len.
+// - It lies on the non long side frontier. If the request is not satisfied, we
+// can only increment the short side len.
+// - It lies on the long side frontier. We launch the kernel without checking if
+// the request is satisfied or not.
+template <typename T, int TileLongSide, int TileShortSide,
+          typename dummy = void>
+struct BatchNarrowMatrixTransposeDispatcher {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+    bool request_satisfied = max(tile_size_i, tile_size_j) <= TileLongSide &&
+                             min(tile_size_i, tile_size_j) <= TileShortSide;
+
+    if (request_satisfied) {
+      LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
+          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+          output);
+      return;
+    }
+
+    // If the execution reaches here, then the kernel was not launched; we then
+    // determine whether it is the long side or the short side that falls short
+    // of the request and increase that parameter accordingly.
+    const bool long_side_request_not_satisfied =
+        max(tile_size_i, tile_size_j) > TileLongSide;
+
+    if (long_side_request_not_satisfied) {
+      BatchNarrowMatrixTransposeDispatcher<
+          T, TileLongSide * 2, TileShortSide>::DoIt(d, tile_size_i, tile_size_j,
+                                                    total_tiles_count, input,
+                                                    input_dims, output);
+    } else {
+      BatchNarrowMatrixTransposeDispatcher<
+          T, TileLongSide, TileShortSide + 1>::DoIt(d, tile_size_i, tile_size_j,
+                                                    total_tiles_count, input,
+                                                    input_dims, output);
+    }
+  }
+};
+
+template <typename T, int TileLongSide, int TileShortSide>
+struct BatchNarrowMatrixTransposeDispatcher<
+    T, TileLongSide, TileShortSide,
+    typename std::enable_if<TileSizeOnNonLongSideFrontier(
+                                TileLongSide, TileShortSide, sizeof(T)),
+                            void>::type> {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+    bool request_satisfied = max(tile_size_i, tile_size_j) <= TileLongSide &&
+                             min(tile_size_i, tile_size_j) <= TileShortSide;
+
+    if (request_satisfied) {
+      LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
+          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+          output);
+      return;
+    }
+
+    // If the execution reaches here, then the kernel was not launched; since
+    // we are on the non long side frontier, we increment the short dimension
+    // and try again.
+    BatchNarrowMatrixTransposeDispatcher<
+        T, TileLongSide, TileShortSide + 1>::DoIt(d, tile_size_i, tile_size_j,
+                                                  total_tiles_count, input,
+                                                  input_dims, output);
+  }
+};
+
+template <typename T, int TileLongSide, int TileShortSide>
+struct BatchNarrowMatrixTransposeDispatcher<
+    T, TileLongSide, TileShortSide,
+    typename std::enable_if<TileSizeOnLongSideFrontier(
+                                TileLongSide, TileShortSide, sizeof(T)),
+                            void>::type> {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+
+    LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
+        d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+        output);
+  }
+};
+
+// This function tries to recover, in a brute force way, the frontier defined in
+// TileSizePossibilityFrontierCheck as a vector of tile size combinations lying
+// on the long side frontier. This vector is sufficient to determine the entire
+// frontier.
+//
+// Note that if one changes the frontier definition in
+// TileSizePossibilityFrontierCheck and forgets to set the largest short
+// side len of the largest legal long side len to 2, this function will fail
+// and crash the program.
+template <int SizeOfT>
+const std::vector<std::pair<int, int>>& GetTileSizesFrontier() {
+  static_assert(
+      SizeOfT <= 16,
+      "Currently, only data types of sizes 16 bytes or less are supported.");
+  static_assert((SizeOfT & (SizeOfT - 1)) == 0,
+                "Data types must have sizes that are powers of 2.");
+
+  // Expensive work to populate sizes, lazily run in a thread-safe
+  // manner the first time GetTileSizesFrontier<N> is called.
+  static auto* frontier = [] {
+    auto* frontier = new std::vector<std::pair<int, int>>();
+    const int kMaxLongSideLen = 1024;
+    const int kMaxShortSideLen = 15;
+    for (int long_side = 32; long_side <= kMaxLongSideLen; long_side *= 2) {
+      for (int short_side = 2; short_side <= kMaxShortSideLen;
+           short_side += 1) {
+        if (TileSizeOnLongSideFrontier(long_side, short_side, SizeOfT)) {
+          // The current combination lies on the frontier, thus we
+          // add it to the frontier definition.
+          frontier->push_back(std::make_pair(long_side, short_side));
+
+          // The long side length is the largest one allowed iff its
+          // corresponding short side length is 2.
+          if (short_side == 2) return frontier;
+
+          // We have exhausted all the possibilities in the frontier
+          // with the given long side length.
+          break;
+        }
+      }
+    }
+    LOG(FATAL)
+        << "The corresponding short side length of the largest long side "
+           "length has to be 2.";
+  }();
+  return *frontier;
+}
+
+// Helper structs to help determine which data type to use given the size of
+// the matrix data type. A transpose of elements of size N will use a kernel
+// which operates on an array of TransposeElemType<N>::type.
+template <int ElemBytes>
+struct TransposeElemType;
+template <>
+struct TransposeElemType<1> {
+  using type = uint8;
+};
+template <>
+struct TransposeElemType<2> {
+  using type = uint16;
+};
+template <>
+struct TransposeElemType<4> {
+  using type = uint32;
+};
+template <>
+struct TransposeElemType<8> {
+  using type = uint64;
+};
+template <>
+struct TransposeElemType<16> {
+  using type = float4;
+};
+
+// A helper function to make RunSwapDimension1And2InTensor3 concise. This
+// helper function looks at the data type and input matrix sizes and decides
+// the thread numbers and tile sizes to use.
+template <typename T, bool conjugate = false >
+void SwapDimension1And2InTensor3WithNarrowMatrices(
+    const GPUDevice& d, const T* input, const Dimension<3>& input_dims,
+    T* output, const int kMinDimensionToUseTiles) {
+  // Get available tile sizes here for the data type requested:
+  const auto& tile_spec = GetTileSizesFrontier<sizeof(T)>();
+
+  int tile_long_side_len = 0;
+  int tile_short_side_len = 0;
+  float lowest_cost = std::numeric_limits<float>::max();
+  int data_long_side = max(input_dims[1], input_dims[2]);
+
+  for (auto tile_size_pair : tile_spec) {
+    int proposed_tile_long_side_len = tile_size_pair.first;
+
+    // Number of threads that will not be doing anything useful when reading
+    // the matrix because the thread block size is bigger than the data block
+    // size.
+    int num_wasted_threads =
+        data_long_side - MathUtil::FloorOfRatio<int>(
+                             data_long_side, proposed_tile_long_side_len) *
+                             proposed_tile_long_side_len;
+
+    int num_full_tiles = MathUtil::FloorOfRatio<int>(
+        data_long_side, proposed_tile_long_side_len);
+
+    float cost = 0;
+
+    // However, if we can execute two or more full tiles, then we gladly
+    // accept any number of wasted threads and ignore its cost.
+    if (num_full_tiles <= 1) cost = num_wasted_threads;
+
+    // Using less than or equal to here because given the same cost, we
+    // would like to launch as many threads as possible.
+    if (cost <= lowest_cost) {
+      tile_long_side_len = proposed_tile_long_side_len;
+      tile_short_side_len = tile_size_pair.second;
+      lowest_cost = cost;
+    }
+  }
+
+  // Request tile sizes such that the longer side of threadblock aligns with
+  // the longer side of input data block to maximize read throughput.
+  // The ideal tile shape is one where the length of the shorter side of the
+  // tile is equal to the length of the shorter side of the input matrix.
+  int requested_tile_size_i = input_dims[1] >= kMinDimensionToUseTiles
+                                  ? tile_long_side_len
+                                  : input_dims[1];
+  int requested_tile_size_j = input_dims[1] >= kMinDimensionToUseTiles
+                                  ? input_dims[2]
+                                  : tile_long_side_len;
+
+  // Truncate the shorter size requested according to the manual limit set in
+  // tile_spec to make sure that we do not launch configurations violating
+  // hardware limits.
+  requested_tile_size_i = requested_tile_size_i == tile_long_side_len
+                              ? tile_long_side_len
+                              : min(requested_tile_size_i, tile_short_side_len);
+  requested_tile_size_j = requested_tile_size_j == tile_long_side_len
+                              ? tile_long_side_len
+                              : min(requested_tile_size_j, tile_short_side_len);
+
+  Dimension<3> input_dims_in_tiles = {
+      input_dims[0],
+      MathUtil::CeilOfRatio<int>(input_dims[1], requested_tile_size_i),
+      MathUtil::CeilOfRatio<int>(input_dims[2], requested_tile_size_j),
+  };
+
+  int total_tiles_count =
+      input_dims_in_tiles[0] * input_dims_in_tiles[1] * input_dims_in_tiles[2];
+
+  using ElemType = typename TransposeElemType<sizeof(T)>::type;
+  static_assert(alignof(T) >= alignof(ElemType), "Unexpected data alignment.");
+  BatchNarrowMatrixTransposeDispatcher<ElemType, 32, 2>::DoIt(
+      d, requested_tile_size_i, requested_tile_size_j, total_tiles_count,
+      reinterpret_cast<const ElemType*>(input), input_dims,
+      reinterpret_cast<ElemType*>(output));
+}
+
 // Launch the GPU kernel that would swap dimension-1 and dimension-2 in a
 // 3D tensor. It looks at the shape of the incoming data, and decides the best
 // strategy to launch.
@ -558,60 +886,33 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
  // If one dimension is trivial, use SmallDim kernel for swapping.
  // Otherwise, the trivial swapping relying on the ldg cache is more efficient.
  static const int kMinDimensionToUseTiles = 16;
-  bool use_tiles = (input_dims[1] >= kMinDimensionToUseTiles &&
-                    input_dims[2] >= kMinDimensionToUseTiles);
-  bool use_small_dim = ((input_dims[1] >= kMinDimensionToUseTiles &&
-                         input_dims[2] < kMinDimensionToUseTiles)) ||
-                       ((input_dims[1] < kMinDimensionToUseTiles &&
-                         input_dims[2] >= kMinDimensionToUseTiles));
-  static const int NumSubTiles = 8;
+  static const int kMinDimensionToUseRectTiles = 96;

-  if (use_tiles) {
-    static const int TileSize = 32;
-    Dimension<3> input_dims_in_tiles = {
-        input_dims[0],
-        (input_dims[1] + TileSize - 1) / TileSize,
-        (input_dims[2] + TileSize - 1) / TileSize,
-    };
-    int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
-                            input_dims_in_tiles[2];
-    // We get best performance when TileSize is the number of threads in a warp
+  bool large_matrix = input_dims[1] >= kMinDimensionToUseTiles &&
+                      input_dims[2] >= kMinDimensionToUseTiles;
+  bool narrow_matrix = input_dims[1] >= kMinDimensionToUseRectTiles ||
+                       input_dims[2] >= kMinDimensionToUseRectTiles;
+  if (large_matrix) {
+    // We get best performance when kTileSize is the number of threads in a warp
    // (32 on our GPUs) and NumSubTiles is 8, so our block size is 8 * 32 = 256
    // threads.
-    SwapDimension1And2InTensor3UsingTiles<T, TileSize, NumSubTiles, conjugate>
-        <<<total_tiles_count, dim3(TileSize, NumSubTiles), 0, d.stream()>>>(
-            input, input_dims, output);
-  } else if (use_small_dim) {
-    // When only one of the dimensions is smaller than kMinDimensionToUseTiles,
-    // we use one block to process a rectangle region with the size of
-    // kTileLength * small_dim. We found that when set kTileLength to 64 on
-    // TitanX Maxwell GPU, it achieves the best performance.
-    //              large_dim
-    //            +---------------...--------+
-    //            |            |        |    |
-    // small_dim  |            |  ...   |    |
-    //            |            |        |    |
-    //            +--------------...---------+
-    //            \----- ------/         \- -/
-    //                  V                  V
-    //    kTileLength(tile_height)    tile_height
-    static const int kTileLength = 64;
-    static const int kGridDimY = 65535;
-    int large_dim = std::max(input_dims[2], input_dims[1]);
-    int tile_num_per_block = (large_dim + kTileLength - 1) / kTileLength;
-    int grid_dim_y = std::min(input_dims[0], kGridDimY);
-    int batch_per_block = (input_dims[0] + grid_dim_y - 1) / grid_dim_y;
-    if (input_dims[2] < input_dims[1]) {
-      SwapDimension1And2InTensor3SmallDim<
-          T, kTileLength * kMinDimensionToUseTiles, true, conjugate>
-          <<<dim3(tile_num_per_block, grid_dim_y), kTileLength, 0,
-             d.stream()>>>(input, batch_per_block, input_dims, output);
-    } else {
-      SwapDimension1And2InTensor3SmallDim<
-          T, kTileLength * kMinDimensionToUseTiles, false, conjugate>
-          <<<dim3(tile_num_per_block, grid_dim_y), kTileLength, 0,
-             d.stream()>>>(input, batch_per_block, input_dims, output);
-    }
+    constexpr int kTileSize = 32;
+    constexpr int kNumThreads = 256;
+
+    Dimension<3> input_dims_in_tiles = {
+        input_dims[0], MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
+        MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
+    };
+
+    int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
+                            input_dims_in_tiles[2];
+    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize, conjugate>
+        <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
+                                                            output);
+
+  } else if (narrow_matrix) {
+    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(d, input, input_dims, output,
+                                                  kMinDimensionToUseTiles);
  } else {
    int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
    CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
--- a/tensorflow/core/kernels/data/dataset.h
+++ b/tensorflow/core/kernels/data/dataset.h
@ -112,7 +112,7 @@ class GraphDefBuilderWrapper {
  Status AddTensor(const Tensor& val, Node** output) {
    AddTensorInternal(val, output);
    if (*output == nullptr) {
-      return errors::Internal("AddTesor: Failed to build Const op.");
+      return errors::Internal("AddTensor: Failed to build Const op.");
    }
    return Status::OK();
  }
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@ -19,6 +19,7 @@ limitations under the License.

 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"

@ -74,6 +75,7 @@ DEFINE_SETZERO_SYCL(int32);
 DEFINE_SETZERO_SYCL(int64);
 #undef DEFINE_SETZERO_SYCL
 #endif  // TENSORFLOW_USE_SYCL
+
 template <typename T>
 void SetOneFunctor<Eigen::ThreadPoolDevice, T>::operator()(
    const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out) {
@ -112,5 +114,47 @@ DEFINE_SETONE_SYCL(double);
 #undef DEFINE_SETONE_SYCL
 #endif  // TENSORFLOW_USE_SYCL

+template <typename T>
+struct FillFunctor<Eigen::ThreadPoolDevice, T> {
+  void operator()(const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in) {
+    out.device(d) = out.constant(in());
+  }
+};
+
+// Explicit instantiations.
+#define DEFINE_FILL_CPU(T) \
+  template struct FillFunctor<Eigen::ThreadPoolDevice, T>;
+
+TF_CALL_ALL_TYPES(DEFINE_FILL_CPU);
+DEFINE_FILL_CPU(quint8);
+DEFINE_FILL_CPU(quint16);
+#undef DEFINE_FILL_CPU
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct FillFunctor<Eigen::SyclDevice, T> {
+  void operator()(const Eigen::SyclDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in) {
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::array<int, 1> rank1{1};
+#else
+    Eigen::IndexList<Eigen::type2index<1> > rank1;
+#endif
+    const int size = out.dimension(0);
+    Eigen::array<int, 1> broadcast_dims{size};
+
+    To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims);
+  }
+};
+
+#define DEFINE_FILL_SYCL(T) \
+  template struct FillFunctor<Eigen::SyclDevice, T>;
+DEFINE_FILL_SYCL(float);
+DEFINE_FILL_SYCL(double);
+TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL)
+#undef DEFINE_FILL_SYCL
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace functor
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/fill_functor.cu.cc
+++ b/tensorflow/core/kernels/fill_functor.cu.cc
@ -0,0 +1,115 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace Eigen {
+namespace internal {
+
+template <typename T>
+struct scalar_const_op {
+  typedef typename packet_traits<T>::type Packet;
+
+  const T* val;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  scalar_const_op(const scalar_const_op& x)
+      : val(x.val) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_const_op(const T* v) : val(v) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()() const {
+    return *val;
+  }
+
+  template <typename PacketType = Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const {
+    return internal::pset1<PacketType>(*val);
+  }
+};
+
+template <typename T>
+struct functor_traits<scalar_const_op<T> > {
+  enum {
+    Cost = 1,
+    PacketAccess = packet_traits<T>::Vectorizable,
+    IsRepeatable = true
+  };
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+namespace tensorflow {
+
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Partial specialization FillFunctor<Device=GPUDevice, T>
+template <typename T>
+struct FillFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in) {
+    Eigen::internal::scalar_const_op<T> f(in.data());
+    To32Bit(out).device(d) = To32Bit(out).nullaryExpr(f);
+  }
+};
+
+#define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>;
+TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
+TF_CALL_bfloat16(DEFINE_FILL_GPU);
+TF_CALL_bool(DEFINE_FILL_GPU);
+#undef DEFINE_FILL_GPU
+
+// Partial specialization of FillFunctor<Device=GPUDevice, T>.
+template <typename T>
+struct SetZeroFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) {
+    To32Bit(out).device(d) = To32Bit(out).constant(T(0));
+  }
+};
+
+#define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>;
+TF_CALL_NUMBER_TYPES(DEFINE_SETZERO_GPU);
+TF_CALL_bfloat16(DEFINE_SETZERO_GPU);
+TF_CALL_bool(DEFINE_SETZERO_GPU);
+#undef DEFINE_SETZERO_GPU
+
+// Partial specialization of FillFunctor<Device=GPUDevice, T>.
+template <typename T>
+struct SetOneFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) {
+    To32Bit(out).device(d) = To32Bit(out).constant(T(1));
+  }
+};
+
+#define DEFINE_SETONE_GPU(T) template struct SetOneFunctor<GPUDevice, T>;
+TF_CALL_NUMBER_TYPES(DEFINE_SETONE_GPU);
+TF_CALL_bfloat16(DEFINE_SETONE_GPU);
+TF_CALL_bool(DEFINE_SETONE_GPU);
+#undef DEFINE_SETONE_GPU
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/fused_batch_norm_op.h"
 #include "tensorflow/core/util/tensor_format.h"

@ -239,6 +240,14 @@ struct FusedBatchNorm<GPUDevice, T, U> {
            << " offset shape: " << offset.shape().DebugString()
            << " tensor format: " << tensor_format;

+    // If input is empty, return NaN mean/variance
+    if (x.shape().num_elements() == 0) {
+      functor::SetNanFunctor<U> f;
+      f(context->eigen_device<GPUDevice>(), batch_mean->flat<U>());
+      f(context->eigen_device<GPUDevice>(), batch_var->flat<U>());
+      return;
+    }
+
    Tensor x_maybe_transformed = x;
    Tensor x_transformed;
    Tensor y_transformed;
@ -656,6 +665,14 @@ class FusedBatchNormGradOp : public OpKernel {
        context, context->allocate_output(4, TensorShape({}), &placeholder_2));
    FillZeros<Device>(placeholder_2);

+    // If input is empty, set gradients w.r.t scale/offset to zero.
+    if (x.shape().num_elements() == 0) {
+      functor::SetZeroFunctor<Device, U> f;
+      f(context->eigen_device<Device>(), scale_backprop->flat<U>());
+      f(context->eigen_device<Device>(), offset_backprop->flat<U>());
+      return;
+    }
+
    if (is_training_) {
      functor::FusedBatchNormGrad<Device, T, U>()(
          context, y_backprop, x, scale, saved_mean_or_pop_mean,
--- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@ -65,8 +65,15 @@ void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
                                              epsilon, sample_size, variance);
 }

+template <class T>
+void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d,
+                                  typename TTypes<T>::Flat out) {
+  To32Bit(out).device(d) = To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
+}
+
 template class VarianceToInvVariance<float>;
 template class InvVarianceToVariance<float>;
+template class SetNanFunctor<float>;
 }  // namespace functor
 }  // namespace tensorflow

--- a/tensorflow/core/kernels/fused_batch_norm_op.h
+++ b/tensorflow/core/kernels/fused_batch_norm_op.h
@ -49,6 +49,12 @@ struct InvVarianceToVariance {
                  int channels, T* variance);
 };

+// This function sets a GPU tensor to NaNs.
+template <class T>
+struct SetNanFunctor {
+  void operator()(const Eigen::GpuDevice& d, typename TTypes<T>::Flat out);
+};
+
 #endif  // GOOGLE_CUDA

 // Functor used by FusedBatchNormGradOp to do the computations when
--- a/tensorflow/core/kernels/matrix_inverse_op.cc
+++ b/tensorflow/core/kernels/matrix_inverse_op.cc
@ -210,7 +210,7 @@ class MatrixInverseOpGpu : public AsyncOpKernel {
            done);
      }
    } else {
-      // For large matrices, we wompute the inverse of each matrix in the batch
+      // For large matrices, we compute the inverse of each matrix in the batch
      // sequentially. Here we use the cuSolver methods GETRF/GETRS because they
      // are MUCH faster than their batched cuBlas equivalents for large
      // matrices.
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@ -147,6 +147,9 @@ void DnnPoolingOp<T>::Compute(
  Tensor* tensor_out = nullptr;
  OP_REQUIRES_OK(context,
                 context->allocate_output(0, tensor_out_shape, &tensor_out));
+  if (tensor_in.shape().num_elements() == 0) {
+    return;
+  }

  PoolParameters params{context, size,        stride,
                        padding, data_format, tensor_in.shape()};
@ -247,6 +250,9 @@ void DnnPoolingGradOp<T>::Compute(
  Tensor* input_backprop = nullptr;
  OP_REQUIRES_OK(context,
                 context->allocate_output(0, tensor_in_shape, &input_backprop));
+  if (tensor_in_shape.num_elements() == 0) {
+    return;
+  }

  PoolParameters params{context, size,        stride,
                        padding, data_format, tensor_in_shape};
--- a/tensorflow/core/kernels/record_input_op.cc
+++ b/tensorflow/core/kernels/record_input_op.cc
@ -38,6 +38,7 @@ class RecordInputOp : public OpKernel {
    GETATTR(int64, batch_size);
    GETATTR(string, compression_type);
 #undef GETATTR
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("compression_type", &compression_type));

    RecordYielder::Options yopts;
    yopts.file_pattern = file_pattern;
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@ -131,7 +131,7 @@ class WhereCPUOp : public OpKernel {
    OP_REQUIRES(
        context, input.dtype() != DT_HALF,
        errors::Unimplemented("No WhereOp available for float16/half type on "
-                              "GPU; dying in CPU WhereOp to avoid silently "
+                              "CPU; dying in CPU WhereOp to avoid silently "
                              "creating costly copies from device."));

    const int input_dims = input.dims();
--- a/tensorflow/core/ops/bitwise_ops.cc
+++ b/tensorflow/core/ops/bitwise_ops.cc
@ -38,7 +38,7 @@ computation is performed on the underlying representation of x.
      .Output("z: T")                                                        \
      .SetIsCommutative()                                                    \
      .Attr("T: {int8, int16, int32, int64, uint8, uint16, uint32, uint64}") \
-      .SetShapeFn(shape_inference::UnchangedShape)
+      .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)

 REGISTER_OP("PopulationCount")
    .Input("x: T")
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@ -114,6 +114,8 @@ int64 LogLevelStrToInt(const char* tf_env_var_val) {
  return level;
 }

+}  // namespace
+
 int64 MinLogLevelFromEnv() {
  const char* tf_env_var_val = getenv("TF_CPP_MIN_LOG_LEVEL");
  return LogLevelStrToInt(tf_env_var_val);
@ -124,8 +126,6 @@ int64 MinVLogLevelFromEnv() {
  return LogLevelStrToInt(tf_env_var_val);
 }

-}  // namespace
-
 LogMessage::~LogMessage() {
  // Read the min log level once during the first call to logging.
  static int64 min_log_level = MinLogLevelFromEnv();
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@ -305,6 +305,10 @@ T&& CheckNotNull(const char* file, int line, const char* exprtext, T&& t) {
  return std::forward<T>(t);
 }

+int64 MinLogLevelFromEnv();
+
+int64 MinVLogLevelFromEnv();
+
 }  // namespace internal
 }  // namespace tensorflow

--- a/tensorflow/core/platform/s3/BUILD
+++ b/tensorflow/core/platform/s3/BUILD
@ -28,6 +28,8 @@ filegroup(
 tf_cc_binary(
    name = "s3_file_system.so",
    srcs = [
+        "aws_logging.cc",
+        "aws_logging.h",
        "s3_crypto.cc",
        "s3_crypto.h",
        "s3_file_system.cc",
@ -66,6 +68,22 @@ cc_library(
    alwayslink = 1,
 )

+cc_library(
+    name = "aws_logging",
+    srcs = [
+        "aws_logging.cc",
+    ],
+    hdrs = [
+        "aws_logging.h",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "@aws//:aws",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
    name = "s3_file_system",
    srcs = [
@ -75,6 +93,7 @@ cc_library(
        "s3_file_system.h",
    ],
    deps = [
+        ":aws_logging",
        ":s3_crypto",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
--- a/tensorflow/core/platform/s3/aws_logging.cc
+++ b/tensorflow/core/platform/s3/aws_logging.cc
@ -0,0 +1,121 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/platform/s3/aws_logging.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+#include <aws/core/Aws.h>
+#include <aws/core/utils/logging/AWSLogging.h>
+#include <aws/core/utils/logging/LogSystemInterface.h>
+
+#include <cstdarg>
+
+namespace tensorflow {
+
+AWSLogSystem::AWSLogSystem(Aws::Utils::Logging::LogLevel log_level)
+    : log_level_(log_level) {}
+
+void AWSLogSystem::Log(Aws::Utils::Logging::LogLevel log_level, const char* tag,
+                       const char* format, ...) {
+  std::va_list args;
+  va_start(args, format);
+
+  const string s = strings::Printf(format, args);
+
+  va_end(args);
+
+  LogMessage(log_level, s);
+}
+
+void AWSLogSystem::LogStream(Aws::Utils::Logging::LogLevel log_level,
+                             const char* tag,
+                             const Aws::OStringStream& message_stream) {
+  LogMessage(log_level, message_stream.rdbuf()->str().c_str());
+}
+
+void AWSLogSystem::LogMessage(Aws::Utils::Logging::LogLevel log_level,
+                              const std::string& message) {
+  switch (log_level) {
+    case Aws::Utils::Logging::LogLevel::Info:
+      LOG(INFO) << message;
+      break;
+    case Aws::Utils::Logging::LogLevel::Warn:
+      LOG(WARNING) << message;
+      break;
+    case Aws::Utils::Logging::LogLevel::Error:
+      LOG(ERROR) << message;
+      break;
+    case Aws::Utils::Logging::LogLevel::Fatal:
+      LOG(FATAL) << message;
+      break;
+    default:
+      LOG(ERROR) << message;
+      break;
+  }
+}
+
+namespace {
+static const char* kAWSLoggingTag = "AWSLogging";
+
+Aws::Utils::Logging::LogLevel ParseLogLevelFromEnv() {
+  Aws::Utils::Logging::LogLevel log_level = Aws::Utils::Logging::LogLevel::Info;
+
+  const int64_t level = tensorflow::internal::MinLogLevelFromEnv();
+
+  switch (level) {
+    case INFO:
+      log_level = Aws::Utils::Logging::LogLevel::Info;
+      break;
+    case WARNING:
+      log_level = Aws::Utils::Logging::LogLevel::Warn;
+      break;
+    case ERROR:
+      log_level = Aws::Utils::Logging::LogLevel::Error;
+      break;
+    case FATAL:
+      log_level = Aws::Utils::Logging::LogLevel::Fatal;
+      break;
+    default:
+      log_level = Aws::Utils::Logging::LogLevel::Info;
+      break;
+  }
+
+  return log_level;
+}
+}
+
+static bool initialized = false;
+static mutex s3_logging_mutex(LINKER_INITIALIZED);
+void AWSLogSystem::InitializeAWSLogging() {
+  std::lock_guard<mutex> s3_logging_lock(s3_logging_mutex);
+  if (!initialized) {
+    Aws::Utils::Logging::InitializeAWSLogging(
+        Aws::MakeShared<AWSLogSystem>(kAWSLoggingTag, ParseLogLevelFromEnv()));
+    initialized = true;
+    return;
+  }
+}
+
+void AWSLogSystem::ShutdownAWSLogging() {
+  std::lock_guard<mutex> s3_logging_lock(s3_logging_mutex);
+  if (initialized) {
+    Aws::Utils::Logging::ShutdownAWSLogging();
+    initialized = false;
+    return;
+  }
+}
+
+}  // namespace tensorflow
--- a/tensorflow/core/platform/s3/aws_logging.h
+++ b/tensorflow/core/platform/s3/aws_logging.h
@ -0,0 +1,68 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_S3_S3_LOGGING_H_
+#define TENSORFLOW_CONTRIB_S3_S3_LOGGING_H_
+
+#include <atomic>
+#include <string>
+
+#include <aws/core/utils/logging/LogLevel.h>
+#include <aws/core/utils/logging/LogSystemInterface.h>
+#include "tensorflow/core/platform/default/logging.h"
+
+namespace tensorflow {
+
+class AWSLogSystem : public Aws::Utils::Logging::LogSystemInterface {
+ public:
+  static void InitializeAWSLogging();
+  static void ShutdownAWSLogging();
+
+  explicit AWSLogSystem(Aws::Utils::Logging::LogLevel log_level);
+  virtual ~AWSLogSystem() = default;
+
+  // Gets the currently configured log level.
+  virtual Aws::Utils::Logging::LogLevel GetLogLevel(void) const override {
+    return log_level_;
+  }
+
+  // Set a new log level. This has the immediate effect of changing the log.
+  void SetLogLevel(Aws::Utils::Logging::LogLevel log_level) {
+    log_level_.store(log_level);
+  }
+
+  // Does a printf style output to ProcessFormattedStatement. Don't use this,
+  // it's unsafe. See LogStream.
+  // Since non-static C++ methods have an implicit this argument,
+  // TF_PRINTF_ATTRIBUTE should be counted from two (vs. one).
+  virtual void Log(Aws::Utils::Logging::LogLevel log_level, const char* tag,
+                   const char* format, ...) override TF_PRINTF_ATTRIBUTE(4, 5);
+
+  // Writes the stream to ProcessFormattedStatement.
+  virtual void LogStream(Aws::Utils::Logging::LogLevel log_level,
+                         const char* tag,
+                         const Aws::OStringStream& messageStream) override;
+
+ private:
+  void LogMessage(Aws::Utils::Logging::LogLevel log_level,
+                  const string& message);
+  std::atomic<Aws::Utils::Logging::LogLevel> log_level_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(AWSLogSystem);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_S3_S3_LOGGING_H_
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@ -15,10 +15,13 @@ limitations under the License.
 #include "tensorflow/core/platform/s3/s3_file_system.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/s3/aws_logging.h"
 #include "tensorflow/core/platform/s3/s3_crypto.h"

 #include <aws/core/Aws.h>
 #include <aws/core/utils/FileSystemUtils.h>
+#include <aws/core/utils/logging/AWSLogging.h>
+#include <aws/core/utils/logging/LogSystemInterface.h>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/S3Errors.h>
 #include <aws/s3/model/CopyObjectRequest.h>
@ -33,6 +36,7 @@ limitations under the License.

 namespace tensorflow {

+namespace {
 static const char* kS3FileSystemAllocationTag = "S3FileSystemAllocation";
 static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
 static const int kS3GetChildrenMaxKeys = 100;
@ -226,7 +230,11 @@ class S3ReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
  uint64 length_;
 };

+}  // namespace
+
 S3FileSystem::S3FileSystem() {
+  AWSLogSystem::InitializeAWSLogging();
+
  Aws::SDKOptions options;
  options.cryptoOptions.sha256Factory_create_fn = []() {
    return Aws::MakeShared<S3SHA256Factory>(S3CryptoAllocationTag);
@ -240,6 +248,8 @@ S3FileSystem::S3FileSystem() {
 S3FileSystem::~S3FileSystem() {
  Aws::SDKOptions options;
  Aws::ShutdownAPI(options);
+
+  AWSLogSystem::ShutdownAWSLogging();
 }

 Status S3FileSystem::NewRandomAccessFile(
--- a/tensorflow/docs_src/get_started/estimator.md
+++ b/tensorflow/docs_src/get_started/estimator.md
@ -232,10 +232,10 @@ data and target values for the training set, respectively, and `test_set.data`
 and `test_set.target` contain feature data and target values for the test set.

 Later on, in
-["Fit the DNNClassifier to the Iris Training Data,"](#fit-dnnclassifier)
+["Fit the DNNClassifier to the Iris Training Data,"](#fit_the_dnnclassifier_to_the_iris_training_data)
 you'll use `training_set.data` and
 `training_set.target` to train your model, and in
-["Evaluate Model Accuracy,"](#evaluate-accuracy) you'll use `test_set.data` and
+["Evaluate Model Accuracy,"](#evaluate_model_accuracy) you'll use `test_set.data` and
 `test_set.target`. But first, you'll construct your model in the next section.

 ## Construct a Deep Neural Network Classifier
--- a/tensorflow/docs_src/performance/datasets_performance.md
+++ b/tensorflow/docs_src/performance/datasets_performance.md
@ -177,7 +177,7 @@ dataset = dataset.batch(batch_size=FLAGS.batch_size)
 to:

 ```
-dataset = dataset.apply(tf.data.contrib.map_and_batch(
+dataset = dataset.apply(tf.contrib.data.map_and_batch(
    map_func=parse_fn, batch_size=FLAGS.batch_size))
 ```

--- a/tensorflow/docs_src/programmers_guide/faq.md
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@ -300,7 +300,7 @@ functions, methods, and properties. We also adhere to the
 [Google Python style guide](https://google.github.io/styleguide/pyguide.html).

 The TensorFlow C++ code base adheres to the
-[Google C++ style guide](http://google.github.io/styleguide/cppguide.html).
+[Google C++ style guide](https://google.github.io/styleguide/cppguide.html).

 (<sup>*</sup> With one exception: we use 2-space indentation instead of 4-space
 indentation.)
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@ -487,7 +487,7 @@ subgraph inside.
 ![](../images/mnist_deep.png)

 For more information about visualizing your TensorFlow application with
-TensorBoard, see the [TensorBoard tutorial](TODO).
+TensorBoard, see the [TensorBoard tutorial](../get_started/summaries_and_tensorboard.md).

 ## Programming with multiple graphs

--- a/tensorflow/docs_src/tutorials/audio_recognition.md
+++ b/tensorflow/docs_src/tutorials/audio_recognition.md
@ -246,7 +246,7 @@ results as in your server testing.
 The demo app updates its UI list of results automatically based on the labels
 text file you copy into assets alongside your frozen graph, which means you can
 easily try out different models without needing to make any code changes. You
-will need to updaye `LABEL_FILENAME` and `MODEL_FILENAME` to point to the files
+will need to update `LABEL_FILENAME` and `MODEL_FILENAME` to point to the files
 you've added if you change the paths though.

 ## How does this Model Work?
--- a/tensorflow/examples/label_image/label_image.py
+++ b/tensorflow/examples/label_image/label_image.py
@ -51,7 +51,7 @@ def read_tensor_from_image_file(file_name, input_height=299, input_width=299,
    image_reader = tf.image.decode_jpeg(file_reader, channels = 3,
                                        name='jpeg_reader')
  float_caster = tf.cast(image_reader, tf.float32)
-  dims_expander = tf.expand_dims(float_caster, 0);
+  dims_expander = tf.expand_dims(float_caster, 0)
  resized = tf.image.resize_bilinear(dims_expander, [input_height, input_width])
  normalized = tf.divide(tf.subtract(resized, [input_mean]), [input_std])
  sess = tf.Session()
@ -118,8 +118,8 @@ if __name__ == "__main__":

  input_name = "import/" + input_layer
  output_name = "import/" + output_layer
-  input_operation = graph.get_operation_by_name(input_name);
-  output_operation = graph.get_operation_by_name(output_name);
+  input_operation = graph.get_operation_by_name(input_name)
+  output_operation = graph.get_operation_by_name(output_name)

  with tf.Session(graph=graph) as sess:
    results = sess.run(output_operation.outputs[0],
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@ -97,10 +97,27 @@ tf_java_op_gen_srcjar(
 # file before making it an executable. See tf_java_op_gen_srcjar().
 cc_library(
    name = "java_op_gen_tool",
-    srcs = glob([
-        "src/gen/cc/*.h",
-        "src/gen/cc/*.cc",
-    ]),
+    srcs = [
+        "src/gen/cc/op_gen_main.cc",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":java_op_gen_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "java_op_gen_lib",
+    srcs = [
+        "src/gen/cc/op_generator.cc",
+    ],
+    hdrs = [
+        "src/gen/cc/java_defs.h",
+        "src/gen/cc/op_generator.h",
+    ],
    copts = tf_copts(),
    deps = [
        "//tensorflow/core:framework",
@ -280,21 +297,6 @@ tf_java_test(
    ],
 )

-#java_test(
-#    name = "OperatorProcessorTest",
-#    size = "small",
-#    srcs = ["src/test/java/org/tensorflow/processor/OperatorProcessorTest.java"],
-#    javacopts = JAVACOPTS,
-#    resources = [":processor_test_resources"],
-#    test_class = "org.tensorflow.processor.OperatorProcessorTest",
-#    deps = [
-#        ":processor_library",
-#        "//third_party/java/junit",
-#        "@com_google_testing_compile",
-#        "@com_google_truth",
-#    ],
-#)
-
 filegroup(
    name = "processor_test_resources",
    srcs = glob([
--- a/tensorflow/java/src/gen/cc/java_defs.h
+++ b/tensorflow/java/src/gen/cc/java_defs.h
@ -0,0 +1,273 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_
+#define TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_
+
+#include <string>
+#include <vector>
+#include <deque>
+
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace java {
+
+// An enumeration of different modifiers commonly used in Java
+enum Modifier {
+  PUBLIC    = (1 << 0),
+  PROTECTED = (1 << 1),
+  PRIVATE   = (1 << 2),
+  STATIC    = (1 << 3),
+  FINAL     = (1 << 4),
+};
+
+class Annotation;
+
+// A definition of any kind of Java type (classes, interfaces...)
+//
+// Note that most of the data fields of this class are only useful in specific
+// contexts and are not required in many cases. For example, annotations and
+// supertypes are only useful when declaring a type.
+class Type {
+ public:
+  enum Kind {
+    PRIMITIVE, CLASS, INTERFACE, ENUM, GENERIC, ANNOTATION
+  };
+  static const Type Byte() {
+    return Type(Type::PRIMITIVE, "byte");
+  }
+  static const Type Char() {
+    return Type(Type::PRIMITIVE, "char");
+  }
+  static const Type Short() {
+    return Type(Type::PRIMITIVE, "short");
+  }
+  static const Type Int() {
+    return Type(Type::PRIMITIVE, "int");
+  }
+  static const Type Long() {
+    return Type(Type::PRIMITIVE, "long");
+  }
+  static const Type Float() {
+    return Type(Type::PRIMITIVE, "float");
+  }
+  static const Type Double() {
+    return Type(Type::PRIMITIVE, "double");
+  }
+  static const Type Boolean() {
+    return Type(Type::PRIMITIVE, "boolean");
+  }
+  static const Type Void() {
+    // For simplicity, we consider 'void' as a primitive type, like the Java
+    // Reflection API does
+    return Type(Type::PRIMITIVE, "void");
+  }
+  static Type Class(const string& name, const string& package = "") {
+    return Type(Type::CLASS, name, package);
+  }
+  static Type Interface(const string& name, const string& package = "") {
+    return Type(Type::INTERFACE, name, package);
+  }
+  static Type Enum(const string& name, const string& package = "") {
+    return Type(Type::ENUM, name, package);
+  }
+  static Type Generic(const string& name = "") {
+    return Type(Type::GENERIC, name);
+  }
+  static Type ClassOf(const Type& type) {
+    return Class("Class").add_parameter(type);
+  }
+  static Type ListOf(const Type& type) {
+    return Interface("List", "java.util").add_parameter(type);
+  }
+  static Type IterableOf(const Type& type) {
+    return Interface("Iterable").add_parameter(type);
+  }
+  const Kind& kind() const { return kind_; }
+  const string& name() const { return name_; }
+  const string& package() const { return package_; }
+  const string& description() const { return description_; }
+  Type& description(const string& description) {
+    description_ = description;
+    return *this;
+  }
+  const std::vector<Type>& parameters() const { return parameters_; }
+  Type& add_parameter(const Type& parameter) {
+    parameters_.push_back(parameter);
+    return *this;
+  }
+  const std::vector<Annotation>& annotations() const { return annotations_; }
+  Type& add_annotation(const Annotation& annotation) {
+    annotations_.push_back(annotation);
+    return *this;
+  }
+  const std::deque<Type>& supertypes() const { return supertypes_; }
+  Type& add_supertype(const Type& type) {
+    if (type.kind_ == CLASS) {
+      supertypes_.push_front(type);  // keep superclass at the front of the list
+    } else if (type.kind_ == INTERFACE) {
+      supertypes_.push_back(type);
+    }
+    return *this;
+  }
+  // Returns true if "type" is of a known collection type (only a few for now)
+  bool IsCollection() const {
+    return name_ == "List" || name_ == "Iterable";
+  }
+  // Returns true if this instance is a wildcard (<?>)
+  bool IsWildcard() const {
+    return kind_ == GENERIC && name_.empty();
+  }
+
+ protected:
+  Type(Kind kind, const string& name, const string& package = "")
+    : kind_(kind), name_(name), package_(package) {}
+
+ private:
+  Kind kind_;
+  string name_;
+  string package_;
+  string description_;
+  std::vector<Type> parameters_;
+  std::vector<Annotation> annotations_;
+  std::deque<Type> supertypes_;
+};
+
+// Definition of a Java annotation
+//
+// This class only defines the usage of an annotation in a specific context,
+// giving optionally a set of attributes to initialize.
+class Annotation : public Type {
+ public:
+  static Annotation Create(const string& type_name, const string& pkg = "") {
+    return Annotation(type_name, pkg);
+  }
+  const string& attributes() const { return attributes_; }
+  Annotation& attributes(const string& attributes) {
+    attributes_ = attributes;
+    return *this;
+  }
+
+ private:
+  string attributes_;
+
+  Annotation(const string& name, const string& package)
+    : Type(Kind::ANNOTATION, name, package) {}
+};
+
+// A definition of a Java variable
+//
+// This class declares an instance of a type, such as a class field or a
+// method argument, which can be documented.
+class Variable {
+ public:
+  static Variable Create(const string& name, const Type& type) {
+    return Variable(name, type, false);
+  }
+  static Variable Varargs(const string& name, const Type& type) {
+    return Variable(name, type, true);
+  }
+  const string& name() const { return name_; }
+  const Type& type() const { return type_; }
+  bool variadic() const { return variadic_; }
+  const string& description() const { return description_; }
+  Variable& description(const string& description) {
+    description_ = description;
+    return *this;
+  }
+ private:
+  string name_;
+  Type type_;
+  bool variadic_;
+  string description_;
+
+  Variable(const string& name, const Type& type, bool variadic)
+    : name_(name), type_(type), variadic_(variadic) {}
+};
+
+// A definition of a Java class method
+//
+// This class defines the signature of a method, including its name, return
+// type and arguments.
+class Method {
+ public:
+  static Method Create(const string& name, const Type& return_type) {
+    return Method(name, return_type, false);
+  }
+  static Method ConstructorFor(const Type& clazz) {
+    return Method(clazz.name(), clazz, true);
+  }
+  bool constructor() const { return constructor_; }
+  const string& name() const { return name_; }
+  const Type& return_type() const { return return_type_; }
+  const string& description() const { return description_; }
+  Method& description(const string& description) {
+    description_ = description;
+    return *this;
+  }
+  const string& return_description() const { return return_description_; }
+  Method& return_description(const string& description) {
+    return_description_ = description;
+    return *this;
+  }
+  const std::vector<Variable>& arguments() const { return arguments_; }
+  Method& add_arguments(const std::vector<Variable>& args) {
+    arguments_.insert(arguments_.cend(), args.cbegin(), args.cend());
+    return *this;
+  }
+  Method& add_argument(const Variable& var) {
+    arguments_.push_back(var);
+    return *this;
+  }
+  const std::vector<Annotation>& annotations() const { return annotations_; }
+  Method& add_annotation(const Annotation& annotation) {
+    annotations_.push_back(annotation);
+    return *this;
+  }
+
+ private:
+  string name_;
+  Type return_type_;
+  bool constructor_;
+  string description_;
+  string return_description_;
+  std::vector<Variable> arguments_;
+  std::vector<Annotation> annotations_;
+
+  Method(const string& name, const Type& return_type, bool constructor)
+    : name_(name), return_type_(return_type), constructor_(constructor) {}
+};
+
+// A piece of code to read from a file.
+class Snippet {
+ public:
+  static Snippet Create(const string& fname, Env* env = Env::Default()) {
+    return Snippet(fname, env);
+  }
+  const string& data() const { return data_; }
+
+ private:
+  string data_;
+
+  Snippet(const string& fname, Env* env) {
+    TF_CHECK_OK(ReadFileToString(env, fname, &data_));
+  }
+};
+
+}  // namespace java
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_
--- a/tensorflow/java/src/gen/cc/op_gen_main.cc
+++ b/tensorflow/java/src/gen/cc/op_gen_main.cc
@ -25,7 +25,7 @@
 #include "tensorflow/java/src/gen/cc/op_generator.h"

 namespace tensorflow {
-namespace op_gen {
+namespace java {

 const char kUsageHeader[] =
    "\n\nGenerator of operation wrappers in Java.\n\n"
@ -51,7 +51,7 @@ const char kUsageHeader[] =
    "Finally, the '--base_package' overrides the default parent package "
    "under which the generated subpackage and classes are to be located.\n\n";

-}  // namespace op_gen
+}  // namespace java
 }  // namespace tensorflow

 int main(int argc, char* argv[]) {
@ -67,13 +67,13 @@ int main(int argc, char* argv[]) {
      tensorflow::Flag(
          "base_package", &base_package,
          "Package parent to the generated subpackage and classes")};
-  tensorflow::string usage = tensorflow::op_gen::kUsageHeader;
+  tensorflow::string usage = tensorflow::java::kUsageHeader;
  usage += tensorflow::Flags::Usage(argv[0], flag_list);
  bool parsed_flags_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
  tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
  QCHECK(parsed_flags_ok && !lib_name.empty() && !output_dir.empty()) << usage;

-  tensorflow::OpGenerator generator;
+  tensorflow::java::OpGenerator generator;
  tensorflow::OpList ops;
  tensorflow::OpRegistry::Global()->Export(true, &ops);
  tensorflow::Status status =
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/java/src/gen/cc/op_generator.h"

 namespace tensorflow {
+namespace java {
 namespace {

 string CamelCase(const string& str, char delimiter, bool upper) {
@ -63,4 +64,5 @@ Status OpGenerator::Run(const OpList& ops, const string& lib_name,
  return Status::OK();
 }

+}  // namespace java
 }  // namespace tensorflow
--- a/tensorflow/java/src/gen/cc/op_generator.h
+++ b/tensorflow/java/src/gen/cc/op_generator.h
@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"

 namespace tensorflow {
+namespace java {

 /// \brief A generator of Java operation wrappers.
 ///
@ -46,6 +47,7 @@ class OpGenerator {
  Env* env;
 };

+}  // namespace java
 }  // namespace tensorflow

 #endif  // TENSORFLOW_JAVA_SRC_GEN_CC_OP_GENERATOR_H_
--- a/tensorflow/java/src/gen/gen_ops.bzl
+++ b/tensorflow/java/src/gen/gen_ops.bzl
@ -52,7 +52,7 @@ def tf_java_op_gen_srcjar(name,

  # Generate a source archive containing generated code for these ops.
  gen_srcjar = out_dir + name + ".srcjar"
-  gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) ."]
+  gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) src"]
  gen_tools += ["@local_jdk//:jar"] + ["@local_jdk//:jdk"]
  gen_tools += tf_binary_additional_srcs()
  native.genrule(
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@ -803,7 +803,7 @@ class Dataset(object):
    ```python
    # Preprocess 4 files concurrently, and interleave blocks of 16 records from
    # each file.
-    filenames = ["/var/data/file1.txt", "/var/data/file2.txt", ..."]
+    filenames = ["/var/data/file1.txt", "/var/data/file2.txt", ...]
    dataset = (Dataset.from_tensor_slices(filenames)
               .interleave(lambda x:
                   TextLineDataset(x).map(parse_fn, num_parallel_calls=1),
--- a/tensorflow/python/debug/examples/examples_test.sh
+++ b/tensorflow/python/debug/examples/examples_test.sh
@ -23,6 +23,9 @@

 set -e

+# Filter out LOG(INFO)
+export TF_CPP_MIN_LOG_LEVEL=1
+
 IS_VIRTUALENV=0
 PYTHON_BIN_PATH=""
 while true; do
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@ -688,9 +688,10 @@ class Estimator(object):
    Raises:
      ValueError: if input_fn takes invalid arguments.
    """
-    del mode  # unused
    input_fn_args = util.fn_args(input_fn)
    kwargs = {}
+    if 'mode' in input_fn_args:
+      kwargs['mode'] = mode
    if 'params' in input_fn_args:
      kwargs['params'] = self.params
    if 'config' in input_fn_args:
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@ -420,6 +420,7 @@ class EstimatorTrainTest(test.TestCase):
    self.assertEqual(1, model_fn_call_count[0])

  def test_callable_input_fn(self):
+    expected_mode = model_fn_lib.ModeKeys.TRAIN
    expected_params = {'batch_size': 10}
    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
    input_fn_call_count = [0]
@ -432,8 +433,9 @@ class EstimatorTrainTest(test.TestCase):

    class InputFn(object):

-      def __call__(self, params, config):
+      def __call__(self, mode, params, config):
        input_fn_call_count[0] += 1
+        test_self.assertEqual(expected_mode, mode)
        test_self.assertEqual(expected_params, params)
        test_self.assertEqual(4321, config.tf_random_seed)
        return dummy_input_fn()
@ -446,6 +448,7 @@ class EstimatorTrainTest(test.TestCase):
    self.assertEqual(1, input_fn_call_count[0])

  def test_input_fn_args(self):
+    expected_mode = model_fn_lib.ModeKeys.TRAIN
    expected_params = {'batch_size': 10}
    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
    input_fn_call_count = [0]
@ -454,8 +457,9 @@ class EstimatorTrainTest(test.TestCase):
      del params, config
      return model_fn_global_step_incrementer(features, labels, mode)

-    def _input_fn(params, config):
+    def _input_fn(mode, params, config):
      input_fn_call_count[0] += 1
+      self.assertEqual(expected_mode, mode)
      self.assertEqual(expected_params, params)
      self.assertEqual(4321, config.tf_random_seed)
      return dummy_input_fn()
@ -992,6 +996,7 @@ class EstimatorDatasetIntegrationTest(test.TestCase):
 class EstimatorEvaluateTest(test.TestCase):

  def test_input_fn_args(self):
+    expected_mode = model_fn_lib.ModeKeys.EVAL
    expected_params = {'batch_size': 10}
    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
    input_fn_call_count = [0]
@ -1000,8 +1005,9 @@ class EstimatorEvaluateTest(test.TestCase):
      del params, config
      return model_fn_global_step_incrementer(features, labels, mode)

-    def _input_fn(params, config):
+    def _input_fn(mode, params, config):
      input_fn_call_count[0] += 1
+      self.assertEqual(expected_mode, mode)
      self.assertEqual(expected_params, params)
      self.assertEqual(4321, config.tf_random_seed)
      return dummy_input_fn()
@ -1265,6 +1271,7 @@ class EstimatorEvaluateTest(test.TestCase):
 class EstimatorPredictTest(test.TestCase):

  def test_input_fn_args(self):
+    expected_mode = model_fn_lib.ModeKeys.PREDICT
    expected_params = {'batch_size': 10}
    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
    input_fn_call_count = [0]
@ -1277,8 +1284,9 @@ class EstimatorPredictTest(test.TestCase):
          train_op=state_ops.assign_add(training.get_global_step(), 1),
          predictions=constant_op.constant([[10.]]))

-    def _input_fn(params, config):
+    def _input_fn(mode, params, config):
      input_fn_call_count[0] += 1
+      self.assertEqual(expected_mode, mode)
      self.assertEqual(expected_params, params)
      self.assertEqual(4321, config.tf_random_seed)
      return dummy_input_fn()
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@ -677,11 +677,19 @@ class _TrainingExecutor(object):
            'RunConfig or set the TF_CONFIG environment variable.')

    logging.info('Start Tensorflow server.')
+
+    if config.session_config is None:
+      session_config=config_pb2.ConfigProto(log_device_placement=False)
+    else:
+      session_config=config_pb2.ConfigProto(
+          log_device_placement=False,
+          gpu_options=config.session_config.gpu_options)
+
    server = server_lib.Server(
        config.cluster_spec,
        job_name=config.task_type,
        task_index=config.task_id,
-        config=config_pb2.ConfigProto(log_device_placement=False),
+        config=session_config,
        start=False)
    server.start()
    return server
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@ -796,6 +796,20 @@ class Conv2DTest(test.TestCase):
          data_format=data_format,
          use_gpu=use_gpu)

+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DBackpropFilterWithEmptyInput(self):
+    expected = [0, 0, 0, 0]
+    for (data_format, use_gpu) in GetTestConfigs():
+      self._RunAndVerifyBackpropFilter(
+          input_sizes=[0, 2, 3, 1],
+          filter_sizes=[2, 2, 1, 1],
+          output_sizes=[0, 1, 2, 1],
+          strides=[1, 1],
+          padding="VALID",
+          expected=expected,
+          data_format=data_format,
+          use_gpu=use_gpu)
+
  @test_util.run_in_graph_and_eager_modes()
  def testConv2D2x2Depth3ValidBackpropFilter(self):
    expected = [
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@ -361,6 +361,16 @@ class PoolingTest(test.TestCase):
        expected=expected_output,
        use_gpu=use_gpu)

+  def _testAvgPoolEmptyInput(self, use_gpu):
+    self._VerifyValues(
+        nn_ops.avg_pool,
+        input_sizes=[0, 8, 8, 8],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding="SAME",
+        expected=[],
+        use_gpu=use_gpu)
+
  def testAvgPooling(self):
    for use_gpu in True, False:
      self._testAvgPoolValidPadding(use_gpu)
@ -371,6 +381,7 @@ class PoolingTest(test.TestCase):
      self._testAvgPoolSamePadding4(use_gpu)
      self._testAvgPoolSamePaddingPacket4(use_gpu)
      self._testAvgPoolSamePaddingPacket8(use_gpu)
+      self._testAvgPoolEmptyInput(use_gpu)

  def _testMaxPoolValidPadding(self, use_gpu):
    expected_output = [13.0, 14.0, 15.0]
@ -543,6 +554,16 @@ class PoolingTest(test.TestCase):
          use_gpu=use_gpu,
          v2=v2)

+  def _testMaxPoolEmptyInput(self, use_gpu):
+    self._VerifyValues(
+        gen_nn_ops._max_pool_v2,
+        input_sizes=[0, 8, 8, 8],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding="SAME",
+        expected=[],
+        use_gpu=use_gpu)
+
  def testMaxPooling(self):
    for use_gpu in True, False:
      self._testMaxPoolValidPadding(use_gpu)
@ -551,6 +572,7 @@ class PoolingTest(test.TestCase):
      self._testMaxPoolValidPaddingUnevenStride(use_gpu)
      self._testMaxPoolSamePaddingPacket4(use_gpu)
      self._testMaxPoolSamePaddingPacket8(use_gpu)
+      self._testMaxPoolEmptyInput(use_gpu)

  # Tests for DepthwiseMaxPooling on CPU only.
  def testDepthwiseMaxPool1x1DepthWindow1(self):
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@ -135,5 +135,36 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
                  bitwise_ops.right_shift(lhs, rhs)])


+  def testShapeInference(self):
+    dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]
+
+    with self.test_session(use_gpu=True) as sess:
+      for dtype in dtype_list:
+        lhs = constant_op.constant([[0], [3], [5]], dtype=dtype)
+        rhs = constant_op.constant([[1, 2, 4]], dtype=dtype)
+
+        and_tensor = bitwise_ops.bitwise_and(lhs, rhs)
+        or_tensor = bitwise_ops.bitwise_or(lhs, rhs)
+        xor_tensor = bitwise_ops.bitwise_xor(lhs, rhs)
+        ls_tensor = bitwise_ops.left_shift(lhs, rhs)
+        rs_tensor = bitwise_ops.right_shift(lhs, rhs)
+
+        and_result, or_result, xor_result, ls_result, rs_result = sess.run(
+            [and_tensor, or_tensor, xor_tensor, ls_tensor, rs_tensor])
+
+        # Compare shape inference with result
+        self.assertAllEqual(and_tensor.get_shape().as_list(), and_result.shape)
+        self.assertAllEqual(and_tensor.get_shape().as_list(), [3, 3])
+        self.assertAllEqual(or_tensor.get_shape().as_list(), or_result.shape)
+        self.assertAllEqual(or_tensor.get_shape().as_list(), [3, 3])
+        self.assertAllEqual(xor_tensor.get_shape().as_list(), xor_result.shape)
+        self.assertAllEqual(xor_tensor.get_shape().as_list(), [3, 3])
+        self.assertAllEqual(ls_tensor.get_shape().as_list(), ls_result.shape)
+        self.assertAllEqual(ls_tensor.get_shape().as_list(), [3, 3])
+        self.assertAllEqual(rs_tensor.get_shape().as_list(), rs_result.shape)
+        self.assertAllEqual(rs_tensor.get_shape().as_list(), [3, 3])
+
+
 if __name__ == "__main__":
  googletest.main()
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@ -977,9 +977,7 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,

  `hessians()` adds ops to the graph to output the Hessian matrix of `ys`
  with respect to `xs`.  It returns a list of `Tensor` of length `len(xs)`
-  where each tensor is the Hessian of `sum(ys)`. This function currently
-  only supports evaluating the Hessian with respect to (a list of) one-
-  dimensional tensors.
+  where each tensor is the Hessian of `sum(ys)`.

  The Hessian is a matrix of second-order partial derivatives of a scalar
  tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
@ -1005,31 +1003,32 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
      'colocate_gradients_with_ops': colocate_gradients_with_ops,
      'gate_gradients': gate_gradients,
      'aggregation_method': aggregation_method
-    }
+  }
  # Compute first-order derivatives and iterate for each x in xs.
  hessians = []
  _gradients = gradients(ys, xs, **kwargs)
-  for i, _gradient, x in zip(range(len(xs)), _gradients, xs):
-    # Ensure that x is a vector.
-    check_rank = check_ops.assert_rank(
-      x, 1, message='Cannot compute Hessian because element %d of `xs` does '
-      'not have rank one.' % i
-    )
-    with ops.control_dependencies([check_rank]):
-      # Declare an iterator and tensor array loop variables for the gradients.
-      n = array_ops.size(x)
-      loop_vars = [
+  for gradient, x in zip(_gradients, xs):
+    # change shape to one-dimension without graph branching
+    gradient = array_ops.reshape(gradient, [-1])
+
+    # Declare an iterator and tensor array loop variables for the gradients.
+    n = array_ops.size(x)
+    loop_vars = [
        array_ops.constant(0, dtypes.int32),
        tensor_array_ops.TensorArray(x.dtype, n)
-      ]
-      # Iterate over all elements of the gradient and compute second order
-      # derivatives.
-      _, hessian = control_flow_ops.while_loop(
-          lambda j, _: j < n,
-          lambda j, result: (j + 1,
-                             result.write(j, gradients(_gradient[j], x)[0])),
-          loop_vars
-      )
+    ]
+    # Iterate over all elements of the gradient and compute second order
+    # derivatives.
+    _, hessian = control_flow_ops.while_loop(
+        lambda j, _: j < n,
+        lambda j, result: (j + 1,
+                           result.write(j, gradients(gradient[j], x)[0])),
+        loop_vars
+    )

-      hessians.append(hessian.stack())
+    _shape = array_ops.shape(x)
+    _reshaped_hessian = array_ops.reshape(
+        hessian.stack(), array_ops.concat((_shape, _shape), 0)
+    )
+    hessians.append(_reshaped_hessian)
  return hessians
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@ -621,6 +621,45 @@ class HessianTest(test_util.TensorFlowTestCase):
        with self.assertRaises(ValueError):
          gradients.hessians(x, x)

+  def testHessian2D_square_matrix(self):
+    # Manually compute the Hessian explicitly for a low-dimensional problem
+    # and check that `hessian` matches. Specifically, the Hessian of
+    # f(x) = 1/2 * x^T * x is H = constant (block identity matrix)
+    m = 3
+    rng = np.random.RandomState([1, 2, 3])
+    x_value = rng.randn(m, m).astype("float32")
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant(x_value)
+      x_square = math_ops.reduce_sum(
+          math_ops.matmul(array_ops.transpose(x), x) * 0.5
+      )
+      hess = gradients.hessians(x_square, x)[0]
+      hess_actual = hess.eval()
+    hess_value = np.bmat([
+        [elem*np.ones((m, m)) for elem in vec]
+        for vec in np.eye(m)
+    ]).astype("float32")
+    self.assertAllEqual((m, m, m, m), hess_actual.shape)
+    self.assertAllClose(hess_value, hess_actual.reshape((m * m, m * m)))
+
+  def testHessian2D_non_square_matrix(self):
+    m = 3
+    n = 4
+    rng = np.random.RandomState([1, 2, 3])
+    x_value = rng.randn(m, n).astype("float32")
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant(x_value)
+      x_square = math_ops.reduce_sum(
+          math_ops.matmul(array_ops.transpose(x), x) * 0.5
+      )
+      hess = gradients.hessians(x_square, x)[0]
+      hess_actual = hess.eval()
+    hess_value = np.bmat([
+        [elem*np.ones((n, n)) for elem in vec]
+        for vec in np.eye(m)
+    ]).astype("float32")
+    self.assertAllEqual((m, n, m, n), hess_actual.shape)
+    self.assertAllClose(hess_value, hess_actual.reshape((m * n, m * n)))

@test_util.with_c_api
 class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@ -219,15 +219,17 @@ def random_flip_up_down(image, seed=None):
  Raises:
    ValueError: if the shape of `image` not supported.
  """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-  mirror_cond = math_ops.less(uniform_random, .5)
-  result = control_flow_ops.cond(mirror_cond,
-                                 lambda: array_ops.reverse(image, [0]),
-                                 lambda: image)
-  return fix_image_flip_shape(image, result)
+  with ops.name_scope(None, 'random_flip_up_down', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
+    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+    mirror_cond = math_ops.less(uniform_random, .5)
+    result = control_flow_ops.cond(mirror_cond,
+                                   lambda: array_ops.reverse(image, [0]),
+                                   lambda: image,
+                                   name=scope)
+    return fix_image_flip_shape(image, result)


 def random_flip_left_right(image, seed=None):
@ -248,15 +250,19 @@ def random_flip_left_right(image, seed=None):
  Raises:
    ValueError: if the shape of `image` not supported.
  """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-  mirror_cond = math_ops.less(uniform_random, .5)
-  result = control_flow_ops.cond(mirror_cond,
-                                 lambda: array_ops.reverse(image, [1]),
-                                 lambda: image)
-  return fix_image_flip_shape(image, result)
+  with ops.name_scope(None, 'random_flip_left_right', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
+    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+    mirror_cond = math_ops.less(uniform_random, .5)
+    result = control_flow_ops.cond(mirror_cond,
+                                   lambda: array_ops.reverse(image, [1]),
+                                   lambda: image,
+                                   name=scope)
+    print('scope: ' + scope)
+    print('result name: ' + result.name)
+    return fix_image_flip_shape(image, result)


 def flip_left_right(image):
@ -276,10 +282,12 @@ def flip_left_right(image):
  Raises:
    ValueError: if the shape of `image` not supported.
  """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  return fix_image_flip_shape(image, array_ops.reverse(image, [1]))
+  with ops.name_scope(None, 'flip_left_right', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
+    return fix_image_flip_shape(image,
+                                array_ops.reverse(image, [1], name=scope))


 def flip_up_down(image):
@ -299,10 +307,12 @@ def flip_up_down(image):
  Raises:
    ValueError: if the shape of `image` not supported.
  """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  return fix_image_flip_shape(image, array_ops.reverse(image, [0]))
+  with ops.name_scope(None, 'flip_up_down', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
+    return fix_image_flip_shape(image,
+                                array_ops.reverse(image, [0], name=scope))


 def rot90(image, k=1, name=None):
@ -356,10 +366,11 @@ def transpose_image(image):
  Raises:
    ValueError: if the shape of `image` not supported.
  """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  return array_ops.transpose(image, [1, 0, 2], name='transpose_image')
+  with ops.name_scope(None, 'transpose_image', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
+    return array_ops.transpose(image, [1, 0, 2], name=scope)


 def central_crop(image, central_fraction):
@ -386,33 +397,34 @@ def central_crop(image, central_fraction):
  Returns:
    3-D float Tensor
  """
-  image = ops.convert_to_tensor(image, name='image')
-  if central_fraction <= 0.0 or central_fraction > 1.0:
-    raise ValueError('central_fraction must be within (0, 1]')
-  if central_fraction == 1.0:
+  with ops.name_scope(None, 'central_crop', [image]):
+    image = ops.convert_to_tensor(image, name='image')
+    if central_fraction <= 0.0 or central_fraction > 1.0:
+      raise ValueError('central_fraction must be within (0, 1]')
+    if central_fraction == 1.0:
+      return image
+
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
+
+    img_shape = array_ops.shape(image)
+    depth = image.get_shape()[2]
+    img_h = math_ops.to_double(img_shape[0])
+    img_w = math_ops.to_double(img_shape[1])
+    bbox_h_start = math_ops.to_int32((img_h - img_h * central_fraction) / 2)
+    bbox_w_start = math_ops.to_int32((img_w - img_w * central_fraction) / 2)
+
+    bbox_h_size = img_shape[0] - bbox_h_start * 2
+    bbox_w_size = img_shape[1] - bbox_w_start * 2
+
+    bbox_begin = array_ops.stack([bbox_h_start, bbox_w_start, 0])
+    bbox_size = array_ops.stack([bbox_h_size, bbox_w_size, -1])
+    image = array_ops.slice(image, bbox_begin, bbox_size)
+
+    # The first two dimensions are dynamic and unknown.
+    image.set_shape([None, None, depth])
    return image

-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-
-  img_shape = array_ops.shape(image)
-  depth = image.get_shape()[2]
-  img_h = math_ops.to_double(img_shape[0])
-  img_w = math_ops.to_double(img_shape[1])
-  bbox_h_start = math_ops.to_int32((img_h - img_h * central_fraction) / 2)
-  bbox_w_start = math_ops.to_int32((img_w - img_w * central_fraction) / 2)
-
-  bbox_h_size = img_shape[0] - bbox_h_start * 2
-  bbox_w_size = img_shape[1] - bbox_w_start * 2
-
-  bbox_begin = array_ops.stack([bbox_h_start, bbox_w_start, 0])
-  bbox_size = array_ops.stack([bbox_h_size, bbox_w_size, -1])
-  image = array_ops.slice(image, bbox_begin, bbox_size)
-
-  # The first two dimensions are dynamic and unknown.
-  image.set_shape([None, None, depth])
-  return image
-

 def pad_to_bounding_box(image, offset_height, offset_width, target_height,
                        target_width):
@ -444,53 +456,54 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
      `target_*` arguments, or either `offset_height` or `offset_width` is
      negative.
  """
-  image = ops.convert_to_tensor(image, name='image')
+  with ops.name_scope(None, 'pad_to_bounding_box', [image]):
+    image = ops.convert_to_tensor(image, name='image')

-  is_batch = True
-  image_shape = image.get_shape()
-  if image_shape.ndims == 3:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-  elif image_shape.ndims is None:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-    image.set_shape([None] * 4)
-  elif image_shape.ndims != 4:
-    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+    is_batch = True
+    image_shape = image.get_shape()
+    if image_shape.ndims == 3:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+    elif image_shape.ndims is None:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+      image.set_shape([None] * 4)
+    elif image_shape.ndims != 4:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')

-  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+    assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+    batch, height, width, depth = _ImageDimensions(image, rank=4)

-  batch, height, width, depth = _ImageDimensions(image, rank=4)
+    after_padding_width = target_width - offset_width - width

-  after_padding_width = target_width - offset_width - width
-  after_padding_height = target_height - offset_height - height
+    after_padding_height = target_height - offset_height - height

-  assert_ops += _assert(offset_height >= 0, ValueError,
-                        'offset_height must be >= 0')
-  assert_ops += _assert(offset_width >= 0, ValueError,
-                        'offset_width must be >= 0')
-  assert_ops += _assert(after_padding_width >= 0, ValueError,
-                        'width must be <= target - offset')
-  assert_ops += _assert(after_padding_height >= 0, ValueError,
-                        'height must be <= target - offset')
-  image = control_flow_ops.with_dependencies(assert_ops, image)
+    assert_ops += _assert(offset_height >= 0, ValueError,
+                          'offset_height must be >= 0')
+    assert_ops += _assert(offset_width >= 0, ValueError,
+                          'offset_width must be >= 0')
+    assert_ops += _assert(after_padding_width >= 0, ValueError,
+                          'width must be <= target - offset')
+    assert_ops += _assert(after_padding_height >= 0, ValueError,
+                          'height must be <= target - offset')
+    image = control_flow_ops.with_dependencies(assert_ops, image)

-  # Do not pad on the depth dimensions.
-  paddings = array_ops.reshape(
-      array_ops.stack([
-          0, 0, offset_height, after_padding_height, offset_width,
-          after_padding_width, 0, 0
-      ]), [4, 2])
-  padded = array_ops.pad(image, paddings)
+    # Do not pad on the depth dimensions.
+    paddings = array_ops.reshape(
+        array_ops.stack([
+            0, 0, offset_height, after_padding_height, offset_width,
+            after_padding_width, 0, 0
+        ]), [4, 2])
+    padded = array_ops.pad(image, paddings)

-  padded_shape = [None if _is_tensor(i) else i
-                  for i in [batch, target_height, target_width, depth]]
-  padded.set_shape(padded_shape)
+    padded_shape = [None if _is_tensor(i) else i
+                    for i in [batch, target_height, target_width, depth]]
+    padded.set_shape(padded_shape)

-  if not is_batch:
-    padded = array_ops.squeeze(padded, squeeze_dims=[0])
+    if not is_batch:
+      padded = array_ops.squeeze(padded, squeeze_dims=[0])

-  return padded
+    return padded


 def crop_to_bounding_box(image, offset_height, offset_width, target_height,
@ -523,51 +536,52 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
      `target_*` arguments, or either `offset_height` or `offset_width` is
      negative, or either `target_height` or `target_width` is not positive.
  """
-  image = ops.convert_to_tensor(image, name='image')
+  with ops.name_scope(None, 'crop_to_bounding_box', [image]):
+    image = ops.convert_to_tensor(image, name='image')

-  is_batch = True
-  image_shape = image.get_shape()
-  if image_shape.ndims == 3:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-  elif image_shape.ndims is None:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-    image.set_shape([None] * 4)
-  elif image_shape.ndims != 4:
-    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+    is_batch = True
+    image_shape = image.get_shape()
+    if image_shape.ndims == 3:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+    elif image_shape.ndims is None:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+      image.set_shape([None] * 4)
+    elif image_shape.ndims != 4:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')

-  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+    assert_ops = _CheckAtLeast3DImage(image, require_static=False)

-  batch, height, width, depth = _ImageDimensions(image, rank=4)
+    batch, height, width, depth = _ImageDimensions(image, rank=4)

-  assert_ops += _assert(offset_width >= 0, ValueError,
-                        'offset_width must be >= 0.')
-  assert_ops += _assert(offset_height >= 0, ValueError,
-                        'offset_height must be >= 0.')
-  assert_ops += _assert(target_width > 0, ValueError,
-                        'target_width must be > 0.')
-  assert_ops += _assert(target_height > 0, ValueError,
-                        'target_height must be > 0.')
-  assert_ops += _assert(width >= (target_width + offset_width), ValueError,
-                        'width must be >= target + offset.')
-  assert_ops += _assert(height >= (target_height + offset_height), ValueError,
-                        'height must be >= target + offset.')
-  image = control_flow_ops.with_dependencies(assert_ops, image)
+    assert_ops += _assert(offset_width >= 0, ValueError,
+                          'offset_width must be >= 0.')
+    assert_ops += _assert(offset_height >= 0, ValueError,
+                          'offset_height must be >= 0.')
+    assert_ops += _assert(target_width > 0, ValueError,
+                          'target_width must be > 0.')
+    assert_ops += _assert(target_height > 0, ValueError,
+                          'target_height must be > 0.')
+    assert_ops += _assert(width >= (target_width + offset_width), ValueError,
+                          'width must be >= target + offset.')
+    assert_ops += _assert(height >= (target_height + offset_height), ValueError,
+                          'height must be >= target + offset.')
+    image = control_flow_ops.with_dependencies(assert_ops, image)

-  cropped = array_ops.slice(
-      image,
-      array_ops.stack([0, offset_height, offset_width, 0]),
-      array_ops.stack([-1, target_height, target_width, -1]))
+    cropped = array_ops.slice(
+        image,
+        array_ops.stack([0, offset_height, offset_width, 0]),
+        array_ops.stack([-1, target_height, target_width, -1]))

-  cropped_shape = [None if _is_tensor(i) else i
-                   for i in [batch, target_height, target_width, depth]]
-  cropped.set_shape(cropped_shape)
+    cropped_shape = [None if _is_tensor(i) else i
+                     for i in [batch, target_height, target_width, depth]]
+    cropped.set_shape(cropped_shape)

-  if not is_batch:
-    cropped = array_ops.squeeze(cropped, squeeze_dims=[0])
+    if not is_batch:
+      cropped = array_ops.squeeze(cropped, squeeze_dims=[0])

-  return cropped
+    return cropped


 def resize_image_with_crop_or_pad(image, target_height, target_width):
@ -598,88 +612,90 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
    If `images` was 3-D, a 3-D float Tensor of shape
    `[new_height, new_width, channels]`.
  """
-  image = ops.convert_to_tensor(image, name='image')
-  image_shape = image.get_shape()
-  is_batch = True
-  if image_shape.ndims == 3:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-  elif image_shape.ndims is None:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-    image.set_shape([None] * 4)
-  elif image_shape.ndims != 4:
-    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+  with ops.name_scope(None, 'resize_image_with_crop_or_pad', [image]):
+    image = ops.convert_to_tensor(image, name='image')
+    image_shape = image.get_shape()
+    is_batch = True
+    if image_shape.ndims == 3:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+    elif image_shape.ndims is None:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+      image.set_shape([None] * 4)
+    elif image_shape.ndims != 4:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')

-  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
-  assert_ops += _assert(target_width > 0, ValueError,
-                        'target_width must be > 0.')
-  assert_ops += _assert(target_height > 0, ValueError,
-                        'target_height must be > 0.')
+    assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+    assert_ops += _assert(target_width > 0, ValueError,
+                          'target_width must be > 0.')
+    assert_ops += _assert(target_height > 0, ValueError,
+                          'target_height must be > 0.')

-  image = control_flow_ops.with_dependencies(assert_ops, image)
-  # `crop_to_bounding_box` and `pad_to_bounding_box` have their own checks.
-  # Make sure our checks come first, so that error messages are clearer.
-  if _is_tensor(target_height):
-    target_height = control_flow_ops.with_dependencies(
-        assert_ops, target_height)
-  if _is_tensor(target_width):
-    target_width = control_flow_ops.with_dependencies(assert_ops, target_width)
+    image = control_flow_ops.with_dependencies(assert_ops, image)
+    # `crop_to_bounding_box` and `pad_to_bounding_box` have their own checks.
+    # Make sure our checks come first, so that error messages are clearer.
+    if _is_tensor(target_height):
+      target_height = control_flow_ops.with_dependencies(
+          assert_ops, target_height)
+    if _is_tensor(target_width):
+      target_width = control_flow_ops.with_dependencies(
+          assert_ops, target_width)

-  def max_(x, y):
-    if _is_tensor(x) or _is_tensor(y):
-      return math_ops.maximum(x, y)
-    else:
-      return max(x, y)
+    def max_(x, y):
+      if _is_tensor(x) or _is_tensor(y):
+        return math_ops.maximum(x, y)
+      else:
+        return max(x, y)

-  def min_(x, y):
-    if _is_tensor(x) or _is_tensor(y):
-      return math_ops.minimum(x, y)
-    else:
-      return min(x, y)
+    def min_(x, y):
+      if _is_tensor(x) or _is_tensor(y):
+        return math_ops.minimum(x, y)
+      else:
+        return min(x, y)

-  def equal_(x, y):
-    if _is_tensor(x) or _is_tensor(y):
-      return math_ops.equal(x, y)
-    else:
-      return x == y
+    def equal_(x, y):
+      if _is_tensor(x) or _is_tensor(y):
+        return math_ops.equal(x, y)
+      else:
+        return x == y

-  _, height, width, _ = _ImageDimensions(image, rank=4)
-  width_diff = target_width - width
-  offset_crop_width = max_(-width_diff // 2, 0)
-  offset_pad_width = max_(width_diff // 2, 0)
+    _, height, width, _ = _ImageDimensions(image, rank=4)
+    width_diff = target_width - width
+    offset_crop_width = max_(-width_diff // 2, 0)
+    offset_pad_width = max_(width_diff // 2, 0)

-  height_diff = target_height - height
-  offset_crop_height = max_(-height_diff // 2, 0)
-  offset_pad_height = max_(height_diff // 2, 0)
+    height_diff = target_height - height
+    offset_crop_height = max_(-height_diff // 2, 0)
+    offset_pad_height = max_(height_diff // 2, 0)

-  # Maybe crop if needed.
-  cropped = crop_to_bounding_box(image, offset_crop_height, offset_crop_width,
-                                 min_(target_height, height),
-                                 min_(target_width, width))
+    # Maybe crop if needed.
+    cropped = crop_to_bounding_box(image, offset_crop_height, offset_crop_width,
+                                   min_(target_height, height),
+                                   min_(target_width, width))

-  # Maybe pad if needed.
-  resized = pad_to_bounding_box(cropped, offset_pad_height, offset_pad_width,
-                                target_height, target_width)
+    # Maybe pad if needed.
+    resized = pad_to_bounding_box(cropped, offset_pad_height, offset_pad_width,
+                                  target_height, target_width)

-  # In theory all the checks below are redundant.
-  if resized.get_shape().ndims is None:
-    raise ValueError('resized contains no shape.')
+    # In theory all the checks below are redundant.
+    if resized.get_shape().ndims is None:
+      raise ValueError('resized contains no shape.')

-  _, resized_height, resized_width, _ = _ImageDimensions(resized, rank=4)
+    _, resized_height, resized_width, _ = _ImageDimensions(resized, rank=4)

-  assert_ops = []
-  assert_ops += _assert(equal_(resized_height, target_height), ValueError,
-                        'resized height is not correct.')
-  assert_ops += _assert(equal_(resized_width, target_width), ValueError,
-                        'resized width is not correct.')
+    assert_ops = []
+    assert_ops += _assert(equal_(resized_height, target_height), ValueError,
+                          'resized height is not correct.')
+    assert_ops += _assert(equal_(resized_width, target_width), ValueError,
+                          'resized width is not correct.')

-  resized = control_flow_ops.with_dependencies(assert_ops, resized)
+    resized = control_flow_ops.with_dependencies(assert_ops, resized)

-  if not is_batch:
-    resized = array_ops.squeeze(resized, squeeze_dims=[0])
+    if not is_batch:
+      resized = array_ops.squeeze(resized, squeeze_dims=[0])

-  return resized
+    return resized


 class ResizeMethod(object):
@ -736,66 +752,68 @@ def resize_images(images,
    If `images` was 3-D, a 3-D float Tensor of shape
    `[new_height, new_width, channels]`.
  """
-  images = ops.convert_to_tensor(images, name='images')
-  if images.get_shape().ndims is None:
-    raise ValueError('\'images\' contains no shape.')
-  # TODO(shlens): Migrate this functionality to the underlying Op's.
-  is_batch = True
-  if images.get_shape().ndims == 3:
-    is_batch = False
-    images = array_ops.expand_dims(images, 0)
-  elif images.get_shape().ndims != 4:
-    raise ValueError('\'images\' must have either 3 or 4 dimensions.')
+  with ops.name_scope(None, 'resize_images', [images, size]):
+    images = ops.convert_to_tensor(images, name='images')
+    if images.get_shape().ndims is None:
+      raise ValueError('\'images\' contains no shape.')
+    # TODO(shlens): Migrate this functionality to the underlying Op's.
+    is_batch = True
+    if images.get_shape().ndims == 3:
+      is_batch = False
+      images = array_ops.expand_dims(images, 0)
+    elif images.get_shape().ndims != 4:
+      raise ValueError('\'images\' must have either 3 or 4 dimensions.')

-  _, height, width, _ = images.get_shape().as_list()
+    _, height, width, _ = images.get_shape().as_list()

-  try:
-    size = ops.convert_to_tensor(size, dtypes.int32, name='size')
-  except (TypeError, ValueError):
-    raise ValueError('\'size\' must be a 1-D int32 Tensor')
-  if not size.get_shape().is_compatible_with([2]):
-    raise ValueError('\'size\' must be a 1-D Tensor of 2 elements: '
-                     'new_height, new_width')
-  size_const_as_shape = tensor_util.constant_value_as_shape(size)
-  new_height_const = size_const_as_shape[0].value
-  new_width_const = size_const_as_shape[1].value
+    try:
+      size = ops.convert_to_tensor(size, dtypes.int32, name='size')
+    except (TypeError, ValueError):
+      raise ValueError('\'size\' must be a 1-D int32 Tensor')
+    if not size.get_shape().is_compatible_with([2]):
+      raise ValueError('\'size\' must be a 1-D Tensor of 2 elements: '
+                       'new_height, new_width')
+    size_const_as_shape = tensor_util.constant_value_as_shape(size)
+    new_height_const = size_const_as_shape[0].value
+    new_width_const = size_const_as_shape[1].value
+
+    # If we can determine that the height and width will be unmodified by this
+    # transformation, we avoid performing the resize.
+    if all(x is not None
+           for x in [new_width_const, width, new_height_const, height]) and (
+               width == new_width_const and height == new_height_const):
+      if not is_batch:
+        images = array_ops.squeeze(images, squeeze_dims=[0])
+      return images
+
+    if method == ResizeMethod.BILINEAR:
+      images = gen_image_ops.resize_bilinear(images,
+                                             size,
+                                             align_corners=align_corners)
+    elif method == ResizeMethod.NEAREST_NEIGHBOR:
+      images = gen_image_ops.resize_nearest_neighbor(images,
+                                                     size,
+                                                     align_corners=
+                                                     align_corners)
+    elif method == ResizeMethod.BICUBIC:
+      images = gen_image_ops.resize_bicubic(images,
+                                            size,
+                                            align_corners=align_corners)
+    elif method == ResizeMethod.AREA:
+      images = gen_image_ops.resize_area(images,
+                                         size,
+                                         align_corners=align_corners)
+    else:
+      raise ValueError('Resize method is not implemented.')
+
+    # NOTE(mrry): The shape functions for the resize ops cannot unpack
+    # the packed values in `new_size`, so set the shape here.
+    images.set_shape([None, new_height_const, new_width_const, None])

-  # If we can determine that the height and width will be unmodified by this
-  # transformation, we avoid performing the resize.
-  if all(x is not None
-         for x in [new_width_const, width, new_height_const, height]) and (
-             width == new_width_const and height == new_height_const):
    if not is_batch:
      images = array_ops.squeeze(images, squeeze_dims=[0])
    return images

-  if method == ResizeMethod.BILINEAR:
-    images = gen_image_ops.resize_bilinear(images,
-                                           size,
-                                           align_corners=align_corners)
-  elif method == ResizeMethod.NEAREST_NEIGHBOR:
-    images = gen_image_ops.resize_nearest_neighbor(images,
-                                                   size,
-                                                   align_corners=align_corners)
-  elif method == ResizeMethod.BICUBIC:
-    images = gen_image_ops.resize_bicubic(images,
-                                          size,
-                                          align_corners=align_corners)
-  elif method == ResizeMethod.AREA:
-    images = gen_image_ops.resize_area(images,
-                                       size,
-                                       align_corners=align_corners)
-  else:
-    raise ValueError('Resize method is not implemented.')
-
-  # NOTE(mrry): The shape functions for the resize ops cannot unpack
-  # the packed values in `new_size`, so set the shape here.
-  images.set_shape([None, new_height_const, new_width_const, None])
-
-  if not is_batch:
-    images = array_ops.squeeze(images, squeeze_dims=[0])
-  return images
-

 def per_image_standardization(image):
  """Linearly scales `image` to have zero mean and unit norm.
@ -816,27 +834,28 @@ def per_image_standardization(image):
  Raises:
    ValueError: if the shape of 'image' is incompatible with this function.
  """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  num_pixels = math_ops.reduce_prod(array_ops.shape(image))
+  with ops.name_scope(None, 'per_image_standardization', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
+    num_pixels = math_ops.reduce_prod(array_ops.shape(image))

-  image = math_ops.cast(image, dtype=dtypes.float32)
-  image_mean = math_ops.reduce_mean(image)
+    image = math_ops.cast(image, dtype=dtypes.float32)
+    image_mean = math_ops.reduce_mean(image)

-  variance = (math_ops.reduce_mean(math_ops.square(image)) -
-              math_ops.square(image_mean))
-  variance = gen_nn_ops.relu(variance)
-  stddev = math_ops.sqrt(variance)
+    variance = (math_ops.reduce_mean(math_ops.square(image)) -
+                math_ops.square(image_mean))
+    variance = gen_nn_ops.relu(variance)
+    stddev = math_ops.sqrt(variance)

-  # Apply a minimum normalization that protects us against uniform images.
-  min_stddev = math_ops.rsqrt(math_ops.cast(num_pixels, dtypes.float32))
-  pixel_value_scale = math_ops.maximum(stddev, min_stddev)
-  pixel_value_offset = image_mean
+    # Apply a minimum normalization that protects us against uniform images.
+    min_stddev = math_ops.rsqrt(math_ops.cast(num_pixels, dtypes.float32))
+    pixel_value_scale = math_ops.maximum(stddev, min_stddev)
+    pixel_value_offset = image_mean

-  image = math_ops.subtract(image, pixel_value_offset)
-  image = math_ops.div(image, pixel_value_scale)
-  return image
+    image = math_ops.subtract(image, pixel_value_offset)
+    image = math_ops.div(image, pixel_value_scale, name=scope)
+    return image


 def random_brightness(image, max_delta, seed=None):
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@ -857,6 +857,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
    with self.test_session(use_gpu=True):
      x_tf = constant_op.constant(x_np, shape=x_np.shape)
      y = image_ops.flip_left_right(x_tf)
+      self.assertTrue(y.op.name.startswith('flip_left_right'))
      y_tf = y.eval()
      self.assertAllEqual(y_tf, y_np)

@ -867,6 +868,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
    with self.test_session(use_gpu=True):
      x_tf = constant_op.constant(x_np, shape=x_np.shape)
      y = image_ops.random_flip_left_right(x_tf)
+      self.assertTrue(y.op.name.startswith('random_flip_left_right'))

      count_flipped = 0
      count_unflipped = 0
@ -897,6 +899,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
    with self.test_session(use_gpu=True):
      x_tf = constant_op.constant(x_np, shape=x_np.shape)
      y = image_ops.flip_up_down(x_tf)
+      self.assertTrue(y.op.name.startswith('flip_up_down'))
      y_tf = y.eval()
      self.assertAllEqual(y_tf, y_np)

@ -907,6 +910,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
    with self.test_session(use_gpu=True):
      x_tf = constant_op.constant(x_np, shape=x_np.shape)
      y = image_ops.random_flip_up_down(x_tf)
+      self.assertTrue(y.op.name.startswith('random_flip_up_down'))
      count_flipped = 0
      count_unflipped = 0
      for _ in range(50):
@ -936,6 +940,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
    with self.test_session(use_gpu=True):
      x_tf = constant_op.constant(x_np, shape=x_np.shape)
      y = image_ops.transpose_image(x_tf)
+      self.assertTrue(y.op.name.startswith('transpose_image'))
      y_tf = y.eval()
      self.assertAllEqual(y_tf, y_np)

@ -1160,6 +1165,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
    with self.test_session(use_gpu=True):
      x = constant_op.constant(x_np, shape=x_shape)
      y = image_ops.per_image_standardization(x)
+      self.assertTrue(y.op.name.startswith('per_image_standardization'))
      y_tf = y.eval()
      self.assertAllClose(y_tf, y_np, atol=1e-4)

@ -1341,6 +1347,11 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
    for params, err_msg in test_config:
      self._assertRaises(x, x_shape, *params, err_msg=err_msg)

+  def testNameScope(self):
+    image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
+    y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66)
+    self.assertTrue(y.name.startswith('crop_to_bounding_box'))
+

 class CentralCropTest(test_util.TensorFlowTestCase):

@ -1417,6 +1428,13 @@ class CentralCropTest(test_util.TensorFlowTestCase):
      with self.assertRaises(ValueError):
        _ = image_ops.central_crop(x, 1.01)

+  def testNameScope(self):
+    x_shape = [13, 9, 3]
+    x_np = np.ones(x_shape, dtype=np.float32)
+    with self.test_session(use_gpu=True):
+      y = image_ops.central_crop(x_np, 1.0)
+      self.assertTrue(y.op.name.startswith('central_crop'))
+

 class PadToBoundingBoxTest(test_util.TensorFlowTestCase):

@ -1620,6 +1638,11 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
    for config_item in test_config:
      self._assertRaises(x, x_shape, *config_item)

+  def testNameScope(self):
+    image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
+    y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66)
+    self.assertTrue(y.op.name.startswith('pad_to_bounding_box'))
+

 class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):

@ -2224,6 +2247,13 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
    self._assertShapeInference([59, 60, None], [55, 66], [55, 66, None])
    self._assertShapeInference([None, None, None], [55, 66], [55, 66, None])

+  def testNameScope(self):
+    img_shape = [1, 3, 2, 1]
+    with self.test_session(use_gpu=True):
+      single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
+      y = image_ops.resize_images(single_image, [55, 66])
+      self.assertTrue(y.op.name.startswith('resize_images'))
+

 class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):

@ -2499,6 +2529,11 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
    self._assertRaises(x, x_shape, target_height, target_width,
                       "target_width must be > 0")

+  def testNameScope(self):
+    image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
+    y = image_ops.resize_image_with_crop_or_pad(image, 55, 66)
+    self.assertTrue(y.op.name.startswith('resize_image_with_crop_or_pad'))
+

 def _SimpleColorRamp():
  """Build a simple color ramp RGB image."""
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@ -171,6 +171,10 @@ class BatchNormalizationTest(test.TestCase):
        x, x_shape, y, y_shape, delta=1e-3, x_init_value=x_init_val)
    _, numerical_grad = gradient_checker.compute_gradient(
        x32, x_shape, y32, y_shape, delta=1e-3, x_init_value=x32_init_val)
+
+    # If grad is empty, no error.
+    if theoretical_grad.size == 0 and numerical_grad.size == 0:
+      return 0
    return np.fabs(theoretical_grad - numerical_grad).max()

  def _test_gradient(self,
@ -371,6 +375,17 @@ class BatchNormalizationTest(test.TestCase):
      self._test_inference(
          x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')

+  def testInferenceShape5(self):
+    x_shape = [0, 131, 127, 6]
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
+        self._test_inference(
+            x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW')
+        self._test_inference(
+            x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
+      self._test_inference(
+          x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
+
  def testTrainingShape1(self):
    x_shape = [1, 1, 6, 1]
    for dtype in [np.float16, np.float32]:
@ -409,6 +424,17 @@ class BatchNormalizationTest(test.TestCase):
      self._test_training(
          x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')

+  def testTrainingShape5(self):
+    x_shape = [0, 131, 127, 6]
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
+        self._test_training(
+            x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW')
+        self._test_training(
+            x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
+      self._test_training(
+          x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
+
  def testBatchNormGradShape1(self):
    for is_training in [True, False]:
      x_shape = [1, 1, 6, 1]
@ -496,6 +522,33 @@ class BatchNormalizationTest(test.TestCase):
            data_format='NHWC',
            is_training=is_training)

+  def testBatchNormGradShape5(self):
+    for is_training in [True, False]:
+      x_shape = [0, 7, 11, 4]
+      for dtype in [np.float16, np.float32]:
+        if test.is_gpu_available(cuda_only=True):
+          self._test_gradient(
+              x_shape,
+              dtype, [7],
+              np.float32,
+              use_gpu=True,
+              data_format='NCHW',
+              is_training=is_training)
+          self._test_gradient(
+              x_shape,
+              dtype, [4],
+              np.float32,
+              use_gpu=True,
+              data_format='NHWC',
+              is_training=is_training)
+        self._test_gradient(
+            x_shape,
+            dtype, [4],
+            np.float32,
+            use_gpu=False,
+            data_format='NHWC',
+            is_training=is_training)
+
  def _testBatchNormGradGrad(self, config):
    shape = config['shape']
    err_tolerance = config['err_tolerance']
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@ -120,7 +120,7 @@ def piecewise_constant(x, boundaries, values, name=None):
      `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
    boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
      increasing entries, and with all elements having the same type as `x`.
-    values: A list of `Tensor`s or float`s or `int`s that specifies the values
+    values: A list of `Tensor`s or `float`s or `int`s that specifies the values
      for the intervals defined by `boundaries`. It should have one more element
      than `boundaries`, and all elements should have the same type.
    name: A string. Optional name of the operation. Defaults to
@ -424,11 +424,12 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
    return math_ops.div(learning_rate, denom, name=name)


-def cosine_decay(learning_rate, global_step, decay_steps, name=None):
+def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
+                 name=None):
  """Applies cosine decay to the learning rate.

  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
-  with Warm Restarts.
+  with Warm Restarts. https://arxiv.org/abs/1608.03983

  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies a cosine decay function
@ -439,7 +440,8 @@ def cosine_decay(learning_rate, global_step, decay_steps, name=None):
  The function returns the decayed learning rate.  It is computed as:
  ```python
  global_step = min(global_step, decay_steps)
-  decayed = 0.5 * (1 + cos(pi * global_step / decay_steps))
+  cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
+  decayed = (1 - alpha) * cosine_decay + alpha
  decayed_learning_rate = learning_rate * decayed
  ```

@ -456,6 +458,8 @@ def cosine_decay(learning_rate, global_step, decay_steps, name=None):
      Global step to use for the decay computation.
    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
      Number of steps to decay over.
+    alpha: A scalar `float32` or `float64` Tensor or a Python number.
+      Minimum learning rate value as a fraction of learning_rate.
    name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
@ -476,7 +480,96 @@ def cosine_decay(learning_rate, global_step, decay_steps, name=None):
    cosine_decayed = 0.5 * (
        1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))

-    return math_ops.multiply(learning_rate, cosine_decayed)
+    decayed = (1 - alpha) * cosine_decayed + alpha
+    return math_ops.multiply(learning_rate, decayed)
+
+
+def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
+                          t_mul=2.0, m_mul=1.0, alpha=0.0, name=None):
+  """Applies cosine decay with restarts to the learning rate.
+
+  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses.  This function applies a cosine decay function with
+  restarts to a provided initial learning rate.  It requires a `global_step`
+  value to compute the decayed learning rate.  You can just pass a TensorFlow
+  variable that you increment at each training step.
+
+  The function returns the decayed learning rate while taking into account
+  possible warm restarts. The learning rate multiplier first decays
+  from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
+  restart is performed. Each new warm restart runs for `t_mul` times more steps
+  and with `m_mul` times smaller initial learning rate.
+
+  Example usage:
+  ```python
+  first_decay_steps = 1000
+  lr_decayed = cosine_decay_restarts(learning_rate, global_step,
+                                     first_decay_steps)
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
+      The initial learning rate.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Global step to use for the decay computation.
+    first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Number of steps to decay over.
+    t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+      Used to derive the number of iterations in the i-th period
+    m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+      Used to derive the initial learning rate of the i-th period:
+    alpha: A scalar `float32` or `float64` Tensor or a Python number.
+      Minimum learning rate value as a fraction of the learning_rate.
+    name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
+  Returns:
+    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+    learning rate.
+  Raises:
+    ValueError: if `global_step` is not supplied.
+  """
+  if global_step is None:
+    raise ValueError("cosine decay restarts requires global_step")
+  with ops.name_scope(name, "SGDRDecay",
+                      [learning_rate, global_step]) as name:
+    learning_rate = ops.convert_to_tensor(learning_rate,
+                                          name="initial_learning_rate")
+    dtype = learning_rate.dtype
+    global_step = math_ops.cast(global_step, dtype)
+    first_decay_steps = math_ops.cast(first_decay_steps, dtype)
+    alpha = math_ops.cast(alpha, dtype)
+    t_mul = math_ops.cast(t_mul, dtype)
+    m_mul = math_ops.cast(m_mul, dtype)
+
+    completed_fraction = global_step / first_decay_steps
+
+    def compute_step(completed_fraction, geometric=False):
+      if geometric:
+        i_restart = math_ops.floor(math_ops.log(1.0 - completed_fraction * (
+            1.0 - t_mul)) / math_ops.log(t_mul))
+
+        sum_r = (1.0 - t_mul ** i_restart) / (1.0 - t_mul)
+        completed_fraction = (completed_fraction - sum_r) / t_mul ** i_restart
+
+      else:
+        i_restart = math_ops.floor(completed_fraction)
+        completed_fraction = completed_fraction - i_restart
+
+      return i_restart, completed_fraction
+
+    i_restart, completed_fraction = control_flow_ops.cond(
+        math_ops.equal(t_mul, 1.0),
+        lambda: compute_step(completed_fraction, geometric=False),
+        lambda: compute_step(completed_fraction, geometric=True))
+
+    m_fac = m_mul ** i_restart
+    cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
+        constant_op.constant(math.pi) * completed_fraction))
+    decayed = (1 - alpha) * cosine_decayed + alpha
+
+  return math_ops.multiply(learning_rate, decayed, name=name)


 def linear_cosine_decay(learning_rate, global_step, decay_steps,
@ -487,6 +580,10 @@ def linear_cosine_decay(learning_rate, global_step, decay_steps,
  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
  https://arxiv.org/abs/1709.07417

+  For the idea of warm starts here controlled by `num_periods`,
+  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
  Note that linear cosine decay is more aggressive than cosine decay and
  larger initial learning rates can typically be used.

@ -563,6 +660,10 @@ def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps,
  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
  https://arxiv.org/abs/1709.07417

+  For the idea of warm starts here controlled by `num_periods`,
+  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
  Note that linear cosine decay is more aggressive than cosine decay and
  larger initial learning rates can typically be used.

--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@ -342,10 +342,11 @@ class InverseDecayTest(test_util.TensorFlowTestCase):

 class CosineDecayTest(test_util.TensorFlowTestCase):

-  def np_cosine_decay(self, step, decay_steps):
+  def np_cosine_decay(self, step, decay_steps, alpha=0.0):
    step = min(step, decay_steps)
    completed_fraction = step / decay_steps
-    return 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    return (1.0 - alpha) * decay + alpha

  def testDecay(self):
    num_training_steps = 1000
@ -357,6 +358,77 @@ class CosineDecayTest(test_util.TensorFlowTestCase):
        expected = self.np_cosine_decay(step, num_training_steps)
        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)

+  def testAlpha(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    alpha = 0.1
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay(
+            initial_lr, step, num_training_steps, alpha)
+        expected = self.np_cosine_decay(step, num_training_steps, alpha)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
+
+class CosineDecayRestartsTest(test_util.TensorFlowTestCase):
+  def np_cosine_decay_restarts(self, step, decay_steps, t_mul=2.0, m_mul=1.0,
+                               alpha=0.0):
+    fac = 1.0
+    while step >= decay_steps:
+      step = step - decay_steps
+      decay_steps *= t_mul
+      fac *= m_mul
+
+    completed_fraction = step / decay_steps
+    decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    return (1.0 - alpha) * decay + alpha
+
+  def testDecay(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay_restarts(
+            initial_lr, step, num_training_steps)
+        expected = self.np_cosine_decay_restarts(step, num_training_steps)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
+  def testAlpha(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    alpha = 0.1
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay_restarts(
+            initial_lr, step, num_training_steps, alpha=alpha)
+        expected = self.np_cosine_decay_restarts(step, num_training_steps,
+                                                 alpha=alpha)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
+  def testMMul(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    m_mul = 0.9
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay_restarts(
+            initial_lr, step, num_training_steps, m_mul=m_mul)
+        expected = self.np_cosine_decay_restarts(step, num_training_steps,
+                                                 m_mul=m_mul)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
+  def testTMul(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    t_mul = 1.0
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay_restarts(
+            initial_lr, step, num_training_steps, t_mul=t_mul)
+        expected = self.np_cosine_decay_restarts(step, num_training_steps,
+                                                 t_mul=t_mul)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+

 class LinearCosineDecayTest(test_util.TensorFlowTestCase):

--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@ -38,6 +38,7 @@ See the @{$python/train} guide.
@@clip_by_global_norm
@@global_norm
@@cosine_decay
+@@cosine_decay_restarts
@@linear_cosine_decay
@@noisy_linear_cosine_decay
@@exponential_decay
--- a/tensorflow/tools/api/golden/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
@ -266,7 +266,11 @@ tf_module {
  }
  member_method {
    name: "cosine_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "cosine_decay_restarts"
+    argspec: "args=[\'learning_rate\', \'global_step\', \'first_decay_steps\', \'t_mul\', \'m_mul\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'1.0\', \'0.0\', \'None\'], "
  }
  member_method {
    name: "create_global_step"
--- a/tensorflow/tools/ci_build/Dockerfile.android
+++ b/tensorflow/tools/ci_build/Dockerfile.android
@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04

 LABEL maintainer="Jan Prach <jendap@google.com>"

--- a/tensorflow/tools/ci_build/Dockerfile.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu
@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04

 LABEL maintainer="Jan Prach <jendap@google.com>"

--- a/tensorflow/tools/ci_build/Dockerfile.cpu.mpi
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu.mpi
@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04

 LABEL authors="Andrew Gibiansky <andrew.gibiansky@gmail.com>, Joel Hestness <jthestness@gmail.com>"

--- a/tensorflow/tools/ci_build/Dockerfile.hadoop
+++ b/tensorflow/tools/ci_build/Dockerfile.hadoop
@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04

 LABEL maintainer="Jonathan Hseu <jhseu@google.com>"

--- a/tensorflow/tools/ci_build/Dockerfile.pi
+++ b/tensorflow/tools/ci_build/Dockerfile.pi
@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04

 LABEL maintainer="Jan Prach <jendap@google.com>"

--- a/tensorflow/tools/ci_build/Dockerfile.pi-python3
+++ b/tensorflow/tools/ci_build/Dockerfile.pi-python3
@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04

 LABEL maintainer="Jan Prach <jendap@google.com>"

--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@ -111,9 +111,9 @@ do_pylint() {
  fi

  if [[ $1 == "PYTHON2" ]]; then
-    PYLINT_BIN="python /usr/local/lib/python2.7/dist-packages/pylint/lint.py"
+    PYLINT_BIN="python -m pylint"
  elif [[ $1 == "PYTHON3" ]]; then
-    PYLINT_BIN="python3 /usr/local/lib/python3.4/dist-packages/pylint/lint.py"
+    PYLINT_BIN="python3 -m pylint"
  else
    echo "Unrecognized python version (PYTHON2 | PYTHON3): $1"
    return 1
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@ -15,7 +15,7 @@
 # ==============================================================================

 # Select bazel version.
-BAZEL_VERSION="0.5.4"
+BAZEL_VERSION="0.8.0"

 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@ -22,9 +22,24 @@

 # fkrull/deadsnakes is for Python3.6
 add-apt-repository -y ppa:fkrull/deadsnakes
+
 apt-get update
+apt-get upgrade
+
+# Install python dep
+apt-get install python-dev
+# Install bz2 dep
+apt-get install libbz2-dev
+# Install curses dep
+apt-get install libncurses5 libncurses5-dev
+apt-get install libncursesw5 libncursesw5-dev
+# Install readline dep
+apt-get install libreadline6 libreadline6-dev
+# Install sqlite3 dependencies
+apt-get install libsqlite3-dev

 set -e
+
 # Install Python 3.6 and dev library
 wget https://www.python.org/ftp/python/3.6.1/Python-3.6.1.tar.xz
 tar xvf Python-3.6.1.tar.xz
@ -63,6 +78,10 @@ pip3 install scikit-learn==0.18.1
 # pandas required by `inflow`
 pip3 install pandas==0.19.2

+pip3 install gnureadline
+
+pip3 install bz2file
+
 # Install recent-enough version of wheel for Python 3.6 wheel builds
 pip3 install wheel==0.29.0

--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@ -10,10 +10,7 @@ load(

 py_binary(
    name = "tf_upgrade",
-    srcs = [
-        "ast_edits.py",
-        "tf_upgrade.py",
-    ],
+    srcs = ["tf_upgrade.py"],
    srcs_version = "PY2AND3",
 )

@ -22,7 +19,7 @@ py_test(
    srcs = ["tf_upgrade_test.py"],
    srcs_version = "PY2AND3",
    deps = [
-        "tf_upgrade",
+        ":tf_upgrade",
        "//tensorflow/python:client_testlib",
        "//tensorflow/python:framework_test_lib",
        "@six_archive//:six",
@ -48,11 +45,11 @@ genrule(
        "test_file_v1_0.py",
        "report.txt",
    ],
-    cmd = ("$(location tf_upgrade)" +
+    cmd = ("$(location :tf_upgrade)" +
           " --infile $(location testdata/test_file_v0_11.py)" +
           " --outfile $(location test_file_v1_0.py)" +
           " --reportfile $(location report.txt)"),
-    tools = ["tf_upgrade"],
+    tools = [":tf_upgrade"],
 )

 py_test(
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@ -19,11 +19,486 @@ from __future__ import division
 from __future__ import print_function

 import argparse
-
-from tensorflow.tools.compatibility import ast_edits
+import ast
+import collections
+import os
+import shutil
+import sys
+import tempfile
+import traceback


-class TFAPIChangeSpec(ast_edits.APIChangeSpec):
+class APIChangeSpec(object):
+  """This class defines the transformations that need to happen.
+
+  This class must provide the following fields:
+
+  * `function_keyword_renames`: maps function names to a map of old -> new
+    argument names
+  * `function_renames`: maps function names to new function names
+  * `change_to_function`: a set of function names that have changed (for
+    notifications)
+  * `function_reorders`: maps functions whose argument order has changed to the
+    list of arguments in the new order
+  * `function_handle`: maps function names to custom handlers for the function
+
+  For an example, see `TFAPIChangeSpec`.
+  """
+
+
+class _FileEditTuple(collections.namedtuple(
+    "_FileEditTuple", ["comment", "line", "start", "old", "new"])):
+  """Each edit that is recorded by a _FileEditRecorder.
+
+  Fields:
+    comment: A description of the edit and why it was made.
+    line: The line number in the file where the edit occurs (1-indexed).
+    start: The line number in the file where the edit occurs (0-indexed).
+    old: text string to remove (this must match what was in file).
+    new: text string to add in place of `old`.
+  """
+
+  __slots__ = ()
+
+
+class _FileEditRecorder(object):
+  """Record changes that need to be done to the file."""
+
+  def __init__(self, filename):
+    # all edits are lists of chars
+    self._filename = filename
+
+    self._line_to_edit = collections.defaultdict(list)
+    self._errors = []
+
+  def process(self, text):
+    """Process a list of strings, each corresponding to the recorded changes.
+
+    Args:
+      text: A list of lines of text (assumed to contain newlines)
+    Returns:
+      A tuple of the modified text and a textual description of what is done.
+    Raises:
+      ValueError: if substitution source location does not have expected text.
+    """
+
+    change_report = ""
+
+    # Iterate of each line
+    for line, edits in self._line_to_edit.items():
+      offset = 0
+      # sort by column so that edits are processed in order in order to make
+      # indexing adjustments cumulative for changes that change the string
+      # length
+      edits.sort(key=lambda x: x.start)
+
+      # Extract each line to a list of characters, because mutable lists
+      # are editable, unlike immutable strings.
+      char_array = list(text[line - 1])
+
+      # Record a description of the change
+      change_report += "%r Line %d\n" % (self._filename, line)
+      change_report += "-" * 80 + "\n\n"
+      for e in edits:
+        change_report += "%s\n" % e.comment
+      change_report += "\n    Old: %s" % (text[line - 1])
+
+      # Make underscore buffers for underlining where in the line the edit was
+      change_list = [" "] * len(text[line - 1])
+      change_list_new = [" "] * len(text[line - 1])
+
+      # Iterate for each edit
+      for e in edits:
+        # Create effective start, end by accounting for change in length due
+        # to previous edits
+        start_eff = e.start + offset
+        end_eff = start_eff + len(e.old)
+
+        # Make sure the edit is changing what it should be changing
+        old_actual = "".join(char_array[start_eff:end_eff])
+        if old_actual != e.old:
+          raise ValueError("Expected text %r but got %r" %
+                           ("".join(e.old), "".join(old_actual)))
+        # Make the edit
+        char_array[start_eff:end_eff] = list(e.new)
+
+        # Create the underline highlighting of the before and after
+        change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
+        change_list_new[start_eff:end_eff] = "~" * len(e.new)
+
+        # Keep track of how to generate effective ranges
+        offset += len(e.new) - len(e.old)
+
+      # Finish the report comment
+      change_report += "         %s\n" % "".join(change_list)
+      text[line - 1] = "".join(char_array)
+      change_report += "    New: %s" % (text[line - 1])
+      change_report += "         %s\n\n" % "".join(change_list_new)
+    return "".join(text), change_report, self._errors
+
+  def add(self, comment, line, start, old, new, error=None):
+    """Add a new change that is needed.
+
+    Args:
+      comment: A description of what was changed
+      line: Line number (1 indexed)
+      start: Column offset (0 indexed)
+      old: old text
+      new: new text
+      error: this "edit" is something that cannot be fixed automatically
+    Returns:
+      None
+    """
+
+    self._line_to_edit[line].append(
+        _FileEditTuple(comment, line, start, old, new))
+    if error:
+      self._errors.append("%s:%d: %s" % (self._filename, line, error))
+
+
+class _ASTCallVisitor(ast.NodeVisitor):
+  """AST Visitor that processes function calls.
+
+  Updates function calls from old API version to new API version using a given
+  change spec.
+  """
+
+  def __init__(self, filename, lines, api_change_spec):
+    self._filename = filename
+    self._file_edit = _FileEditRecorder(filename)
+    self._lines = lines
+    self._api_change_spec = api_change_spec
+
+  def process(self, lines):
+    return self._file_edit.process(lines)
+
+  def generic_visit(self, node):
+    ast.NodeVisitor.generic_visit(self, node)
+
+  def _rename_functions(self, node, full_name):
+    function_renames = self._api_change_spec.function_renames
+    try:
+      new_name = function_renames[full_name]
+      self._file_edit.add("Renamed function %r to %r" % (full_name,
+                                                         new_name),
+                          node.lineno, node.col_offset, full_name, new_name)
+    except KeyError:
+      pass
+
+  def _get_attribute_full_path(self, node):
+    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
+
+    Args:
+      node: A Node of type Attribute.
+
+    Returns:
+      a '.'-delimited full-name or None if the tree was not a simple form.
+      i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
+    """
+    curr = node
+    items = []
+    while not isinstance(curr, ast.Name):
+      if not isinstance(curr, ast.Attribute):
+        return None
+      items.append(curr.attr)
+      curr = curr.value
+    items.append(curr.id)
+    return ".".join(reversed(items))
+
+  def _find_true_position(self, node):
+    """Return correct line number and column offset for a given node.
+
+    This is necessary mainly because ListComp's location reporting reports
+    the next token after the list comprehension list opening.
+
+    Args:
+      node: Node for which we wish to know the lineno and col_offset
+    """
+    import re
+    find_open = re.compile("^\s*(\\[).*$")
+    find_string_chars = re.compile("['\"]")
+
+    if isinstance(node, ast.ListComp):
+      # Strangely, ast.ListComp returns the col_offset of the first token
+      # after the '[' token which appears to be a bug. Workaround by
+      # explicitly finding the real start of the list comprehension.
+      line = node.lineno
+      col = node.col_offset
+      # loop over lines
+      while 1:
+        # Reverse the text to and regular expression search for whitespace
+        text = self._lines[line-1]
+        reversed_preceding_text = text[:col][::-1]
+        # First find if a [ can be found with only whitespace between it and
+        # col.
+        m = find_open.match(reversed_preceding_text)
+        if m:
+          new_col_offset = col - m.start(1) - 1
+          return line, new_col_offset
+        else:
+          if (reversed_preceding_text=="" or
+             reversed_preceding_text.isspace()):
+            line = line - 1
+            prev_line = self._lines[line - 1]
+            # TODO(aselle):
+            # this is poor comment detection, but it is good enough for
+            # cases where the comment does not contain string literal starting/
+            # ending characters. If ast gave us start and end locations of the
+            # ast nodes rather than just start, we could use string literal
+            # node ranges to filter out spurious #'s that appear in string
+            # literals.
+            comment_start = prev_line.find("#")
+            if comment_start ==  -1:
+              col = len(prev_line) -1
+            elif find_string_chars.search(prev_line[comment_start:]) is None:
+              col = comment_start
+            else:
+              return None, None
+          else:
+            return None, None
+    # Most other nodes return proper locations (with notably does not), but
+    # it is not possible to use that in an argument.
+    return node.lineno, node.col_offset
+
+
+  def visit_Call(self, node):  # pylint: disable=invalid-name
+    """Handle visiting a call node in the AST.
+
+    Args:
+      node: Current Node
+    """
+
+
+    # Find a simple attribute name path e.g. "tf.foo.bar"
+    full_name = self._get_attribute_full_path(node.func)
+
+    # Make sure the func is marked as being part of a call
+    node.func.is_function_for_call = True
+
+    if full_name:
+      # Call special handlers
+      function_handles = self._api_change_spec.function_handle
+      if full_name in function_handles:
+        function_handles[full_name](self._file_edit, node)
+
+      # Examine any non-keyword argument and make it into a keyword argument
+      # if reordering required.
+      function_reorders = self._api_change_spec.function_reorders
+      function_keyword_renames = (
+          self._api_change_spec.function_keyword_renames)
+
+      if full_name in function_reorders:
+        reordered = function_reorders[full_name]
+        for idx, arg in enumerate(node.args):
+          lineno, col_offset = self._find_true_position(arg)
+          if lineno is None or col_offset is None:
+            self._file_edit.add(
+                "Failed to add keyword %r to reordered function %r"
+                % (reordered[idx], full_name), arg.lineno, arg.col_offset,
+                "", "",
+                error="A necessary keyword argument failed to be inserted.")
+          else:
+            keyword_arg = reordered[idx]
+            if (full_name in function_keyword_renames and
+                keyword_arg in function_keyword_renames[full_name]):
+              keyword_arg = function_keyword_renames[full_name][keyword_arg]
+            self._file_edit.add("Added keyword %r to reordered function %r"
+                                % (reordered[idx], full_name), lineno,
+                                col_offset, "", keyword_arg + "=")
+
+      # Examine each keyword argument and convert it to the final renamed form
+      renamed_keywords = ({} if full_name not in function_keyword_renames else
+                          function_keyword_renames[full_name])
+      for keyword in node.keywords:
+        argkey = keyword.arg
+        argval = keyword.value
+
+        if argkey in renamed_keywords:
+          argval_lineno, argval_col_offset = self._find_true_position(argval)
+          if argval_lineno is not None and argval_col_offset is not None:
+            # TODO(aselle): We should scan backward to find the start of the
+            # keyword key. Unfortunately ast does not give you the location of
+            # keyword keys, so we are forced to infer it from the keyword arg
+            # value.
+            key_start = argval_col_offset - len(argkey) - 1
+            key_end = key_start + len(argkey) + 1
+            if (self._lines[argval_lineno - 1][key_start:key_end] ==
+                argkey + "="):
+              self._file_edit.add("Renamed keyword argument from %r to %r" %
+                                  (argkey, renamed_keywords[argkey]),
+                                  argval_lineno,
+                                  argval_col_offset - len(argkey) - 1,
+                                  argkey + "=", renamed_keywords[argkey] + "=")
+              continue
+          self._file_edit.add(
+              "Failed to rename keyword argument from %r to %r" %
+              (argkey, renamed_keywords[argkey]),
+              argval.lineno,
+              argval.col_offset - len(argkey) - 1,
+              "", "",
+              error="Failed to find keyword lexographically. Fix manually.")
+
+    ast.NodeVisitor.generic_visit(self, node)
+
+  def visit_Attribute(self, node):  # pylint: disable=invalid-name
+    """Handle bare Attributes i.e. [tf.foo, tf.bar].
+
+    Args:
+      node: Node that is of type ast.Attribute
+    """
+    full_name = self._get_attribute_full_path(node)
+    if full_name:
+      self._rename_functions(node, full_name)
+    if full_name in self._api_change_spec.change_to_function:
+      if not hasattr(node, "is_function_for_call"):
+        new_text = full_name + "()"
+        self._file_edit.add("Changed %r to %r"%(full_name, new_text),
+                            node.lineno, node.col_offset, full_name, new_text)
+
+    ast.NodeVisitor.generic_visit(self, node)
+
+
+class ASTCodeUpgrader(object):
+  """Handles upgrading a set of Python files using a given API change spec."""
+
+  def __init__(self, api_change_spec):
+    if not isinstance(api_change_spec, APIChangeSpec):
+      raise TypeError("Must pass APIChangeSpec to ASTCodeUpgrader, got %s" %
+                      type(api_change_spec))
+    self._api_change_spec = api_change_spec
+
+  def process_file(self, in_filename, out_filename):
+    """Process the given python file for incompatible changes.
+
+    Args:
+      in_filename: filename to parse
+      out_filename: output file to write to
+    Returns:
+      A tuple representing number of files processed, log of actions, errors
+    """
+
+    # Write to a temporary file, just in case we are doing an implace modify.
+    with open(in_filename, "r") as in_file, \
+        tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
+      ret = self.process_opened_file(
+          in_filename, in_file, out_filename, temp_file)
+
+    shutil.move(temp_file.name, out_filename)
+    return ret
+
+  # Broad exceptions are required here because ast throws whatever it wants.
+  # pylint: disable=broad-except
+  def process_opened_file(self, in_filename, in_file, out_filename, out_file):
+    """Process the given python file for incompatible changes.
+
+    This function is split out to facilitate StringIO testing from
+    tf_upgrade_test.py.
+
+    Args:
+      in_filename: filename to parse
+      in_file: opened file (or StringIO)
+      out_filename: output file to write to
+      out_file: opened file (or StringIO)
+    Returns:
+      A tuple representing number of files processed, log of actions, errors
+    """
+    process_errors = []
+    text = "-" * 80 + "\n"
+    text += "Processing file %r\n outputting to %r\n" % (in_filename,
+                                                         out_filename)
+    text += "-" * 80 + "\n\n"
+
+    parsed_ast = None
+    lines = in_file.readlines()
+    try:
+      parsed_ast = ast.parse("".join(lines))
+    except Exception:
+      text += "Failed to parse %r\n\n" % in_filename
+      text += traceback.format_exc()
+    if parsed_ast:
+      visitor = _ASTCallVisitor(in_filename, lines, self._api_change_spec)
+      visitor.visit(parsed_ast)
+      out_text, new_text, process_errors = visitor.process(lines)
+      text += new_text
+      if out_file:
+        out_file.write(out_text)
+    text += "\n"
+    return 1, text, process_errors
+  # pylint: enable=broad-except
+
+  def process_tree(self, root_directory, output_root_directory,
+                   copy_other_files):
+    """Processes upgrades on an entire tree of python files in place.
+
+    Note that only Python files. If you have custom code in other languages,
+    you will need to manually upgrade those.
+
+    Args:
+      root_directory: Directory to walk and process.
+      output_root_directory: Directory to use as base.
+      copy_other_files: Copy files that are not touched by this converter.
+
+    Returns:
+      A tuple of files processed, the report string ofr all files, and errors
+    """
+
+    # make sure output directory doesn't exist
+    if output_root_directory and os.path.exists(output_root_directory):
+      print("Output directory %r must not already exist." % (
+          output_root_directory))
+      sys.exit(1)
+
+    # make sure output directory does not overlap with root_directory
+    norm_root = os.path.split(os.path.normpath(root_directory))
+    norm_output = os.path.split(os.path.normpath(output_root_directory))
+    if norm_root == norm_output:
+      print("Output directory %r same as input directory %r" % (
+          root_directory, output_root_directory))
+      sys.exit(1)
+
+    # Collect list of files to process (we do this to correctly handle if the
+    # user puts the output directory in some sub directory of the input dir)
+    files_to_process = []
+    files_to_copy = []
+    for dir_name, _, file_list in os.walk(root_directory):
+      py_files = [f for f in file_list if f.endswith(".py")]
+      copy_files = [f for f in file_list if not f.endswith(".py")]
+      for filename in py_files:
+        fullpath = os.path.join(dir_name, filename)
+        fullpath_output = os.path.join(
+            output_root_directory, os.path.relpath(fullpath, root_directory))
+        files_to_process.append((fullpath, fullpath_output))
+      if copy_other_files:
+        for filename in copy_files:
+          fullpath = os.path.join(dir_name, filename)
+          fullpath_output = os.path.join(
+              output_root_directory, os.path.relpath(fullpath, root_directory))
+          files_to_copy.append((fullpath, fullpath_output))
+
+    file_count = 0
+    tree_errors = []
+    report = ""
+    report += ("=" * 80) + "\n"
+    report += "Input tree: %r\n" % root_directory
+    report += ("=" * 80) + "\n"
+
+    for input_path, output_path in files_to_process:
+      output_directory = os.path.dirname(output_path)
+      if not os.path.isdir(output_directory):
+        os.makedirs(output_directory)
+      file_count += 1
+      _, l_report, l_errors = self.process_file(input_path, output_path)
+      tree_errors += l_errors
+      report += l_report
+    for input_path, output_path in files_to_copy:
+      output_directory = os.path.dirname(output_path)
+      if not os.path.isdir(output_directory):
+        os.makedirs(output_directory)
+      shutil.copy(input_path, output_path)
+    return file_count, report, tree_errors
+
+
+class TFAPIChangeSpec(APIChangeSpec):
  """List of maps that describe what changed in the API."""

  def __init__(self):
@ -238,7 +713,7 @@ Simple usage:
      default="report.txt")
  args = parser.parse_args()

-  upgrade = ast_edits.ASTCodeUpgrader(TFAPIChangeSpec())
+  upgrade = ASTCodeUpgrader(TFAPIChangeSpec())
  report_text = None
  report_filename = args.report_filename
  files_processed = 0
--- a/tensorflow/tools/compatibility/tf_upgrade_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_test.py
@ -22,7 +22,6 @@ import tempfile
 import six
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
-from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import tf_upgrade


@ -37,7 +36,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
  def _upgrade(self, old_file_text):
    in_file = six.StringIO(old_file_text)
    out_file = six.StringIO()
-    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
+    upgrader = tf_upgrade.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
    count, report, errors = (
        upgrader.process_opened_file("test.py", in_file,
                                     "test_out.py", out_file))
@ -140,7 +139,7 @@ class TestUpgradeFiles(test_util.TensorFlowTestCase):
    upgraded = "tf.multiply(a, b)\n"
    temp_file.write(original)
    temp_file.close()
-    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
+    upgrader = tf_upgrade.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
    upgrader.process_file(temp_file.name, temp_file.name)
    self.assertAllEqual(open(temp_file.name).read(), upgraded)
    os.unlink(temp_file.name)
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@ -57,7 +57,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
    >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.5.4
+ENV BAZEL_VERSION 0.8.0
 WORKDIR /
 RUN mkdir /bazel && \
    cd /bazel && \
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@ -66,7 +66,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
    >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.5.4
+ENV BAZEL_VERSION 0.8.0
 WORKDIR /
 RUN mkdir /bazel && \
    cd /bazel && \
--- a/Show More
+++ b/Show More