Merge changes from github.

Change: 131437429
2016-08-26 12:50:48 -08:00 · 2016-08-26 12:50:48 -08:00 · b0bdff4827
commit b0bdff4827
parent 0f867ebf83
65 changed files with 1374 additions and 159 deletions
--- a/README.md
+++ b/README.md
@ -34,9 +34,9 @@ and discussion.**
 People who are a little more adventurous can also try our nightly binaries:

 * Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/140/artifact/pip_test/whl/tensorflow-0.8.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/140/artifact/pip_test/whl/tensorflow-0.8.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
 * Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
 * [Android](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/lastSuccessfulBuild/artifact/bazel-out/local_linux/bin/tensorflow/examples/android/tensorflow_demo.apk) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/))

 #### *Try your first TensorFlow program*
--- a/10
+++ b/10
@ -98,7 +98,7 @@ while true; do
    fi
  fi
  if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
-    export CC=$GCC_HOST_COMPILER_PATH
+    export GCC_HOST_COMPILER_PATH
    break
  fi
  echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2
@ -142,7 +142,7 @@ while true; do

  if [ -e "${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH}" ]; then
    export CUDA_TOOLKIT_PATH
-    export CUDA_VERSION=$TF_CUDA_VERSION
+    export TF_CUDA_VERSION
    break
  fi
  echo "Invalid path to CUDA $TF_CUDA_VERSION toolkit. ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH} cannot be found"
@ -203,7 +203,7 @@ while true; do
  fi

  if [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_ALT_PATH}" -o -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_PATH}" ]; then
-    export CUDNN_VERSION=$TF_CUDNN_VERSION
+    export TF_CUDNN_VERSION
    export CUDNN_INSTALL_PATH
    break
  fi
@ -211,7 +211,7 @@ while true; do
  if [ "$OSNAME" == "Linux" ]; then
    CUDNN_PATH_FROM_LDCONFIG="$(ldconfig -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')"
    if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then
-      export CUDNN_VERSION=$TF_CUDNN_VERSION
+      export TF_CUDNN_VERSION
      export CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})"
      break
    fi
@ -263,7 +263,7 @@ EOF
      exit 1
    fi
  else
-    export CUDA_COMPUTE_CAPABILITIES=$TF_CUDA_COMPUTE_CAPABILITIES
+    export TF_CUDA_COMPUTE_CAPABILITIES
    break
  fi
  TF_CUDA_COMPUTE_CAPABILITIES=""
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@ -174,6 +174,15 @@ cc_binary(
    ],
 )

+cc_binary(
+    name = "libtensorflow_c.so",
+    linkshared = 1,
+    deps = [
+        "//tensorflow/c:c_api",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+
 cc_binary(
    name = "libtensorflow_cc.so",
    linkshared = 1,
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@ -232,7 +232,7 @@ string PrintAttrValue(string op, const AttrValue& attr_value) {
 string ToCamelCase(const string& str) {
  string result;
  const char joiner = '_';
-  int i = 0;
+  size_t i = 0;
  bool cap = true;
  while (i < str.size()) {
    const char c = str[i++];
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -51,7 +51,8 @@ include(highwayhash)

 # Let's get to work!
 include(tf_core_framework.cmake)
-include(tf_stream_executor.cmake)
+# NOTE: Disabled until issue #3996 is fixed.
+# include(tf_stream_executor.cmake)
 include(tf_core_cpu.cmake)
 include(tf_models.cmake)
 include(tf_core_ops.cmake)
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@ -13,6 +13,8 @@ file(GLOB_RECURSE tf_core_kernels_exclude_srcs
   "${tensorflow_source_dir}/tensorflow/core/kernels/*testutil.cc"
   "${tensorflow_source_dir}/tensorflow/core/kernels/*main.cc"
   "${tensorflow_source_dir}/tensorflow/core/kernels/*.cu.cc"
+   "${tensorflow_source_dir}/tensorflow/core/kernels/debug_ops.h"
+   "${tensorflow_source_dir}/tensorflow/core/kernels/debug_ops.cc"
 )

 list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_exclude_srcs}) 
--- a/tensorflow/contrib/ios_examples/README.md
+++ b/tensorflow/contrib/ios_examples/README.md
@ -72,6 +72,9 @@ rundown:
   unused because no other code references the variables, but in fact their
   constructors have the important side effect of registering the class.

+ - You'll need to include the Accelerate framework in the "Link Binary with
+   Libraries" build phase of your project.
+ 
 - C++11 support (or later) should be enabled by setting `C++ Language Dialect` to
   `GNU++11` (or `GNU++14`), and `C++ Standard Library` to `libc++`.
 
--- a/tensorflow/contrib/ios_examples/benchmark/benchmark.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/ios_examples/benchmark/benchmark.xcodeproj/project.pbxproj
@ -9,6 +9,7 @@
 /* Begin PBXBuildFile section */
 		590E7D881D02091F00DF5523 /* libprotobuf-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 590E7D861D02091F00DF5523 /* libprotobuf-lite.a */; };
 		590E7D8A1D0209DD00DF5523 /* libprotobuf.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 590E7D871D02091F00DF5523 /* libprotobuf.a */; };
+		5993C7701D5D4E7F0048CE6A /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5993C76F1D5D4E7F0048CE6A /* Accelerate.framework */; };
 		59A3D0011CF4E68100C4259F /* AppDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFF21CF4E68100C4259F /* AppDelegate.mm */; };
 		59A3D0031CF4E68100C4259F /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */; };
 		59A3D0051CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */; };
@ -25,6 +26,7 @@
 		590E7D861D02091F00DF5523 /* libprotobuf-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libprotobuf-lite.a"; path = "../../makefile/gen/protobuf_ios/lib/libprotobuf-lite.a"; sourceTree = "<group>"; };
 		590E7D871D02091F00DF5523 /* libprotobuf.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libprotobuf.a; path = ../../makefile/gen/protobuf_ios/lib/libprotobuf.a; sourceTree = "<group>"; };
 		5911579B1CF4011C00C31E3A /* benchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = benchmark.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		5993C76F1D5D4E7F0048CE6A /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
 		59A3CFF11CF4E68100C4259F /* AppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
 		59A3CFF21CF4E68100C4259F /* AppDelegate.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = AppDelegate.mm; sourceTree = "<group>"; };
 		59A3CFF41CF4E68100C4259F /* cropped_panda.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = cropped_panda.jpg; sourceTree = "<group>"; };
@ -50,6 +52,7 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				5993C7701D5D4E7F0048CE6A /* Accelerate.framework in Frameworks */,
 				590E7D8A1D0209DD00DF5523 /* libprotobuf.a in Frameworks */,
 				590E7D881D02091F00DF5523 /* libprotobuf-lite.a in Frameworks */,
 				59A3D0181CF4E86100C4259F /* UIKit.framework in Frameworks */,
@ -63,6 +66,7 @@
 		591157921CF4011C00C31E3A = {
 			isa = PBXGroup;
 			children = (
+				5993C76F1D5D4E7F0048CE6A /* Accelerate.framework */,
 				590E7D861D02091F00DF5523 /* libprotobuf-lite.a */,
 				590E7D871D02091F00DF5523 /* libprotobuf.a */,
 				59A3D0171CF4E86100C4259F /* UIKit.framework */,
--- a/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj
@ -24,6 +24,7 @@
 		592FF90D18EDD0DA00C164F8 /* MainStoryboard_iPhone.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 592FF90A18EDD0DA00C164F8 /* MainStoryboard_iPhone.storyboard */; };
 		592FF92518EE240200C164F8 /* CameraExampleAppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 592FF92218EE240200C164F8 /* CameraExampleAppDelegate.m */; };
 		592FF92618EE240200C164F8 /* CameraExampleViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 592FF92418EE240200C164F8 /* CameraExampleViewController.mm */; };
+		5993C7721D5D4E980048CE6A /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5993C7711D5D4E980048CE6A /* Accelerate.framework */; };
 /* End PBXBuildFile section */

 /* Begin PBXFileReference section */
@ -52,6 +53,7 @@
 		592FF92218EE240200C164F8 /* CameraExampleAppDelegate.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = CameraExampleAppDelegate.m; sourceTree = SOURCE_ROOT; };
 		592FF92318EE240200C164F8 /* CameraExampleViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleViewController.h; sourceTree = SOURCE_ROOT; };
 		592FF92418EE240200C164F8 /* CameraExampleViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = CameraExampleViewController.mm; sourceTree = SOURCE_ROOT; };
+		5993C7711D5D4E980048CE6A /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS9.3.sdk/System/Library/Frameworks/Accelerate.framework; sourceTree = DEVELOPER_DIR; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
@ -59,6 +61,7 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				5993C7721D5D4E980048CE6A /* Accelerate.framework in Frameworks */,
 				591D3EDF1CFFAD230059011C /* libprotobuf-lite.a in Frameworks */,
 				591D3EE01CFFAD230059011C /* libprotobuf.a in Frameworks */,
 				591D3ECF1CFF7FCE0059011C /* ImageIO.framework in Frameworks */,
@ -103,6 +106,7 @@
 		592FF8B718ECBD7600C164F8 /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
+				5993C7711D5D4E980048CE6A /* Accelerate.framework */,
 				591D3EDD1CFFAD230059011C /* libprotobuf-lite.a */,
 				591D3EDE1CFFAD230059011C /* libprotobuf.a */,
 				591D3ECE1CFF7FCE0059011C /* ImageIO.framework */,
--- a/tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj/project.pbxproj
@ -9,6 +9,7 @@
 /* Begin PBXBuildFile section */
 		590E7D881D02091F00DF5523 /* libprotobuf-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 590E7D861D02091F00DF5523 /* libprotobuf-lite.a */; };
 		590E7D8A1D0209DD00DF5523 /* libprotobuf.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 590E7D871D02091F00DF5523 /* libprotobuf.a */; };
+		5993C7741D5D4EAF0048CE6A /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5993C7731D5D4EAF0048CE6A /* Accelerate.framework */; };
 		59A3D0011CF4E68100C4259F /* AppDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFF21CF4E68100C4259F /* AppDelegate.mm */; };
 		59A3D0031CF4E68100C4259F /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */; };
 		59A3D0051CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */; };
@ -25,6 +26,7 @@
 		590E7D861D02091F00DF5523 /* libprotobuf-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libprotobuf-lite.a"; path = "../../makefile/gen/protobuf_ios/lib/libprotobuf-lite.a"; sourceTree = "<group>"; };
 		590E7D871D02091F00DF5523 /* libprotobuf.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libprotobuf.a; path = ../../makefile/gen/protobuf_ios/lib/libprotobuf.a; sourceTree = "<group>"; };
 		5911579B1CF4011C00C31E3A /* tf_ios_makefile_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tf_ios_makefile_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		5993C7731D5D4EAF0048CE6A /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
 		59A3CFF11CF4E68100C4259F /* AppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
 		59A3CFF21CF4E68100C4259F /* AppDelegate.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = AppDelegate.mm; sourceTree = "<group>"; };
 		59A3CFF41CF4E68100C4259F /* cropped_panda.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = cropped_panda.jpg; sourceTree = "<group>"; };
@ -50,6 +52,7 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				5993C7741D5D4EAF0048CE6A /* Accelerate.framework in Frameworks */,
 				590E7D8A1D0209DD00DF5523 /* libprotobuf.a in Frameworks */,
 				590E7D881D02091F00DF5523 /* libprotobuf-lite.a in Frameworks */,
 				59A3D0181CF4E86100C4259F /* UIKit.framework in Frameworks */,
@ -63,6 +66,7 @@
 		591157921CF4011C00C31E3A = {
 			isa = PBXGroup;
 			children = (
+				5993C7731D5D4EAF0048CE6A /* Accelerate.framework */,
 				590E7D861D02091F00DF5523 /* libprotobuf-lite.a */,
 				590E7D871D02091F00DF5523 /* libprotobuf.a */,
 				59A3D0171CF4E86100C4259F /* UIKit.framework */,
--- a/tensorflow/contrib/layers/init.py
+++ b/tensorflow/contrib/layers/init.py
@ -27,6 +27,7 @@ common machine learning algorithms.
@@convolution2d_transpose
@@flatten
@@fully_connected
+@@layer_norm
@@max_pool2d
@@one_hot_encoding
@@repeat
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@ -52,6 +52,7 @@ __all__ = ['avg_pool2d',
           'dropout',
           'flatten',
           'fully_connected',
+           'layer_norm',
           'linear',
           'max_pool2d',
           'one_hot_encoding',
@ -276,7 +277,8 @@ def batch_norm(inputs,
    outputs.set_shape(inputs_shape)
    if activation_fn:
      outputs = activation_fn(outputs)
-    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+    return utils.collect_named_outputs(outputs_collections,
+                                       sc.original_name_scope, outputs)


@add_arg_scope
@ -328,7 +330,8 @@ def bias_add(inputs,
    outputs = nn.bias_add(inputs, biases)
    if activation_fn:
      outputs = activation_fn(outputs)
-    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+    return utils.collect_named_outputs(outputs_collections,
+                                       sc.original_name_scope, outputs)


@add_arg_scope
@ -441,7 +444,8 @@ def convolution2d(inputs,
        outputs = nn.bias_add(outputs, biases)
    if activation_fn:
      outputs = activation_fn(outputs)
-    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+    return utils.collect_named_outputs(outputs_collections,
+                                       sc.original_name_scope, outputs)


@add_arg_scope
@ -541,7 +545,8 @@ def convolution2d_in_plane(

    if activation_fn:
      outputs = activation_fn(outputs)
-    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+    return utils.collect_named_outputs(outputs_collections,
+                                       sc.original_name_scope, outputs)


@add_arg_scope
@ -668,7 +673,8 @@ def convolution2d_transpose(

    if activation_fn:
      outputs = activation_fn(outputs)
-    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+    return utils.collect_named_outputs(outputs_collections,
+                                       sc.original_name_scope, outputs)


@add_arg_scope
@ -845,7 +851,95 @@ def fully_connected(inputs,
      # Reshape back outputs
      outputs = array_ops.reshape(outputs, array_ops.pack(out_shape))
      outputs.set_shape(static_shape)
-    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+    return utils.collect_named_outputs(outputs_collections,
+                                       sc.original_name_scope, outputs)
+
+
+@add_arg_scope
+def layer_norm(inputs,
+               center=True,
+               scale=True,
+               activation_fn=None,
+               reuse=None,
+               variables_collections=None,
+               outputs_collections=None,
+               trainable=True,
+               scope=None):
+  """Adds a Layer Normalization layer from https://arxiv.org/abs/1607.06450.
+
+    "Layer Normalization"
+
+    Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
+
+  Can be used as a normalizer function for conv2d and fully_connected.
+
+  Args:
+    inputs: a tensor with 2 or more dimensions. The normalization
+            occurs over all but the first dimension.
+    center: If True, subtract `beta`. If False, `beta` is ignored.
+    scale: If True, multiply by `gamma`. If False, `gamma` is
+      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
+      disabled since the scaling can be done by the next layer.
+    activation_fn: Optional activation function.
+    reuse: whether or not the layer and its variables should be reused. To be
+      able to reuse the layer scope must be given.
+    variables_collections: optional collections for the variables.
+    outputs_collections: collections to add the outputs.
+    trainable: If `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    scope: Optional scope for `variable_op_scope`.
+
+  Returns:
+    A `Tensor` representing the output of the operation.
+
+  Raises:
+    ValueError: if rank or last dimension of `inputs` is undefined.
+  """
+  with variable_scope.variable_scope(scope, 'LayerNorm', [inputs],
+                                     reuse=reuse) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    inputs_shape = inputs.get_shape()
+    inputs_rank = inputs_shape.ndims
+    if inputs_rank is None:
+      raise ValueError('Inputs %s has undefined rank.' % inputs.name)
+    dtype = inputs.dtype.base_dtype
+    axis = list(range(1, inputs_rank))
+    params_shape = inputs_shape[-1:]
+    if not params_shape.is_fully_defined():
+      raise ValueError('Inputs %s has undefined last dimension %s.' % (
+          inputs.name, params_shape))
+    # Allocate parameters for the beta and gamma of the normalization.
+    beta, gamma = None, None
+    if center:
+      beta_collections = utils.get_variable_collections(variables_collections,
+                                                        'beta')
+      beta = variables.model_variable('beta',
+                                      shape=params_shape,
+                                      dtype=dtype,
+                                      initializer=init_ops.zeros_initializer,
+                                      collections=beta_collections,
+                                      trainable=trainable)
+    if scale:
+      gamma_collections = utils.get_variable_collections(variables_collections,
+                                                         'gamma')
+      gamma = variables.model_variable('gamma',
+                                       shape=params_shape,
+                                       dtype=dtype,
+                                       initializer=init_ops.ones_initializer,
+                                       collections=gamma_collections,
+                                       trainable=trainable)
+    # Calculate the moments on the last axis (layer activations).
+    mean, variance = nn.moments(inputs, axis, keep_dims=True)
+    # Compute layer normalization using the batch_normalization function.
+    variance_epsilon = 1E-12
+    outputs = nn.batch_normalization(
+        inputs, mean, variance, beta, gamma, variance_epsilon)
+    outputs.set_shape(inputs_shape)
+    if activation_fn:
+      outputs = activation_fn(outputs)
+    return utils.collect_named_outputs(outputs_collections,
+                                       sc.original_name_scope,
+                                       outputs)


@add_arg_scope
@ -1094,7 +1188,8 @@ def separable_convolution2d(

    if activation_fn:
      outputs = activation_fn(outputs)
-    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+    return utils.collect_named_outputs(outputs_collections,
+                                       sc.original_name_scope, outputs)


@add_arg_scope
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@ -203,6 +203,16 @@ class Convolution2dTest(tf.test.TestCase):
                                               scope='conv1')
      self.assertEquals(output.op.name, 'conv1/Relu')

+  def testCreateConvWithCollection(self):
+    height, width = 3, 3
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    with tf.name_scope('fe'):
+      conv = tf.contrib.layers.convolution2d(
+        images, 32, [3, 3], outputs_collections='outputs',
+        scope='Conv')
+    namedOutputs = tf.get_collection('outputs')[0]
+    self.assertEquals(namedOutputs.name, 'fe/Conv')
+
  def testCreateConvWithoutActivation(self):
    height, width = 3, 3
    with self.test_session():
@ -989,6 +999,16 @@ class FCTest(tf.test.TestCase):
      output = tf.contrib.layers.fully_connected(inputs, 32, scope='fc1')
      self.assertEquals(output.op.name, 'fc1/Relu')

+  def testCreateFCWithCollection(self):
+    height, width = 3, 3
+    inputs = tf.random_uniform((5, height * width * 3), seed=1)
+    with tf.name_scope('fe'):
+      fc = tf.contrib.layers.fully_connected(
+        inputs, 7, outputs_collections='outputs',
+        scope='fc')
+    namedOutputs = tf.get_collection('outputs')[0]
+    self.assertEquals(namedOutputs.name, 'fe/fc')
+
  def testCreateFcCreatesWeightsAndBiasesVars(self):
    height, width = 3, 3
    inputs = tf.random_uniform((5, height * width * 3), seed=1)
@ -1542,6 +1562,90 @@ class BatchNormTest(tf.test.TestCase):
      self.assertAllClose(moving_variance.eval(), expected_var)


+class LayerNormTest(tf.test.TestCase):
+
+  def testUnknownShape(self):
+    with tf.Graph().as_default() as g, self.test_session(g):
+      inputs = tf.placeholder(dtype=tf.float32)
+      with self.assertRaisesRegexp(ValueError, 'undefined rank'):
+        tf.contrib.layers.layer_norm(inputs)
+
+  def testUnknownLastDim(self):
+    with tf.Graph().as_default() as g, self.test_session(g):
+      inputs = tf.placeholder(dtype=tf.float32)
+      inputs.set_shape(tf.TensorShape((5, 3, 3, None)))
+      with self.assertRaisesRegexp(ValueError, 'undefined last dimension'):
+        tf.contrib.layers.layer_norm(inputs)
+
+  def testCreateOp(self):
+    height, width = 3, 3
+    with self.test_session():
+      images = np.random.uniform(size=(5, height, width, 3))
+      output = tf.contrib.layers.layer_norm(images)
+      self.assertTrue(output.op.name.startswith('LayerNorm/batchnorm'))
+      self.assertListEqual(output.get_shape().as_list(), [5, height, width, 3])
+
+  def testCreateVariables(self):
+    height, width = 3, 3
+    with self.test_session():
+      images = tf.random_uniform((5, height, width, 3), seed=1)
+      tf.contrib.layers.layer_norm(images)
+      beta = tf.contrib.framework.get_variables_by_name('beta')[0]
+      gamma = tf.contrib.framework.get_variables_by_name('gamma')[0]
+      self.assertEquals(beta.op.name, 'LayerNorm/beta')
+      self.assertEquals(gamma.op.name, 'LayerNorm/gamma')
+
+  def testReuseVariables(self):
+    height, width = 3, 3
+    with self.test_session():
+      images = tf.random_uniform((5, height, width, 3), seed=1)
+      tf.contrib.layers.layer_norm(images, scope='ln')
+      tf.contrib.layers.layer_norm(images, scope='ln', reuse=True)
+      beta = tf.contrib.framework.get_variables_by_name('beta')
+      gamma = tf.contrib.framework.get_variables_by_name('gamma')
+      self.assertEquals(len(beta), 1)
+      self.assertEquals(len(gamma), 1)
+
+  def testReuseVars(self):
+    height, width = 3, 3
+    with self.test_session() as sess:
+      image_shape = (10, height, width, 3)
+      image_values = np.random.rand(*image_shape)
+      images = tf.constant(image_values, shape=image_shape, dtype=tf.float32)
+      output_train = tf.contrib.layers.layer_norm(images, scope='LN')
+      output_eval = tf.contrib.layers.layer_norm(images,
+                                                 scope='LN',
+                                                 reuse=True)
+      # Initialize all variables
+      sess.run(tf.initialize_all_variables())
+      # output_train and output_eval should be the same.
+      self.assertAllClose(sess.run([output_train]), sess.run([output_eval]))
+
+  def doOutputTest(self, input_shape):
+    with self.test_session() as sess:
+      input_values = np.random.rand(*input_shape)
+      inputs = tf.constant(input_values, shape=input_shape, dtype=tf.float32)
+      output_op = tf.contrib.layers.layer_norm(inputs, scope='LN')
+      # Initialize all variables
+      sess.run(tf.initialize_all_variables())
+      # The mean and variance of the output should be close to 0 and 1
+      # respectively.
+      moments_axis = tuple([i for i in range(1, len(input_shape))])
+      outputs = sess.run(output_op)
+      expected_mean = np.zeros(input_shape[0])
+      expected_var = np.ones(input_shape[0])
+      mean = np.mean(outputs, axis=moments_axis)
+      var = np.var(outputs, axis=moments_axis)
+      tol = 1e-5
+      self.assertAllClose(mean, expected_mean, rtol=tol, atol=tol)
+      self.assertAllClose(var, expected_var, rtol=tol, atol=tol)
+
+  def testOutput2DInput(self):
+    self.doOutputTest((10, 300))
+
+  def testOutput4DInput(self):
+    self.doOutputTest((100, 10, 10, 3))
+
 class MaxPool2DTest(tf.test.TestCase):

  def testCreateMaxPool(self):
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@ -684,6 +684,9 @@ py_test(
    size = "small",
    srcs = ["python/learn/utils/export_test.py"],
    srcs_version = "PY2AND3",
+    tags = [
+        "manual",  # http://b/31032996
+    ],
    deps = [
        ":learn",
        "//tensorflow:tensorflow_py",
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@ -285,6 +285,7 @@ ifeq ($(TARGET),IOS)
 		CXXFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
 		-arch armv7 \
 		-D__thread= \
+		-DUSE_GEMM_FOR_CONV \
 		-Wno-c++11-narrowing \
 		-mno-thumb \
 		-DTF_LEAN_BINARY \
@ -295,6 +296,7 @@ ifeq ($(TARGET),IOS)
 		${IPHONEOS_SYSROOT}
 		LDFLAGS := -arch armv7 \
 		-miphoneos-version-min=${MIN_SDK_VERSION} \
+		-framework Accelerate \
 		-Xlinker -S \
 		-Xlinker -x \
 		-Xlinker -dead_strip \
@ -306,6 +308,7 @@ ifeq ($(TARGET),IOS)
 		CXXFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
 		-arch armv7s \
 		-D__thread= \
+		-DUSE_GEMM_FOR_CONV \
 		-Wno-c++11-narrowing \
 		-mno-thumb \
 		-DTF_LEAN_BINARY \
@ -316,6 +319,7 @@ ifeq ($(TARGET),IOS)
 		${IPHONEOS_SYSROOT}
 		LDFLAGS := -arch armv7s \
 		-miphoneos-version-min=${MIN_SDK_VERSION} \
+		-framework Accelerate \
 		-Xlinker -S \
 		-Xlinker -x \
 		-Xlinker -dead_strip \
@ -327,6 +331,7 @@ ifeq ($(TARGET),IOS)
 		CXXFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
 		-arch arm64 \
 		-D__thread= \
+		-DUSE_GEMM_FOR_CONV \
 		-Wno-c++11-narrowing \
 		-DTF_LEAN_BINARY \
 		-D__ANDROID_TYPES_SLIM__ \
@ -336,6 +341,7 @@ ifeq ($(TARGET),IOS)
 		${IPHONEOS_SYSROOT}
 		LDFLAGS := -arch arm64 \
 		-miphoneos-version-min=${MIN_SDK_VERSION} \
+		-framework Accelerate \
 		-Xlinker -S \
 		-Xlinker -x \
 		-Xlinker -dead_strip \
@ -347,6 +353,7 @@ ifeq ($(TARGET),IOS)
 		CXXFLAGS += -mios-simulator-version-min=$(MIN_SDK_VERSION) \
 		-arch i386 \
 		-D__thread= \
+		-DUSE_GEMM_FOR_CONV \
 		-Wno-c++11-narrowing \
 		-DTF_LEAN_BINARY \
 		-D__ANDROID_TYPES_SLIM__ \
@ -356,6 +363,7 @@ ifeq ($(TARGET),IOS)
 		${IPHONESIMULATOR_SYSROOT}
 		LDFLAGS := -arch i386 \
 		-mios-simulator-version-min=${MIN_SDK_VERSION} \
+		-framework Accelerate \
 		-Xlinker -S \
 		-Xlinker -x \
 		-Xlinker -dead_strip \
@ -367,6 +375,7 @@ ifeq ($(TARGET),IOS)
 		CXXFLAGS += -mios-simulator-version-min=$(MIN_SDK_VERSION) \
 		-arch x86_64 \
 		-D__thread= \
+		-DUSE_GEMM_FOR_CONV \
 		-Wno-c++11-narrowing \
 		-DTF_LEAN_BINARY \
 		-D__ANDROID_TYPES_SLIM__ \
@ -376,6 +385,7 @@ ifeq ($(TARGET),IOS)
 		${IPHONESIMULATOR_SYSROOT}
 		LDFLAGS := -arch x86_64 \
 		-mios-simulator-version-min=${MIN_SDK_VERSION} \
+		-framework Accelerate \
 		-Xlinker -S \
 		-Xlinker -x \
 		-Xlinker -dead_strip \
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@ -260,15 +260,17 @@ For other variations of valid optimization flags, see [clang optimization levels
 ## Raspberry Pi

 Building on the Raspberry Pi is similar to a normal Linux system. First
-download the dependencies and build protobuf:
+download the dependencies, install the required packages and build protobuf:

 ```bash
 tensorflow/contrib/makefile/download_dependencies.sh
+sudo apt-get install autoconf automake libtool
 cd tensorflow/contrib/makefile/downloads/protobuf/
 ./autogen.sh
 ./configure
 make
 sudo make install
+sudo ldconfig  # refresh shared library cache
 cd ../../../../..
 ```

--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@ -99,6 +99,7 @@ tensorflow/core/kernels/cwise_op_equal_to.cc
 tensorflow/core/kernels/cwise_op_div.cc
 tensorflow/core/kernels/cwise_op_add.cc
 tensorflow/core/kernels/ctc_decoder_ops.cc
+tensorflow/core/kernels/conv_ops_using_gemm.cc
 tensorflow/core/kernels/conv_ops.cc
 tensorflow/core/kernels/conv_grad_ops.cc
 tensorflow/core/kernels/control_flow_ops.cc
--- a/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py
@ -59,7 +59,7 @@ def confusion_matrix(predictions, labels, num_classes=None,
    name: Scope name.

  Returns:
-    A l X l matrix represeting the confusion matrix, where l in the number of
+    A k X k matrix represeting the confusion matrix, where k is the number of
    possible labels in the classification task.

  Raises:
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@ -787,7 +787,8 @@ struct CudaVersion {
 };

 std::vector<CudaVersion> supported_cuda_compute_capabilities = {
-    TF_CUDA_CAPABILITIES,};
+  TF_CUDA_CAPABILITIES,
+};

 std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
  auto cuda_caps = supported_cuda_compute_capabilities;
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@ -154,7 +154,7 @@ Status DebugNodeInserter::InsertNodes(

      // Create all requested debug nodes and their edges to the Copy node.
      std::vector<Node*> node_added_debug_nodes;
-      for (int i = 0; i < tensor_watches[tensor_name].size(); ++i) {
+      for (size_t i = 0; i < tensor_watches[tensor_name].size(); ++i) {
        const string& debug_op_name = tensor_watches[tensor_name][i];

        Node* debug_node;
--- a/tensorflow/core/framework/log_memory.cc
+++ b/tensorflow/core/framework/log_memory.cc
@ -30,7 +30,7 @@ namespace {
 template <typename T>
 void OutputToLog(const T& proto) {
  string type_name = proto.GetTypeName();
-  const int index = type_name.find_last_of(".");
+  const size_t index = type_name.find_last_of(".");
  if (index != string::npos) type_name = type_name.substr(index + 1);
  LOG(INFO) << LogMemory::kLogMemoryLabel << " " << type_name << " { "
            << ProtoShortDebugString(proto) << " }";
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@ -156,7 +156,7 @@ string OpRegistry::DebugString(bool include_internal) const {
 bool OpRegistry::MustCallDeferred() const {
  if (initialized_) return false;
  initialized_ = true;
-  for (int i = 0; i < deferred_.size(); ++i) {
+  for (size_t i = 0; i < deferred_.size(); ++i) {
    TF_QCHECK_OK(RegisterAlreadyLocked(deferred_[i]));
  }
  deferred_.clear();
@ -166,7 +166,7 @@ bool OpRegistry::MustCallDeferred() const {
 Status OpRegistry::CallDeferred() const {
  if (initialized_) return Status::OK();
  initialized_ = true;
-  for (int i = 0; i < deferred_.size(); ++i) {
+  for (size_t i = 0; i < deferred_.size(); ++i) {
    Status s = RegisterAlreadyLocked(deferred_[i]);
    if (!s.ok()) {
      return s;
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@ -57,7 +57,7 @@ class DimensionHandle {

  const Dimension* ptr_ = nullptr;

-  friend class DimensionOrConstant;
+  friend struct DimensionOrConstant;
  friend class InferenceContext;
  friend class ShapeInferenceTest;
  friend class ShapeInferenceTestutil;
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@ -740,7 +740,7 @@ string Tensor::SummarizeValue(int64 max_entries) const {
      string ret;
      // TODO(irving): Don't call flat every time around this
      // loop.
-      for (int64 i = 0; i < limit; ++i) {
+      for (size_t i = 0; i < limit; ++i) {
        if (i > 0) strings::StrAppend(&ret, " ");
        switch (dtype()) {
          case DT_STRING:
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@ -242,8 +242,12 @@ class Conv2DOp : public BinaryOp<T> {
      Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
      Conv2DOp<CPUDevice, T>);

+// If we're using the alternative GEMM-based implementation of Conv2D for the
+// CPU implementation, don't register this EigenTensor-based version.
+#if !defined(USE_GEMM_FOR_CONV)
 TF_CALL_half(REGISTER_CPU);
 TF_CALL_float(REGISTER_CPU);
+#endif  // USE_GEMM_FOR_CONV

 // To be used inside depthwise_conv_op.cc.
 template class LaunchConv2DOp<CPUDevice, float>;
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@ -0,0 +1,622 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains a set of different implementations of the two-dimensional
+// convolution operation. The standard TensorFlow Conv2d kernel uses EigenTensor
+// to implement the computation, but this module has a variety of different ways
+// of producing the same result. These methods are designed to be easier to
+// understand and connect to other libraries, so that we can take advantage of
+// platforms that have specialized implementations of GEMM for example.
+//
+// The basic interface is a Conv functor object that's templated by the types
+// of the data it will be operating on, and is passed in the arguments needed to
+// calculate the convolution. The simplest implementation of this functor is
+// ReferenceConvFunctor, which is a readable but slow reference version.
+//
+// A faster version uses the approach of packing image patches into a matrix
+// before calling a matrix multiply, the Im2ColConvFunctor. In turn, this can
+// use a variety of different methods to calculate the matrix multiplication,
+// or GEMM. The simplest but slowest is the ReferenceGemmFunctor, but the
+// FastGemmFunctor will use whatever optimized libraries are available. By
+// default it uses Eigen, but on Apple platforms it will take advantage of the
+// system's Accelerate BLAS library to get better performance than the standard
+// TensorFlow convolution kernel.
+//
+// The version actually used is defined at the bottom of this file using the
+// REGISTER_KERNEL_BUILDER() macro. To try out different implementations (for
+// example to switch to a reference one for easier debugging) you can swap out
+// the default functors in that call.
+//
+// The registration itself is guarded with the USE_GEMM_FOR_CONV macro. The iOS
+// makefile build defines this, but if you want to enable this implementation
+// and disable the standard EigenTensor one in other build setups, you'll need
+// to define it there too.
+
+#include <string.h>
+#include <map>
+#include <vector>
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#if defined(__APPLE__)
+#include <Accelerate/Accelerate.h>
+#define USE_ACCELERATE_GEMM
+#endif  // __APPLE__
+
+namespace tensorflow {
+
+namespace {
+// This function implements the convolution operation in as simple a form as
+// possible. It won't give great performance, but it is very useful for
+// stepping through and instrumenting for debugging, creating minimal benchmarks
+// to prototype with, and sharing with teams that want to run this outside of
+// our environment.
+// With that in mind, I've avoided using anything except pretty standard C++
+// types. This is especially noticeable in the data access through raw array
+// indexing. It's deliberate in this case though, since it makes the underlying
+// memory order very explicit, which is important for both inspecting memory
+// contents during debugging and for specifying what we expect to others.
+// The memory layout of the data is, from biggest stride to smallest:
+// input_data = [input_batches, input_height, input_width, input_depth]
+// filter_data = [filter_height, filter_width, input_depth, filter_count]
+// output_data = [input_batches, output_height, output_width, filter_count]
+template <class T1, class T2, class T3>
+class ReferenceConvFunctor {
+ public:
+  void operator()(OpKernelContext* context, const T1* input_data,
+                  int input_batches, int input_height, int input_width,
+                  int input_depth, const T2* filter_data, int filter_height,
+                  int filter_width, int filter_count, int stride_rows,
+                  int stride_cols, Padding padding, T3* output_data,
+                  int output_height, int output_width) {
+    // The two different padding modes we support can be a bit confusing. SAME
+    // means we're trying to produce an output image that's the same size as the
+    // input. It's complicated by stride, which shrinks the output image by a
+    // a factor, but it means we end up sampling from outside the borders of the
+    // input. These out-of-bounds values are read as zeroes. VALID means only
+    // produce output values where the filters can read all their values from
+    // within the input image. It effectively removes the margins of the output
+    // image compared to the one produced by SAME. Stride complicates this
+    // definition though, because it can result in the right and bottom filter
+    // patches sampling from outside the borders if it's greater than 1.
+    // Most of the logic for sorting this all out is done before this function,
+    // when we calculate the output size, but the positioning of the origin of
+    // the filters is different between the two modes, since SAME positions the
+    // first filter off the edge of the input.
+    int filter_left_offset;
+    int filter_top_offset;
+    if (padding == VALID) {
+      filter_left_offset =
+          ((output_width - 1) * stride_cols + filter_width - input_width + 1) /
+          2;
+      filter_top_offset = ((output_height - 1) * stride_rows + filter_height -
+                           input_height + 1) /
+                          2;
+    } else {
+      filter_left_offset =
+          ((output_width - 1) * stride_cols + filter_width - input_width) / 2;
+      filter_top_offset =
+          ((output_height - 1) * stride_rows + filter_height - input_height) /
+          2;
+    }
+
+    // If we've got multiple images in our input, work through each of them.
+    for (int batch = 0; batch < input_batches; ++batch) {
+      // Walk through all the output image values, sliding the filter to
+      // different positions in the input.
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          // Each filter kernel produces one output channel.
+          for (int out_channel = 0; out_channel < filter_count; ++out_channel) {
+            // We're going to calculate a single output value, which means we
+            // need to multiply a three dimensional kernel of weights against
+            // the current location within the input image.
+            /*
+             *-------------------------------...
+             |\ ^
+             | \in_depth
+             |  \ v
+             |   *-------------------------------...
+             |   |            ^
+             |   |       in_y_origin
+             |   |            v   \
+             |   |<in_x_origin>*---*^
+             |   |            \|   |filter_height
+             .   |             *---*v
+             .   |             <--->
+             .         filter_width
+             .
+            */
+            const int in_x_origin = (out_x * stride_cols) - filter_left_offset;
+            const int in_y_origin = (out_y * stride_rows) - filter_top_offset;
+            T3 total(0);
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                for (int in_channel = 0; in_channel < input_depth;
+                     ++in_channel) {
+                  const int in_x = in_x_origin + filter_x;
+                  const int in_y = in_y_origin + filter_y;
+                  T1 input_value;
+                  // If the location is outside the bounds of the input image,
+                  // use zero as a default value.
+                  if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                      (in_y < input_height)) {
+                    input_value =
+                        input_data[(batch * input_height * input_width *
+                                    input_depth) +
+                                   (in_y * input_width * input_depth) +
+                                   (in_x * input_depth) + in_channel];
+                  } else {
+                    input_value = T1(0);
+                  }
+                  const T2 filter_value =
+                      filter_data[(filter_y * filter_width * input_depth *
+                                   filter_count) +
+                                  (filter_x * input_depth * filter_count) +
+                                  (in_channel * filter_count) + out_channel];
+                  total += (input_value * filter_value);
+                }
+              }
+            }
+            output_data[(batch * output_height * output_width * filter_count) +
+                        (out_y * output_width * filter_count) +
+                        (out_x * filter_count) + out_channel] = total;
+          }
+        }
+      }
+    }
+  }
+};
+
+// A readable but slow implementation of matrix multiplication, useful for
+// debugging and understanding the algorithm. Use instead of FastGemmFunctor in
+// the Im2ColConvFunctor template definition inside the op registration to
+// enable. Assumes row-major ordering of the values in memory.
+template <class T1, class T2, class T3>
+class ReferenceGemmFunctor {
+ public:
+  void operator()(size_t m, size_t n, size_t k, const T1* a, size_t lda,
+                  const T2* b, size_t ldb, T3* c, size_t ldc) {
+    const size_t a_i_stride = lda;
+    const size_t a_l_stride = 1;
+    const size_t b_j_stride = 1;
+    const size_t b_l_stride = ldb;
+    const size_t c_i_stride = ldc;
+    const size_t c_j_stride = 1;
+    size_t i, j, l;
+    for (j = 0; j < n; j++) {
+      for (i = 0; i < m; i++) {
+        T3 total(0);
+        for (l = 0; l < k; l++) {
+          const size_t a_index = ((i * a_i_stride) + (l * a_l_stride));
+          const T1 a_value = a[a_index];
+          const size_t b_index = ((j * b_j_stride) + (l * b_l_stride));
+          const T2 b_value = b[b_index];
+          total += (a_value * b_value);
+        }
+        const size_t c_index = ((i * c_i_stride) + (j * c_j_stride));
+        c[c_index] = total;
+      }
+    }
+  }
+};
+
+// Uses the optimized Eigen library to implement the matrix multiplication
+// required by the Im2ColConvFunctor class. We supply the two input and one
+// output types so that the accumulator can potentially be higher-precision than
+// the inputs, even though we don't currently take advantage of this.
+template <class T1, class T2, class T3>
+class FastGemmFunctor {
+ public:
+  // Convenience wrappers for the Eigen matrix types we'll be using.
+  typedef Eigen::Map<
+      const Eigen::Matrix<T1, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+      ConstMatrixT1;
+  typedef Eigen::Map<
+      const Eigen::Matrix<T2, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+      ConstMatrixT2;
+  typedef Eigen::Map<
+      Eigen::Matrix<T3, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+      MatrixT3;
+  void operator()(size_t m, size_t n, size_t k, const T1* a, size_t lda,
+                  const T2* b, size_t ldb, T3* c, size_t ldc) {
+    ConstMatrixT1 a_matrix(a, m, k);
+    ConstMatrixT2 b_matrix(b, k, n);
+    MatrixT3 c_matrix(c, m, n);
+    c_matrix.noalias() = a_matrix * b_matrix;
+  }
+};
+
+// If we have Apple's Accelerate framework, use their implementation of GEMM to
+// get a performance boost for float.
+#if defined(USE_ACCELERATE_GEMM)
+template <>
+class FastGemmFunctor<float, float, float> {
+ public:
+  void operator()(size_t m, size_t n, size_t k, const float* a, size_t lda,
+                  const float* b, size_t ldb, float* c, size_t ldc) {
+    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0f, a,
+                lda, b, ldb, 0.0f, c, ldc);
+  }
+};
+#endif  // USE_ACCELERATE_GEMM
+
+// Used to keep track of persistent memory buffers used within the op.
+template <class T, size_t size>
+struct Im2ColBufferResource : public ResourceBase {
+  mutex mu;
+  T data[size];
+  string DebugString() { return "Im2ColBufferResource"; }
+};
+
+// Implements convolution as a two stage process, first packing the patches of
+// the input image into columns (im2col) and then running GEMM to produce the
+// final result.
+template <class T1, class T2, class T3, class TGemmFunctor>
+class Im2ColConvFunctor {
+ public:
+  void operator()(OpKernelContext* context, const T1* input_data,
+                  int input_batches, int input_height, int input_width,
+                  int input_depth, const T2* filter_data, int filter_height,
+                  int filter_width, int filter_count, int stride_rows,
+                  int stride_cols, Padding padding, T3* output_data,
+                  int output_height, int output_width) {
+    if ((input_batches <= 0) || (input_width <= 0) || (input_height <= 0) ||
+        (input_depth <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad input dimensions: "
+                   << input_batches << ", " << input_height << ", "
+                   << input_width << ", " << input_depth;
+      return;
+    }
+    if ((filter_width <= 0) || (filter_height <= 0) || (filter_count <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad filter dimensions: "
+                   << filter_width << ", " << filter_height << ", "
+                   << filter_count;
+      return;
+    }
+    if ((output_width <= 0) || (output_height <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad output width or height: "
+                   << output_width << ", " << output_height;
+      return;
+    }
+
+    // These calculations define how the patches will be positioned within the
+    // input image. The actual definitions are quite complex, and rely on the
+    // previously-calculated output size.
+    int filter_left_offset;
+    int filter_top_offset;
+    if (padding == VALID) {
+      filter_left_offset =
+          ((output_width - 1) * stride_cols + filter_width - input_width + 1) /
+          2;
+      filter_top_offset = ((output_height - 1) * stride_rows + filter_height -
+                           input_height + 1) /
+                          2;
+    } else {
+      filter_left_offset =
+          ((output_width - 1) * stride_cols + filter_width - input_width) / 2;
+      filter_top_offset =
+          ((output_height - 1) * stride_rows + filter_height - input_height) /
+          2;
+    }
+
+    // The im2col buffer has # of patches rows, and # of filters cols.
+    // It's laid out like this, in row major order in memory:
+    //        < filter value count >
+    //   ^   +---------------------+
+    // patch |                     |
+    // count |                     |
+    //   v   +---------------------+
+    // Each patch row contains a filter_width x filter_height patch of the
+    // input, with the depth channel as the most contiguous in memory, followed
+    // by the width, then the height. This is the standard memory order in the
+    // image world if it helps to visualize it.
+    const int filter_value_count = filter_width * filter_height * input_depth;
+
+    // We don't want to allocate a buffer to hold all the patches if the size is
+    // going to be extremely large, so break it into chunks if it's bigger than
+    // a limit. Each chunk will be processed serially, so we can refill the
+    // buffer for the next chunk and reuse it, keeping maximum memory size down.
+    // In this case, we've picked 16 megabytes as a reasonable limit.
+    const size_t max_chunk_size = (16 * 1024 * 1024);
+    OP_REQUIRES(context, (filter_value_count * sizeof(T1)) <= max_chunk_size,
+                errors::InvalidArgument("Im2Col patch too large for buffer"));
+    const size_t patches_per_chunk =
+        max_chunk_size / (filter_value_count * sizeof(T1));
+
+    // Because memory allocation is very expensive on mobile platforms, try to
+    // allocate a persistent buffer that will be kept around between calls. We
+    // use TensorFlow's resource management to ensure that the memory will be
+    // released when the session is over.
+    Im2ColBufferResource<T1, max_chunk_size>* im2col_buffer_resource;
+    std::function<Status(Im2ColBufferResource<T1, max_chunk_size>**)> creator =
+        [](Im2ColBufferResource<T1, max_chunk_size>** resource) {
+          *resource = new Im2ColBufferResource<T1, max_chunk_size>();
+          return Status::OK();
+        };
+    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
+                                "Conv2d", "im2col_buffer",
+                                &im2col_buffer_resource, creator));
+    // This means that multiple ops can't be run simultaneously on different
+    // threads, because we have a single shared resource. The platforms this is
+    // aimed at have intra-op parallelism as their focus though, so it shouldn't
+    // be an issue.
+    mutex_lock lock_buffer(im2col_buffer_resource->mu);
+    core::ScopedUnref unref_buffer(im2col_buffer_resource);
+    T1* im2col_buffer = im2col_buffer_resource->data;
+
+    for (int batch = 0; batch < input_batches; ++batch) {
+      const T1* input_batch_start =
+          input_data + (batch * input_height * input_width * input_depth);
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        const int in_y_origin = (out_y * stride_rows) - filter_top_offset;
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin = (out_x * stride_cols) - filter_left_offset;
+          const int patch_index = (batch * output_width * output_height) +
+                                  (out_y * output_width) + out_x;
+          const int patch_index_within_chunk = patch_index % patches_per_chunk;
+          T1* im2col_patch_start =
+              im2col_buffer + (patch_index_within_chunk * filter_value_count);
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + filter_y;
+            T1* im2col_row_start =
+                im2col_patch_start + (filter_y * filter_width * input_depth);
+            // If we're off the top or the bottom of the input, fill the whole
+            // row with zeroes.
+            if ((in_y < 0) || (in_y >= input_height)) {
+              T1* im2col_row_end =
+                  im2col_row_start + (filter_width * input_depth);
+              std::fill(im2col_row_start, im2col_row_end, T1(0));
+            } else {
+              // What we're doing here is trying to copy and fill the im2col
+              // buffer as efficiently as possible, using functions to set or
+              // duplicate values en masse. We know we don't have to worry about
+              // vertical edges because we dealt with that case above, so we
+              // just need to handle filters that overlap the left or right
+              // edges. Here's what that looks like:
+              //
+              // < left_zero_count > < center_copy_count > < right_zero_count >
+              // +------------------+---------------------+--------------------+
+              // |     (filter)     |       (image)       |      (filter)      |
+              // +------------------+---------------------+--------------------+
+              // in_x_origin        0                 input_width       in_x_end
+              //
+              // In reality it's unlikely that a filter patch will be wider
+              // than an input, but this shows all the edge cases.
+              // We use std::fill() to set the left and right sections to zeroes
+              // and std::copy() to copy over the input data for the center.
+              const int in_x_end = in_x_origin + filter_width;
+              const int left_zero_count = std::max(0, 0 - in_x_origin);
+              const int right_zero_count = std::max(0, in_x_end - input_width);
+              const int center_copy_count =
+                  filter_width - (left_zero_count + right_zero_count);
+              if (left_zero_count > 0) {
+                T1* im2col_left_start = im2col_row_start;
+                T1* im2col_left_end =
+                    im2col_left_start + (left_zero_count * input_depth);
+                std::fill(im2col_left_start, im2col_left_end, T1(0));
+              }
+              if (center_copy_count > 0) {
+                const T1* input_row_start =
+                    input_batch_start + (in_y * input_width * input_depth) +
+                    (std::max(0, in_x_origin) * input_depth);
+                const T1* input_row_end =
+                    input_row_start + (center_copy_count * input_depth);
+                T1* im2col_center_start =
+                    im2col_row_start + (left_zero_count * input_depth);
+                std::copy(input_row_start, input_row_end, im2col_center_start);
+              }
+              if (right_zero_count > 0) {
+                T1* im2col_right_start =
+                    im2col_row_start +
+                    ((left_zero_count + center_copy_count) * input_depth);
+                T1* im2col_right_end =
+                    im2col_right_start + (right_zero_count * input_depth);
+                std::fill(im2col_right_start, im2col_right_end, T1(0));
+              }
+            }
+          }
+          const bool is_last_in_chunk =
+              (patch_index_within_chunk == (patches_per_chunk - 1));
+          const bool is_last_overall =
+              ((batch == (input_batches - 1)) &&
+               (out_y == (output_height - 1)) && (out_x == (output_width - 1)));
+          if (is_last_in_chunk || is_last_overall) {
+            // Now we've assembled a set of image patches into a matrix, apply a
+            // GEMM matrix multiply of the patches as rows, times the filter
+            // weights in columns, to get partial results in the output matrix.
+            const int how_many_patches = patch_index_within_chunk + 1;
+            const int m = how_many_patches;
+            const int n = filter_count;
+            const int k = filter_value_count;
+            const int lda = filter_value_count;
+            const int ldb = filter_count;
+            const int ldc = filter_count;
+            const size_t start_patch_index =
+                patch_index - (how_many_patches - 1);
+            T3* chunk_output_data =
+                output_data + (start_patch_index * filter_count);
+            TGemmFunctor gemm_functor;
+            gemm_functor(m, n, k, im2col_buffer, lda, filter_data, ldb,
+                         chunk_output_data, ldc);
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace
+
+// This TensorFlow kernel class handles all of the IO and housekeeping for the
+// functors that actually implement the underlying algorithm. To swap in
+// different implementations of the main calculations, use a different
+// TConvFunctor parameter when instantiating the template.
+template <class T, class TConvFunctor>
+class Conv2DUsingGemmOp : public BinaryOp<T> {
+ public:
+  explicit Conv2DUsingGemmOp(OpKernelConstruction* context)
+      : BinaryOp<T>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES(context, data_format_ == FORMAT_NHWC,
+                errors::InvalidArgument(
+                    "Data format not supported by this kernel", data_format));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
+    const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, stride_n == 1 && stride_c == 1,
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, in_rows, in_cols, in_depth ]
+    const Tensor& input = context->input(0);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth]
+    const Tensor& filter = context->input(1);
+
+    // For 2D convolution, there should be 4 dimensions.
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().DebugString()));
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().DebugString()));
+
+    for (int i = 0; i < 3; i++) {
+      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
+                                           std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("filter too large"));
+    }
+
+    // The last dimension for input is in_depth. It must be the same as the
+    // filter's in_depth.
+    const int64 in_depth = GetTensorDim(input, data_format_, 'C');
+    OP_REQUIRES(
+        context, in_depth == filter.dim_size(2),
+        errors::InvalidArgument("input and filter must have the same depth: ",
+                                in_depth, " vs ", filter.dim_size(2)));
+
+    // The last dimension for filter is out_depth.
+    const int out_depth = static_cast<int>(filter.dim_size(3));
+
+    // The second dimension for input is rows/height.
+    // The first dimension for filter is rows/height.
+    const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
+    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("Input rows too large"));
+    const int input_rows = static_cast<int>(input_rows_raw);
+    const int filter_rows = static_cast<int>(filter.dim_size(0));
+
+    // The third dimension for input is columns/width.
+    // The second dimension for filter is columns/width.
+    const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
+    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("Input cols too large"));
+    const int input_cols = static_cast<int>(input_cols_raw);
+    const int filter_cols = static_cast<int>(filter.dim_size(1));
+
+    // The first dimension for input is batch.
+    const int64 batch_raw = GetTensorDim(input, data_format_, 'N');
+    OP_REQUIRES(context,
+                FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()),
+                errors::InvalidArgument("batch is too large"));
+    const int batch = static_cast<int>(batch_raw);
+
+    // For now we take the stride from the second and third dimensions only (we
+    // do not support striding on the batch or depth dimension).
+    const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
+    const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
+                                         padding_, &out_cols, &pad_cols));
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    VLOG(2) << "Conv2D: in_depth = " << in_depth
+            << ", input_cols = " << input_cols
+            << ", filter_cols = " << filter_cols
+            << ", input_rows = " << input_rows
+            << ", filter_rows = " << filter_rows
+            << ", stride_rows = " << stride_rows
+            << ", stride_cols = " << stride_cols
+            << ", out_depth = " << out_depth;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+    TConvFunctor conv_functor;
+    conv_functor(context, input.flat<T>().data(), batch, input_rows, input_cols,
+                 in_depth, filter.flat<T>().data(), filter_rows, filter_cols,
+                 out_depth, stride_rows, stride_cols, padding_,
+                 output->flat<T>().data(), out_rows, out_cols);
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DUsingGemmOp);
+};
+
+#define REGISTER_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv2DUsingGemmOp<                                        \
+          T, Im2ColConvFunctor<T, T, T, FastGemmFunctor<T, T, T>>>);
+
+// Only register this GEMM-based implementation of Conv2d if the compiler flags
+// request the implementation explicitly, since otherwise it will clash with the
+// default EigenTensor-based kernel.
+#if defined(USE_GEMM_FOR_CONV)
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
+#endif  // USE_GEMM_FOR_CONV
+
+}  // namespace tensorflow
--- a/tensorflow/core/kernels/reduce_join_op.cc
+++ b/tensorflow/core/kernels/reduce_join_op.cc
@ -105,7 +105,7 @@ void MakeUnreducedIndices(gtl::InlinedVector<bool, 8> index_is_reduced,
 TensorShape GetOutputShape(gtl::InlinedVector<bool, 8> index_is_reduced,
                           const TensorShape& input_shape, bool keep_dims) {
  TensorShape output_shape;
-  for (int32 index = 0; index < index_is_reduced.size(); ++index) {
+  for (size_t index = 0; index < index_is_reduced.size(); ++index) {
    if (index_is_reduced[index]) {
      if (keep_dims) output_shape.AddDim(1);
    } else {
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@ -40,19 +40,19 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;

-template <typename Device>
+template <typename Device, typename Tlen>
 void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
  const Tensor& input = context->input(0);
  const Tensor& seq_lens = context->input(1);

-  auto seq_lens_t = seq_lens.vec<int64>();
+  auto seq_lens_t = seq_lens.vec<Tlen>();

-  std::vector<int64> seq_lens_vec(seq_lens_t.size());
+  std::vector<Tlen> seq_lens_vec(seq_lens_t.size());

  // Copy seq_len info down for validity checks
  context->eigen_device<Device>().memcpyDeviceToHost(
      seq_lens_vec.data(), seq_lens_t.data(),
-      sizeof(int64) * seq_lens_t.size());
+      sizeof(Tlen) * seq_lens_t.size());

  OP_REQUIRES(context, batch_dim != seq_dim,
              errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
@ -76,8 +76,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
  }
 }

-template <>
-void CheckErrors<GPUDevice>(OpKernelContext* context, int batch_dim,
+void CheckErrorsGPU(OpKernelContext* context, int batch_dim,
                            int seq_dim) {
  const Tensor& input = context->input(0);
  const Tensor& seq_lens = context->input(1);
@ -97,7 +96,19 @@ void CheckErrors<GPUDevice>(OpKernelContext* context, int batch_dim,
                                      " vs. ", input.dim_size(batch_dim)));
 }

-template <typename Device, typename T>
+template <>
+void CheckErrors<GPUDevice, int32>(OpKernelContext* context, int batch_dim,
+                            int seq_dim) {
+  CheckErrorsGPU(context, batch_dim, seq_dim);
+}
+
+template <>
+void CheckErrors<GPUDevice, int64>(OpKernelContext* context, int batch_dim,
+                            int seq_dim) {
+  CheckErrorsGPU(context, batch_dim, seq_dim);
+}
+
+template <typename Device, typename T, typename Tlen>
 class ReverseSequenceOp : public OpKernel {
 public:
  explicit ReverseSequenceOp(OpKernelConstruction* context)
@ -115,9 +126,9 @@ class ReverseSequenceOp : public OpKernel {
                errors::InvalidArgument("seq_lens input must be 1-dim, not ",
                                        seq_lens.dims()));

-    auto seq_lens_t = seq_lens.vec<int64>();
+    auto seq_lens_t = seq_lens.vec<Tlen>();

-    CheckErrors<Device>(context, batch_dim_, seq_dim_);
+    CheckErrors<Device, Tlen>(context, batch_dim_, seq_dim_);

    const int input_dims = input.dims();

@ -127,7 +138,7 @@ class ReverseSequenceOp : public OpKernel {

 #define HANDLE_DIM(NDIM)                                                      \
  case NDIM:                                                                  \
-    functor::ReverseSequence<Device, T, NDIM>::Compute(                       \
+    functor::ReverseSequence<Device, T, Tlen, NDIM>::Compute(                 \
        context->eigen_device<Device>(), input.tensor<T, NDIM>(), batch_dim_, \
        seq_dim_, seq_lens_t, output->tensor<T, NDIM>());                     \
    break;
@ -153,42 +164,57 @@ class ReverseSequenceOp : public OpKernel {
  TF_DISALLOW_COPY_AND_ASSIGN(ReverseSequenceOp);
 };

-#define REGISTER_REVERSE_SEQUENCE(type)                                     \
+#define REGISTER_REVERSE_SEQUENCE(type, len_type)                           \
  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("ReverseSequence").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
-      ReverseSequenceOp<CPUDevice, type>);
+      Name("ReverseSequence").Device(DEVICE_CPU).TypeConstraint<type>("T"). \
+      TypeConstraint<len_type>("Tlen"),                                     \
+      ReverseSequenceOp<CPUDevice, type, len_type>);

-TF_CALL_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE);
+#define REGISTER_REVERSE_SEQUENCE_LEN(type)    \
+  REGISTER_REVERSE_SEQUENCE(type, int32);      \
+  REGISTER_REVERSE_SEQUENCE(type, int64);
+
+TF_CALL_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_LEN);

 #if GOOGLE_CUDA

 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T, Dims)                                       \
-  template <>                                                           \
-  void ReverseSequence<GPUDevice, T, Dims>::Compute(                    \
-      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,  \
-      int32 batch_dim, int32 seq_dim, TTypes<int64>::ConstVec seq_lens, \
-      typename TTypes<T, Dims>::Tensor output);                         \
-  extern template struct ReverseSequence<GPUDevice, T, Dims>;
+#define DECLARE_GPU_SPEC(T, Tlen, Dims)                                     \
+  template <>                                                               \
+  void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(                  \
+      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,      \
+      int32 batch_dim, int32 seq_dim,                                       \
+      typename TTypes<Tlen>::ConstVec seq_lens,                             \
+      typename TTypes<T, Dims>::Tensor output);                             \
+  extern template struct ReverseSequence<GPUDevice, T, Tlen, Dims>;

-#define DECLARE_GPU_SPECS(T) \
-  DECLARE_GPU_SPEC(T, 2);    \
-  DECLARE_GPU_SPEC(T, 3);    \
-  DECLARE_GPU_SPEC(T, 4);    \
-  DECLARE_GPU_SPEC(T, 5);
+#define DECLARE_GPU_SPEC_LEN(T, Dims)    \
+  DECLARE_GPU_SPEC(T, int32, Dims);      \
+  DECLARE_GPU_SPEC(T, int64, Dims);
+
+#define DECLARE_GPU_SPECS(T)     \
+  DECLARE_GPU_SPEC_LEN(T, 2);    \
+  DECLARE_GPU_SPEC_LEN(T, 3);    \
+  DECLARE_GPU_SPEC_LEN(T, 4);    \
+  DECLARE_GPU_SPEC_LEN(T, 5);

 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);

 }  // namespace functor

 // Registration of the GPU implementations.
-#define REGISTER_REVERSE_SEQUENCE_GPU(type)                                 \
+#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type)                       \
  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("ReverseSequence").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
-      ReverseSequenceOp<GPUDevice, type>);
+      Name("ReverseSequence").Device(DEVICE_GPU).TypeConstraint<type>("T"). \
+      TypeConstraint<len_type>("Tlen"),                                     \
+      ReverseSequenceOp<GPUDevice, type, len_type>);

-TF_CALL_GPU_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_GPU);
+#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type)   \
+  REGISTER_REVERSE_SEQUENCE_GPU(type, int32);     \
+  REGISTER_REVERSE_SEQUENCE_GPU(type, int64);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_GPU_LEN);

 #undef REGISTER_REVERSE_SEQUENCE_GPU

--- a/tensorflow/core/kernels/reverse_sequence_op.h
+++ b/tensorflow/core/kernels/reverse_sequence_op.h
@ -25,12 +25,12 @@ namespace tensorflow {

 namespace generator {

-template <typename T, size_t Dims>
+template <typename T, typename Tlen, size_t Dims>
 class ReverseGenerator {
 public:
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
  ReverseGenerator(typename TTypes<T, Dims>::ConstTensor input, int32 batch_dim,
-                   int32 seq_dim, TTypes<int64>::ConstVec seq_lengths)
+                   int32 seq_dim, typename TTypes<Tlen>::ConstVec seq_lengths)
      : input_(input),
        batch_dim_(batch_dim),
        seq_dim_(seq_dim),
@ -51,21 +51,22 @@ class ReverseGenerator {
  typename TTypes<T, Dims>::ConstTensor input_;
  int32 batch_dim_;
  int32 seq_dim_;
-  TTypes<int64>::ConstVec seq_lengths_;
+  typename TTypes<Tlen>::ConstVec seq_lengths_;
 };

 }  // namespace generator

 namespace functor {

-template <typename Device, typename T, size_t Dims>
+template <typename Device, typename T, typename Tlen, size_t Dims>
 struct ReverseSequence {
  EIGEN_ALWAYS_INLINE static void Compute(
      const Device& d, typename TTypes<T, Dims>::ConstTensor input,
-      int32 batch_dim, int32 seq_dim, TTypes<int64>::ConstVec seq_lengths,
+      int32 batch_dim, int32 seq_dim,
+      typename TTypes<Tlen>::ConstVec seq_lengths,
      typename TTypes<T, Dims>::Tensor output) {
-    generator::ReverseGenerator<T, Dims> generator(input, batch_dim, seq_dim,
-                                                   seq_lengths);
+    generator::ReverseGenerator<T, Tlen, Dims> generator(input, batch_dim,
+                                                         seq_dim, seq_lengths);
    output.device(d) = input.generate(generator);
  }
 };
--- a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
@ -24,15 +24,19 @@ namespace tensorflow {

 typedef Eigen::GpuDevice GPUDevice;

-#define DEFINE_GPU_SPEC(T, dims)                       \
-  template class generator::ReverseGenerator<T, dims>; \
-  template struct functor::ReverseSequence<GPUDevice, T, dims>;
+#define DEFINE_GPU_SPEC(T, Tlen, dims)                       \
+  template class generator::ReverseGenerator<T, Tlen, dims>; \
+  template struct functor::ReverseSequence<GPUDevice, T, Tlen, dims>;
+
+#define DEFINE_GPU_SPEC_LEN(T, dims)  \
+  DEFINE_GPU_SPEC(T, int32, dims);    \
+  DEFINE_GPU_SPEC(T, int64, dims);

 #define DEFINE_GPU_SPECS(T) \
-  DEFINE_GPU_SPEC(T, 2);    \
-  DEFINE_GPU_SPEC(T, 3);    \
-  DEFINE_GPU_SPEC(T, 4);    \
-  DEFINE_GPU_SPEC(T, 5);
+  DEFINE_GPU_SPEC_LEN(T, 2);    \
+  DEFINE_GPU_SPEC_LEN(T, 3);    \
+  DEFINE_GPU_SPEC_LEN(T, 4);    \
+  DEFINE_GPU_SPEC_LEN(T, 5);

 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);

--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
@ -91,7 +91,7 @@ class SparseDenseBinaryOpShared : public OpKernel {
    auto VecGreaterEq = [](ArraySlice<int64> lhs, ArraySlice<int64> rhs) {
      if (lhs.size() > rhs.size()) return true;
      if (lhs.size() < rhs.size()) return false;
-      for (int i = 0; i < lhs.size(); ++i) {
+      for (size_t i = 0; i < lhs.size(); ++i) {
        if (lhs[i] < rhs[i]) return false;
      }
      return true;
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@ -1471,11 +1471,12 @@ This operation returns N 1-D integer tensors representing shape of `input[i]s`.
 // --------------------------------------------------------------------------
 REGISTER_OP("ReverseSequence")
    .Input("input: T")
-    .Input("seq_lengths: int64")
+    .Input("seq_lengths: Tlen")
    .Output("output: T")
    .Attr("seq_dim: int")
    .Attr("batch_dim: int = 0")
    .Attr("T: type")
+    .Attr("Tlen: {int32, int64} = DT_INT64")
    .SetShapeFn([](InferenceContext* c) {
      ShapeHandle input = c->input(0);
      ShapeHandle seq_lens_shape;
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@ -4711,6 +4711,42 @@ op {
  summary: "Decode a PNG-encoded image to a uint8 or uint16 tensor."
  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the PNG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the PNG-encoded image is transformed to match the requested number\nof color channels."
 }
+op {
+  name: "DecodeGif"
+  input_arg {
+    name: "contents"
+    description: "0-D.  The GIF-encoded image."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    description: "3-D with shape `[height, width, channels]`."
+    type_attr: "dtype"
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "Number of color channels for the decoded image."
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  summary: "Decode a GIF-encoded image to a uint8 or uint16 tensor."
+  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the GIF-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the GIF-encoded image is transformed to match the requested number\nof color channels."
+}
 op {
  name: "DecodeRaw"
  input_arg {
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@ -222,7 +222,7 @@ Status SingleExampleProtoToTensors(
  const auto& feature_dict = features.feature();

  // Handle dense features.
-  for (int d = 0; d < fixed_len_features.size(); ++d) {
+  for (size_t d = 0; d < fixed_len_features.size(); ++d) {
    const FixedLenFeature& feature_config = fixed_len_features[d];
    const string& key = feature_config.key;
    const DataType& dtype = feature_config.dtype;
@ -263,7 +263,7 @@ Status SingleExampleProtoToTensors(
  }

  // Handle sparse features.
-  for (int d = 0; d < var_len_features.size(); ++d) {
+  for (size_t d = 0; d < var_len_features.size(); ++d) {
    const VarLenFeature& feature_config = var_len_features[d];
    const string& key = feature_config.key;
    const DataType& dtype = feature_config.dtype;
@ -338,7 +338,7 @@ Status BatchExampleProtoToTensors(
      fixed_len_features.size());

  // Preallocate dense_values, since we know their sizes.
-  for (int d = 0; d < fixed_len_features.size(); ++d) {
+  for (size_t d = 0; d < fixed_len_features.size(); ++d) {
    const FixedLenFeature& config = fixed_len_features[d];
    TensorShape out_shape;
    out_shape.AddDim(batch_size);
@ -352,11 +352,11 @@ Status BatchExampleProtoToTensors(
  // Temporary vector to hold sparse values.
  std::vector<std::vector<Tensor>> sparse_values_tmp(var_len_features.size());

-  for (int d = 0; d < var_len_features.size(); ++d) {
+  for (size_t d = 0; d < var_len_features.size(); ++d) {
    sparse_values_tmp[d] = std::vector<Tensor>(batch_size);
  }

-  for (int b = 0; b < examples.size(); ++b) {
+  for (size_t b = 0; b < examples.size(); ++b) {
    const Example& ex = *(examples[b]);
    const string& example_name = (has_names) ? names[b] : "<unknown>";
    SingleExampleProtoToTensors(
@ -364,7 +364,7 @@ Status BatchExampleProtoToTensors(
        &output_dense_values_tensor_ptrs, &sparse_values_tmp);
  }

-  for (int d = 0; d < var_len_features.size(); ++d) {
+  for (size_t d = 0; d < var_len_features.size(); ++d) {
    const VarLenFeature& feature_config = var_len_features[d];
    const DataType& dtype = feature_config.dtype;
    const std::vector<Tensor>& sparse_values_tensor = sparse_values_tmp[d];
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@ -283,7 +283,7 @@ void SparseTensor::Reorder(const VarDimArray& order) {
  // permutation (the inverse).  This can be calculated with O(1)
  // additional
  // and O(n) time (INVPERM) but we just do the simple thing here.
-  std::vector<int64> permutation(reorder.size());
+  std::vector<size_t> permutation(reorder.size());
  for (std::size_t n = 0; n < reorder.size(); ++n) {
    permutation[reorder[n]] = n;
  }
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@ -703,7 +703,7 @@ def variable_summaries(var, name):
    mean = tf.reduce_mean(var)
    tf.scalar_summary('mean/' + name, mean)
    with tf.name_scope('stddev'):
-      stddev = tf.sqrt(tf.reduce_sum(tf.square(var - mean)))
+      stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
    tf.scalar_summary('sttdev/' + name, stddev)
    tf.scalar_summary('max/' + name, tf.reduce_max(var))
    tf.scalar_summary('min/' + name, tf.reduce_min(var))
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@ -75,7 +75,7 @@ def train():
      mean = tf.reduce_mean(var)
      tf.scalar_summary('mean/' + name, mean)
      with tf.name_scope('stddev'):
-        stddev = tf.sqrt(tf.reduce_sum(tf.square(var - mean)))
+        stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
      tf.scalar_summary('sttdev/' + name, stddev)
      tf.scalar_summary('max/' + name, tf.reduce_max(var))
      tf.scalar_summary('min/' + name, tf.reduce_min(var))
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.Optimizer.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.Optimizer.md
@ -184,9 +184,9 @@ applies gradients.

 ### Gating Gradients

-Both `minimize()` and `compute_gradients()` accept a `gate_gradients`
-argument that controls the degree of parallelism during the application of
-the gradients.
+Both `minimize()` and `compute_gradients()` accept a `gate_gradients` argument
+that controls the degree of parallelism during the application of the
+gradients.

 The possible values are: `GATE_NONE`, `GATE_OP`, and `GATE_GRAPH`.

@ -258,7 +258,3 @@ Use `get_slot_names()` to get the list of slot names created by the
 - - -

 #### `tf.train.Optimizer.get_name()` {#Optimizer.get_name}
-
-
-
-
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.confusion_matrix.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.confusion_matrix.md
@ -35,7 +35,7 @@ the same shape in order for this function to work.

 ##### Returns:

-  A l X l matrix represeting the confusion matrix, where l in the number of
+  A k X k matrix represeting the confusion matrix, where k is the number of
  possible labels in the classification task.

 ##### Raises:
--- a/tensorflow/g3doc/api_docs/python/train.md
+++ b/tensorflow/g3doc/api_docs/python/train.md
@ -204,9 +204,9 @@ applies gradients.

 ### Gating Gradients

-Both `minimize()` and `compute_gradients()` accept a `gate_gradients`
-argument that controls the degree of parallelism during the application of
-the gradients.
+Both `minimize()` and `compute_gradients()` accept a `gate_gradients` argument
+that controls the degree of parallelism during the application of the
+gradients.

 The possible values are: `GATE_NONE`, `GATE_OP`, and `GATE_GRAPH`.

@ -3771,5 +3771,3 @@ Generates a checkpoint state proto.
  CheckpointState proto with model_checkpoint_path and
  all_model_checkpoint_paths updated to either absolute paths or
  relative paths to the current save_dir.
-
-
--- a/tensorflow/g3doc/get_started/os_setup.md
+++ b/tensorflow/g3doc/get_started/os_setup.md
@ -8,9 +8,10 @@ github source.
 The TensorFlow Python API supports Python 2.7 and Python 3.3+.

 The GPU version (Linux & Mac OS X only) works best with Cuda Toolkit 7.5 and
-cuDNN v4. other versions are supported (Cuda toolkit >= 7.0 and cuDNN 6.5(v2),
-7.0(v3), v5) only when installing from sources. Please see [Cuda installation]
-(#optional-install-cuda-gpus-on-linux) for details.
+cuDNN v4.  other versions are supported (Cuda toolkit >= 7.0 and
+cuDNN 6.5(v2), 7.0(v3), v5) only when installing from sources.
+Please see [Cuda installation](#optional-install-cuda-gpus-on-linux)
+for details.

 ## Overview

--- a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
+++ b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
@ -86,7 +86,7 @@ def variable_summaries(var, name):
    mean = tf.reduce_mean(var)
    tf.scalar_summary('mean/' + name, mean)
    with tf.name_scope('stddev'):
-      stddev = tf.sqrt(tf.reduce_sum(tf.square(var - mean)))
+      stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
    tf.scalar_summary('sttdev/' + name, stddev)
    tf.scalar_summary('max/' + name, tf.reduce_max(var))
    tf.scalar_summary('min/' + name, tf.reduce_min(var))
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -71,6 +71,34 @@ tf_py_test(
    ],
 )

+tf_py_test(
+    name = "flags_test",
+    size = "small",
+    srcs = ["platform/flags_test.py"],
+    additional_deps = [
+        ":platform",
+        ":platform_test",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+)
+
+tf_py_test(
+    name = "app_test",
+    size = "small",
+    srcs = ["platform/app_test.py"],
+    additional_deps = [
+        ":platform",
+        ":platform_test",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+)
+
 cc_library(
    name = "numpy_lib",
    srcs = ["lib/core/numpy.cc"],
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@ -58,7 +58,12 @@ class ClipTest(tf.test.TestCase):
      ans = tf.clip_by_norm(x, clip_norm)
      tf_ans = ans.eval()
      
+      clip_tensor = tf.constant(4.0)
+      ans = tf.clip_by_norm(x, clip_norm)
+      tf_ans_tensor = ans.eval()
+
    self.assertAllClose(np_ans, tf_ans)
+    self.assertAllClose(np_ans, tf_ans_tensor)

  def testClipByNormNotClipped(self):
    # No norm clipping when clip_norm >= 5
@ -148,6 +153,28 @@ class ClipTest(tf.test.TestCase):
    self.assertAllClose(np_ans_0, tf_ans_1)
    self.assertAllClose(np_ans_1, tf_ans_2)

+  def testClipByGlobalNormClippedTensor(self):
+    # Norm clipping when clip_norm < 5
+    with self.test_session():
+      x0 = tf.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
+      x1 = tf.constant([1.0, -2.0])
+      # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
+      clip_norm = tf.constant(4.0)
+
+      # Answers are the original tensors scaled by 4.0/5.0
+      np_ans_0 = [[-1.6, 0.0, 0.0],
+                  [3.2, 0.0, 0.0]]
+      np_ans_1 = [0.8, -1.6]
+
+      ans, norm = tf.clip_by_global_norm((x0, x1), clip_norm)
+      tf_ans_1 = ans[0].eval()
+      tf_ans_2 = ans[1].eval()
+      tf_norm = norm.eval()
+
+    self.assertAllClose(tf_norm, 5.0)
+    self.assertAllClose(np_ans_0, tf_ans_1)
+    self.assertAllClose(np_ans_1, tf_ans_2)
+
  def testClipByGlobalNormSupportsNone(self):
    # Norm clipping when clip_norm < 5
    with self.test_session():
@ -259,6 +286,19 @@ class ClipTest(tf.test.TestCase):

    self.assertAllClose(np_ans, tf_ans)

+  def testClipByAverageNormClippedTensor(self):
+    # Norm clipping when average clip_norm < 0.83333333
+    with self.test_session():
+      x = tf.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
+      # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
+      np_ans = [[-2.88, 0.0, 0.0],
+                [3.84, 0.0, 0.0]]
+      clip_norm = tf.constant(0.8)
+      ans = tf.clip_by_average_norm(x, clip_norm)
+      tf_ans = ans.eval()
+
+    self.assertAllClose(np_ans, tf_ans)
+
  def testClipByAverageNormNotClipped(self):
    # No norm clipping when average clip_norm >= 0.83333333
    with self.test_session():
--- a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
+++ b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
@ -47,7 +47,7 @@ class ReverseSequenceTest(tf.test.TestCase):
    self._testReverseSequence(x, batch_dim, seq_dim, seq_lengths,
                              truth, False, expected_err_re)

-  def _testBasic(self, dtype):
+  def _testBasic(self, dtype, len_dtype=np.int64):
    x = np.asarray([
        [[1, 2, 3, 4], [5, 6, 7, 8]],
        [[9, 10, 11, 12], [13, 14, 15, 16]],
@ -56,7 +56,7 @@ class ReverseSequenceTest(tf.test.TestCase):
    x = x.transpose([2, 1, 0, 3, 4])  # permute axes 0 <=> 2

    # reverse dim 2 up to (0:3, none, 0:4) along dim=0
-    seq_lengths = np.asarray([3, 0, 4], dtype=np.int64)
+    seq_lengths = np.asarray([3, 0, 4], dtype=len_dtype)

    truth_orig = np.asarray(
        [[[3, 2, 1, 4], [7, 6, 5, 8]],  # reverse 0:3
@ -70,6 +70,9 @@ class ReverseSequenceTest(tf.test.TestCase):
    batch_dim = 2
    self._testBothReverseSequence(x, batch_dim, seq_dim, seq_lengths, truth)

+  def testSeqLenghtInt32(self):
+    self._testBasic(np.float32, np.int32)
+
  def testFloatBasic(self):
    self._testBasic(np.float32)

--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@ -743,5 +743,28 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
        tf.sparse_maximum(sp_zero, sp_one).eval()


+class SparseTransposeTest(tf.test.TestCase):
+
+  def _SparseTensorPlaceholder(self):
+    return tf.SparseTensor(
+        tf.placeholder(tf.int64),
+        tf.placeholder(tf.float64),
+        tf.placeholder(tf.int64))
+
+  def testTranspose(self):
+    with self.test_session(use_gpu=False) as sess:
+      np.random.seed(1618)
+      shapes = [np.random.randint(1, 10, size=rank) for rank in range(1, 6)]
+      for shape in shapes:
+        for dtype in [np.int32, np.int64, np.float32, np.float64]:
+          dn_input = np.random.randn(*shape).astype(dtype)
+          rank = tf.rank(dn_input).eval()
+          perm = np.random.choice(rank, rank, False)
+          sp_input, unused_a_nnz = _sparsify(dn_input)
+          sp_trans = tf.sparse_transpose(sp_input, perm=perm)
+          dn_trans = tf.sparse_tensor_to_dense(sp_trans).eval()
+          expected_trans = tf.transpose(dn_input, perm=perm).eval()
+          self.assertAllEqual(dn_trans, expected_trans)
+
 if __name__ == "__main__":
  googletest.main()
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@ -206,7 +206,7 @@ def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
    # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
    scale = clip_norm * math_ops.minimum(
        1.0 / use_norm,
-        constant_op.constant(1.0 / clip_norm, dtype=use_norm.dtype))
+        constant_op.constant(1.0, dtype=use_norm.dtype) / clip_norm)

    values = [
        ops.convert_to_tensor(
@ -268,7 +268,7 @@ def clip_by_average_norm(t, clip_norm, name=None):
        math_ops.reduce_sum(t * t, math_ops.range(array_ops.rank(t))))
    tclip = array_ops.identity(
        t * clip_norm * math_ops.minimum(
-            l2norm_inv * n_element, constant_op.constant(1.0 / clip_norm)),
+            l2norm_inv * n_element, constant_op.constant(1.0) / clip_norm),
        name=name)

  return tclip
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@ -80,6 +80,7 @@ Queues](../../how_tos/threading_and_queues/index.md).
@@FIFOQueue
@@PaddingFIFOQueue
@@RandomShuffleQueue
+@@PriorityQueue

 ## Dealing with the filesystem

--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import numbers
+
 import numpy as np

 from tensorflow.python.framework import common_shapes
@ -1131,7 +1133,7 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):
  """
  with ops.name_scope(name, "dropout", [x]) as name:
    x = ops.convert_to_tensor(x, name="x")
-    if isinstance(keep_prob, float) and not 0 < keep_prob <= 1:
+    if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1:
      raise ValueError("keep_prob must be a scalar tensor or a float in the "
                       "range (0, 1], got %g" % keep_prob)
    keep_prob = ops.convert_to_tensor(keep_prob,
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@ -40,6 +40,7 @@ dimension, and dense along all other dimensions.
@@sparse_retain
@@sparse_reset_shape
@@sparse_fill_empty_rows
+@@sparse_transpose

 ## Reduction
@@sparse_reduce_sum
@ -1582,3 +1583,49 @@ def _SparseSparseMaximumMinimumShape(op):  # pylint: disable=invalid-name
  op.inputs[4].get_shape().assert_has_rank(1)  # b_values
  op.inputs[5].get_shape().assert_has_rank(1)  # b_shape
  return [tensor_shape.unknown_shape(2), tensor_shape.unknown_shape(1)]
+
+
+def sparse_transpose(sp_input, perm=None, name=None):
+  """Transposes a `SparseTensor`
+
+  The returned tensor's dimension i will correspond to the input dimension
+  `perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is
+  the rank of the input tensor. Hence by default, this operation performs a
+  regular matrix transpose on 2-D input Tensors.
+
+  For example, if `sp_input` has shape `[4, 5]` and `indices` / `values`:
+
+      [0, 3]: b
+      [0, 1]: a
+      [3, 1]: d
+      [2, 0]: c
+
+  then the output will be a `SparseTensor` of shape `[5, 4]` and
+  `indices` / `values`:
+
+      [0, 2]: c
+      [1, 0]: a
+      [1, 3]: d
+      [3, 0]: b
+
+  Args:
+    sp_input: The input `SparseTensor`.
+    perm: A permutation of the dimensions of `sp_input`.
+    name: A name prefix for the returned tensors (optional)
+  Returns:
+    A transposed `SparseTensor`.
+
+  Raises:
+    TypeError: If `sp_input` is not a `SparseTensor`.
+  """
+  with ops.op_scope([sp_input], name, "SparseTranspose") as name:
+    if perm is None:
+      rank = array_ops.rank(sp_input)
+      perm = (rank - 1) - math_ops.range(0, rank, 1)
+    indices = sp_input.indices
+    transposed_indices = array_ops.transpose(array_ops.gather(array_ops.transpose(indices), perm))
+    dense_shape = sp_input.shape
+    transposed_dense_shape = array_ops.gather(dense_shape, perm)
+    transposed_st = ops.SparseTensor(transposed_indices, sp_input.values, transposed_dense_shape)
+    transposed_st = sparse_reorder(transposed_st)
+    return transposed_st
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@ -25,6 +25,6 @@ from tensorflow.python.platform import flags

 def run(main=None):
  f = flags.FLAGS
-  f._parse_flags()
+  flags_passthrough = f._parse_flags()
  main = main or sys.modules['__main__'].main
-  sys.exit(main(sys.argv))
+  sys.exit(main(sys.argv[:1] + flags_passthrough))
--- a/tensorflow/python/platform/app_test.py
+++ b/tensorflow/python/platform/app_test.py
@ -0,0 +1,45 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for our flags implementation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+from tensorflow.python.platform import app
+from tensorflow.python.platform import flags
+
+FLAGS = flags.FLAGS
+flags.DEFINE_boolean('myflag', False, '')
+
+def main(argv):
+  if (len(argv) != 3):
+    print("Length of argv was not 3: ", argv)
+    sys.exit(-1)
+
+  if argv[1] != "--passthrough":
+    print("--passthrough argument not in argv")
+    sys.exit(-1)
+
+  if argv[2] != "extra":
+    print("'extra' argument not in argv")
+    sys.exit(-1)
+
+
+if __name__ == '__main__':
+  sys.argv.extend(["--myflag", "--passthrough", "extra"])
+  app.run()
--- a/tensorflow/python/platform/flags.py
+++ b/tensorflow/python/platform/flags.py
@ -30,10 +30,11 @@ class _FlagValues(object):
    self.__dict__['__parsed'] = False

  def _parse_flags(self):
-    result, _ = _global_parser.parse_known_args()
+    result, unparsed = _global_parser.parse_known_args()
    for flag_name, val in vars(result).items():
      self.__dict__['__flags'][flag_name] = val
    self.__dict__['__parsed'] = True
+    return unparsed

  def __getattr__(self, name):
    """Retrieves the 'value' attribute of the flag --name."""
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@ -150,7 +150,7 @@ def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
    else:
      cluster_spec = server_lib.ClusterSpec(cluster).as_dict()
    # Get ps_job_name from ps_device by striping "/job:".
-    ps_job_name = ps_device.lstrip("/job:")
+    ps_job_name = pydev.DeviceSpec.from_string(ps_device).job
    if ps_job_name not in cluster_spec or cluster_spec[ps_job_name] is None:
      return None
    ps_tasks = len(cluster_spec[ps_job_name])
--- a/tensorflow/python/training/device_setter_test.py
+++ b/tensorflow/python/training/device_setter_test.py
@ -78,6 +78,23 @@ class DeviceSetterTest(tf.test.TestCase):
      self.assertDeviceEqual("/job:moon/task:1", w.initializer.device)
      self.assertDeviceEqual("/job:sun", a.device)
  
+  def testPS2TasksWithCPUConstraint(self):
+    cluster_spec = tf.train.ClusterSpec({
+        "sun": ["sun0:2222", "sun1:2222", "sun2:2222"],
+        "moon": ["moon0:2222", "moon1:2222"]})
+
+    with tf.device(tf.train.replica_device_setter(
+        ps_device="/job:moon/cpu:0", worker_device="/job:sun",
+        cluster=cluster_spec.as_cluster_def())):
+      v = tf.Variable([1, 2])
+      w = tf.Variable([2, 1])
+      a = v + w
+      self.assertDeviceEqual("/job:moon/task:0/cpu:0", v.device)
+      self.assertDeviceEqual("/job:moon/task:0/cpu:0", v.initializer.device)
+      self.assertDeviceEqual("/job:moon/task:1/cpu:0", w.device)
+      self.assertDeviceEqual("/job:moon/task:1/cpu:0", w.initializer.device)
+      self.assertDeviceEqual("/job:sun", a.device)
+

 if __name__ == "__main__":
  tf.test.main()
--- a/tensorflow/tensorboard/components/tf-dashboard-common/dashboard-style.html
+++ b/tensorflow/tensorboard/components/tf-dashboard-common/dashboard-style.html
@ -53,7 +53,7 @@ limitations under the License.
      }
      .card .card-bottom-row {
        position: absolute;
-        left: 50px;
+        left: 75px;
        bottom: 0;
        padding-right: 10px;
      }
@ -71,6 +71,14 @@ limitations under the License.
        display: block;
      }

+      .log-option-button {
+        position: absolute;
+        left: 25px;
+        bottom: 0px;
+        color: #2196F3;
+        display: block;
+      }
+
      #content-container{
        display: block;
      }
--- a/tensorflow/tensorboard/components/tf-event-dashboard/tf-event-dashboard.html
+++ b/tensorflow/tensorboard/components/tf-event-dashboard/tf-event-dashboard.html
@ -145,6 +145,13 @@ The #center div contains tf-line-charts embedded inside tf-collapsable-panes.
                      icon="fullscreen"
                      on-tap="toggleSelected"
                    ></paper-icon-button>
+                      <paper-icon-button
+                              class="log-option-button"
+                              icon="line-weight"
+                              on-tap="toggleYScale"
+                              title="Toggle y-axis scale (log, linear)"
+                              ></paper-icon-button>
+
                  </div>
                  <template is="dom-if" if="[[_showDownloadLinks]]">
                    <div class="card-bottom-row">
@ -242,6 +249,19 @@ The #center div contains tf-line-charts embedded inside tf-collapsable-panes.
          chartScaffold.chart().redraw();
        }
      },
+
+      toggleYScale: function(e) {
+        var currentTarget = Polymer.dom(e.currentTarget);
+
+        var b = currentTarget.parentNode.querySelector('.log-option-button');
+        var c = currentTarget.parentNode.querySelector('vz-line-chart');
+        if (c !== null) {
+          c.yScaleType = c.yScaleType === 'log' ? 'linear' : 'log';
+          b.icon = c.yScaleType === 'log' ? 'line-weight' : 'reorder';
+          c.redraw();
+        }
+      },
+
      validRuns: function(tag, runsChange, run2tagChange) {
        var _this = this;
        var result = this.selectedRuns.filter(function(r) {
--- a/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.html
+++ b/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.html
@ -191,6 +191,16 @@ such as different X scales (linear and temporal), tooltips and smoothing.
          value: 'step'
        },

+        /**
+         * The scale for the y-axis. Allows:
+         * - "linear" - linear scale (Plottable.Scales.Linear)
+         * - "log" - modified-log scale (Plottable.Scales.ModifiedLog)
+         */
+        yScaleType: {
+          type: String,
+          value: 'linear'
+        },
+
        /**
         * Change how the tooltip is sorted. Allows:
         * - "default" - Sort the tooltip by input order.
@ -228,7 +238,7 @@ such as different X scales (linear and temporal), tooltips and smoothing.
        }
      },
      observers: [
-        "_makeChart(xType, colorScale, _attached)",
+        "_makeChart(xType, yScaleType, colorScale, _attached)",
        "_reloadFromCache(_chart)",
        "_smoothingChanged(smoothingEnabled, smoothingWeight, _chart)",
        "_tooltipSortingMethodChanged(tooltipSortingMethod, _chart)",
@ -284,7 +294,7 @@ such as different X scales (linear and temporal), tooltips and smoothing.
        this.scopeSubtree(this.$.tooltip, true);
        this.scopeSubtree(this.$.chartsvg, true);
      },
-      _makeChart: function(xType, colorScale, _attached) {
+      _makeChart: function(xType, yScaleType, colorScale, _attached) {
        if (this._makeChartAsyncCallbackId === null) {
          this.cancelAsync(this._makeChartAsyncCallbackId);
        }
@ -294,7 +304,7 @@ such as different X scales (linear and temporal), tooltips and smoothing.
          if (!this._attached) return;
          if (this._chart) this._chart.destroy();
          var tooltip = d3.select(this.$.tooltip);
-          var chart = new VZ.LineChart(xType, colorScale, tooltip);
+          var chart = new VZ.LineChart(xType, yScaleType, colorScale, tooltip);
          var svg = d3.select(this.$.chartsvg);
          chart.renderTo(svg);
          this._chart = chart;
--- a/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.ts
+++ b/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.ts
@ -49,7 +49,7 @@ module VZ {
    private targetSVG: d3.Selection<any>;

    constructor(
-        xType: string, colorScale: Plottable.Scales.Color,
+        xType: string, yScaleType: string, colorScale: Plottable.Scales.Color,
        tooltip: d3.Selection<any>) {
      this.seriesNames = [];
      this.name2datasets = {};
@ -63,10 +63,10 @@ module VZ {
      // need to do a single bind, so we can deregister the callback from
      // old Plottable.Datasets. (Deregistration is done by identity checks.)
      this.onDatasetChanged = this._onDatasetChanged.bind(this);
-      this.buildChart(xType);
+      this.buildChart(xType, yScaleType);
    }

-    private buildChart(xType: string) {
+    private buildChart(xType: string, yScaleType: string) {
      if (this.outer) {
        this.outer.destroy();
      }
@ -75,7 +75,7 @@ module VZ {
      this.xScale = xComponents.scale;
      this.xAxis = xComponents.axis;
      this.xAxis.margin(0).tickLabelPadding(3);
-      this.yScale = new Plottable.Scales.Linear();
+      this.yScale = LineChart.getYScaleFromType(yScaleType);
      this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
      let yFormatter = VZ.ChartHelpers.multiscaleFormatter(
          VZ.ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
@ -450,6 +450,16 @@ module VZ {
      return this.name2datasets[name];
    }

+    static getYScaleFromType(yScaleType: string): Plottable.QuantitativeScale<number> {
+      if (yScaleType === 'log') {
+        return new Plottable.Scales.ModifiedLog();
+      } else if (yScaleType === 'linear') {
+        return new Plottable.Scales.Linear();
+      } else {
+        throw new Error('Unrecognized yScale type ' + yScaleType);
+      }
+    }
+
    /**
     * Update the selected series on the chart.
     */
--- a/tensorflow/tensorboard/tensorboard.py
+++ b/tensorflow/tensorboard/tensorboard.py
@ -138,7 +138,12 @@ def main(unused_argv=None):

  status_bar.SetupStatusBarInsideGoogle('TensorBoard %s' % tag, FLAGS.port)
  print('Starting TensorBoard %s on port %d' % (tag, FLAGS.port))
-  print('(You can navigate to http://%s:%d)' % (FLAGS.host, FLAGS.port))
+
+  if FLAGS.host == "0.0.0.0":
+    print('(You can navigate to http://%s:%d)' % (socket.gethostbyname(socket.gethostname()), FLAGS.port))
+  else:
+    print('(You can navigate to http://%s:%d)' % (FLAGS.host, FLAGS.port))
+
  tb_server.serve_forever()


--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu
@ -24,4 +24,4 @@ ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
 ENV CUDA_TOOLKIT_PATH /usr/local/cuda
 ENV CUDNN_INSTALL_PATH /usr/lib/x86_64-linux-gnu
 ENV TF_NEED_CUDA 1
-ENV CUDA_COMPUTE_CAPABILITIES 3.0,5.2
+ENV TF_CUDA_COMPUTE_CAPABILITIES 3.0,5.2
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@ -323,7 +323,21 @@ fi

 # Apply the final image name and tag
 FINAL_IMG="${FINAL_IMAGE_NAME}:${FINAL_TAG}"
-docker tag -f "${IMG}" "${FINAL_IMG}" || \
+
+DOCKER_VER=$(docker version | grep Version | head -1 | awk '{print $NF}')
+if [[ -z "${DOCKER_VER}" ]]; then
+  die "ERROR: Failed to determine docker version"
+fi
+DOCKER_MAJOR_VER=$(echo "${DOCKER_VER}" | cut -d. -f 1)
+DOCKER_MINOR_VER=$(echo "${DOCKER_VER}" | cut -d. -f 2)
+
+FORCE_TAG=""
+if [[ "${DOCKER_MAJOR_VER}" -le 1 ]] && \
+   [[ "${DOCKER_MINOR_VER}" -le 9 ]]; then
+  FORCE_TAG="--force"
+fi
+
+docker tag ${FORCE_TAG} "${IMG}" "${FINAL_IMG}" || \
    die "Failed to tag intermediate docker image ${IMG} as ${FINAL_IMG}"

 echo ""
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -2,12 +2,14 @@

 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")

-# If TensorFlow is linked as a submodule, path_prefix is TensorFlow's directory
-# within the workspace (e.g. "tensorflow/"), and tf_repo_name is the name of the
-# local_repository rule (e.g. "@tf").
+# If TensorFlow is linked as a submodule.
+# path_prefix and tf_repo_name are no longer used.
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
  cuda_configure(name = "local_config_cuda")
-
+  if path_prefix:
+    print("path_prefix was specified to tf_workspace but is no longer used and will be removed in the future.")
+  if tf_repo_name:
+    print("tf_repo_name was specified to tf_workspace but is no longer used and will be removed in the future.")
  # These lines need to be changed when updating Eigen. They are parsed from
  # this file by the cmake and make builds to determine the eigen version and hash.
  eigen_version = "9e1b48c333aa"
@ -18,7 +20,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
    url = "https://bitbucket.org/eigen/eigen/get/" + eigen_version + ".tar.gz",
    sha256 = eigen_sha256,
    strip_prefix = "eigen-eigen-" + eigen_version,
-    build_file = path_prefix + "eigen.BUILD",
+    build_file = str(Label("//:eigen.BUILD")),
  )

  native.git_repository(
@ -37,7 +39,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
    name = "farmhash_archive",
    url = "https://github.com/google/farmhash/archive/34c13ddfab0e35422f4c3979f360635a8c050260.zip",
    sha256 = "e3d37a59101f38fd58fb799ed404d630f0eee18bfc2a2433910977cc8fea9c28",
-    build_file = path_prefix + "farmhash.BUILD",
+    build_file = str(Label("//:farmhash.BUILD")),
  )

  native.bind(
@ -56,28 +58,28 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
    name = "jpeg_archive",
    url = "http://www.ijg.org/files/jpegsrc.v9a.tar.gz",
    sha256 = "3a753ea48d917945dd54a2d97de388aa06ca2eb1066cbfdc6652036349fe05a7",
-    build_file = path_prefix + "jpeg.BUILD",
+    build_file = str(Label("//:jpeg.BUILD")),
  )

  native.new_http_archive(
    name = "png_archive",
    url = "https://github.com/glennrp/libpng/archive/v1.2.53.zip",
    sha256 = "c35bcc6387495ee6e757507a68ba036d38ad05b415c2553b3debe2a57647a692",
-    build_file = path_prefix + "png.BUILD",
+    build_file = str(Label("//:png.BUILD")),
  )

  native.new_http_archive(
    name = "gif_archive",
    url = "http://ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
    sha256 = "34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1",
-    build_file = path_prefix + "gif.BUILD",
+    build_file = str(Label("//:gif.BUILD")),
  )

  native.new_http_archive(
    name = "six_archive",
    url = "https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz#md5=34eed507548117b2ab523ab14b2f8b55",
    sha256 = "105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
-    build_file = path_prefix + "six.BUILD",
+    build_file = str(Label("//:six.BUILD")),
  )

  native.bind(
@ -95,7 +97,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
    name = "gmock_archive",
    url = "http://pkgs.fedoraproject.org/repo/pkgs/gmock/gmock-1.7.0.zip/073b984d8798ea1594f5e44d85b20d66/gmock-1.7.0.zip",
    sha256 = "26fcbb5925b74ad5fc8c26b0495dfc96353f4d553492eb97e85a8a6d2f43095b",
-    build_file = path_prefix + "gmock.BUILD",
+    build_file = str(Label("//:gmock.BUILD")),
  )

  native.bind(
@ -110,7 +112,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):

  native.bind(
    name = "python_headers",
-    actual = tf_repo_name + "//util/python:python_headers",
+    actual = str(Label("//util/python:python_headers")),
  )

  # grpc expects //external:protobuf_clib and //external:protobuf_compiler
@ -130,7 +132,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
    commit = "d7ff4ff40071d2b486a052183e3e9f9382afb745",
    init_submodules = True,
    remote = "https://github.com/grpc/grpc.git",
-    build_file = path_prefix + "grpc.BUILD",
+    build_file = str(Label("//:grpc.BUILD")),
  )

  # protobuf expects //external:grpc_cpp_plugin to point to grpc's
@ -149,7 +151,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
    name = "jsoncpp_git",
    remote = "https://github.com/open-source-parsers/jsoncpp.git",
    commit = "11086dd6a7eba04289944367ca82cea71299ed70",
-    build_file = path_prefix + "jsoncpp.BUILD",
+    build_file = str(Label("//:jsoncpp.BUILD")),
  )

  native.bind(
@ -167,7 +169,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
    name = "nanopb_git",
    commit = "1251fa1",
    remote = "https://github.com/nanopb/nanopb.git",
-    build_file = path_prefix + "nanopb.BUILD",
+    build_file = str(Label("//:nanopb.BUILD")),
  )

  native.bind(
@ -179,26 +181,26 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
    name = "avro_archive",
    url = "http://www-us.apache.org/dist/avro/avro-1.8.0/cpp/avro-cpp-1.8.0.tar.gz",
    sha256 = "ec6e2ec957e95ca07f70cc25f02f5c416f47cb27bd987a6ec770dcbe72527368",
-    build_file = path_prefix + "avro.BUILD",
+    build_file = str(Label("//:avro.BUILD")),
  )

  native.new_http_archive(
    name = "boost_archive",
    url = "http://pilotfiber.dl.sourceforge.net/project/boost/boost/1.61.0/boost_1_61_0.tar.gz",
    sha256 = "a77c7cc660ec02704c6884fbb20c552d52d60a18f26573c9cee0788bf00ed7e6",
-    build_file = path_prefix + "boost.BUILD",
+    build_file = str(Label("//:boost.BUILD")),
  )

  native.new_http_archive(
    name = "bzip2_archive",
    url = "http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz",
    sha256 = "a2848f34fcd5d6cf47def00461fcb528a0484d8edef8208d6d2e2909dc61d9cd",
-    build_file = path_prefix + "bzip2.BUILD",
+    build_file = str(Label("//:bzip2.BUILD")),
  )

  native.new_http_archive(
    name = "zlib_archive",
    url = "http://zlib.net/zlib-1.2.8.tar.gz",
    sha256 = "36658cb768a54c1d4dec43c3116c27ed893e88b02ecfcb44f2166f9c0b7f2a0d",
-    build_file = path_prefix + "zlib.BUILD",
+    build_file = str(Label("//:zlib.BUILD")),
  )
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@ -3,19 +3,25 @@

 `cuda_configure` depends on the following environment variables:

-  * `ENABLE_CUDA`: Whether to enable building with CUDA.
-  * `CC`: The GCC host compiler path
+  * `TF_NEED_CUDA`: Whether to enable building with CUDA.
+  * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
  * `CUDA_TOOLKIT_PATH`: The path to the CUDA toolkit. Default is
    `/usr/local/cuda`.
-  * `CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then
+  * `TF_CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then
    use the system default.
-  * `CUDNN_VERSION`: The version of the cuDNN library.
+  * `TF_CUDNN_VERSION`: The version of the cuDNN library.
  * `CUDNN_INSTALL_PATH`: The path to the cuDNN library. Default is
    `/usr/local/cuda`.
-  * `CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
+  * `TF_CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
    `3.5,5.2`.
 """

+_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
+_CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
+_TF_CUDA_VERSION = "TF_CUDA_VERSION"
+_TF_CUDNN_VERSION = "TF_CUDNN_VERSION"
+_CUDNN_INSTALL_PATH = "CUDNN_INSTALL_PATH"
+_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"

 _DEFAULT_CUDA_VERSION = ""
 _DEFAULT_CUDNN_VERSION = ""
@ -30,8 +36,8 @@ _DEFAULT_CUDA_COMPUTE_CAPABILITIES = ["3.5", "5.2"]
 def find_cc(repository_ctx):
  """Find the C++ compiler."""
  cc_name = "gcc"
-  if "CC" in repository_ctx.os.environ:
-    cc_name = repository_ctx.os.environ["CC"].strip()
+  if _GCC_HOST_COMPILER_PATH in repository_ctx.os.environ:
+    cc_name = repository_ctx.os.environ[_GCC_HOST_COMPILER_PATH].strip()
    if not cc_name:
      cc_name = "gcc"
  if cc_name.startswith("/"):
@ -93,8 +99,8 @@ def _enable_cuda(repository_ctx):
 def _cuda_toolkit_path(repository_ctx):
  """Finds the cuda toolkit directory."""
  cuda_toolkit_path = _DEFAULT_CUDA_TOOLKIT_PATH
-  if "CUDA_TOOLKIT_PATH" in repository_ctx.os.environ:
-    cuda_toolkit_path = repository_ctx.os.environ["CUDA_TOOLKIT_PATH"].strip()
+  if _CUDA_TOOLKIT_PATH in repository_ctx.os.environ:
+    cuda_toolkit_path = repository_ctx.os.environ[_CUDA_TOOLKIT_PATH].strip()
  if not repository_ctx.path(cuda_toolkit_path).exists:
    fail("Cannot find cuda toolkit path.")
  return cuda_toolkit_path
@ -103,8 +109,8 @@ def _cuda_toolkit_path(repository_ctx):
 def _cudnn_install_basedir(repository_ctx):
  """Finds the cudnn install directory."""
  cudnn_install_path = _DEFAULT_CUDNN_INSTALL_PATH
-  if "CUDNN_INSTALL_PATH" in repository_ctx.os.environ:
-    cudnn_install_path = repository_ctx.os.environ["CUDNN_INSTALL_PATH"].strip()
+  if _CUDNN_INSTALL_PATH in repository_ctx.os.environ:
+    cudnn_install_path = repository_ctx.os.environ[_CUDNN_INSTALL_PATH].strip()
  if not repository_ctx.path(cudnn_install_path).exists:
    fail("Cannot find cudnn install path.")
  return cudnn_install_path
@ -112,25 +118,25 @@ def _cudnn_install_basedir(repository_ctx):

 def _cuda_version(repository_ctx):
  """Detects the cuda version."""
-  if "CUDA_VERSION" in repository_ctx.os.environ:
-    return repository_ctx.os.environ["CUDA_VERSION"].strip()
+  if _TF_CUDA_VERSION in repository_ctx.os.environ:
+    return repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
  else:
    return ""


 def _cudnn_version(repository_ctx):
  """Detects the cudnn version."""
-  if "CUDNN_VERSION" in repository_ctx.os.environ:
-    return repository_ctx.os.environ["CUDNN_VERSION"].strip()
+  if _TF_CUDNN_VERSION in repository_ctx.os.environ:
+    return repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
  else:
    return ""


 def _compute_capabilities(repository_ctx):
  """Returns a list of strings representing cuda compute capabilities."""
-  if "CUDA_COMPUTE_CAPABILITIES" not in repository_ctx.os.environ:
+  if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ:
    return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-  capabilities_str = repository_ctx.os.environ["CUDA_COMPUTE_CAPABILITIES"]
+  capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES]
  capabilities = capabilities_str.split(",")
  for capability in capabilities:
    # Workaround for Skylark's lack of support for regex. This check should