Merge changes from github.

PiperOrigin-RevId: 160344052
2017-06-27 16:33:00 -07:00 · 2017-06-27 16:33:00 -07:00 · 50b999a833
commit 50b999a833
parent d6d58a3a17
303 changed files with 6490 additions and 1395 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -159,7 +159,12 @@ There are two ways to run TensorFlow unit tests.
   bazel test ${flags} //tensorflow/python/...
   ```

-2. Using Docker and TensorFlow's CI scripts.
+2. Using [Docker](www.docker.com) and TensorFlow's CI scripts.
+
+   ```bash
+   # Install Docker first, then this will build and run cpu tests
+   tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
+   ```

   See
   [TensorFlow Builds](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/ci_build) for details.
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@ -6,6 +6,7 @@ If you open a GitHub issue, here is our policy:

 1. It must be a bug or a feature request.
 2. The form below must be filled out.
+3. It shouldn't be a TensorBoard issue. Those go [here](https://github.com/tensorflow/tensorboard/issues).

 **Here's why we have that policy**: TensorFlow developers respond to issues. We want to focus on work that benefits the whole community, e.g., fixing bugs and adding features. Support only helps individuals. GitHub also notifies thousands of people when issues are filed. We want them to see you communicating an interesting problem, rather than being redirected to Stack Overflow.

--- a/README.md
+++ b/README.md
@ -34,13 +34,13 @@ and discussion.**

 People who are a little more adventurous can also try our nightly binaries:

-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc2-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc2-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc2-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc2-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
-* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
+* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
 ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))

 #### *Try your first TensorFlow program*
--- a/RELEASE.md
+++ b/RELEASE.md
@ -113,6 +113,8 @@
  checkpoints containing such RNN cells, in which case you can use the
  [checkpoint_convert script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py)
  to convert the variable names in your old checkpoints.
+* Added `tf.contrib.kernel_methods` module with Ops and estimators for primal
+  (explicit) kernel methods in TensorFlow.

 ## Bug Fixes and Other Changes
 * In python, `Operation.get_attr` on type attributes returns the Python DType
--- a/37
+++ b/37
@ -162,8 +162,12 @@ bazel version > bazel.version
 curr_bazel_version=$(head -n 1 bazel.version | cut -d ' ' -f3)
 rm -f bazel.version

+
 echo "You have bazel $curr_bazel_version installed."
-if [ "$(version "$MIN_BAZEL_VERSION")" -gt "$(version "$curr_bazel_version")" ]; then
+if [ -z "$curr_bazel_version" ]; then
+  echo "WARNING: current bazel installation is not a release version."
+  echo "Make sure you are running at least bazel $MIN_BAZEL_VERSION."
+elif [ "$(version "$MIN_BAZEL_VERSION")" -gt "$(version "$curr_bazel_version")" ]; then
  echo "Please upgrade your bazel installation to version $MIN_BAZEL_VERSION or higher to build TensorFlow!"
  echo "Exiting..."
  exit 1
@ -535,9 +539,9 @@ done
 # Set default CUDA version if not set
 if [ -z "$TF_CUDA_VERSION" ]; then
  TF_CUDA_VERSION="8.0"
-  export TF_CUDA_VERSION 
+  export TF_CUDA_VERSION
 fi
-write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION" 
+write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION"

 # Set up which gcc nvcc should use as the host compiler
 # No need to set this on Windows
@ -586,6 +590,9 @@ while true; do
    # Result returned from "read" will be used unexpanded. That make "~" unusable.
    # Going through one more level of expansion to handle that.
    CUDNN_INSTALL_PATH=`"${PYTHON_BIN_PATH}" -c "import os; print(os.path.realpath(os.path.expanduser('${CUDNN_INSTALL_PATH}')))"`
+    if is_windows; then
+      CUDNN_INSTALL_PATH="$(cygpath -m "$CUDNN_INSTALL_PATH")"
+    fi
  fi

  if [[ -z "$TF_CUDNN_VERSION" ]]; then
@ -652,16 +659,22 @@ write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"

 # Configure the compute capabilities that TensorFlow builds for.
 # Since Cuda toolkit is not backward-compatible, this is not guaranteed to work.
+function get_native_cuda_compute_capabilities {
+  device_query_bin="$CUDA_TOOLKIT_PATH/extras/demo_suite/deviceQuery" # Also works on Windows without .exe
+  "$device_query_bin" | grep 'Capability' | grep -o '[0-9]*\.[0-9]*' | sed ':a;{N;s/\n/,/};ba'
+  exit 0 # ensure that this function always exit success even if device detection fails, to prevent the whole configure from aborting
+}
 while true; do
  fromuser=""
-  default_cuda_compute_capabilities="3.5,5.2"
+  native_cuda_compute_capabilities=$(get_native_cuda_compute_capabilities)
+  default_cuda_compute_capabilities=${native_cuda_compute_capabilities:-"3.5,5.2"}
  if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then
 cat << EOF
 Please specify a list of comma-separated Cuda compute capabilities you want to build with.
 You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
 Please note that each additional compute capability significantly increases your build time and binary size.
 EOF
-    read -p "[Default is: \"3.5,5.2\"]: " TF_CUDA_COMPUTE_CAPABILITIES
+    read -p "[Default is: \"$default_cuda_compute_capabilities\"]: " TF_CUDA_COMPUTE_CAPABILITIES
    fromuser=1
  fi
  if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then
@ -832,17 +845,17 @@ while true; do
    if [ -e "$MPI_HOME/include" ] && [ -e "$MPI_HOME/lib" ]; then
        break
    fi
- 
+
    echo "Invalid path to the MPI Toolkit. ${MPI_HOME}/include or ${MPI_HOME}/lib cannot be found."
    if [ -z "$fromuser" ]; then
        exit 1
    fi

    # Retry
-    MPI_HOME="" 
+    MPI_HOME=""
 done
-    
-    
+
+
 if [ "$TF_NEED_MPI" == "1" ]; then
  write_to_bazelrc 'build --define with_mpi_support=true'

@ -850,11 +863,11 @@ if [ "$TF_NEED_MPI" == "1" ]; then
  ln -sf "${MPI_HOME}/include/mpi.h" third_party/mpi/mpi.h


-  #Determine if we use OpenMPI or MVAPICH, these require different header files 
+  #Determine if we use OpenMPI or MVAPICH, these require different header files
  #to be included here to make bazel dependency checker happy

  if [ -e "${MPI_HOME}/include/mpi_portable_platform.h" ]; then
-        #OpenMPI 
+        #OpenMPI
        ln -sf "${MPI_HOME}/include/mpi_portable_platform.h" third_party/mpi/
        sed -i -e "s/MPI_LIB_IS_OPENMPI=False/MPI_LIB_IS_OPENMPI=True/" third_party/mpi/mpi.bzl
 else
@ -864,7 +877,7 @@ if [ "$TF_NEED_MPI" == "1" ]; then
        sed -i -e "s/MPI_LIB_IS_OPENMPI=True/MPI_LIB_IS_OPENMPI=False/" third_party/mpi/mpi.bzl
 fi

-  
+
  if [ -e "${MPI_HOME}/lib/libmpi.so" ]; then
    ln -sf "${MPI_HOME}/lib/libmpi.so" third_party/mpi/
  else
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@ -208,6 +208,7 @@ filegroup(
        "//tensorflow/compiler/jit/kernels:all_files",
        "//tensorflow/compiler/jit/legacy_flags:all_files",
        "//tensorflow/compiler/jit/ops:all_files",
+        "//tensorflow/compiler/plugin/executor:all_files",
        "//tensorflow/compiler/tests:all_files",
        "//tensorflow/compiler/tf2xla:all_files",
        "//tensorflow/compiler/tf2xla/cc:all_files",
--- a/tensorflow/c/generate-pc.sh
+++ b/tensorflow/c/generate-pc.sh
@ -26,7 +26,7 @@ usage() {
 [ $# == 0 ] && usage && exit 0

 # read the options
-ARGS=`getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@"`
+ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@")
 eval set -- "$ARGS"

 # extract options and their arguments into variables.
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@ -472,10 +472,23 @@ cc_binary(
    name = "tutorials_example_trainer",
    srcs = ["tutorials/example_trainer.cc"],
    copts = tf_copts(),
-    linkopts = [
-        "-lpthread",
-        "-lm",
-    ],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//tensorflow:windows_msvc": [],
+        "//tensorflow:darwin": [
+            "-lm",
+            "-lpthread",
+        ],
+        "//tensorflow:ios": [
+            "-lm",
+            "-lpthread",
+        ],
+        "//conditions:default": [
+            "-lm",
+            "-lpthread",
+            "-lrt",
+        ],
+    }),
    deps = [
        ":cc_ops",
        "//tensorflow/core:core_cpu",
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@ -162,6 +162,32 @@ Status Log1pGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Log1p", Log1pGrad);

+Status SinhGrad(const Scope& scope, const Operation& op,
+                const std::vector<Output>& grad_inputs,
+                std::vector<Output>* grad_outputs) {
+  // y = sinh(x)
+  // dy/dx = cosh(x)
+  auto dydx = Cosh(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Sinh", SinhGrad);
+
+Status CoshGrad(const Scope& scope, const Operation& op,
+                const std::vector<Output>& grad_inputs,
+                std::vector<Output>* grad_outputs) {
+  // y = cosh(x)
+  // dy/dx = sinh(x)
+  auto dydx = Sinh(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Cosh", CoshGrad);
+
 Status TanhGrad(const Scope& scope, const Operation& op,
                const std::vector<Output>& grad_inputs,
                std::vector<Output>* grad_outputs) {
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@ -45,6 +45,8 @@ class CWiseUnaryGradTest : public ::testing::Test {
    EXPM1,
    LOG,
    LOG1P,
+    SINH,
+    COSH,
    TANH,
    SIGMOID,
    SIGN,
@ -111,6 +113,12 @@ class CWiseUnaryGradTest : public ::testing::Test {
      case LOG1P:
        y = Log1p(scope_, x);
        break;
+      case SINH:
+        y = Sinh(scope_, x);
+        break;
+      case COSH:
+        y = Cosh(scope_, x);
+        break;
      case TANH:
        y = Tanh(scope_, x);
        break;
@ -337,6 +345,50 @@ TEST_F(CWiseUnaryGradTest, Log1p_Complex) {
  TestCWiseGrad<complex64>(LOG1P, x_fn, dy_fn, dx_fn);
 }

+TEST_F(CWiseUnaryGradTest, Sinh) {
+  auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
+  auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
+  auto dx_fn = [this](const float x, const float dy) {
+    return dy * std::cosh(x);
+  };
+  TestCWiseGrad<float>(SINH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Sinh_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::cosh(x));
+  };
+  TestCWiseGrad<complex64>(SINH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Cosh) {
+  auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
+  auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
+  auto dx_fn = [this](const float x, const float dy) {
+    return dy * std::sinh(x);
+  };
+  TestCWiseGrad<float>(COSH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Cosh_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::sinh(x));
+  };
+  TestCWiseGrad<complex64>(COSH, x_fn, dy_fn, dx_fn);
+}
+
 TEST_F(CWiseUnaryGradTest, Tanh) {
  auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
  auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@ -46,6 +46,19 @@ Status SoftmaxGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad);

+Status LogSoftmaxGrad(const Scope& scope, const Operation& op,
+                   const std::vector<Output>& grad_inputs,
+                   std::vector<Output>* grad_outputs) {
+
+  auto softmax = Exp(scope, op.output(0));
+  auto sum = Sum(scope, grad_inputs[0], {1}, Sum::KeepDims(true));
+  auto mul = Mul(scope, sum, softmax);
+  auto dx = Sub(scope, grad_inputs[0], mul);
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("LogSoftmax", LogSoftmaxGrad);
+
 Status ReluGradHelper(const Scope& scope, const Operation& op,
                      const std::vector<Output>& grad_inputs,
                      std::vector<Output>* grad_outputs) {
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@ -57,6 +57,19 @@ TEST_F(NNGradTest, SoftmaxGrad) {
  RunTest(x, shape, y, shape);
 }

+TEST_F(NNGradTest, LogSoftmaxGrad) {
+  TensorShape shape({5, 3});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = LogSoftmax(scope_, x);
+  // Avoid numerical instability when computing finite differences.
+  Tensor x_init_value = test::AsTensor<float>(
+          {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f,
+           0.1f, 0.3f, 0.5f, 0.7f, 0.8f,
+           -0.1f, 0.1f, 0.1f, 0.1f, 1.2f},
+          {5, 3});
+  RunTest(x, x_init_value, y, shape);
+}
+
 TEST_F(NNGradTest, ReluGrad) {
  TensorShape shape({5, 2});
  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@ -15,7 +15,10 @@ package_group(
 )

 package(
-    default_visibility = [":internal"],
+    default_visibility = [
+        ":internal",
+        "//tensorflow/compiler/plugin/executor:__pkg__",
+    ],
 )

 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@ -2,6 +2,7 @@ licenses(["notice"])  # Apache 2.0

 package(
    default_visibility = [
+        "//tensorflow/compiler/plugin/executor:__pkg__",
        "//tensorflow/compiler/tf2xla:internal",
    ],
 )
--- a/tensorflow/compiler/plugin/BUILD
+++ b/tensorflow/compiler/plugin/BUILD
@ -32,5 +32,7 @@ package(

 cc_library(
    name = "plugin",
-    deps = [],
+    deps = [
+        "//tensorflow/compiler/plugin/executor:plugin_lib",
+    ],
 )
--- a/tensorflow/compiler/plugin/executor/BUILD
+++ b/tensorflow/compiler/plugin/executor/BUILD
@ -0,0 +1,34 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "plugin_lib",
+    srcs = glob([
+        "*.cc",
+    ]),
+    hdrs = glob([
+        "*.h",
+    ]),
+    deps = [
+        "//tensorflow/compiler/jit:xla_device",
+        "//tensorflow/compiler/jit:xla_jit_headers_lib",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:xla_headers_lib",
+        "//tensorflow/compiler/xla/service",
+        "//third_party/eigen3",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@protobuf//:protobuf_headers",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
--- a/tensorflow/compiler/plugin/executor/compiler.cc
+++ b/tensorflow/compiler/plugin/executor/compiler.cc
@ -0,0 +1,122 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdlib.h>
+#include <fstream>
+
+#include "tensorflow/compiler/plugin/executor/compiler.h"
+#include "tensorflow/compiler/plugin/executor/executable.h"
+
+#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
+#include "tensorflow/compiler/xla/service/inliner.h"
+#include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/strcat.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace se = ::perftools::gputools;
+namespace sep = ::perftools::gputools::executorplugin;
+namespace port = ::perftools::gputools::port;
+
+namespace xla {
+namespace executorplugin {
+
+/*
+ * Run optimization passes on the module.  The graph is transformed by
+ * each pass in the optimization pipeline.  The service subdirectory
+ * contains useful optimization passes.
+ */
+Status ExecutorCompiler::RunHloOptimization(HloModule* hlo_module) {
+  HloPassPipeline pipeline("Executor");
+  pipeline.AddPass<Inliner>();
+  pipeline.AddPass<HloSubcomputationUnification>();
+  pipeline.AddPass<HloCSE>(false);
+
+  pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
+      false, [](const Shape&, const Shape&) { return false; });
+  pipeline.AddPass<ReshapeMover>();
+  pipeline.AddPass<HloConstantFolding>();
+  pipeline.AddPass<HloCSE>(true);
+
+  pipeline.AddPass<HloDCE>();
+  pipeline.AddPass<FlattenCallGraph>();
+  return pipeline.Run(hlo_module).status();
+}
+
+StatusOr<std::unique_ptr<Executable>> ExecutorCompiler::Compile(
+        std::unique_ptr<HloModule> hlo_module,
+        se::StreamExecutor* stream_exec) {
+  TF_RET_CHECK(stream_exec != nullptr);
+
+  VLOG(1) << "Generate graph " << hlo_module->name();
+
+  TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
+
+  // Typically you would visit the HLO graph, building up a compiled equivalent
+  // In this case we are using an Hlo evaluator at execution time, so we don't
+  // need to compile anything
+
+  // Create executable from only the Hlo module
+  std::unique_ptr<Executable> executable;
+  executable.reset(new ExecutorExecutable(std::move(hlo_module)));
+
+  return std::move(executable);
+}
+
+StatusOr<std::vector<std::unique_ptr<Executable>>> ExecutorCompiler::Compile(
+        std::vector<std::unique_ptr<HloModule>> hlo_modules,
+        std::vector<se::StreamExecutor*> stream_execs) {
+
+  return tensorflow::errors::Unimplemented(
+      "Compilation of multiple HLO modules is not supported on Executor.");
+}
+
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+ExecutorCompiler::CompileAheadOfTime(
+    std::vector<std::unique_ptr<HloModule>> hlo_modules,
+    const AotCompilationOptions& aot_options) {
+
+  return tensorflow::errors::InvalidArgument(
+      "AOT compilation not supported on Executor");
+}
+
+se::Platform::Id ExecutorCompiler::PlatformId() const {
+  return sep::kExecutorPlatformId;
+}
+
+HloCostAnalysis::ShapeSizeFunction
+ExecutorCompiler::ShapeSizeBytesFunction() const {
+  return ExecutorExecutable::ShapeSizeBytes;
+}
+
+
+}  // namespace executorplugin
+}  // namespace xla
+
+REGISTER_MODULE_INITIALIZER(executor_compiler, {
+  xla::Compiler::RegisterCompilerFactory(sep::kExecutorPlatformId, []() {
+    return xla::MakeUnique<xla::executorplugin::ExecutorCompiler>();
+  });
+});
--- a/tensorflow/compiler/plugin/executor/compiler.h
+++ b/tensorflow/compiler/plugin/executor/compiler.h
@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+
+#include "tensorflow/compiler/plugin/executor/platform_id.h"
+
+namespace xla {
+namespace executorplugin {
+
+class ExecutorCompiler : public Compiler {
+ public:
+  ExecutorCompiler() {}
+  ~ExecutorCompiler() override {}
+
+  StatusOr<std::unique_ptr<Executable>> Compile(
+      std::unique_ptr<HloModule> hlo_module,
+      perftools::gputools::StreamExecutor* stream_exec) override;
+
+  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+      std::vector<std::unique_ptr<HloModule>> hlo_module,
+      std::vector<perftools::gputools::StreamExecutor*> stream_exec) override;
+
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      std::vector<std::unique_ptr<HloModule>> module,
+      const AotCompilationOptions& options) override;
+
+  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
+
+  perftools::gputools::Platform::Id PlatformId() const override;
+
+ private:
+  Status RunHloOptimization(HloModule* hlo_module);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorCompiler);
+};
+
+}  // namespace executorplugin
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
--- a/tensorflow/compiler/plugin/executor/device.cc
+++ b/tensorflow/compiler/plugin/executor/device.cc
@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/kernels/xla_device_launch_op.h"
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/jit/xla_device_ops.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+
+const char* const DEVICE_XLA_EXEC = "XLA_EXEC";
+const char* const DEVICE_EXEC_XLA_JIT = "XLA_EXEC_JIT";
+
+constexpr std::array<DataType, 5> kExecAllTypes = {
+    {DT_INT32, DT_FLOAT, DT_BOOL, DT_DOUBLE, DT_INT64}};
+
+class XlaExaDeviceFactory : public DeviceFactory {
+ public:
+  Status CreateDevices(const SessionOptions& options, const string& name_prefix,
+                       std::vector<Device*>* devices) override;
+};
+
+Status XlaExaDeviceFactory::CreateDevices(const SessionOptions& options,
+                                          const string& name_prefix,
+                                          std::vector<Device*>* devices) {
+  static XlaDeviceOpRegistrations* registrations =
+      RegisterXlaDeviceKernels(DEVICE_XLA_EXEC, DEVICE_EXEC_XLA_JIT);
+  (void)registrations;
+
+  std::unique_ptr<XlaDevice> device;
+  TF_RETURN_IF_ERROR(XlaDevice::Create("Executor", DEVICE_XLA_EXEC, 0,
+                                       DEVICE_EXEC_XLA_JIT, options,
+                                       name_prefix, &device));
+  devices->push_back(device.release());
+  return Status::OK();
+}
+
+REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_EXEC, XlaExaDeviceFactory, 110);
+
+// Kernel registrations
+
+static bool OpFilter(KernelDef* kdef) { return true; }
+
+REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_EXEC, XlaDeviceLaunchOp, kExecAllTypes);
+REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_EXEC, kExecAllTypes);
+REGISTER_XLA_BACKEND(DEVICE_EXEC_XLA_JIT, kExecAllTypes, OpFilter);
+
+}  // namespace tensorflow
--- a/tensorflow/compiler/plugin/executor/executable.cc
+++ b/tensorflow/compiler/plugin/executor/executable.cc
@ -0,0 +1,147 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/plugin/executor/executable.h"
+#include "tensorflow/compiler/plugin/executor/executor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace se = ::perftools::gputools;
+namespace sep = ::perftools::gputools::executorplugin;
+
+namespace xla {
+namespace executorplugin {
+
+ExecutorExecutable::ExecutorExecutable(std::unique_ptr<HloModule> hlo_module)
+    : Executable(std::move(hlo_module), ShapeSizeBytes) {}
+
+ExecutorExecutable::~ExecutorExecutable() {}
+
+static se::DeviceMemoryBase AllocateSingleOutput(sep::ExecutorExecutor* executor,
+                                                 const Literal& literal) {
+  int64 size(xla::ShapeUtil::ByteSizeOf(literal.shape()));
+  void* buf = executor->Allocate(size);
+  const void* src = literal.InternalData();
+  memcpy(buf, src, size);
+  return se::DeviceMemoryBase(buf, size);
+}
+
+static se::DeviceMemoryBase AllocateOutputBuffer(sep::ExecutorExecutor* executor,
+                                                 const Literal& literal) {
+  const Shape& shape = literal.shape();
+  if (shape.element_type() != xla::TUPLE) {
+    return AllocateSingleOutput(executor, literal);
+  } else {
+    int64 size(xla::ShapeUtil::ByteSizeOf(shape, sizeof(void*)));
+    void** buf = reinterpret_cast<void**>(executor->Allocate(size));
+    for (int64 n = 0; n < xla::ShapeUtil::TupleElementCount(shape); n++) {
+      se::DeviceMemoryBase out =
+          AllocateSingleOutput(executor, literal.tuple_literals(n));
+      *buf++ = out.opaque();
+    }
+
+    return se::DeviceMemoryBase(buf, size);
+  }
+}
+
+StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
+    HloExecutionProfile* hlo_execution_profile) {
+  se::Stream* stream = run_options->stream();
+
+  VLOG(1) << "Execute " << module().name();
+  if (VLOG_IS_ON(2)) {
+    for (const auto& a : arguments) {
+      VLOG(2) << "-- argument " << a.opaque();
+    }
+  }
+
+  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
+
+  HloComputation* computation = module().entry_computation();
+  if (computation->num_parameters() != arguments.size()) {
+    return tensorflow::errors::Internal(
+        "Mismatch between argument count and graph parameter count.");
+  }
+
+  // Create the arguments as an vector of XLA literals
+  std::vector<std::unique_ptr<Literal>> arg_literals;
+  std::vector<Literal*> arg_literals_ptrs;
+  for (int64 p = 0; p < computation->num_parameters(); p++) {
+    // Create the input literal for the parameter
+    HloInstruction* param = computation->parameter_instruction(p);
+    arg_literals.emplace_back(Literal::CreateFromShape(param->shape()));
+    arg_literals_ptrs.push_back(arg_literals.back().get());
+
+    // Copy in the data from the stream_executor buffers
+    void* buffer = arg_literals.back().get()->MutableInternalData();
+    memcpy(buffer, arguments[p].opaque(),
+           ShapeUtil::ByteSizeOf(param->shape()));
+  }
+
+  // Execute the graph using the evaluator
+  HloEvaluator evaluator;
+  std::unique_ptr<Literal> output;
+  TF_ASSIGN_OR_RETURN(output,
+                      evaluator.Evaluate(computation, arg_literals_ptrs));
+
+  // Copy the result into the return buffer
+  perftools::gputools::StreamExecutor* executor(stream->parent());
+  sep::ExecutorExecutor* executorExecutor(
+      static_cast<sep::ExecutorExecutor*>(executor->implementation()));
+
+  se::DeviceMemoryBase ret =
+      AllocateOutputBuffer(executorExecutor, *(output.get()));
+
+  uint64 end_micros = tensorflow::Env::Default()->NowMicros();
+
+  {
+    tensorflow::mutex_lock lock(mutex_);
+    const double nanoseconds = (end_micros - start_micros) * 1000.0;
+    execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
+  }
+
+  return ret;
+}
+
+StatusOr<std::unique_ptr<ShapedBuffer>> ExecutorExecutable::ExecuteOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+    HloExecutionProfile* hlo_execution_profile) {
+  return tensorflow::errors::Unimplemented(
+      "ExecuteOnStream is not yet supported on Executor.");
+}
+
+StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteAsyncOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
+  return tensorflow::errors::Unimplemented(
+      "ExecuteAsyncOnStream is not yet supported on Executor.");
+}
+
+/*static*/ int64 ExecutorExecutable::ShapeSizeBytes(const Shape& shape) {
+  if (ShapeUtil::IsOpaque(shape)) {
+    return sizeof(void*);
+  }
+  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+}
+
+
+}  // namespace executorplugin
+}  // namespace xla
--- a/tensorflow/compiler/plugin/executor/executable.h
+++ b/tensorflow/compiler/plugin/executor/executable.h
@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace xla {
+namespace executorplugin {
+
+class ExecutorExecutable : public Executable {
+ public:
+  ExecutorExecutable(std::unique_ptr<HloModule> hlo_module);
+  ~ExecutorExecutable() override;
+
+  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
+          arguments,
+      HloExecutionProfile* hlo_execution_profile) override;
+
+  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+      HloExecutionProfile* hlo_execution_profile) override;
+
+  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
+          arguments) override;
+
+  static int64 ShapeSizeBytes(const Shape& shape);
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorExecutable);
+};
+
+}  // namespace executorplugin
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
--- a/tensorflow/compiler/plugin/executor/executor.cc
+++ b/tensorflow/compiler/plugin/executor/executor.cc
@ -0,0 +1,135 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/plugin/executor/executor.h"
+#include "tensorflow/compiler/plugin/executor/platform_id.h"
+
+#include "tensorflow/compiler/xla/status_macros.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+namespace se = ::perftools::gputools;
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+host::HostStream *AsExecutorStream(Stream *stream) {
+  DCHECK(stream != nullptr);
+  return dynamic_cast<host::HostStream *>(stream->implementation());
+}
+
+ExecutorExecutor::ExecutorExecutor(const PluginConfig &plugin_config)
+    : plugin_config_(plugin_config) {}
+
+ExecutorExecutor::~ExecutorExecutor() {}
+
+void *ExecutorExecutor::Allocate(uint64 size) {
+  void *buf = new char[size];
+  return buf;
+}
+
+void *ExecutorExecutor::AllocateSubBuffer(DeviceMemoryBase *parent,
+                                         uint64 offset_bytes,
+                                         uint64 size_bytes) {
+  return parent + offset_bytes;
+}
+
+void ExecutorExecutor::Deallocate(DeviceMemoryBase *mem) {
+  if (!mem->is_sub_buffer()) {
+    delete[] static_cast<char *>(mem->opaque());
+  }
+}
+
+bool ExecutorExecutor::Memcpy(Stream *stream, void *host_dst,
+                             const DeviceMemoryBase &dev_src, uint64 size) {
+  AsExecutorStream(stream)->EnqueueTask([this, host_dst, dev_src, size]() {
+    port::Status ok = SynchronousMemcpy(host_dst, dev_src, size);
+  });
+  return true;
+}
+
+bool ExecutorExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
+                             const void *host_src, uint64 size) {
+  AsExecutorStream(stream)->EnqueueTask([this, dev_dst, host_src, size]() {
+    port::Status ok = SynchronousMemcpy(dev_dst, host_src, size);
+  });
+  return true;
+}
+
+port::Status ExecutorExecutor::SynchronousMemcpy(DeviceMemoryBase *dev_dst,
+                                                const void *host_src,
+                                                uint64 size) {
+  memcpy(dev_dst->opaque(), host_src, size);
+  return port::Status::OK();
+}
+
+port::Status ExecutorExecutor::SynchronousMemcpy(void *host_dst,
+                                                const DeviceMemoryBase &dev_src,
+                                                uint64 size) {
+  memcpy(host_dst, dev_src.opaque(), size);
+  return port::Status::OK();
+}
+
+bool ExecutorExecutor::HostCallback(Stream *stream,
+                                   std::function<void()> callback) {
+  AsExecutorStream(stream)->EnqueueTask(callback);
+  return true;
+}
+
+bool ExecutorExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
+  AsExecutorStream(dependent)->EnqueueTask(
+      [other]() { other->BlockHostUntilDone(); });
+  AsExecutorStream(dependent)->BlockUntilDone();
+  return true;
+}
+
+bool ExecutorExecutor::StartTimer(Stream *stream, Timer *timer) {
+  dynamic_cast<host::HostTimer *>(timer->implementation())->Start(stream);
+  return true;
+}
+
+bool ExecutorExecutor::StopTimer(Stream *stream, Timer *timer) {
+  dynamic_cast<host::HostTimer *>(timer->implementation())->Stop(stream);
+  return true;
+}
+
+bool ExecutorExecutor::BlockHostUntilDone(Stream *stream) {
+  AsExecutorStream(stream)->BlockUntilDone();
+  return true;
+}
+
+DeviceDescription *ExecutorExecutor::PopulateDeviceDescription() const {
+  internal::DeviceDescriptionBuilder builder;
+
+  builder.set_device_address_bits(64);
+
+  builder.set_name("Executor");
+  builder.set_device_vendor("VectorName");
+  builder.set_platform_version("1.0");
+  builder.set_driver_version("1.0");
+  builder.set_runtime_version("1.0");
+  builder.set_pci_bus_id("1");
+  builder.set_device_memory_size(static_cast<uint64>(4) * 1024 * 1024 * 1024);
+  builder.set_clock_rate_ghz(static_cast<float>(CLOCKS_PER_SEC) / 1e9);
+
+  auto built = builder.Build();
+  return built.release();
+}
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
--- a/tensorflow/compiler/plugin/executor/executor.h
+++ b/tensorflow/compiler/plugin/executor/executor.h
@ -0,0 +1,213 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Declares the ExecutorExecutor class, which is a CPU-only implementation of
+// the StreamExecutor interface. For now, this is used for testing and to
+// examine the performance of host-based StreamExecutor code.
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
+
+#include "tensorflow/stream_executor/host/host_stream.h"
+#include "tensorflow/stream_executor/host/host_timer.h"
+
+#include "tensorflow/compiler/xla/shape_util.h"
+
+#include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/rng.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+#include <list>
+#include <mutex>
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+using Args = tensorflow::gtl::ArraySlice<DeviceMemoryBase>;
+
+class ExecutorExecutor : public internal::StreamExecutorInterface {
+ public:
+  explicit ExecutorExecutor(const PluginConfig &plugin_config);
+  ~ExecutorExecutor() override;
+
+  port::Status Init(int device_ordinal, DeviceOptions device_options) override {
+    return port::Status::OK();
+  }
+
+  bool GetKernel(const MultiKernelLoaderSpec &spec,
+                 KernelBase *kernel) override {
+    return false;
+  }
+  bool Launch(Stream *stream, const ThreadDim &thread_dims,
+              const BlockDim &block_dims, const KernelBase &kernel,
+              const KernelArgsArrayBase &args) override {
+    return false;
+  }
+
+  void *Allocate(uint64 size) override;
+  void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
+                          uint64 size_bytes) override;
+  void Deallocate(DeviceMemoryBase *mem) override;
+
+  void *HostMemoryAllocate(uint64 size) override { return new char[size]; }
+  void HostMemoryDeallocate(void *mem) override {
+    delete[] static_cast<char *>(mem);
+  }
+  bool HostMemoryRegister(void *mem, uint64 size) override { return true; }
+  bool HostMemoryUnregister(void *mem) override { return true; }
+
+  bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &pop_src,
+              uint64 size) override;
+  bool Memcpy(Stream *stream, DeviceMemoryBase *pop_dst, const void *host_src,
+              uint64 size) override;
+  bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *pop_dst,
+                            const DeviceMemoryBase &host_src,
+                            uint64 size) override {
+    return false;
+  }
+
+  bool MemZero(Stream *stream, DeviceMemoryBase *location,
+               uint64 size) override {
+    return false;
+  }
+  bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern,
+              uint64 size) override {
+    return false;
+  }
+  bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
+                uint64 size) override {
+    return false;
+  }
+
+  // No "synchronize all activity" implemented for this platform at the moment.
+  bool SynchronizeAllActivity() override { return false; }
+  bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override {
+    return false;
+  }
+
+  bool SynchronousMemSet(DeviceMemoryBase *location, int value,
+                         uint64 size) override {
+    return false;
+  }
+
+  port::Status SynchronousMemcpy(DeviceMemoryBase *pop_dst,
+                                 const void *host_src, uint64 size) override;
+  port::Status SynchronousMemcpy(void *host_dst,
+                                 const DeviceMemoryBase &pop_src,
+                                 uint64 size) override;
+  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *pop_dst,
+                                               const DeviceMemoryBase &pop_src,
+                                               uint64 size) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+
+  port::Status AllocateEvent(Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  port::Status DeallocateEvent(Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  port::Status RecordEvent(Stream *stream, Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  port::Status WaitForEvent(Stream *stream, Event *event) override {
+    return port::Status{port::error::UNIMPLEMENTED, ""};
+  }
+
+  Event::Status PollForEventStatus(Event *event) override {
+    return Event::Status::kError;
+  }
+
+  bool AllocateStream(Stream *stream) override { return true; }
+  void DeallocateStream(Stream *stream) override {}
+  bool CreateStreamDependency(Stream *dependent, Stream *other) override;
+
+  bool AllocateTimer(Timer *timer) override { return true; }
+  void DeallocateTimer(Timer *timer) override {}
+  bool StartTimer(Stream *stream, Timer *timer) override;
+  bool StopTimer(Stream *stream, Timer *timer) override;
+
+  bool BlockHostUntilDone(Stream *stream) override;
+
+  int PlatformDeviceCount() override { return 1; }
+
+  bool DeviceMemoryUsage(int64 *free, int64 *total) const override {
+    return false;
+  }
+
+  DeviceDescription *PopulateDeviceDescription() const override;
+
+  port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override {
+    return port::Status::OK();
+  }
+
+  bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override {
+    return true;
+  }
+
+  SharedMemoryConfig GetDeviceSharedMemoryConfig() override {
+    return SharedMemoryConfig::kDefault;
+  }
+
+  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override {
+    return port::Status{port::error::UNIMPLEMENTED,
+                        "Shared memory not supported"};
+  }
+
+  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
+      override {
+    return nullptr;
+  }
+
+  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
+      override {
+    return nullptr;
+  }
+
+  std::unique_ptr<internal::StreamInterface> GetStreamImplementation()
+      override {
+    return std::unique_ptr<internal::StreamInterface>(new host::HostStream());
+  }
+
+  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override {
+    return std::unique_ptr<internal::TimerInterface>(new host::HostTimer());
+  }
+
+  port::StatusOr<DeviceMemoryBase> ExecuteGraph(const xla::Shape &shape,
+                                                Args args);
+
+ private:
+  DeviceMemoryBase AllocateSingleOutput(const xla::Shape &shape);
+
+  port::StatusOr<DeviceMemoryBase> AllocateOutputBuffer(
+      const xla::Shape &shape);
+
+  const PluginConfig plugin_config_;
+};
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
--- a/tensorflow/compiler/plugin/executor/platform.cc
+++ b/tensorflow/compiler/plugin/executor/platform.cc
@ -0,0 +1,125 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/plugin/executor/platform.h"
+#include "tensorflow/compiler/plugin/executor/executor.h"
+#include "tensorflow/compiler/plugin/executor/platform_id.h"
+
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/status_macros.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+
+namespace se = ::perftools::gputools;
+namespace sep = ::perftools::gputools::executorplugin;
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+PLATFORM_DEFINE_ID(kExecutorPlatformId);
+
+ExecutorPlatform::ExecutorPlatform() : name_("Executor") {}
+
+ExecutorPlatform::~ExecutorPlatform() {}
+
+Platform::Id ExecutorPlatform::id() const { return kExecutorPlatformId; }
+
+int ExecutorPlatform::VisibleDeviceCount() const { return 1; }
+
+const string& ExecutorPlatform::Name() const { return name_; }
+
+port::StatusOr<StreamExecutor*> ExecutorPlatform::ExecutorForDevice(
+    int ordinal) {
+  StreamExecutorConfig config;
+  config.ordinal = ordinal;
+  config.plugin_config = PluginConfig();
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*>
+ExecutorPlatform::ExecutorForDeviceWithPluginConfig(
+    int device_ordinal, const PluginConfig& plugin_config) {
+  StreamExecutorConfig config;
+  config.ordinal = device_ordinal;
+  config.plugin_config = plugin_config;
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> ExecutorPlatform::GetExecutor(
+    const StreamExecutorConfig& config) {
+  mutex_lock lock(executors_mutex_);
+
+  port::StatusOr<StreamExecutor*> status = executor_cache_.Get(config);
+  if (status.ok()) {
+    return status.ValueOrDie();
+  }
+
+  port::StatusOr<std::unique_ptr<StreamExecutor>> executor =
+      GetUncachedExecutor(config);
+  if (!executor.ok()) {
+    return executor.status();
+  }
+
+  StreamExecutor* naked_executor = executor.ValueOrDie().get();
+  SE_RETURN_IF_ERROR(
+      executor_cache_.Insert(config, executor.ConsumeValueOrDie()));
+  return naked_executor;
+}
+
+port::StatusOr<std::unique_ptr<StreamExecutor>>
+ExecutorPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
+  auto executor = port::MakeUnique<StreamExecutor>(
+      this, port::MakeUnique<ExecutorExecutor>(config.plugin_config));
+  auto init_status = executor->Init(config.ordinal, config.device_options);
+  if (!init_status.ok()) {
+    return port::Status{
+        port::error::INTERNAL,
+        port::Printf(
+            "failed initializing StreamExecutor for device ordinal %d: %s",
+            config.ordinal, init_status.ToString().c_str())};
+  }
+
+  return std::move(executor);
+}
+
+void ExecutorPlatform::RegisterTraceListener(
+    std::unique_ptr<TraceListener> listener) {
+  LOG(FATAL) << "not yet implemented: register executor trace listener";
+}
+
+void ExecutorPlatform::UnregisterTraceListener(TraceListener* listener) {
+  LOG(FATAL) << "not yet implemented: unregister executor trace listener";
+}
+
+static void InitializeExecutorPlatform() {
+  std::unique_ptr<se::Platform> platform(new sep::ExecutorPlatform);
+  SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform)));
+}
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
+
+REGISTER_MODULE_INITIALIZER(executor_platform, sep::InitializeExecutorPlatform());
+
+DECLARE_MODULE_INITIALIZER(multi_platform_manager);
+// Note that module initialization sequencing is not supported in the
+// open-source project, so this will be a no-op there.
+REGISTER_MODULE_INITIALIZER_SEQUENCE(executor_platform, multi_platform_manager);
--- a/tensorflow/compiler/plugin/executor/platform.h
+++ b/tensorflow/compiler/plugin/executor/platform.h
@ -0,0 +1,83 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/stream_executor/executor_cache.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/trace_listener.h"
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+class ExecutorPlatform : public Platform {
+ public:
+  ExecutorPlatform();
+  ~ExecutorPlatform() override;
+
+  Platform::Id id() const override;
+
+  // Device count is less clear-cut for CPUs than accelerators. This call
+  // currently returns the number of thread units in the host, as reported by
+  // base::NumCPUs().
+  int VisibleDeviceCount() const override;
+
+  const string& Name() const override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+      int ordinal, const PluginConfig& config) override;
+
+  port::StatusOr<StreamExecutor*> GetExecutor(
+      const StreamExecutorConfig& config) override;
+
+  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      const StreamExecutorConfig& config) override;
+
+  void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
+
+  void UnregisterTraceListener(TraceListener* listener) override;
+
+ private:
+  // This platform's name.
+  string name_;
+
+  // mutex that guards the ordinal-to-executor map.
+  mutable mutex executors_mutex_;
+
+  // Cache of created StreamExecutors.
+  ExecutorCache executor_cache_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ExecutorPlatform);
+};
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
--- a/tensorflow/compiler/plugin/executor/platform_id.h
+++ b/tensorflow/compiler/plugin/executor/platform_id.h
@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
+#define TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
+
+#include "tensorflow/stream_executor/platform.h"
+
+namespace perftools {
+namespace gputools {
+namespace executorplugin {
+
+extern const Platform::Id kExecutorPlatformId;
+
+}  // namespace executorplugin
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
--- a/tensorflow/compiler/plugin/executor/transfer_manager.cc
+++ b/tensorflow/compiler/plugin/executor/transfer_manager.cc
@ -0,0 +1,187 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/plugin/executor/transfer_manager.h"
+#include "tensorflow/compiler/plugin/executor/platform_id.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace sep = ::perftools::gputools::executorplugin;
+
+namespace xla {
+namespace executorplugin {
+
+ExecutorTransferManager::ExecutorTransferManager() {}
+
+se::Platform::Id ExecutorTransferManager::PlatformId() const {
+  return se::executorplugin::kExecutorPlatformId;
+}
+
+Status ExecutorTransferManager::TransferLiteralFromDevice(
+    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
+    const Shape& device_shape, const Shape& literal_shape, Literal* literal) {
+  TF_RET_CHECK(ShapeUtil::Compatible(device_shape, literal_shape));
+
+  // Tuples are a special case and contain one or more shapes inside of them to
+  // an arbitrary nesting depth.
+  if (device_shape.element_type() == TUPLE) {
+    *literal->mutable_shape() = literal_shape;
+    TF_ASSIGN_OR_RETURN(
+        std::vector<se::DeviceMemoryBase> element_buffers,
+        ShallowCopyTupleFromDevice(executor, source, device_shape));
+    TF_RET_CHECK(element_buffers.size() ==
+                 ShapeUtil::TupleElementCount(device_shape));
+    for (int64 i = 0; i < element_buffers.size(); ++i) {
+      const Shape& element_device_shape = device_shape.tuple_shapes(i);
+      const Shape& element_literal_shape = literal_shape.tuple_shapes(i);
+      Literal* element_literal = literal->add_tuple_literals();
+      // Recursively call TransferFromDevice to copy over the data in the
+      // element array.
+      TF_RETURN_IF_ERROR(TransferLiteralFromDevice(
+          executor, element_buffers[i], element_device_shape,
+          element_literal_shape, element_literal));
+    }
+    return Status::OK();
+  }
+
+  *literal->mutable_shape() = device_shape;
+  literal->Reserve(ShapeUtil::ElementsIn(device_shape));
+  TF_RETURN_IF_ERROR(TransferBufferFromDevice(
+      executor, source, ShapeUtil::ByteSizeOf(device_shape),
+      literal->MutableInternalData()));
+  if (!ShapeUtil::Equal(literal_shape, device_shape)) {
+    literal->Swap(
+        literal->Relayout(literal_shape.layout()).get());
+  }
+  TF_RET_CHECK(ShapeUtil::Equal(literal_shape, literal->shape()));
+  return Status::OK();
+}
+
+StatusOr<std::vector<se::DeviceMemoryBase>>
+ExecutorTransferManager::ShallowCopyTupleFromDevice(
+    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
+    const Shape& shape) {
+  TF_RET_CHECK(ShapeUtil::IsTuple(shape));
+
+  std::vector<void*> element_pointers(ShapeUtil::TupleElementCount(shape),
+                                      nullptr);
+  int64 tuple_size = ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  auto copy_status = executor->SynchronousMemcpyD2H(source, tuple_size,
+                                                    element_pointers.data());
+  if (!copy_status.ok()) {
+    return AddStatus(
+        Status(static_cast<tensorflow::error::Code>(copy_status.code()),
+               copy_status.error_message()),
+        "failed transfer of tuple buffer " + ShapeUtil::HumanString(shape));
+  }
+
+  // Create a DeviceMemoryBase from each void* pointer.
+  std::vector<se::DeviceMemoryBase> destination;
+  for (int i = 0; i < element_pointers.size(); ++i) {
+    if (element_pointers[i] == nullptr &&
+        !ShapeUtil::HasZeroElements(shape.tuple_shapes(i))) {
+      return FailedPrecondition("tuple contains nullptr at element %d", i);
+    }
+    int64 buffer_size =
+        ShapeUtil::ByteSizeOf(shape.tuple_shapes(i), sizeof(void*));
+    destination.emplace_back(element_pointers[i], buffer_size);
+  }
+  return std::move(destination);
+}
+
+Status ExecutorTransferManager::TransferLiteralToDevice(
+    se::StreamExecutor* executor, const Literal& literal,
+    se::DeviceMemoryBase* destination) {
+  const Shape& shape = literal.shape();
+
+  if (ShapeUtil::IsTuple(literal.shape())) {
+    std::vector<void*> tuple_elements_on_device;
+    for (const Literal& tuple_element : literal.tuple_literals()) {
+      se::DeviceMemoryBase allocation = executor->AllocateArray<uint8>(
+          GetByteSizeRequirement(tuple_element.shape()));
+      TF_RETURN_IF_ERROR(
+          TransferLiteralToDevice(executor, tuple_element, &allocation));
+      tuple_elements_on_device.push_back(allocation.opaque());
+    }
+    return TransferBufferToDevice(
+        executor, tuple_elements_on_device.size() * sizeof(void*),
+        tuple_elements_on_device.data(), destination);
+  }
+
+  return TransferBufferToDevice(executor, GetByteSizeRequirement(shape),
+                                literal.InternalData(),
+                                destination);
+}
+
+Status ExecutorTransferManager::TransferLiteralToInfeed(
+    se::StreamExecutor* executor, const Literal& literal) {
+  const Shape& shape = literal.shape();
+  VLOG(1) << "transferring literal shape to infeed: "
+          << ShapeUtil::HumanString(shape);
+
+  return Status::OK();
+}
+
+Status ExecutorTransferManager::TransferBufferToInfeed(
+    se::StreamExecutor* executor, int64 size, const void* source) {
+  return Unimplemented("Transfer to Infeed");
+}
+
+Status ExecutorTransferManager::TransferLiteralFromOutfeed(
+    perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
+    Literal* literal) {
+  const Shape& shape = literal->shape();
+  VLOG(1) << "transferring literal shape from outfeed: "
+          << ShapeUtil::HumanString(shape);
+
+  return Status::OK();
+}
+
+Status ExecutorTransferManager::ResetDevices(
+    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
+        executors) {
+  return Unimplemented("Device reset not supported");
+}
+
+int64 ExecutorTransferManager::GetByteSizeRequirement(const Shape& shape) {
+  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+}
+
+}  // namespace executorplugin
+}  // namespace xla
+
+static std::unique_ptr<xla::TransferManager> CreateExecutorTransferManager() {
+  return xla::MakeUnique<xla::executorplugin::ExecutorTransferManager>();
+}
+
+static bool InitModule() {
+  xla::TransferManager::RegisterTransferManager(sep::kExecutorPlatformId,
+                                                &CreateExecutorTransferManager);
+  return true;
+}
+static bool module_initialized = InitModule();
--- a/tensorflow/compiler/plugin/executor/transfer_manager.h
+++ b/tensorflow/compiler/plugin/executor/transfer_manager.h
@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
+#define TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
+
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/types.h"
+
+#include <vector>
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+namespace executorplugin {
+
+class ExecutorTransferManager : public TransferManager {
+ public:
+  ExecutorTransferManager();
+
+  ~ExecutorTransferManager() override {}
+
+  se::Platform::Id PlatformId() const override;
+
+  StatusOr<std::vector<se::DeviceMemoryBase>> ShallowCopyTupleFromDevice(
+      se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
+      const Shape& shape) override;
+
+  Status TransferLiteralFromDevice(se::StreamExecutor* executor,
+                                   const se::DeviceMemoryBase& source,
+                                   const Shape& device_shape,
+                                   const Shape& literal_shape,
+                                   Literal* literal) override;
+
+  Status TransferLiteralToDevice(se::StreamExecutor* executor,
+                                 const Literal& literal,
+                                 se::DeviceMemoryBase* destination) override;
+
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                 const Literal& literal) override;
+
+  Status TransferBufferToInfeed(se::StreamExecutor* executor,
+                                 int64 size, const void* source) override;
+
+  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                    const Shape& literal_shape,
+                                    Literal* literal) override;
+
+  Status ResetDevices(
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> executors) override;
+
+  int64 GetByteSizeRequirement(const Shape& shape) override;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorTransferManager);
+};
+
+}  // namespace executorplugin
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@ -175,6 +175,11 @@ tf_xla_py_test(
    name = "slice_ops_test",
    size = "small",
    srcs = ["slice_ops_test.py"],
+    # TODO(b/62962492): Test fails with assertion error.
+    tags = [
+        "manual",
+        "notap",
+    ],
    deps = [
        ":xla_test",
        "//tensorflow/python:array_ops",
@ -456,6 +461,11 @@ cuda_py_test(
        "//tensorflow/python:math_ops",
        "//tensorflow/python:nn_ops",
    ],
+    # TODO(b/62961789): Test fails with SIGABRT
+    tags = [
+        "manual",
+        "notap",
+    ],
 )

 cc_library(
@ -524,8 +534,12 @@ cuda_py_test(
 # --dump_graph_dir, and the config file was written by hand.
 #
 # Run the following to build a minimal benchmark of the computation on Android:
-# $ bazel build -c opt --config=android_arm \
-#       third_party/tensorflow/compiler/tests:lstm_layer_inference_benchmark
+# $ bazel build -c opt --cxxopt='-std=c++11' --linkopt='-lm' \
+#   --cpu=armeabi-v7a \
+#   --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+#   --crosstool_top=//external:android/crosstool \
+#   //tensorflow/compiler/tests:lstm_layer_inference_benchmark
+
 #
 # Currently the resulting binary size is ~190KB
 tf_library(
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@ -218,7 +218,7 @@ class FtrlOptimizerTest(XLATestCase):
        self.assertAllClose(np.array([-0.24059935, -0.46829352]), var0.eval())
        self.assertAllClose(np.array([-0.02406147, -0.04830509]), var1.eval())

-  # When variables are intialized with Zero, FTRL-Proximal has two properties:
+  # When variables are initialized with Zero, FTRL-Proximal has two properties:
  # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical
  # with GradientDescent.
  # 2. Without L1&L2 but with adaptive learning rate, FTRL-Proximal is idential
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@ -94,12 +94,14 @@ class BatchMatMulOp : public XlaOpKernel {
      // Slice off individual matrices and reshape to 2D tensors.
      auto x_slice = builder->Slice(
          x_flat, {i, 0, 0},
-          {i + 1, x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)});
+          {i + 1, x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)},
+          {1, 1, 1});
      x_slice = builder->Reshape(
          x_slice, {x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)});
      auto y_slice = builder->Slice(
          y_flat, {i, 0, 0},
-          {i + 1, y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)});
+          {i + 1, y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)},
+          {1, 1, 1});
      y_slice = builder->Reshape(
          y_slice, {y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)});

--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@ -125,6 +125,7 @@ void BatchToSpace(XlaOpKernelContext* ctx,
  //       input_shape[M+1], ..., input_shape[N-1]]
  std::vector<int64> start_indices(input_rank, 0);
  std::vector<int64> end_indices = reshaped_permuted_shape;
+  std::vector<int64> strides(input_rank, 1);
  for (int i = 0; i < block_rank; ++i) {
    int64 crop_start = crops.Get<int64>({i, 0});
    int64 crop_end = crops.Get<int64>({i, 1});
@ -139,7 +140,7 @@ void BatchToSpace(XlaOpKernelContext* ctx,
            " end: ", crop_end, " size ", reshaped_permuted_shape[1 + i]));
  }
  xla::ComputationDataHandle output =
-      b->Slice(reshaped_permuted, start_indices, end_indices);
+      b->Slice(reshaped_permuted, start_indices, end_indices, strides);
  ctx->SetOutput(0, output);
 }

--- a/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc
@ -172,15 +172,14 @@ class DepthwiseConv2dNativeOp : public XlaOpKernel {
    } else {
      // These will be used to define the bounds of each slice.
      // Within the loop, the input_channel index will be modified.
-      gtl::InlinedVector<int64, 4> filter_begin;
-      gtl::InlinedVector<int64, 4> filter_limits;
-      gtl::InlinedVector<int64, 4> input_begin;
-      gtl::InlinedVector<int64, 4> input_limits;
+      gtl::InlinedVector<int64, 4> filter_begin(4, 0);
+      gtl::InlinedVector<int64, 4> filter_limits(4);
+      gtl::InlinedVector<int64, 4> input_begin(4, 0);
+      gtl::InlinedVector<int64, 4> input_limits(4);
+      gtl::InlinedVector<int64, 4> strides(4, 1);
      for (int i = 0; i < 4; ++i) {
-        filter_begin.push_back(0);
-        filter_limits.push_back(filter_shape.dim_size(i));
-        input_begin.push_back(0);
-        input_limits.push_back(input_shape.dim_size(i));
+        filter_limits[i] = filter_shape.dim_size(i);
+        input_limits[i] = input_shape.dim_size(i);
      }

      std::vector<int64> strides_for_tla{strides_[1], strides_[2]};
@ -209,9 +208,9 @@ class DepthwiseConv2dNativeOp : public XlaOpKernel {
        input_limits[3] = i + 1;

        xla::ComputationDataHandle filter_slice =
-            b.Slice(filter, filter_begin, filter_limits);
+            b.Slice(filter, filter_begin, filter_limits, strides);
        xla::ComputationDataHandle input_slice =
-            b.Slice(input, input_begin, input_limits);
+            b.Slice(input, input_begin, input_limits, strides);
        convs.push_back(b.ConvWithGeneralDimensions(
            input_slice, filter_slice, strides_for_tla, xla_padding, dims));
      }
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@ -125,7 +125,7 @@ class DiagPartOp : public XlaOpKernel {
    diag = builder->Reshape(diag, {new_size, new_size + 1});

    // Slices out the first column and reshapes to the final shape.
-    diag = builder->Slice(diag, {0, 0}, {new_size, 1});
+    diag = builder->Slice(diag, {0, 0}, {new_size, 1}, {1, 1});
    diag = builder->Reshape(diag, new_dims);

    ctx->SetOutput(0, diag);
@ -224,8 +224,9 @@ class MatrixDiagPartOp : public XlaOpKernel {
    } else if (actual_size > target_size) {
      std::vector<int64> start(flattened_dims.size(), 0);
      std::vector<int64> limits(flattened_dims.begin(), flattened_dims.end());
+      std::vector<int64> strides(flattened_dims.size(), 1);
      limits[flattened_dims.size() - 1] = target_size;
-      diag = builder->Slice(diag, start, limits);
+      diag = builder->Slice(diag, start, limits, strides);
    }

    // Reshape so the target values are in the first position of the last
@ -238,8 +239,9 @@ class MatrixDiagPartOp : public XlaOpKernel {
    // Slices out the first column and reshapes to the final shape.
    std::vector<int64> start(dims.size(), 0);
    std::vector<int64> limits(dims.begin(), dims.end());
+    std::vector<int64> strides(dims.size(), 1);
    limits[last_dim] = 1;
-    diag = builder->Slice(diag, start, limits);
+    diag = builder->Slice(diag, start, limits, strides);

    // Collapses away the last dimension.
    dims.pop_back();
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@ -156,6 +156,8 @@ class DynamicStitchOp : public XlaOpKernel {
                                   indices0_shape.dims());
    std::vector<int64> slice_limit(1 + data0_shape.dims() -
                                   indices0_shape.dims());
+    std::vector<int64> stride(1 + data0_shape.dims() -
+                              indices0_shape.dims(), 1);
    for (int d = indices0_shape.dims(); d < data0_shape.dims(); d++) {
      slice_limit[1 + d - indices0_shape.dims()] = data0_shape.dim_size(d);
    }
@ -168,7 +170,7 @@ class DynamicStitchOp : public XlaOpKernel {
      // And place it in the concat list in the place indicated by
      // the index.
      to_concat[index_num] =
-          ctx->builder()->Slice(expression, slice_start, slice_limit);
+          ctx->builder()->Slice(expression, slice_start, slice_limit, stride);
    }

    ctx->SetOutput(0, ctx->builder()->ConcatInDim(to_concat, 0));
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@ -54,7 +54,9 @@ class SliceOp : public XlaOpKernel {
    for (int i = 0; i < begin.size(); ++i) {
      limits.push_back(begin[i] + size[i]);
    }
-    ctx->SetOutput(0, ctx->builder()->Slice(ctx->Input(0), begin, limits));
+    std::vector<int64> strides(begin.size(), 1);
+    ctx->SetOutput(0, ctx->builder()->Slice(ctx->Input(0), begin, limits,
+                                            strides));
  }

 private:
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@ -77,14 +77,14 @@ class SplitOp : public XlaOpKernel {

    // The vectors we will use to define the slice. The entry for the
    // split dimensions varies for each output.
-    std::vector<int64> begin;
-    std::vector<int64> limits;
+    std::vector<int64> begin(input_shape.dims(), 0);
+    std::vector<int64> limits(input_shape.dims());
+    std::vector<int64> strides(input_shape.dims(), 1);
    for (int i = 0; i < input_shape.dims(); ++i) {
      // Initially set up the limits to be the full size of the input:
      // the split dimension is filled in below.
      int64 dim = input_shape.dim_size(i);
-      begin.push_back(0);
-      limits.push_back(dim);
+      limits[i] = dim;
    }

    auto input = ctx->Input(1);
@ -94,7 +94,7 @@ class SplitOp : public XlaOpKernel {
      // Slice out the ith split from the split dimension.
      begin[split_dim] = i * slice_size;
      limits[split_dim] = (i + 1) * slice_size;
-      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits));
+      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits, strides));
    }
  }
 };
@ -188,7 +188,7 @@ class SplitVOp : public XlaOpKernel {
    std::vector<int64> begin(input_shape.dims(), 0);
    auto dim_sizes = input_shape.dim_sizes();
    std::vector<int64> limits(dim_sizes.begin(), dim_sizes.end());
-
+    std::vector<int64> strides(input_shape.dims(), 1);
    for (int i = 0; i < num_split; ++i) {
      TensorShape output_shape(input_shape);
      int slice_size = split_sizes_vec[i];
@ -196,7 +196,7 @@ class SplitVOp : public XlaOpKernel {

      // Slice out the ith split from the split dimension.
      limits[split_dim] = begin[split_dim] + slice_size;
-      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits));
+      ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits, strides));
      begin[split_dim] = limits[split_dim];
    }
  }
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@ -72,55 +72,29 @@ class StridedSliceOp : public XlaOpKernel {
                       &dummy, &dummy, &dummy, &begin, &end, &strides));

    gtl::InlinedVector<int64, 4> dimensions_to_reverse;
-    gtl::InlinedVector<int64, 4> slice_begin, slice_end;
-    bool simple_strides = true;
+    gtl::InlinedVector<int64, 4> slice_begin, slice_end, slice_strides;
+
    for (int i = 0; i < begin.size(); ++i) {
-      simple_strides &= (std::abs(strides[i]) == 1);
      if (strides[i] > 0) {
        slice_begin.push_back(begin[i]);
        slice_end.push_back(end[i]);
+        slice_strides.push_back(strides[i]);
      } else {
        // Negative stride: swap begin and end, add 1 because the interval
        // is semi-open, and mark the dimension to be reversed.
-        slice_begin.push_back(end[i] + 1);
-        slice_end.push_back(begin[i] + 1);
+        slice_begin.push_back(input_shape.dim_size(i) - begin[i] - 1);
+        slice_end.push_back(input_shape.dim_size(i) - end[i] - 1);
+        slice_strides.push_back(-strides[i]);
        dimensions_to_reverse.push_back(i);
      }
    }
-    xla::ComputationDataHandle slice =
-        ctx->builder()->Slice(ctx->Input(0), slice_begin, slice_end);
+
+    xla::ComputationDataHandle slice = ctx->Input(0);
    if (!dimensions_to_reverse.empty()) {
      slice = ctx->builder()->Rev(slice, dimensions_to_reverse);
    }

-    // If at least one of the strides is > 1 (or < -1) then use Slice
-    // to pull out each of the strided slices, and Concat to put them
-    // together again.
-    if (!simple_strides) {
-      // Re-adjust the begin and end now that the periphery has been
-      // sliced away.
-      for (int d = 0; d < strides.size(); ++d) {
-        slice_end[d] -= slice_begin[d];
-        slice_begin[d] = 0;
-      }
-
-      for (int d = 0; d < strides.size(); ++d) {
-        int64 stride = std::abs(strides[d]);
-        if (stride > 1) {
-          std::vector<xla::ComputationDataHandle> to_concat;
-          int64 end = slice_end[d];
-          for (int64 i = 0; i < end; i += stride) {
-            slice_begin[d] = i;
-            slice_end[d] = i + 1;
-            to_concat.push_back(
-                ctx->builder()->Slice(slice, slice_begin, slice_end));
-          }
-          slice = ctx->builder()->ConcatInDim(to_concat, d);
-          slice_begin[d] = 0;
-          slice_end[d] = to_concat.size();
-        }
-      }
-    }
+    slice = ctx->builder()->Slice(slice, slice_begin, slice_end, slice_strides);

    slice = ctx->builder()->Reshape(slice, final_shape.dim_sizes());
    ctx->SetOutput(0, slice);
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@ -318,7 +318,7 @@ class TensorArrayGatherOp : public XlaOpKernel {
    for (int i = 0; i < num_indices; ++i) {
      // Slices the i-th index out of `indices`, and pads it with zeros in the
      // minor dimensions to form an index into the TensorArray storage.
-      auto index = b->Slice(indices, {i}, {i + 1});
+      auto index = b->Slice(indices, {i}, {i + 1}, {1});

      // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
      auto start_indices = PadIndexWithZeros(b, index, ta_shape.dims() - 1);
@ -381,16 +381,18 @@ class TensorArrayScatterOp : public XlaOpKernel {
    std::vector<int64> value_starts(value_shape.dims(), 0);
    auto value_ends = value_shape.dim_sizes();

+    std::vector<int64> value_strides(value_shape.dims(), 1);
+
    // For every (index, value) pair, update the corresponding TensorArray
    // storage.
    for (int i = 0; i < num_indices; ++i) {
      // Slice out part of the value.
      value_starts[0] = i;
      value_ends[0] = i + 1;
-      auto slice = b->Slice(value, value_starts, value_ends);
+      auto slice = b->Slice(value, value_starts, value_ends, value_strides);

      // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-      auto index = b->Slice(indices, {i}, {i + 1});
+      auto index = b->Slice(indices, {i}, {i + 1}, {1});
      auto start_indices = PadIndexWithZeros(b, index, elem_shape.dims());
      ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices);
    }
--- a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
@ -66,6 +66,7 @@ class UnpackOp : public XlaOpKernel {

    std::vector<int64> start_indices(input_shape.dims(), 0);
    std::vector<int64> limit_indices(input_shape.dims());
+    std::vector<int64> strides(input_shape.dims(), 1);
    for (int i = 0; i < input_shape.dims(); ++i) {
      limit_indices[i] = input_shape.dim_size(i);
    }
@ -73,7 +74,8 @@ class UnpackOp : public XlaOpKernel {
    for (int i = 0; i < num; ++i) {
      start_indices[axis] = i;
      limit_indices[axis] = i + 1;
-      auto slice = ctx->builder()->Slice(input, start_indices, limit_indices);
+      auto slice = ctx->builder()->Slice(input, start_indices, limit_indices,
+                                         strides);
      // Reshape to drop the 'axis' dimension.
      auto result = ctx->builder()->Reshape(slice, output_shape.dim_sizes());
      ctx->SetOutput(i, result);
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@ -256,7 +256,8 @@ void ComputationBuilder::CheckSameShape(const ComputationDataHandle& lhs,
 ComputationDataHandle ComputationBuilder::Slice(
    const ComputationDataHandle& operand,
    tensorflow::gtl::ArraySlice<int64> start_indices,
-    tensorflow::gtl::ArraySlice<int64> limit_indices) {
+    tensorflow::gtl::ArraySlice<int64> limit_indices,
+    tensorflow::gtl::ArraySlice<int64> stride) {
  if (!first_error_.ok() || !PrepareComputation().ok()) {
    return ComputationDataHandle();
  }
@ -269,6 +270,9 @@ ComputationDataHandle ComputationBuilder::Slice(
  for (int64 index : limit_indices) {
    request.add_limit_indices(index);
  }
+  for (int64 index : stride) {
+    request.add_stride(index);
+  }
  OpRequest op_request;
  *op_request.mutable_computation() = computation_.handle();
  *op_request.mutable_slice_request() = request;
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@ -211,9 +211,11 @@ class ComputationBuilder {
  //
  // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
  // range notation.
+  // The stride parameter determines the stride over the slice
  ComputationDataHandle Slice(const ComputationDataHandle& operand,
                              tensorflow::gtl::ArraySlice<int64> start_indices,
-                              tensorflow::gtl::ArraySlice<int64> limit_indices);
+                              tensorflow::gtl::ArraySlice<int64> limit_indices,
+                              tensorflow::gtl::ArraySlice<int64> stride);

  // Enqueues a slice operation onto the computation that slices the 'operand'
  // from dynamic start indices which are passed in 'start_indices'.
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@ -1215,11 +1215,7 @@ void Literal::Resize<double>(int64 num_elements, double value) {
 template <>
 void Literal::Resize<half>(int64 num_elements, half value) {
  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_f16s()->resize(num_elements * sizeof(half));
-  auto data = GetMutableArraySlice<half>();
-  for (int i = 0; i < num_elements; i++) {
-    data[i] = value;
-  }
+  mutable_f16s()->resize(num_elements, value);
 }

 template <typename RepeatedFieldT, typename NativeT>
@ -1262,7 +1258,7 @@ LiteralProto Literal::ToProto() const {
    case F16:
      *proto.mutable_f16s() =
          string(reinterpret_cast<const char*>(f16s_.data()),
-                 f16s_.size() / sizeof(half));
+                 f16s_.size() * sizeof(half));
      break;
    case F32:
      CopyToRepeatedField(proto.mutable_f32s(), f32s());
@ -1318,7 +1314,7 @@ void Literal::CopyFromProto(const LiteralProto& literal_proto) {
      const string& s(literal_proto.f16s());
      CHECK_EQ(0, s.size() % sizeof(half));
      f16s_ = std::vector<half>(s.size() / sizeof(half));
-      memcpy(f16s_.data(), s.data(), s.size() / sizeof(half));
+      memcpy(f16s_.data(), s.data(), s.size());
      break;
    }
    case F32:
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@ -949,5 +949,62 @@ TEST_F(LiteralUtilTest, CopyFromProto_Bool) {
  }
 }

+// Note that f16 is currently stored in a byte array in little endian byte order
+TEST_F(LiteralUtilTest, ToProto_f16) {
+  half h1(1.0f);
+  half h2(2.0f);
+
+  auto m = Literal::CreateR2<half>({{h1, h2}, {h2, h1}});
+  Literal* l = m.get();
+  EXPECT_EQ(4, ShapeUtil::ElementsIn(l->shape()));
+  EXPECT_EQ(4, l->f16s().size());
+  EXPECT_EQ(4, l->f16s_size());
+
+  LiteralProto p = l->ToProto();
+  EXPECT_EQ(4, ShapeUtil::ElementsIn(p.shape()));
+  EXPECT_EQ(8, p.f16s().size());
+  const char* d = p.f16s().data();
+  EXPECT_EQ(d[0], 0);
+  EXPECT_EQ(d[1], 0x3C);
+  EXPECT_EQ(d[2], 0);
+  EXPECT_EQ(d[3], 0x40);
+  EXPECT_EQ(d[4], 0);
+  EXPECT_EQ(d[5], 0x40);
+  EXPECT_EQ(d[6], 0);
+  EXPECT_EQ(d[7], 0x3C);
+}
+
+// Note that f16 is currently stored in a byte array in little endian byte order
+TEST_F(LiteralUtilTest, CopyFromProto_f16) {
+  half h1(1.0f);
+  half h2(2.0f);
+
+  const char half_vals[8] = {
+    0x00, 0x3C, 0x00, 0x40, 0x00, 0x40, 0x00, 0x3C
+  };
+  LiteralProto p;
+  p.mutable_shape()->set_element_type(F16);
+  p.mutable_shape()->clear_dimensions();
+  p.mutable_shape()->add_dimensions(4);
+  p.clear_f16s();
+  p.set_f16s(half_vals, 8);
+
+
+  Literal literal(p);
+  ASSERT_EQ(4, literal.f16s_size());
+  ASSERT_EQ(h1, literal.f16s(0));
+  ASSERT_EQ(h2, literal.f16s(1));
+  ASSERT_EQ(h2, literal.f16s(2));
+  ASSERT_EQ(h1, literal.f16s(3));
+
+  const std::vector<half>& r = literal.f16s();
+  ASSERT_EQ(4, r.size());
+  ASSERT_EQ(h1, r[0]);
+  ASSERT_EQ(h2, r[1]);
+  ASSERT_EQ(h2, r[2]);
+  ASSERT_EQ(h1, r[3]);
+}
+
+
 }  // namespace
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@ -855,6 +855,7 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
    // Second, construct the slice instruction to perform the negative padding.
    std::vector<int64> start_indices;
    std::vector<int64> end_indices;
+    std::vector<int64> strides;
    for (int64 i = 0; i < pad->padding_config().dimensions_size(); ++i) {
      const PaddingConfig::PaddingConfigDimension& padding_dimension =
          pad->padding_config().dimensions(i);
@ -868,16 +869,18 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
      }
      start_indices.push_back(start);
      end_indices.push_back(end);
+      strides.push_back(1);
    }

    // Verify that the slice shape matches the pad shape.
    TF_ASSIGN_OR_RETURN(Shape inferred_slice_shape,
                        ShapeInference::InferSliceShape(
-                            nonzero_pad_shape, start_indices, end_indices));
+                            nonzero_pad_shape, start_indices, end_indices,
+                            strides));
    TF_RET_CHECK(ShapeUtil::Compatible(inferred_slice_shape, pad->shape()));

    std::unique_ptr<HloInstruction> slice = HloInstruction::CreateSlice(
-        pad->shape(), nonzero_pad, start_indices, end_indices);
+        pad->shape(), nonzero_pad, start_indices, end_indices, strides);
    return ReplaceWithNewInstruction(pad, std::move(slice));
  }

--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@ -520,7 +520,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
      HloInstruction::CreateConstant(Literal::CreateR1<float>({})));
  HloInstruction* empty_slice =
      builder.AddInstruction(HloInstruction::CreateSlice(
-          ShapeUtil::MakeShape(F32, {0}), param1, {42}, {42}));
+          ShapeUtil::MakeShape(F32, {0}), param1, {42}, {42}, {1}));
  Shape result_shape = ShapeUtil::MakeShape(F32, {3 * kParamLength});
  builder.AddInstruction(HloInstruction::CreateConcatenate(
      result_shape, {empty_literal, param0, param0, empty_slice, param1}, 0));
@ -551,7 +551,7 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
      HloInstruction::CreateConstant(Literal::CreateR1<float>({})));
  HloInstruction* empty_slice =
      builder.AddInstruction(HloInstruction::CreateSlice(
-          ShapeUtil::MakeShape(F32, {0}), param0, {42}, {42}));
+          ShapeUtil::MakeShape(F32, {0}), param0, {42}, {42}, {1}));
  Shape result_shape = ShapeUtil::MakeShape(F32, {0});
  builder.AddInstruction(HloInstruction::CreateConcatenate(
      result_shape, {empty_literal, empty_slice}, 0));
@ -1132,7 +1132,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
          0, ShapeUtil::MakeShape(F32, {dim0, dim1}), "param"));
  builder.AddInstruction(HloInstruction::CreateSlice(
      ShapeUtil::MakeShape(F32, {dim0, dim1}), param, /*start_indices=*/{0, 0},
-      /*limit_indices=*/{dim0, dim1}));
+      /*limit_indices=*/{dim0, dim1}, /*slices=*/{1, 1}));

  HloModule module(TestName());
  HloComputation* computation = module.AddEntryComputation(builder.Build());
@ -1537,7 +1537,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {

  Shape slice_shape = ShapeUtil::MakeShape(F32, {2, 2, 3, 3});
  HloInstruction* slice = builder.AddInstruction(HloInstruction::CreateSlice(
-      slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}));
+      slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}, {1, 1, 1, 1}));

  HloModule module(TestName());
  auto computation = module.AddEntryComputation(builder.Build());
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@ -731,7 +731,7 @@ TEST_F(BufferAssignmentTest, ReuseNonOperandBuffer) {
  auto negate = builder.AddInstruction(
      HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
  auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
  auto broadcast = builder.AddInstruction(
      HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));

@ -763,7 +763,7 @@ TEST_F(BufferAssignmentTest, NoReuseLiveBuffer) {
  auto negate = builder.AddInstruction(
      HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
  auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
  auto broadcast = builder.AddInstruction(
      HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
  builder.AddInstruction(HloInstruction::CreateTuple({negate, broadcast}));
@ -800,7 +800,7 @@ TEST_F(BufferAssignmentTest, NoReuseAliasedBuffer) {
  auto tuple_element = builder.AddInstruction(
      HloInstruction::CreateGetTupleElement(f32vec100_, tuple, 0));
  auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, tuple_element, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, tuple_element, {0}, {10}, {1}));
  auto broadcast = builder.AddInstruction(
      HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
  builder.AddInstruction(HloInstruction::CreateTuple({tuple, broadcast}));
@ -835,7 +835,7 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBuffer) {
      HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
  // Slice output is 10 elements.
  auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
  // Broadcast output is 40 elements.
  auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
      ShapeUtil::MakeShape(F32, {10, 4}), slice, {0}));
@ -867,7 +867,7 @@ TEST_F(BufferAssignmentTest, ReuseOutputBufferIfExactlySized) {
  auto negate = builder.AddInstruction(
      HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
  auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
  // Broadcast output is 40 elements.
  auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
      ShapeUtil::MakeShape(F32, {10, 10}), slice, {0}));
@ -904,7 +904,7 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBufferInTuple) {
      HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
  // Slice output is 10 elements.
  auto slice = builder.AddInstruction(
-      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}));
+      HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1}));
  // Broadcast output is 40 elements.
  auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
      ShapeUtil::MakeShape(F32, {10, 4}), slice, {0}));
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@ -588,7 +588,7 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
    if (update_uses_tuple_element1) {
      // Create a slice instruction as an additional user of 'gte1'.
      slice = builder.AddInstruction(
-          HloInstruction::CreateSlice(update_shape, gte1, {0}, {3}));
+          HloInstruction::CreateSlice(update_shape, gte1, {0}, {3}, {1}));
      update = builder.AddInstruction(HloInstruction::CreateBinary(
          update_shape, HloOpcode::kAdd, update, slice));
    }
--- a/tensorflow/compiler/xla/service/compile_only_service.h
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@ -55,7 +55,7 @@ class CompileOnlyService : public Service {

  // Override Service methods that require or imply the existence of an
  // execute backend.  Note that this does not include TransferToClient, as
-  // computing contants produces global data that we may wish to transfer.
+  // computing constants produces global data that we may wish to transfer.
  tensorflow::Status Execute(const ExecuteRequest* arg,
                             ExecuteResponse* result) override {
    return Unimplemented("CompileOnlyService does not support execution.");
--- a/tensorflow/compiler/xla/service/computation_placer.cc
+++ b/tensorflow/compiler/xla/service/computation_placer.cc
@ -49,17 +49,18 @@ Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
  return Status::OK();
 }

-/* static */ StatusOr<DeviceAssignment> DeviceAssignment::Deserialize(
-    const DeviceAssignmentProto& proto) {
+/* static */ StatusOr<std::unique_ptr<DeviceAssignment>>
+DeviceAssignment::Deserialize(const DeviceAssignmentProto& proto) {
  TF_RET_CHECK(proto.computation_devices_size() == proto.computation_count());
-  DeviceAssignment assignment(proto.replica_count(), proto.computation_count());
+  auto assignment = MakeUnique<DeviceAssignment>(proto.replica_count(),
+                                                 proto.computation_count());
  for (int computation = 0; computation < proto.computation_count();
       ++computation) {
    const auto& computation_device = proto.computation_devices(computation);
    TF_RET_CHECK(computation_device.replica_device_ids_size() ==
                 proto.replica_count());
    for (int replica = 0; replica < proto.replica_count(); ++replica) {
-      assignment(replica, computation) =
+      (*assignment)(replica, computation) =
          computation_device.replica_device_ids(replica);
    }
  }
--- a/tensorflow/compiler/xla/service/computation_placer.h
+++ b/tensorflow/compiler/xla/service/computation_placer.h
@ -49,7 +49,11 @@ class DeviceAssignment : public Array2D<int> {

  // Protocol buffer serialization and deserialization.
  Status Serialize(DeviceAssignmentProto* proto) const;
-  static StatusOr<DeviceAssignment> Deserialize(
+
+  // Return a std::unique_ptr<DeviceAssignment> instead of a DeviceAssignment
+  // directly because one of the supported TF platforms (mac) does not compile
+  // due to a StatusOr of an incomplete type (DeviceAssignment).
+  static StatusOr<std::unique_ptr<DeviceAssignment>> Deserialize(
      const DeviceAssignmentProto& proto);
 };

--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@ -949,9 +949,20 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
        IrArray::Index sliced_index(index.size());
        for (int i = 0; i < index.size(); ++i) {
-          sliced_index[i] = ir_builder_->CreateAdd(
-              index[i], llvm::ConstantInt::get(index[i]->getType(),
-                                               hlo->slice_starts(i)));
+          int64 stride = hlo->slice_stride(i);
+          if (stride != 1) {
+            sliced_index[i] = ir_builder_->CreateAdd(
+                ir_builder_->CreateMul(
+                    index[i], llvm::ConstantInt::get(index[i]->getType(),
+                                                     stride)),
+                llvm::ConstantInt::get(index[i]->getType(),
+                                       hlo->slice_starts(i)));
+          } else {
+            sliced_index[i] = ir_builder_->CreateAdd(
+                    index[i],
+                    llvm::ConstantInt::get(index[i]->getType(),
+                                           hlo->slice_starts(i)));
+          }
        }
        return operand_to_generator.at(hlo->operand(0))(sliced_index);
      };
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@ -80,6 +80,7 @@ HloInstruction* MaybePaddedAndSlicedInput(
    std::vector<int64> start_indices(input->shape().dimensions_size(), 0);
    std::vector<int64> limit_indices(input->shape().dimensions().begin(),
                                     input->shape().dimensions().end());
+    std::vector<int64> strides(input->shape().dimensions_size(), 1);
    for (size_t i = 0; i < conv_dnums.spatial_dimensions().size(); ++i) {
      int64 dim = conv_dnums.spatial_dimensions(i);
      // If dimension "dim" has negative padding, increase the start index or
@ -92,9 +93,9 @@ HloInstruction* MaybePaddedAndSlicedInput(

    input = computation->AddInstruction(HloInstruction::CreateSlice(
        ShapeInference::InferSliceShape(input->shape(), start_indices,
-                                        limit_indices)
+                                        limit_indices, strides)
            .ConsumeValueOrDie(),
-        input, start_indices, limit_indices));
+        input, start_indices, limit_indices, strides));
  }

  return input;
@ -354,6 +355,8 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
  std::vector<int64> limit_indices(
      new_backward_conv->shape().dimensions().begin(),
      new_backward_conv->shape().dimensions().end());
+  std::vector<int64> strides(new_backward_conv->shape().dimensions_size(),
+                             1LL);
  for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
    int64 padding_low = backward_conv->window().dimensions(i).padding_low();
    int64 padding_high = backward_conv->window().dimensions(i).padding_high();
@ -373,13 +376,13 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
  // Replace the old backward convolution with the slice.
  CHECK(ShapeUtil::Compatible(
      ShapeInference::InferSliceShape(new_backward_conv->shape(), start_indices,
-                                      limit_indices)
+                                      limit_indices, strides)
          .ConsumeValueOrDie(),
      backward_conv->shape()));
  TF_CHECK_OK(computation->ReplaceWithNewInstruction(
      backward_conv,
      HloInstruction::CreateSlice(backward_conv->shape(), new_backward_conv,
-                                  start_indices, limit_indices)));
+                                  start_indices, limit_indices, strides)));
  return true;
 }

--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@ -147,6 +147,7 @@ TEST_F(HloConstantFoldingTest, Slice) {
  const int64 dimensions[] = {11, 8, 7, 5, 9};
  const int64 slice_start[] = {4, 2, 3, 1, 5};
  const int64 slice_limits[] = {10, 8, 6, 5, 9};
+  const int64 slice_strides[] = {1, 1, 1, 1, 1};
  TF_ASSIGN_OR_ASSERT_OK(auto literal,
                         LiteralTestUtil::CreateRandomLiteral<F32>(
                             ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
@ -154,7 +155,7 @@ TEST_F(HloConstantFoldingTest, Slice) {
      HloInstruction::CreateConstant(std::move(literal)));
  Shape shape = ShapeUtil::MakeShape(F32, {6, 6, 3, 4, 4});
  builder.AddInstruction(HloInstruction::CreateSlice(
-      shape, literal_instruction, slice_start, slice_limits));
+      shape, literal_instruction, slice_start, slice_limits, slice_strides));
  auto module = CreateNewModule();
  auto computation = module->AddEntryComputation(builder.Build());

--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@ -306,11 +306,13 @@ HloInstruction::CreateCrossReplicaSum(const Shape& shape,
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSlice(
    const Shape& shape, HloInstruction* operand,
    tensorflow::gtl::ArraySlice<int64> start_indices,
-    tensorflow::gtl::ArraySlice<int64> limit_indices) {
+    tensorflow::gtl::ArraySlice<int64> limit_indices,
+    tensorflow::gtl::ArraySlice<int64> strides) {
  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kSlice, shape));
  instruction->AppendOperand(operand);
  instruction->slice_starts_.assign(start_indices.begin(), start_indices.end());
  instruction->slice_limits_.assign(limit_indices.begin(), limit_indices.end());
+  instruction->slice_strides_.assign(strides.begin(), strides.end());
  return instruction;
 }

@ -852,7 +854,8 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
      return CreateReshape(shape, new_operands[0]);
    case HloOpcode::kSlice:
      CHECK_EQ(new_operands.size(), 1);
-      return CreateSlice(shape, new_operands[0], slice_starts_, slice_limits_);
+      return CreateSlice(shape, new_operands[0], slice_starts_, slice_limits_,
+                         slice_strides_);
    case HloOpcode::kDynamicSlice:
      return CreateDynamicSlice(shape, new_operands[0], new_operands[1],
                                dynamic_slice_sizes_);
@ -1672,6 +1675,8 @@ string HloInstruction::ToCategory() const {
      case FusionKind::kConvBackwardFilter:
      case FusionKind::kConvBackwardInput:
        return "convolution fusion";
+      case FusionKind::kCustom:
+        return "custom fusion";
    }
  }

@ -2339,6 +2344,8 @@ string ToString(HloInstruction::FusionKind kind) {
      return "kConvBackwardFilter";
    case HloInstruction::FusionKind::kConvBackwardInput:
      return "kConvBackwardInput";
+    case HloInstruction::FusionKind::kCustom:
+      return "kCustom";
  }
 }

--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@ -63,6 +63,9 @@ class HloInstruction {
    kTransposeDot,        // Fused into a dot with transposed operands.
    kConvBackwardFilter,  // Fused into a backward filter convolution.
    kConvBackwardInput,   // Fused into a backward input convolution.
+
+    kCustom,              // Custom category for backend-specific fusions that
+                          // do not match any of the more specific ones.
  };

  ~HloInstruction();
@ -174,7 +177,8 @@ class HloInstruction {
  static std::unique_ptr<HloInstruction> CreateSlice(
      const Shape& shape, HloInstruction* operand,
      tensorflow::gtl::ArraySlice<int64> start_indices,
-      tensorflow::gtl::ArraySlice<int64> limit_indices);
+      tensorflow::gtl::ArraySlice<int64> limit_indices,
+      tensorflow::gtl::ArraySlice<int64> strides);

  // Creates a slice instruction, where the first operand is sliced by
  // start indices specified in the second operand, and by size specfied in
@ -662,6 +666,15 @@ class HloInstruction {
    return slice_limits_;
  }

+  // Returns the stride in the given dimension for a slice node.
+  //
+  // Precondition: opcode() == HloOpcode::kSlice
+  int64 slice_stride(int64 dimension) const {
+    CHECK_EQ(HloOpcode::kSlice, opcode_);
+    return slice_strides_[dimension];
+  }
+  const std::vector<int64>& slice_strides() const { return slice_strides_; }
+
  // Returns the size of the slice in the given dimension for a dynamic
  // slice node.
  //
@ -907,6 +920,7 @@ class HloInstruction {
  // Describes the [begin, end) index range for a slice.
  std::vector<int64> slice_starts_;
  std::vector<int64> slice_limits_;
+  std::vector<int64> slice_strides_;

  // The bit sizes for a reduce-precision operation.
  int32 exponent_bits_;
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@ -67,7 +67,8 @@ class HloRematerializationTest : public HloTestBase {
        /*dimension=*/0));
    auto slice_1 = builder.AddInstruction(HloInstruction::CreateSlice(
        vec1_shape_, concat_1, /*start_indices=*/{0},
-        /*limit_indices=*/{1}));
+        /*limit_indices=*/{1},
+        /*strides=*/{1}));
    auto concat_2 = builder.AddInstruction(HloInstruction::CreateConcatenate(
        ShapeUtil::MakeShape(xla::F32, {1025}), {bcast, slice_1},
        /*dimension=*/0));
@ -75,7 +76,8 @@ class HloRematerializationTest : public HloTestBase {
    // which is necessary to use this computation in a while.
    builder.AddInstruction(HloInstruction::CreateSlice(vec1_shape_, concat_2,
                                                       /*start_indices=*/{0},
-                                                       /*limit_indices=*/{1}));
+                                                       /*limit_indices=*/{1},
+                                                       /*strides=*/{1}));
    return builder.Build();
  }

@ -103,7 +105,8 @@ class HloRematerializationTest : public HloTestBase {
        HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
    auto slice_1 = builder.AddInstruction(
        HloInstruction::CreateSlice(vec1_shape_, bcast, /*start_indices=*/{0},
-                                    /*limit_indices=*/{1}));
+                                    /*limit_indices=*/{1},
+                                    /*strides=*/{1}));
    auto while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
        vec1_shape_, while_cond, while_body, slice_1));
    auto concat = builder.AddInstruction(HloInstruction::CreateConcatenate(
@ -111,7 +114,8 @@ class HloRematerializationTest : public HloTestBase {
        /*dimension=*/0));
    builder.AddInstruction(HloInstruction::CreateSlice(vec1_shape_, concat,
                                                       /*start_indices=*/{0},
-                                                       /*limit_indices=*/{1}));
+                                                       /*limit_indices=*/{1},
+                                                       /*strides=*/{1}));
    return builder.Build();
  }

@ -353,7 +357,7 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
        /*dimension=*/0));
    builder.AddInstruction(HloInstruction::CreateSlice(
        vec1024_shape_, concat, /*start_indices=*/{0},
-        /*limit_indices=*/{1024}));
+        /*limit_indices=*/{1024}, /*slices=*/{1}));
    subcomputation = module->AddEmbeddedComputation(builder.Build());
  }

@ -469,7 +473,7 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
        /*dimension=*/0));
    builder.AddInstruction(HloInstruction::CreateSlice(
        vec1024_shape_, concat, /*start_indices=*/{0},
-        /*limit_indices=*/{1024}));
+        /*limit_indices=*/{1024}, /*slices=*/{1}));
    subcomputation = module->AddEmbeddedComputation(builder.Build());
  }

--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@ -1135,7 +1135,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(

 /* static */ StatusOr<Shape> ShapeInference::InferSliceShape(
    const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
-    tensorflow::gtl::ArraySlice<int64> limits) {
+    tensorflow::gtl::ArraySlice<int64> limits,
+    tensorflow::gtl::ArraySlice<int64> strides) {
  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
  VLOG(2) << tensorflow::strings::Printf(
      "slicing shape %s starts={%s} limits={%s}",
@ -1158,13 +1159,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
  for (int64 dimension = 0; dimension < starts.size(); ++dimension) {
    int64 start_index = starts[dimension];
    int64 limit_index = limits[dimension];
+    int64 stride = strides[dimension];
    if (start_index < 0) {
      return InvalidArgument("negative start index to slice: %lld",
                             start_index);
    }
-    if (limit_index < 0) {
-      return InvalidArgument("negative limit index to slice: %lld",
-                             limit_index);
+    if (stride == 0) {
+      return InvalidArgument("Zero stride");
    }
    if (limit_index > arg.dimensions(dimension)) {
      return InvalidArgument(
@ -1172,18 +1173,21 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
          "size (%lld)",
          limit_index, arg.dimensions(dimension));
    }
-    if (start_index > limit_index) {
-      return InvalidArgument(
-          "limit index (%lld) must be greater or equal to "
-          "start index (%lld) in slice",
-          limit_index, start_index);
-    }
    VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
                                           start_index);
    VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
                                           limit_index);
-
-    sizes.push_back(limits[dimension] - starts[dimension]);
+    if (stride > 0) {
+      if (start_index > limit_index) {
+        return InvalidArgument(
+            "limit index (%lld) must be greater or equal to "
+            "start index (%lld) in slice with positive stride",
+            limit_index, start_index);
+      }
+      sizes.push_back((limit_index - start_index + stride - 1) / stride);
+    } else {
+      return InvalidArgument("Negative strides not supported");
+    }
  }

  return ShapeUtil::MakeShape(arg.element_type(), sizes);
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@ -116,7 +116,8 @@ class ShapeInference {
  // e.g. slice f32[32x32] 0:16 0:16 -> f32[16x16]
  static StatusOr<Shape> InferSliceShape(
      const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
-      tensorflow::gtl::ArraySlice<int64> limits);
+      tensorflow::gtl::ArraySlice<int64> limits,
+      tensorflow::gtl::ArraySlice<int64> strides);

  // Infers the shape produced by a dynamic slice operation of size specified
  // in 'slice_sizes', with dynamic start indices shape 'start_indices_shape'.
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@ -682,16 +682,43 @@ TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) {
 TEST_F(ShapeInferenceTest, InferSliceShapeRank2) {
  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
  auto inferred_status =
-      ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64});
+      ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {1, 1});
  ASSERT_IS_OK(inferred_status.status());
  Shape inferred = inferred_status.ValueOrDie();
  ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {32, 64}), inferred));
 }

+TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStrides) {
+  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  auto inferred_status =
+      ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {2, 4});
+  ASSERT_IS_OK(inferred_status.status());
+  Shape inferred = inferred_status.ValueOrDie();
+  ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {16, 16}), inferred));
+}
+
+TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStridesNotIntegral) {
+  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  auto inferred_status =
+      ShapeInference::InferSliceShape(matrix_shape, {15, 0}, {20, 13}, {2, 4});
+  ASSERT_IS_OK(inferred_status.status());
+  Shape inferred = inferred_status.ValueOrDie();
+  ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {3, 4}), inferred));
+}
+
+TEST_F(ShapeInferenceTest, InferInvalidStride) {
+  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  auto inferred_status =
+      ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {0, 1});
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_EQ(tensorflow::error::INVALID_ARGUMENT,
+            inferred_status.status().code());
+}
+
 TEST_F(ShapeInferenceTest, InferOobSliceShapeRank2) {
  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
  auto inferred_status =
-      ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2});
+      ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {1, 1});
  ASSERT_FALSE(inferred_status.ok());
  ASSERT_EQ(tensorflow::error::INVALID_ARGUMENT,
            inferred_status.status().code());
@ -700,7 +727,7 @@ TEST_F(ShapeInferenceTest, InferOobSliceShapeRank2) {
 TEST_F(ShapeInferenceTest, InferSliceShapeRank1) {
  Shape vector_shape = ShapeUtil::MakeShape(F32, {17});
  auto inferred_status =
-      ShapeInference::InferSliceShape(vector_shape, {2}, {4});
+      ShapeInference::InferSliceShape(vector_shape, {2}, {4}, {1});
  ASSERT_TRUE(inferred_status.ok());
  Shape inferred = inferred_status.ValueOrDie();
  ASSERT_TRUE(ShapeUtil::Equal(inferred, ShapeUtil::MakeShape(F32, {2})));
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@ -584,7 +584,7 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
    if (add_additional_gte0_user) {
      // Create 'slice' as an additional user of 'input'.
      auto slice = builder.AddInstruction(
-          HloInstruction::CreateSlice(update_shape, input, {0}, {3}));
+          HloInstruction::CreateSlice(update_shape, input, {0}, {3}, {1}));
      // Modify 'update' to take 'slice' output.
      update = builder.AddInstruction(HloInstruction::CreateBinary(
          update_shape, HloOpcode::kAdd, update, slice));
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@ -744,7 +744,8 @@ StatusOr<ComputationDataHandle> UserComputation::AddSliceInstruction(
      Shape new_shape,
      ShapeInference::InferSliceShape(
          operand->output_shape(), AsInt64Slice(slice_request.start_indices()),
-          AsInt64Slice(slice_request.limit_indices())));
+          AsInt64Slice(slice_request.limit_indices()),
+          AsInt64Slice(slice_request.stride())));

  ComputationDataHandle handle = CreateComputationDataHandle();

@ -2393,7 +2394,8 @@ void ComputationLowerer::Visit(
      hlo_instruction = add_instruction(HloInstruction::CreateSlice(
          request.output_shape(), operand,
          AsInt64Slice(slice_request.start_indices()),
-          AsInt64Slice(slice_request.limit_indices())));
+          AsInt64Slice(slice_request.limit_indices()),
+          AsInt64Slice(slice_request.stride())));
      break;
    }

--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@ -1853,7 +1853,7 @@ TEST_F(ArrayElementwiseOpTest, ImplictBroadcastInFusedExpressions) {

  auto x = builder.Parameter(0, x_literal->shape(), "x");
  auto y = builder.Parameter(1, y_literal->shape(), "y");
-  auto slice = builder.Slice(x, {1}, {2});
+  auto slice = builder.Slice(x, {1}, {2}, {1});
  builder.Sub(slice, y);

  ComputeAndCompareR1<float>(&builder, {-2, -3}, {x_data.get(), y_data.get()},
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@ -365,9 +365,9 @@ XLA_TEST_F(DotOperationTest, BatchMatMul) {
  std::vector<xla::ComputationDataHandle> out_slices;
  for (int i = 0; i < 4; ++i) {
    // Slice off individual matrices and reshape to 2D tensors.
-    auto x_slice = builder.Slice(x_flat, {i, 0, 0}, {i + 1, 2, 2});
+    auto x_slice = builder.Slice(x_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1});
    x_slice = builder.Reshape(x_slice, {0, 1, 2}, {2, 2});
-    auto y_slice = builder.Slice(y_flat, {i, 0, 0}, {i + 1, 2, 2});
+    auto y_slice = builder.Slice(y_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1});
    y_slice = builder.Reshape(y_slice, {0, 1, 2}, {2, 2});

    auto out = builder.Dot(x_slice, y_slice);
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@ -210,7 +210,7 @@ XLA_TEST_F(FusionTest, Test) {
      HloInstruction::CreateTernary(ShapeUtil::MakeShape(F32, {2, 3}),
                                    HloOpcode::kSelect, const10, add8, const9));
  auto slice12 = builder.AddInstruction(HloInstruction::CreateSlice(
-      ShapeUtil::MakeShape(F32, {2, 1}), select11, {0, 1}, {2, 2}));
+      ShapeUtil::MakeShape(F32, {2, 1}), select11, {0, 1}, {2, 2}, {1, 1}));
  // CreateFusionInstruction needs the `instructions_to_fuse` argument in
  // reverse topological order, so the first element in `instructions_to_fuse`
  // must be the root.
--- a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
+++ b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
@ -36,7 +36,7 @@ XLA_TEST_F(SliceTest, Slice2D) {
  ComputationBuilder builder(client_, "slice_2d");
  auto original = builder.ConstantR2<float>(
      {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}});
-  builder.Slice(original, {2, 1}, {4, 3});
+  builder.Slice(original, {2, 1}, {4, 3}, {1, 1});

  Array2D<float> expected({{8.0f, 9.0f}, {11.0f, 12.0f}});
  ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.000001));
@ -47,7 +47,7 @@ XLA_TEST_F(SliceTest, Slice3D) {
  Array3D<float> array_3d(
      {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}});
  auto original = builder.ConstantR3FromArray3D<float>(array_3d);
-  builder.Slice(original, {0, 0, 1}, {2, 1, 2});
+  builder.Slice(original, {0, 0, 1}, {2, 1, 2}, {1, 1, 1});

  Array3D<float> expected_3d({{{2.0f}}, {{6.0f}}});
  ComputeAndCompareR3<float>(&builder, expected_3d, {}, ErrorSpec(0.000001));
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@ -325,7 +325,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) {
  ComputationBuilder builder(client_, TestName());
  auto input = builder.Parameter(0, original, "input");
  // Use the slice operator to get an off-diagonal element.
-  builder.Slice(input, {0, 1}, {1, 2});
+  builder.Slice(input, {0, 1}, {1, 2}, {1, 1});

  std::unique_ptr<GlobalData> data =
      client_->TransferToServer(*literal).ConsumeValueOrDie();
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@ -44,7 +44,7 @@ class SliceTest : public ClientLibraryTestBase {

    ComputationBuilder builder(client_, TestName());
    auto original = builder.ConstantR1<NativeT>(constant);
-    builder.Slice(original, {2}, {4});
+    builder.Slice(original, {2}, {4}, {1});

    const std::vector<NativeT> expected = {static_cast<NativeT>(2),
                                           static_cast<NativeT>(3)};
@ -55,7 +55,7 @@ class SliceTest : public ClientLibraryTestBase {
 XLA_TEST_F(SliceTest, SliceZeroToZeroF32) {
  ComputationBuilder builder(client_, TestName());
  auto original = builder.ConstantR1<float>({});
-  builder.Slice(original, {0}, {0});
+  builder.Slice(original, {0}, {0}, {1});

  ComputeAndCompareR1<float>(&builder, {}, {});
 }
@ -64,7 +64,7 @@ XLA_TEST_F(SliceTest, SliceTenToZeroF32) {
  ComputationBuilder builder(client_, TestName());
  std::vector<float> constant(10, 0.3);
  auto original = builder.ConstantR1<float>(constant);
-  builder.Slice(original, {7}, {7});
+  builder.Slice(original, {7}, {7}, {1});

  ComputeAndCompareR1<float>(&builder, {}, {});
 }
@ -87,7 +87,7 @@ TEST_F(SliceTest, SliceTenToTen) {

  ComputationBuilder builder(client_, TestName());
  auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {0}, {10});
+  builder.Slice(original, {0}, {10}, {1});

  ComputeAndCompareR1<float>(&builder, values, {}, ErrorSpec(0.000001));
 }
@ -98,7 +98,7 @@ TEST_F(SliceTest, SliceLastFourOf1024) {

  ComputationBuilder builder(client_, TestName());
  auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {1024 - 4}, {1024});
+  builder.Slice(original, {1024 - 4}, {1024}, {1});

  const std::vector<float> expected = {1020, 1021, 1022, 1023};
  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.000001));
@ -112,7 +112,7 @@ TEST_F(SliceTest, DISABLED_SliceUnaligned1024In4096Values) {

  ComputationBuilder builder(client_, TestName());
  auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {7}, {7 + 1024});
+  builder.Slice(original, {7}, {7 + 1024}, {1});

  std::vector<float> expected(1024);
  std::iota(values.begin(), values.end(), 7.0);
@ -122,7 +122,7 @@ TEST_F(SliceTest, DISABLED_SliceUnaligned1024In4096Values) {
 XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
  ComputationBuilder builder(client_, TestName());
  auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 0));
-  builder.Slice(original, {0, 0}, {0, 0});
+  builder.Slice(original, {0, 0}, {0, 0}, {1, 1});

  ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 0), {});
 }
@ -130,7 +130,7 @@ XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
 XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
  ComputationBuilder builder(client_, TestName());
  auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 20));
-  builder.Slice(original, {0, 15}, {0, 20});
+  builder.Slice(original, {0, 15}, {0, 20}, {1, 1});

  ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 5), {});
 }
@ -138,7 +138,7 @@ XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
 XLA_TEST_F(SliceTest, Slice3x0to2x0F32) {
  ComputationBuilder builder(client_, TestName());
  auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(3, 0));
-  builder.Slice(original, {1, 0}, {3, 0});
+  builder.Slice(original, {1, 0}, {3, 0}, {1, 1});

  ComputeAndCompareR2<float>(&builder, Array2D<float>(2, 0), {});
 }
@ -153,7 +153,7 @@ XLA_TEST_F(SliceTest, SliceQuadrantOf256x256) {

  ComputationBuilder builder(client_, TestName());
  auto original = builder.ConstantR2FromArray2D<float>(values);
-  builder.Slice(original, {128, 128}, {256, 256});
+  builder.Slice(original, {128, 128}, {256, 256}, {1, 1});

  Array2D<float> expected(128, 128);
  for (int row = 0; row < 128; ++row) {
@ -171,7 +171,7 @@ TEST_F(SliceTest, Slice_1x4096_To_1x1024) {

  ComputationBuilder builder(client_, TestName());
  auto original = builder.ConstantR2FromArray2D<float>(values);
-  builder.Slice(original, {0, 3072}, {1, 4096});
+  builder.Slice(original, {0, 3072}, {1, 4096}, {1, 1});

  Array2D<float> expected(1, 1024);
  std::iota(expected.data(), expected.data() + 1024, 3072.0);
@ -192,7 +192,7 @@ TEST_F(SliceTest, Slice_16x4_To_16x2) {
  }
  ComputationBuilder builder(client_, TestName());
  auto original = builder.ConstantR2FromArray2D<float>(values);
-  builder.Slice(original, {0, 0}, {16, 2});
+  builder.Slice(original, {0, 0}, {16, 2}, {1, 1});
  ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.000001));
 }

@ -204,7 +204,7 @@ TEST_F(SliceTest, SliceR4ThreeDimsMiddleMinor) {
      ReferenceUtil::Slice4D(values, {{1, 0, 8, 0}}, {{2, 2, 16, 128}});
  ComputationBuilder builder(client_, TestName());
  auto original = builder.ConstantR4FromArray4D(values);
-  builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128});
+  builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128}, {1, 1, 1, 1});
  ComputeAndCompareR4(&builder, *expected, {}, ErrorSpec(0.000001));
 }

@ -213,6 +213,7 @@ struct R2Spec {
  int64 input_dim1;
  std::array<int64, 2> slice_starts;
  std::array<int64, 2> slice_limits;
+  std::array<int64, 2> slice_strides;
  Layout layout;
 };

@ -228,7 +229,7 @@ TEST_P(SliceR2Test, DoIt) {

  ComputationBuilder builder(client_, TestName());
  auto a = builder.ConstantR2FromArray2D<int32>(input);
-  builder.Slice(a, spec.slice_starts, spec.slice_limits);
+  builder.Slice(a, spec.slice_starts, spec.slice_limits, spec.slice_strides);

  std::unique_ptr<Array2D<int32>> expected =
      ReferenceUtil::Slice2D(input, spec.slice_starts, spec.slice_limits);
@ -239,19 +240,23 @@ TEST_P(SliceR2Test, DoIt) {
 INSTANTIATE_TEST_CASE_P(
    SliceR2TestInstantiation, SliceR2Test,
    ::testing::Values(
-        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, LayoutUtil::MakeLayout({0, 1})},
-        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, LayoutUtil::MakeLayout({0, 1})},
-        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {256, 400, {{0, 300}}, {{256, 400}},
+        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, {{1, 1}},
+          LayoutUtil::MakeLayout({0, 1})},
+        R2Spec {4, 12, {{0, 3}}, {{4, 6}}, {{1, 1}},
          LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {500, 400, {{111, 123}}, {{300, 257}},
+        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, {{1, 1}},
+          LayoutUtil::MakeLayout({0, 1})},
+        R2Spec {16, 4, {{0, 2}}, {{16, 4}}, {{1, 1}},
          LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {500, 400, {{111, 123}}, {{300, 400}},
+        R2Spec {256, 400, {{0, 300}}, {{256, 400}}, {{1, 1}},
          LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {384, 512, {{128, 256}}, {{256, 384}},
+        R2Spec {500, 400, {{111, 123}}, {{300, 257}}, {{1, 1}},
          LayoutUtil::MakeLayout({1, 0})},
-        R2Spec {357, 512, {{111, 256}}, {{301, 384}},
+        R2Spec {500, 400, {{111, 123}}, {{300, 400}}, {{1, 1}},
+          LayoutUtil::MakeLayout({1, 0})},
+        R2Spec {384, 512, {{128, 256}}, {{256, 384}}, {{1, 1}},
+          LayoutUtil::MakeLayout({1, 0})},
+        R2Spec {357, 512, {{111, 256}}, {{301, 384}}, {{1, 1}},
          LayoutUtil::MakeLayout({1, 0})}
    )
 );
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@ -666,7 +666,8 @@ TEST_F(WhileTest, WhileWithPrngScalarResult) {
  auto build_condition = [this, v6s32](int count) {
    ComputationBuilder builder(client_, TestName());
    auto prev = builder.Reshape(
-        builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}), {0}, {});
+        builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}, {1}), {0},
+          {});
    builder.Gt(builder.ConstantR0<int32>(count), prev);
    return builder.Build().ConsumeValueOrDie();
  };
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@ -195,16 +195,24 @@ bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank);
 // 2. permutation.size() == input.size().
 template <template <typename...> class C, typename T>
 std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
-                       C<T> input_) {
-  tensorflow::gtl::ArraySlice<T> input(input_);
-  CHECK(IsPermutation(permutation, input.size()));
-  std::vector<T> output(input.size());
+                       C<T> input) {
+  tensorflow::gtl::ArraySlice<T> data(input);
+  CHECK(IsPermutation(permutation, data.size()));
+  std::vector<T> output(data.size());
  for (size_t i = 0; i < permutation.size(); ++i) {
-    output[permutation[i]] = input[i];
+    output[permutation[i]] = data[i];
  }
  return output;
 }

+// Override of the above that works around compile failures with gcc 7.1.1.
+// For details see https://github.com/tensorflow/tensorflow/issues/10843
+template <typename T>
+std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
+                       const std::vector<T>& input) {
+  return Permute<std::vector, T>(permutation, input);
+}
+
 // Inverts a permutation, i.e., output_permutation[input_permutation[i]] = i.
 std::vector<int64> InversePermutation(
    tensorflow::gtl::ArraySlice<int64> input_permutation);
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@ -200,7 +200,7 @@ message OpMetadata {
  string op_name = 2;
  // Indicate a file and line that this op is associated to in a user's program.
  //
-  // e.g. it could be be the file and line of user code that generated the op.
+  // e.g. it could be the file and line of user code that generated the op.
  string source_file = 3;
  int32 source_line = 4;
 }
@ -369,6 +369,7 @@ message SliceRequest {
  ComputationDataHandle operand = 2;
  repeated int64 start_indices = 3;
  repeated int64 limit_indices = 4;
+  repeated int64 stride = 5;
 }

 message DynamicSliceRequest {
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@ -17,6 +17,7 @@ package org.tensorflow.contrib.android;

 import android.content.res.AssetManager;
 import android.os.Trace;
+import android.os.Build.VERSION;
 import android.text.TextUtils;
 import android.util.Log;
 import java.io.FileInputStream;
@ -370,9 +371,11 @@ public class TensorFlowInferenceInterface {
  private void loadGraph(InputStream is, Graph g) throws IOException {
    final long startMs = System.currentTimeMillis();

-    Trace.beginSection("initializeTensorFlow");
+    if (VERSION.SDK_INT >= 18) {
+      Trace.beginSection("initializeTensorFlow");
+      Trace.beginSection("readGraphDef");
+    }

-    Trace.beginSection("readGraphDef");
    // TODO(ashankar): Can we somehow mmap the contents instead of copying them?
    byte[] graphDef = new byte[is.available()];
    final int numBytesRead = is.read(graphDef);
@ -383,17 +386,22 @@ public class TensorFlowInferenceInterface {
              + " of the graph, expected to read "
              + graphDef.length);
    }
-    Trace.endSection();

-    Trace.beginSection("importGraphDef");
+    if (VERSION.SDK_INT >= 18) {
+      Trace.endSection(); // readGraphDef.
+      Trace.beginSection("importGraphDef");
+    }
+
    try {
      g.importGraphDef(graphDef);
    } catch (IllegalArgumentException e) {
      throw new IOException("Not a valid TensorFlow Graph serialization: " + e.getMessage());
    }
-    Trace.endSection();

-    Trace.endSection(); // initializeTensorFlow.
+    if (VERSION.SDK_INT >= 18) {
+      Trace.endSection(); // importGraphDef.
+      Trace.endSection(); // initializeTensorFlow.
+    }

    final long endMs = System.currentTimeMillis();
    Log.i(
--- a/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
@ -60,7 +60,7 @@ class BigQueryReader : public ReaderBase {
    BigQueryTablePartition partition;
    if (!partition.ParseFromString(current_work())) {
      return errors::InvalidArgument(
-          "Could not parse work as as valid partition.");
+          "Could not parse work as valid partition.");
    }
    TF_RETURN_IF_ERROR(bigquery_table_accessor_->SetPartition(partition));
    return Status::OK();
--- a/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py
+++ b/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py
@ -92,7 +92,7 @@ class BigQueryReader(io_ops.ReaderBase):

    Raises:
      TypeError: - If features is neither None nor a dict or
-                 - If columns is is neither None nor a list or
+                 - If columns is neither None nor a list or
                 - If both features and columns are None or set.
    """
    if (features is None) == (columns is None):
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -74,7 +74,7 @@ if(WIN32)
  set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} /ignore:4049 /ignore:4197 /ignore:4217 /ignore:4221")
  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /ignore:4049 /ignore:4197 /ignore:4217 /ignore:4221")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
-  set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Ob0")
+  set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Ob2")
  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /D_ITERATOR_DEBUG_LEVEL=0")
  set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /D_ITERATOR_DEBUG_LEVEL=0")
  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /D_ITERATOR_DEBUG_LEVEL=0")
--- a/tensorflow/contrib/data/README.md
+++ b/tensorflow/contrib/data/README.md
@ -543,7 +543,7 @@ padded.
 ```python
 dataset = tf.contrib.data.Dataset.range(100)
 dataset = dataset.map(lambda x: tf.fill([tf.cast(x, tf.int32)], x))
-dataset = dataset.padded_batch(4, padded_shapes=[None])
+batched_dataset = dataset.padded_batch(4, padded_shapes=[None])

 iterator = batched_dataset.make_one_shot_iterator()
 next_element = iterator.get_next()
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
@ -120,7 +120,7 @@ class _TriLPlusVDVTLightweightOperatorPD(object):

    Doesn't actually do the sqrt! Named as such to agree with API.

-    To compute (M + V D V.T), we use the the Woodbury matrix identity:
+    To compute (M + V D V.T), we use the Woodbury matrix identity:
      inv(M + V D V.T) = inv(M) - inv(M) V inv(C) V.T inv(M)
    where,
      C = inv(D) + V.T inv(M) V.
@ -166,7 +166,7 @@ class _TriLPlusVDVTLightweightOperatorPD(object):
  def _woodbury_sandwiched_term(self):
    """Computes the sandwiched term in the Woodbury identity.

-    Computes the "`C`" in the the identity:
+    Computes the "`C`" in the identity:
       inv(M + V D V.T) = inv(M) - inv(M) V inv(C) V.T inv(M)
    where,
       C = inv(D) + V.T inv(M) V.
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@ -52,7 +52,7 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
  the RelaxedBernoulli can suffer from underflow issues. In many case loss
  functions such as these are invariant under invertible transformations of
  the random variables. The KL divergence, found in the variational autoencoder
-  loss, is an example. Because RelaxedBernoullis are sampled by by a Logistic
+  loss, is an example. Because RelaxedBernoullis are sampled by a Logistic
  random variable followed by a `tf.sigmoid` op, one solution is to treat
  the Logistic as the random variable and `tf.sigmoid` as downstream. The
  KL divergences of two Logistics, which are always followed by a `tf.sigmoid`
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@ -47,7 +47,7 @@ def percentile(x,
  """Compute the `q`-th percentile of `x`.

  Given a vector `x`, the `q`-th percentile of `x` is the value `q / 100` of the
-  way from the minimum to the maximum in in a sorted copy of `x`.
+  way from the minimum to the maximum in a sorted copy of `x`.

  The values and distances of the two nearest neighbors as well as the
  `interpolation` parameter will determine the percentile if the normalized
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Vectorized Laplace distribution class, directly using LinearOpeartor."""
+"""Vectorized Laplace distribution class, directly using LinearOperator."""

 from __future__ import absolute_import
 from __future__ import division
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@ -446,7 +446,7 @@ class Transformer(object):
      # TODO(fkp): return a subgraph?
      op_, op_outputs_ = self.transform_op_handler(info, op)
      if op is op_:
-        raise ValueError("In-place tranformation not allowed.")
+        raise ValueError("In-place transformation not allowed.")

      # Process op.
      info.transformed_ops[op] = op_
--- a/tensorflow/contrib/keras/python/keras/backend.py
+++ b/tensorflow/contrib/keras/python/keras/backend.py
@ -3261,7 +3261,7 @@ def conv2d(x,
      padding: string, `"same"` or `"valid"`.
      data_format: `"channels_last"` or `"channels_first"`.
          Whether to use Theano or TensorFlow data format
-          for inputs/kernels/ouputs.
+          for inputs/kernels/outputs.
      dilation_rate: tuple of 2 integers.

  Returns:
@ -3309,7 +3309,7 @@ def conv2d_transpose(x,
      padding: string, `"same"` or `"valid"`.
      data_format: `"channels_last"` or `"channels_first"`.
          Whether to use Theano or TensorFlow data format
-          for inputs/kernels/ouputs.
+          for inputs/kernels/outputs.

  Returns:
      A tensor, result of transposed 2D convolution.
@ -3395,7 +3395,7 @@ def conv3d(x,
      padding: string, `"same"` or `"valid"`.
      data_format: `"channels_last"` or `"channels_first"`.
          Whether to use Theano or TensorFlow data format
-          for inputs/kernels/ouputs.
+          for inputs/kernels/outputs.
      dilation_rate: tuple of 3 integers.

  Returns:
--- a/tensorflow/contrib/keras/python/keras/layers/core.py
+++ b/tensorflow/contrib/keras/python/keras/layers/core.py
@ -107,7 +107,7 @@ class Dropout(tf_core_layers.Dropout, Layer):
    self.supports_masking = True
    # Inheritance call order:
    # 1) tf.layers.Dropout, 2) keras.layers.Layer, 3) tf.layers.Layer
-    super(Dropout, self).__init__(**kwargs)
+    super(Dropout, self).__init__(rate=rate, noise_shape=noise_shape, seed=seed, **kwargs)

  def call(self, inputs, training=None):
    if training is None:
--- a/tensorflow/contrib/keras/python/keras/layers/recurrent.py
+++ b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
@ -985,7 +985,7 @@ class LSTM(Recurrent):

  References:
      - [Long short-term
-        memory](http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf)
+        memory](http://www.bioinf.jku.at/publications/older/2604.pdf)
        (original 1997 paper)
      - [Supervised sequence labeling with recurrent neural
        networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
--- a/tensorflow/contrib/keras/python/keras/models_test.py
+++ b/tensorflow/contrib/keras/python/keras/models_test.py
@ -105,7 +105,7 @@ class TestModelSaving(test.TestCase):
      out2 = model.predict(x)
      self.assertAllClose(out, out2, atol=1e-05)

-  def test_fuctional_model_saving(self):
+  def test_functional_model_saving(self):
    if h5py is None:
      return  # Skip test if models cannot be saved.

--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@ -121,7 +121,6 @@ cuda_py_test(
        ":layers_py",
        "//third_party/py/numpy",
        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/losses:losses_py",
        "//tensorflow/python:array_ops",
        "//tensorflow/python:client",
        "//tensorflow/python:client_testlib",
@ -141,6 +140,7 @@ cuda_py_test(
        "//tensorflow/python:template",
        "//tensorflow/python:variable_scope",
        "//tensorflow/python:variables",
+        "//tensorflow/python/ops/losses:losses",
    ],
 )

--- a/tensorflow/contrib/layers/init.py
+++ b/tensorflow/contrib/layers/init.py
@ -17,12 +17,16 @@
 See the @{$python/contrib.layers} guide.

@@avg_pool2d
+@@avg_pool3d
@@batch_norm
@@convolution2d
+@@convolution3d
@@conv2d_in_plane
@@convolution2d_in_plane
@@conv2d_transpose
@@convolution2d_transpose
+@@conv3d_transpose
+@@convolution3d_transpose
@@dropout
@@elu
@@embedding_lookup_unique
@ -31,6 +35,7 @@ See the @{$python/contrib.layers} guide.
@@layer_norm
@@linear
@@max_pool2d
+@@max_pool3d
@@one_hot_encoding
@@relu
@@relu6
@ -101,6 +106,7 @@ from tensorflow.python.util.all_util import remove_undocumented

 _allowed_symbols = ['bias_add',
                    'conv2d',
+                    'conv3d',
                    'elu',
                    'feature_column',
                    'legacy_fully_connected',
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@ -49,15 +49,20 @@ from tensorflow.python.training import moving_averages
 # TODO(b/28426988): Replace legacy_* fns migrated from slim.
 # TODO(b/28426988): Remove legacy_* when all uses have migrated to new API.
 __all__ = ['avg_pool2d',
+           'avg_pool3d',
           'batch_norm',
           'bias_add',
           'conv2d',
+           'conv3d',
           'conv2d_in_plane',
           'conv2d_transpose',
+           'conv3d_transpose',
           'convolution',
           'convolution2d',
           'convolution2d_in_plane',
           'convolution2d_transpose',
+           'convolution3d',
+           'convolution3d_transpose',
           'dropout',
           'elu',
           'flatten',
@ -66,6 +71,7 @@ __all__ = ['avg_pool2d',
           'linear',
           'pool',
           'max_pool2d',
+           'max_pool3d',
           'one_hot_encoding',
           'relu',
           'relu6',
@ -82,6 +88,8 @@ __all__ = ['avg_pool2d',

 DATA_FORMAT_NCHW = 'NCHW'
 DATA_FORMAT_NHWC = 'NHWC'
+DATA_FORMAT_NCDHW = 'NCDHW'
+DATA_FORMAT_NDHWC = 'NDHWC'


@add_arg_scope
@ -132,6 +140,54 @@ def avg_pool2d(inputs,
    return utils.collect_named_outputs(outputs_collections, sc, outputs)


+@add_arg_scope
+def avg_pool3d(inputs,
+               kernel_size,
+               stride=2,
+               padding='VALID',
+               data_format=DATA_FORMAT_NDHWC,
+               outputs_collections=None,
+               scope=None):
+  """Adds a 3D average pooling op.
+
+  It is assumed that the pooling is done per image but not in batch or channels.
+
+  Args:
+    inputs: A 5-D tensor of shape `[batch_size, depth, height, width, channels]` if
+      `data_format` is `NDHWC`, and `[batch_size, channels, depth, height, width]` if
+      `data_format` is `NCDHW`.
+    kernel_size: A list of length 3: [kernel_depth, kernel_height, kernel_width] of the
+      pooling kernel over which the op is computed. Can be an int if both
+      values are the same.
+    stride: A list of length 3: [stride_depth, stride_height, stride_width].
+      Can be an int if both strides are the same. Note that presently
+      both strides must have the same value.
+    padding: The padding method, either 'VALID' or 'SAME'.
+    data_format: A string. `NDHWC` (default) and `NCDHW` are supported.
+    outputs_collections: The collections to which the outputs are added.
+    scope: Optional scope for name_scope.
+
+  Returns:
+    A `Tensor` representing the results of the pooling operation.
+
+  Raises:
+    ValueError: If `data_format` is neither `NDHWC` nor `NCDHW`.
+  """
+  if data_format not in (DATA_FORMAT_NCDHW, DATA_FORMAT_NDHWC):
+    raise ValueError('data_format has to be either NCDHW or NDHWC.')
+  with ops.name_scope(scope, 'AvgPool3D', [inputs]) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    df = ('channels_first' if data_format and data_format.startswith('NC')
+          else 'channels_last')
+    layer = pooling_layers.AveragePooling3D(pool_size=kernel_size,
+                                            strides=stride,
+                                            padding=padding,
+                                            data_format=df,
+                                            _scope=sc)
+    outputs = layer.apply(inputs)
+    return utils.collect_named_outputs(outputs_collections, sc, outputs)
+
+
 def _fused_batch_norm(
    inputs,
    decay=0.999,
@ -985,6 +1041,7 @@ def convolution(inputs,
                                       sc.original_name_scope, outputs)

 convolution2d = convolution
+convolution3d = convolution


@add_arg_scope
@ -1203,6 +1260,116 @@ def convolution2d_transpose(
                                       sc.original_name_scope, outputs)


+@add_arg_scope
+def convolution3d_transpose(
+    inputs,
+    num_outputs,
+    kernel_size,
+    stride=1,
+    padding='SAME',
+    data_format=DATA_FORMAT_NDHWC,
+    activation_fn=nn.relu,
+    normalizer_fn=None,
+    normalizer_params=None,
+    weights_initializer=initializers.xavier_initializer(),
+    weights_regularizer=None,
+    biases_initializer=init_ops.zeros_initializer(),
+    biases_regularizer=None,
+    reuse=None,
+    variables_collections=None,
+    outputs_collections=None,
+    trainable=True,
+    scope=None):
+  """Adds a convolution3d_transpose with an optional batch normalization layer.
+  
+  The function creates a variable called `weights`, representing the
+  kernel, that is convolved with the input. If `batch_norm_params` is `None`, a
+  second variable called 'biases' is added to the result of the operation.
+  Args:
+    inputs: A 5-D `Tensor` of type `float` and shape
+      `[batch, depth, height, width, in_channels]` for `NDHWC` data format or
+      `[batch, in_channels, depth, height, width]` for `NCDHW` data format.
+    num_outputs: Integer, the number of output filters.
+    kernel_size: A list of length 3 holding the [kernel_depth, kernel_height, kernel_width] of
+      of the filters. Can be an int if both values are the same.
+    stride: A list of length 3: [stride_depth, stride_height, stride_width].
+      Can be an int if both strides are the same.  Note that presently
+      both strides must have the same value.
+    padding: One of 'VALID' or 'SAME'.
+    data_format: A string. `NDHWC` (default) and `NCDHW` are supported.
+    activation_fn: Activation function. The default value is a ReLU function.
+      Explicitly set it to None to skip it and maintain a linear activation.
+    normalizer_fn: Normalization function to use instead of `biases`. If
+      `normalizer_fn` is provided then `biases_initializer` and
+      `biases_regularizer` are ignored and `biases` are not created nor added.
+      default set to None for no normalizer function
+    normalizer_params: Normalization function parameters.
+    weights_initializer: An initializer for the weights.
+    weights_regularizer: Optional regularizer for the weights.
+    biases_initializer: An initializer for the biases. If None skip biases.
+    biases_regularizer: Optional regularizer for the biases.
+    reuse: Whether or not the layer and its variables should be reused. To be
+      able to reuse the layer scope must be given.
+    variables_collections: Optional list of collections for all the variables or
+      a dictionary containing a different list of collection per variable.
+    outputs_collections: Collection to add the outputs.
+    trainable: Whether or not the variables should be trainable or not.
+    scope: Optional scope for variable_scope.
+  Returns:
+    A tensor representing the output of the operation.
+  Raises:
+    ValueError: If 'kernel_size' is not a list of length 3.
+    ValueError: If `data_format` is neither `NDHWC` nor `NCDHW`.
+    ValueError: If `C` dimension of `inputs` is None.
+  """
+  layer_variable_getter = _build_variable_getter(
+      {'bias': 'biases', 'kernel': 'weights'})
+
+  with variable_scope.variable_scope(
+      scope, 'Conv3d_transpose', [inputs], reuse=reuse,
+      custom_getter=layer_variable_getter) as sc:
+    if data_format not in (DATA_FORMAT_NCDHW, DATA_FORMAT_NDHWC):
+      raise ValueError('data_format has to be either NCDHW or NDHWC.')
+
+    inputs = ops.convert_to_tensor(inputs)
+
+    df = ('channels_first' if data_format and data_format.startswith('NC')
+          else 'channels_last')
+    layer = convolutional_layers.Convolution3DTranspose(
+        filters=num_outputs,
+        kernel_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        activation=None,
+        use_bias=not normalizer_fn and biases_initializer,
+        kernel_initializer=weights_initializer,
+        bias_initializer=biases_initializer,
+        kernel_regularizer=weights_regularizer,
+        bias_regularizer=biases_regularizer,
+        activity_regularizer=None,
+        trainable=trainable,
+        name=sc.name,
+        dtype=inputs.dtype.base_dtype,
+        _scope=sc,
+        _reuse=reuse)
+    outputs = layer.apply(inputs)
+
+    # Add variables to collections.
+    _add_variable_to_collections(layer.kernel, variables_collections, 'weights')
+    if layer.bias:
+      _add_variable_to_collections(layer.bias, variables_collections, 'biases')
+
+    if normalizer_fn is not None:
+      normalizer_params = normalizer_params or {}
+      outputs = normalizer_fn(outputs, **normalizer_params)
+
+    if activation_fn is not None:
+      outputs = activation_fn(outputs)
+    return utils.collect_named_outputs(outputs_collections,
+                                       sc.original_name_scope, outputs)
+
+
@add_arg_scope
 def dropout(inputs,
            keep_prob=0.5,
@ -1467,7 +1634,8 @@ def fully_connected(inputs,
    ValueError: If x has rank less than 2 or if its last dimension is not set.
  """
  if not isinstance(num_outputs, six.integer_types):
-    raise ValueError('num_outputs should be int or long, got %s.', num_outputs)
+    raise ValueError(
+        'num_outputs should be int or long, got %s.' % (num_outputs,))

  layer_variable_getter = _build_variable_getter({'bias': 'biases',
                                                  'kernel': 'weights'})
@ -1689,6 +1857,55 @@ def max_pool2d(inputs,
    return utils.collect_named_outputs(outputs_collections, sc, outputs)


+@add_arg_scope
+def max_pool3d(inputs,
+               kernel_size,
+               stride=2,
+               padding='VALID',
+               data_format=DATA_FORMAT_NDHWC,
+               outputs_collections=None,
+               scope=None):
+  """Adds a 3D Max Pooling op.
+
+  It is assumed that the pooling is done per image but not in batch or channels.
+
+  Args:
+    inputs: A 5-D tensor of shape `[batch_size, depth, height, width, channels]` if
+      `data_format` is `NDHWC`, and `[batch_size, channels, depth, height, width]` if
+      `data_format` is `NCDHW`.
+    kernel_size: A list of length 3: [kernel_depth, kernel_height, kernel_width] of the
+      pooling kernel over which the op is computed. Can be an int if both
+      values are the same.
+    stride: A list of length 3: [stride_depth, stride_height, stride_width].
+      Can be an int if both strides are the same. Note that presently
+      both strides must have the same value.
+    padding: The padding method, either 'VALID' or 'SAME'.
+    data_format: A string. `NDHWC` (default) and `NCDHW` are supported.
+    outputs_collections: The collections to which the outputs are added.
+    scope: Optional scope for name_scope.
+
+  Returns:
+    A `Tensor` representing the results of the pooling operation.
+
+  Raises:
+    ValueError: If `data_format` is neither `NDHWC` nor `NCDHW`.
+    ValueError: If 'kernel_size' is not a 3-D list
+  """
+  if data_format not in (DATA_FORMAT_NCDHW, DATA_FORMAT_NDHWC):
+    raise ValueError('data_format has to be either NCDHW or NDHWC.')
+  with ops.name_scope(scope, 'MaxPool3D', [inputs]) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    df = ('channels_first' if data_format and data_format.startswith('NC')
+          else 'channels_last')
+    layer = pooling_layers.MaxPooling3D(pool_size=kernel_size,
+                                        strides=stride,
+                                        padding=padding,
+                                        data_format=df,
+                                        _scope=sc)
+    outputs = layer.apply(inputs)
+    return utils.collect_named_outputs(outputs_collections, sc, outputs)
+
+
@add_arg_scope
 def pool(inputs,
         kernel_size,
@ -2346,6 +2563,8 @@ linear = functools.partial(fully_connected, activation_fn=None)

 # Simple alias.
 conv2d = convolution2d
+conv3d = convolution3d
 conv2d_transpose = convolution2d_transpose
+conv3d_transpose = convolution3d_transpose
 conv2d_in_plane = convolution2d_in_plane
 separable_conv2d = separable_convolution2d
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@ -27,7 +27,6 @@ from tensorflow.contrib.framework.python.ops import arg_scope
 from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import layers as _layers
 from tensorflow.contrib.layers.python.layers import regularizers
-from tensorflow.contrib.losses.python.losses import loss_ops
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@ -49,6 +48,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test


@ -121,6 +121,76 @@ class AvgPool2DTest(test.TestCase):
    self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])


+class AvgPool3DTest(test.TestCase):
+
+  def testInvalidDataFormat(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, depth, height, width, 3))
+    with self.assertRaisesRegexp(ValueError,
+                                 'data_format has to be either NCDHW or NDHWC.'):
+      _layers.avg_pool3d(images, [3, 3, 3], data_format='CDHWN')
+
+  def testCreateAvgPool(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, depth, height, width, 3))
+    output = _layers.avg_pool3d(images, [3, 3, 3])
+    self.assertEqual(output.op.name, 'AvgPool3D/AvgPool3D')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
+
+  def testCreateAvgPoolNCDHW(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, 2, depth, height, width))
+    output = _layers.avg_pool3d(images, [3, 3, 3], data_format='NCDHW')
+    self.assertEquals(output.op.name, 'AvgPool3D/transpose_1')
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 1, 2, 4])
+
+  def testCollectOutputs(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, [3, 3, 3], outputs_collections='outputs')
+    output_collected = ops.get_collection('outputs')[0]
+    self.assertEqual(output_collected.aliases, ['AvgPool3D'])
+    self.assertEqual(output_collected, output)
+
+  def testCreateSquareAvgPool(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, 3)
+    self.assertEqual(output.op.name, 'AvgPool3D/AvgPool3D')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
+
+  def testCreateAvgPoolWithScope(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, [3, 3, 3], scope='pool1')
+    self.assertEqual(output.op.name, 'pool1/AvgPool3D')
+
+  def testCreateAvgPoolWithSamePadding(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, [3, 3, 3], padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 3, 5, 3])
+
+  def testCreateAvgPoolWithSamePaddingNCDHW(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, 3, depth, height, width), seed=1)
+    output = _layers.avg_pool3d(
+        images, [3, 3, 3], padding='SAME', data_format='NCDHW')
+    self.assertListEqual(output.get_shape().as_list(), [5, 3, 2, 3, 5])
+
+  def testCreateAvgPoolStrideWithSamePadding(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, [3, 3, 3], stride=1, padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3])
+
+  def testGlobalAvgPool(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, images.get_shape()[1:4], stride=1)
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 1, 3])
+
+
 class PoolTest(test.TestCase):

  def testCreatePool(self):
@ -1559,23 +1629,23 @@ class FCTest(test.TestCase):
        inputs, 32, scope='fc1', weights_regularizer=regularizer)
    self.assertEqual(
        len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
-    self.assertEqual(len(loss_ops.get_regularization_losses()), 1)
+    self.assertEqual(len(losses.get_regularization_losses()), 1)
    _layers.fully_connected(
        inputs, 32, scope='fc1', weights_regularizer=regularizer, reuse=True)
    self.assertEqual(
        len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
-    self.assertEqual(len(loss_ops.get_regularization_losses()), 1)
+    self.assertEqual(len(losses.get_regularization_losses()), 1)

    with variable_scope.variable_scope('outer', reuse=False):
      _layers.fully_connected(inputs, 32, weights_regularizer=regularizer)
      self.assertEqual(
          len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 2)
-      self.assertEqual(len(loss_ops.get_regularization_losses()), 2)
+      self.assertEqual(len(losses.get_regularization_losses()), 2)
    with variable_scope.variable_scope('outer', reuse=True):
      _layers.fully_connected(inputs, 32, weights_regularizer=regularizer)
      self.assertEqual(
          len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 2)
-      self.assertEqual(len(loss_ops.get_regularization_losses()), 2)
+      self.assertEqual(len(losses.get_regularization_losses()), 2)

  def testCreateFCWithoutActivation(self):
    height, width = 3, 3
@ -2771,6 +2841,76 @@ class MaxPool2DTest(test.TestCase):
    self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])


+class MaxPool3DTest(test.TestCase):
+
+  def testInvalidDataFormat(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, depth, height, width, 3))
+    with self.assertRaisesRegexp(ValueError,
+                                 'data_format has to be either NCDHW or NDHWC.'):
+      _layers.max_pool3d(images, [3, 3, 3], data_format='CDHWN')
+
+  def testCreateMaxPool(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, depth, height, width, 3)).astype(np.float32)
+    output = _layers.max_pool3d(images, [3, 3, 3])
+    self.assertEqual(output.op.name, 'MaxPool3D/MaxPool3D')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
+
+  def testCreateMaxPoolNCDHW(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, 3, depth, height, width)).astype(np.float32)
+    output = _layers.max_pool3d(images, [3, 3, 3], data_format='NCDHW')
+    self.assertEquals(output.op.name, 'MaxPool3D/transpose_1')
+    self.assertListEqual(output.get_shape().as_list(), [5, 3, 1, 2, 4])
+
+  def testCollectOutputs(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, [3, 3, 3], outputs_collections='outputs')
+    output_collected = ops.get_collection('outputs')[0]
+    self.assertEqual(output_collected.aliases, ['MaxPool3D'])
+    self.assertEqual(output_collected, output)
+
+  def testCreateSquareMaxPool(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, 3)
+    self.assertEqual(output.op.name, 'MaxPool3D/MaxPool3D')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
+
+  def testCreateMaxPoolWithScope(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, [3, 3, 3], scope='pool1')
+    self.assertEqual(output.op.name, 'pool1/MaxPool3D')
+
+  def testCreateMaxPoolWithSamePadding(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, [3, 3, 3], padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 3, 5, 3])
+
+  def testCreateMaxPoolWithSamePaddingNCDHW(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, 3, depth, height, width), seed=1)
+    output = _layers.max_pool3d(
+        images, [3, 3, 3], padding='SAME', data_format='NCDHW')
+    self.assertListEqual(output.get_shape().as_list(), [5, 3, 2, 3, 5])
+
+  def testCreateMaxPoolStrideWithSamePadding(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, [3, 3, 3], stride=1, padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3])
+
+  def testGlobalMaxPool(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, images.get_shape()[1:4], stride=1)
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 1, 3])
+
+
 class OneHotEncodingTest(test.TestCase):

  def testOneHotEncodingCreate(self):
--- a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
@ -52,8 +52,8 @@ LABEL_DIMENSION = 3  #  Dimensionality of regression labels.

 def _train_test_split(features_and_labels):
  features, labels = features_and_labels
-  train_set = (features[:len(features) / 2], labels[:len(features) / 2])
-  test_set = (features[len(features) / 2:], labels[len(features) / 2:])
+  train_set = (features[:int(len(features) / 2)], labels[:int(len(features) / 2)])
+  test_set = (features[int(len(features) / 2):], labels[int(len(features) / 2):])
  return train_set, test_set


--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@ -438,7 +438,7 @@ def loss_only_head(loss_fn, head_name=None):
  Args:
    loss_fn: a function that takes no argument and returns a list of
        scalar tensors.
-    head_name: a name for for the head.
+    head_name: a name for the head.

  Returns:
    An instance of `Head` to hold the additional losses.
--- a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
@ -729,7 +729,7 @@ class LinearClassifierTest(test.TestCase):
    self.assertLess(loss, 0.07)

  def testSdcaOptimizerRealValuedFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and real valued features."""
+    """Tests LinearClassifier with SDCAOptimizer and real valued features."""

    def input_fn():
      return {
@ -776,7 +776,7 @@ class LinearClassifierTest(test.TestCase):
    self.assertLess(loss, 0.05)

  def testSdcaOptimizerBucketizedFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and bucketized features."""
+    """Tests LinearClassifier with SDCAOptimizer and bucketized features."""

    def input_fn():
      return {
@ -802,7 +802,7 @@ class LinearClassifierTest(test.TestCase):
    self.assertGreater(scores['accuracy'], 0.9)

  def testSdcaOptimizerSparseFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and sparse features."""
+    """Tests LinearClassifier with SDCAOptimizer and sparse features."""

    def input_fn():
      return {
@ -833,7 +833,7 @@ class LinearClassifierTest(test.TestCase):
    self.assertGreater(scores['accuracy'], 0.9)

  def testSdcaOptimizerWeightedSparseFeatures(self):
-    """LinearClasssifier with SDCAOptimizer and weighted sparse features."""
+    """LinearClassifier with SDCAOptimizer and weighted sparse features."""

    def input_fn():
      return {
@ -864,7 +864,7 @@ class LinearClassifierTest(test.TestCase):
    self.assertGreater(scores['accuracy'], 0.9)

  def testSdcaOptimizerCrossedFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and crossed features."""
+    """Tests LinearClassifier with SDCAOptimizer and crossed features."""

    def input_fn():
      return {
@ -897,7 +897,7 @@ class LinearClassifierTest(test.TestCase):
    self.assertGreater(scores['accuracy'], 0.9)

  def testSdcaOptimizerMixedFeatures(self):
-    """Tests LinearClasssifier with SDCAOptimizer and a mix of features."""
+    """Tests LinearClassifier with SDCAOptimizer and a mix of features."""

    def input_fn():
      return {
@ -1509,7 +1509,7 @@ class LinearRegressorTest(test.TestCase):
    self.assertLess(loss, 0.05)

  def testSdcaOptimizerSparseFeaturesWithL1Reg(self):
-    """Tests LinearClasssifier with SDCAOptimizer and sparse features."""
+    """Tests LinearClassifier with SDCAOptimizer and sparse features."""

    def input_fn():
      return {
@ -1581,7 +1581,7 @@ class LinearRegressorTest(test.TestCase):
    self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm)

  def testSdcaOptimizerBiasOnly(self):
-    """Tests LinearClasssifier with SDCAOptimizer and validates bias weight."""
+    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""

    def input_fn():
      """Testing the bias weight when it's the only feature present.
@ -1614,7 +1614,7 @@ class LinearRegressorTest(test.TestCase):
        regressor.get_variable_value('linear/bias_weight')[0], 0.25, err=0.1)

  def testSdcaOptimizerBiasAndOtherColumns(self):
-    """Tests LinearClasssifier with SDCAOptimizer and validates bias weight."""
+    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""

    def input_fn():
      """Testing the bias weight when there are other features present.
@ -1676,7 +1676,7 @@ class LinearRegressorTest(test.TestCase):
        regressor.get_variable_value('linear/b/weight')[0], 0.0, err=0.05)

  def testSdcaOptimizerBiasAndOtherColumnsFabricatedCentered(self):
-    """Tests LinearClasssifier with SDCAOptimizer and validates bias weight."""
+    """Tests LinearClassifier with SDCAOptimizer and validates bias weight."""

    def input_fn():
      """Testing the bias weight when there are other features present.
--- a/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py
@ -123,7 +123,7 @@ class ModelFnopsTest(test.TestCase):
      self.assertAllEqual(predictions["probabilities"].eval(),
                          regression_output.value.eval())

-  def testEstimatorSpec_export_classsification(self):
+  def testEstimatorSpec_export_classification(self):
    predictions = self.create_predictions()
    output_alternatives = {"classification_head": (
        constants.ProblemType.CLASSIFICATION, predictions)}
@ -143,7 +143,7 @@ class ModelFnopsTest(test.TestCase):
      self.assertAllEqual(predictions["classes"].eval(),
                          classification_output.classes.eval())

-  def testEstimatorSpec_export_classsification_with_missing_scores(self):
+  def testEstimatorSpec_export_classification_with_missing_scores(self):
    predictions = self.create_predictions()
    output_alternatives_predictions = predictions.copy()
    del output_alternatives_predictions["scores"]
@ -165,7 +165,7 @@ class ModelFnopsTest(test.TestCase):
      self.assertAllEqual(predictions["classes"].eval(),
                          classification_output.classes.eval())

-  def testEstimatorSpec_export_classsification_with_missing_scores_proba(self):
+  def testEstimatorSpec_export_classification_with_missing_scores_proba(self):
    predictions = self.create_predictions()
    output_alternatives_predictions = predictions.copy()
    del output_alternatives_predictions["scores"]
@ -187,7 +187,7 @@ class ModelFnopsTest(test.TestCase):
      self.assertAllEqual(predictions["classes"].eval(),
                          classification_output.classes.eval())

-  def testEstimatorSpec_export_classsification_with_missing_classes(self):
+  def testEstimatorSpec_export_classification_with_missing_classes(self):
    predictions = self.create_predictions()
    output_alternatives_predictions = predictions.copy()
    del output_alternatives_predictions["classes"]
@ -208,7 +208,7 @@ class ModelFnopsTest(test.TestCase):
                          classification_output.scores.eval())
      self.assertIsNone(classification_output.classes)

-  def testEstimatorSpec_export_classsification_with_nonstring_classes(self):
+  def testEstimatorSpec_export_classification_with_nonstring_classes(self):
    predictions = self.create_predictions()
    output_alternatives_predictions = predictions.copy()
    output_alternatives_predictions["classes"] = constant_op.constant(
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@ -455,7 +455,7 @@ class Experiment(object):
  def train_and_evaluate(self):
    """Interleaves training and evaluation.

-    The frequency of evaluation is controlled by the contructor arg
+    The frequency of evaluation is controlled by the constructor arg
    `min_eval_frequency`. When this parameter is 0, evaluation happens
    only after training has completed. Note that evaluation cannot happen
    more frequently than checkpoints are taken. If no new snapshots are
@ -515,9 +515,9 @@ class Experiment(object):

    This differs from `train_and_evaluate` as follows:
      1. The procedure will have train and evaluation in turns. The model
-      will be trained for a number of steps (usuallly smaller than `train_steps`
+      will be trained for a number of steps (usually smaller than `train_steps`
      if provided) and then be evaluated.  `train_and_evaluate` will train the
-      model for `train_steps` (no small training iteraions).
+      model for `train_steps` (no small training iterations).

      2. Due to the different approach this schedule takes, it leads to two
      differences in resource control. First, the resources (e.g., memory) used
--- a/tensorflow/contrib/learn/python/learn/models.py
+++ b/tensorflow/contrib/learn/python/learn/models.py
@ -63,7 +63,7 @@ def linear_regression(x, y, init_mean=None, init_stddev=1.0):
    x: tensor or placeholder for input features.
    y: tensor or placeholder for labels.
    init_mean: the mean value to use for initialization.
-    init_stddev: the standard devation to use for initialization.
+    init_stddev: the standard deviation to use for initialization.

  Returns:
    Predictions and loss tensors.
@ -124,7 +124,7 @@ def logistic_regression(x,
                  will check if graph contains tensor `class_weight:0`.
                  If that is not provided either all ones are used.
    init_mean: the mean value to use for initialization.
-    init_stddev: the standard devation to use for initialization.
+    init_stddev: the standard deviation to use for initialization.

  Returns:
    Predictions and loss tensors.
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
@ -462,7 +462,7 @@ def random_tril_matrix(shape,
    remove_upper:  Python `bool`.
      If `True`, zero out the strictly upper triangle.
      If `False`, the lower triangle of returned matrix will have desired
-      properties, but will not not have the strictly upper triangle zero'd out.
+      properties, but will not have the strictly upper triangle zero'd out.

  Returns:
    `Tensor` with desired shape and dtype.
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@ -208,7 +208,7 @@ def index_to_string_table_from_tensor(mapping, default_value="UNK", name=None):
  Sample Usages:

  ```python
-  mapping_string = tf.constant(["emerson", "lake", "palmer")
+  mapping_string = tf.constant(["emerson", "lake", "palmer"])
  indices = tf.constant([1, 5], tf.int64)
  table = tf.contrib.lookup.index_to_string_table_from_tensor(
      mapping_string, default_value="UNKNOWN")
@ -260,7 +260,7 @@ def index_to_string(tensor, mapping, default_value="UNK", name=None):
  For example:

  ```python
-  mapping_string = tf.constant(["emerson", "lake", "palmer")
+  mapping_string = tf.constant(["emerson", "lake", "palmer"])
  indices = tf.constant([1, 5], tf.int64)
  values = tf.contrib.lookup.index_to_string(
      indices, mapping=mapping_string, default_value="UNKNOWN")
--- a/tensorflow/contrib/makefile/build_with_docker.sh
+++ b/tensorflow/contrib/makefile/build_with_docker.sh
@ -23,7 +23,7 @@
 # Make sure we're in the correct directory, at the root of the source tree.
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 WORKSPACE="${SCRIPT_DIR}/../../../"
-cd ${WORKSPACE}
+cd ${WORKSPACE} || exit 1

 DOCKER_IMG_NAME="tf-make-base"
 DOCKER_CONTEXT_PATH="${WORKSPACE}tensorflow/contrib/makefile/"
--- a/tensorflow/contrib/makefile/compile_android_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_android_protobuf.sh
@ -27,7 +27,7 @@ cc_prefix="${CC_PREFIX}"
 usage() {
  echo "Usage: $(basename "$0") [-a:c]"
  echo "-a [Architecture] Architecture of target android [default=armeabi-v7a] \
-(supported archtecture list: \
+(supported architecture list: \
 arm64-v8a armeabi armeabi-v7a armeabi-v7a-hard mips mips64 x86 x86_64)"
  echo "-c Clean before building protobuf for target"
  echo "\"NDK_ROOT\" should be defined as an environment variable."
@ -130,7 +130,7 @@ elif [[ ${ARCHITECTURE} == "x86_64" ]]; then
    sysroot_arch="x86_64"
    bin_prefix="x86_64-linux-android"
 else
-    echo "archtecture ${arcitecture} is not supported." 1>&2
+    echo "architecture ${ARCHITECTURE} is not supported." 1>&2
    usage
    exit 1
 fi
--- a/tensorflow/contrib/makefile/compile_ios_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
@ -1,4 +1,4 @@
-#!/bin/bash -x -e
+#!/bin/bash
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -15,6 +15,9 @@
 # ==============================================================================
 # Builds protobuf 3 for iOS.

+set -x
+set -e
+
 SCRIPT_DIR=$(dirname $0)
 source "${SCRIPT_DIR}/build_helper.subr"

@ -30,17 +33,17 @@ fi

 JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"

-GENDIR=`pwd`/gen/protobuf_ios/
+GENDIR=$(pwd)/gen/protobuf_ios/
 LIBDIR=${GENDIR}lib
 mkdir -p ${LIBDIR}

 OSX_VERSION=darwin14.0.0

-IPHONEOS_PLATFORM=`xcrun --sdk iphoneos --show-sdk-platform-path`
-IPHONEOS_SYSROOT=`xcrun --sdk iphoneos --show-sdk-path`
-IPHONESIMULATOR_PLATFORM=`xcrun --sdk iphonesimulator --show-sdk-platform-path`
-IPHONESIMULATOR_SYSROOT=`xcrun --sdk iphonesimulator --show-sdk-path`
-IOS_SDK_VERSION=`xcrun --sdk iphoneos --show-sdk-version`
+IPHONEOS_PLATFORM=$(xcrun --sdk iphoneos --show-sdk-platform-path)
+IPHONEOS_SYSROOT=$(xcrun --sdk iphoneos --show-sdk-path)
+IPHONESIMULATOR_PLATFORM=$(xcrun --sdk iphonesimulator --show-sdk-platform-path)
+IPHONESIMULATOR_SYSROOT=$(xcrun --sdk iphonesimulator --show-sdk-path)
+IOS_SDK_VERSION=$(xcrun --sdk iphoneos --show-sdk-version)
 MIN_SDK_VERSION=8.0

 CFLAGS="-DNDEBUG -Os -pipe -fPIC -fno-exceptions"
--- a/Show More
+++ b/Show More