Merge changes from github.

Change: 137532946
2016-10-28 10:29:28 -08:00 · 2016-10-28 10:29:28 -08:00 · e2d51a87f0
commit e2d51a87f0
parent f80ef2d696
97 changed files with 1731 additions and 405 deletions
--- a/README.md
+++ b/README.md
@ -33,10 +33,10 @@ and discussion.**

 People who are a little more adventurous can also try our nightly binaries:

-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
 * [Android](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/lastSuccessfulBuild/artifact/bazel-out/local_linux/bin/tensorflow/examples/android/tensorflow_demo.apk) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/))

 #### *Try your first TensorFlow program*
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -15,6 +15,7 @@ cmake_policy(SET CMP0022 NEW)

 # Options
 option(tensorflow_VERBOSE "Enable for verbose output" OFF)
+option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" OFF)
 option(tensorflow_ENABLE_GRPC_SUPPORT "Enable gRPC support" ON)
 option(tensorflow_BUILD_CC_EXAMPLE "Build the C++ tutorial example" ON)
@ -48,8 +49,13 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC -D__VERSION__=\"MSVC\")
+  add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
+  add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH -D_ITERATOR_DEBUG_LEVEL=0)
+  add_definitions(/bigobj /nologo /EHsc /GF /FC /MP /Gm-)
  # Suppress warnings to reduce build log size.
  add_definitions(/wd4267 /wd4244 /wd4800 /wd4503 /wd4554 /wd4996 /wd4348 /wd4018)
+  add_definitions(/wd4099 /wd4146 /wd4267 /wd4305 /wd4307)
+  add_definitions(/wd4715 /wd4722 /wd4723 /wd4838 /wd4309 /wd4334)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
 endif()

@ -80,7 +86,16 @@ set(tensorflow_EXTERNAL_LIBRARIES
    ${protobuf_STATIC_LIBRARIES}
 )
 set(tensorflow_EXTERNAL_DEPENDENCIES
-  gif_copy_headers_to_destination png_copy_headers_to_destination jpeg_copy_headers_to_destination jsoncpp farmhash_copy_headers_to_destination highwayhash_copy_headers_to_destination protobuf eigen)
+    zlib_copy_headers_to_destination
+    gif_copy_headers_to_destination
+    png_copy_headers_to_destination
+    jpeg_copy_headers_to_destination
+    jsoncpp
+    farmhash_copy_headers_to_destination
+    highwayhash_copy_headers_to_destination
+    protobuf
+    eigen
+)

 include_directories(
    # Source and generated code.
@ -118,19 +133,67 @@ if(UNIX)
  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS})
 endif()

+if (tensorflow_ENABLE_GPU)
+  if (WIN32)
+    find_package(CUDA 8.0 REQUIRED)
+
+    # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
+    # CUDA_NVCC_FLAGS and cuda_config.h below
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
+    set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
+    include_directories(${CUDA_INCLUDE})
+    add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.5,5.2)
+
+    # add cudnn
+    include_directories(${CUDNN_HOME})
+    set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDNN_HOME}/lib/x64/cudnn.lib)
+
+    # create cuda_config.h
+    FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
+      "#ifndef CUDA_CUDA_CONFIG_H_\n"
+      "#define CUDA_CUDA_CONFIG_H_\n"
+      "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
+      "#define TF_CUDA_VERSION \"64_80\"\n"
+      "#define TF_CUDNN_VERSION \"64_5\"\n"
+      "#endif  // CUDA_CUDA_CONFIG_H_\n"
+    )
+
+    # tf assumes in various places header files to be in cuda/include. On windows the cuda sdk
+    # installs them under cuda/version/include and to avoid that we need to change tf we copy a
+    # few files to cuda/include
+    FILE(COPY
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda.h ${CUDA_TOOLKIT_TARGET_DIR}/include/cuComplex.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h ${CUDNN_HOME}/include/cudnn.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
+      DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
+    )
+    include_directories(${tensorflow_source_dir}/third_party/gpus)
+    # add cuda libraries to tensorflow_EXTERNAL_LIBRARIES
+    list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
+  endif()
+endif()
+
 # Let's get to work!
 include(tf_core_framework.cmake)
 include(tf_tools.cmake)
 # NOTE: Disabled until issue #3996 is fixed.
 # include(tf_stream_executor.cmake)
+if (tensorflow_ENABLE_GPU)
+  if (WIN32)
+    include(tf_stream_executor.cmake)
+  endif()
+endif()
+
 include(tf_core_cpu.cmake)
 include(tf_models.cmake)
 include(tf_core_ops.cmake)
 include(tf_core_direct_session.cmake)
+include(tf_core_kernels.cmake)
 if(tensorflow_ENABLE_GRPC_SUPPORT)
  include(tf_core_distributed_runtime.cmake)
 endif()
-include(tf_core_kernels.cmake)
+
 include(tf_cc_ops.cmake)
 if(tensorflow_BUILD_CC_EXAMPLE)
  include(tf_tutorials.cmake)
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@ -15,14 +15,13 @@ Current Status

 The CMake files in this directory can build the core TensorFlow runtime, an
 example C++ binary, and a PIP package containing the runtime and Python
-bindings. Currently, only CPU builds are supported, but we are working on
-providing a GPU build as well.
+bindings.

 Note: Windows support is in an **alpha** state, and we welcome your feedback.

 ### Pre-requisites

-* CMake version 3.1 or later
+* CMake version 3.1 up to 3.6

 * [Git](http://git-scm.com)

@ -45,21 +44,13 @@ Note: Windows support is in an **alpha** state, and we welcome your feedback.
  - [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.continuum.io/downloads)
  - [Git for Windows version 2.9.2.windows.1](https://git-scm.com/download/win)
  - [swigwin-3.0.10](http://www.swig.org/download.html)
-
+  - [NVidia CUDA Toolkit 8.0] (https://developer.nvidia.com/cuda-downloads)
+  - [NVidia CUDNN 5.1] (https://developer.nvidia.com/cudnn)
 * Ubuntu 14.04
  - Makefile generator
  - Docker 1.9.1 (for automated testing)

 ### Current known limitations
-
-* CPU support only
-
-  - We are in the process of porting the GPU code in
-    `tensorflow/stream_executor` to build with CMake and work on non-POSIX
-    platforms.
-
-* Additional limitations for the Windows build:
-
  - The Python package supports **Python 3.5 only**, because that is the only
    version for which standard Python binaries exist and those binaries are
    compatible with the TensorFlow runtime. (On Windows, the standard Python
@ -114,6 +105,17 @@ Step-by-step Windows build
     D:\temp> "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\amd64\vcvarsall.bat"
     ```

+   * When building with GPU support after installing the CUDNN zip file from NVidia, append its 
+     bin directory to your PATH environment variable.
+     In case TensorFlow fails to find the CUDA dll's during initialization, check your PATH environment variable. 
+     It should contain the directory of the CUDA dlls and the directory of the CUDNN dll.
+     For example:
+     
+     ```
+     D:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin
+     D:\local\cuda\bin
+     ```
+
   * We assume that `cmake` and `git` are installed and in your `%PATH%`. If
     for example `cmake` is not in your path and it is installed in
     `C:\Program Files (x86)\CMake\bin\cmake.exe`, you can add this directory
@ -145,9 +147,14 @@ Step-by-step Windows build
   D:\...\build> cmake .. -A x64 -DCMAKE_BUILD_TYPE=Release ^
   More? -DSWIG_EXECUTABLE=C:/tools/swigwin-3.0.10/swig.exe ^
   More? -DPYTHON_EXECUTABLE=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/python.exe ^
-   More? -DPYTHON_LIBRARIES=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/libs/python35.lib
+   More? -DPYTHON_LIBRARIES=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/libs/python35.lib 
   ```
-
+   To build with GPU support add "^" at the end of the last line above following with:
+   ```
+   More? -Dtensorflow_ENABLE_GPU=ON ^
+   More? -DCUDNN_HOME="D:\...\cudnn"
+   ```
+    
   Note that the `-DCMAKE_BUILD_TYPE=Release` flag must match the build
   configuration that you choose when invoking `msbuild`. The known-good
   values are `Release` and `RelWithDebInfo`. The `Debug` build type is
@ -184,6 +191,11 @@ Step-by-step Windows build
     SSL support (for making secure HTTP requests) in the TensorFlow runtime.
     This support is incomplete, and will be used for Google Cloud Storage
     support.
+     
+   * `-Dtensorflow_ENABLE_GPU=(ON|OFF)`. Defaults to `OFF`. Include
+     GPU support. If GPU is enabled you need to install the CUDA 8.0 Toolkit and CUDNN 5.1.
+     CMake will expect the location of CUDNN in -DCUDNN_HOME=path_you_unziped_cudnn.
+    

 4. Invoke MSBuild to build TensorFlow.

@ -202,7 +214,6 @@ Step-by-step Windows build
   D:\...\build> MSBuild /p:Configuration=Release tf_python_build_pip_package.vcxproj
   ```

-
 Linux Continuous Integration build
 ==================================

--- a/tensorflow/contrib/cmake/setup.py
+++ b/tensorflow/contrib/cmake/setup.py
@ -26,7 +26,7 @@ from setuptools import find_packages, setup, Command
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution

-_VERSION = '0.11.0rc0-cmake-experimental'
+_VERSION = '0.11.0rc1-cmake-experimental'

 REQUIRED_PACKAGES = [
    'numpy >= 1.11.0',
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@ -21,13 +21,27 @@ file(GLOB_RECURSE tf_core_cpu_exclude_srcs
    "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_factory.cc"
    "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_options.cc"
 )
-
 list(REMOVE_ITEM tf_core_cpu_srcs ${tf_core_cpu_exclude_srcs}) 
+
 # We need to include stubs for the GPU tracer, which are in the exclude glob.
 list(APPEND tf_core_cpu_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_tracer.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_tracer.h"
 )

+if (tensorflow_ENABLE_GPU)
+  file(GLOB_RECURSE tf_core_gpu_srcs
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu/cupti_wrapper.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu_device_factory.cc"
+  )
+  file(GLOB_RECURSE tf_core_gpu_exclude_srcs
+     "${tensorflow_source_dir}/tensorflow/core/*test*.cc"
+     "${tensorflow_source_dir}/tensorflow/core/*test*.cc"
+  )
+  list(REMOVE_ITEM tf_core_gpu_srcs ${tf_core_gpu_exclude_srcs})
+  list(APPEND tf_core_cpu_srcs ${tf_core_gpu_srcs})
+endif()
+
 add_library(tf_core_cpu OBJECT ${tf_core_cpu_srcs})
 add_dependencies(tf_core_cpu tf_core_framework)
--- a/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake
+++ b/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake
@ -38,9 +38,11 @@ add_executable(grpc_tensorflow_server
    $<TARGET_OBJECTS:tf_core_ops>
    $<TARGET_OBJECTS:tf_core_direct_session>
    $<TARGET_OBJECTS:tf_core_distributed_runtime>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )

 target_link_libraries(grpc_tensorflow_server PUBLIC
    tf_protos_cc
+    ${tf_core_gpu_kernels_lib}
    ${tensorflow_EXTERNAL_LIBRARIES}
 )
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@ -38,6 +38,7 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
      "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/metrics/kernels/set_kernels.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/metrics/ops/set_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
@ -83,7 +84,7 @@ list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_exclude_srcs})

 if(WIN32)
  file(GLOB_RECURSE tf_core_kernels_windows_exclude_srcs
-      # Not currently working on Windows:
+      # not working on windows yet
      "${tensorflow_source_dir}/tensorflow/core/kernels/depthwise_conv_op.cc"  # Cannot find symbol: tensorflow::LaunchConv2DOp<struct Eigen::ThreadPoolDevice, double>::launch(...).
      "${tensorflow_source_dir}/tensorflow/core/kernels/fact_op.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/immutable_constant_op.cc"
@ -93,14 +94,38 @@ if(WIN32)
      "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_matmul_op.h"
      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
+      "${tensorflow_source_dir}/tensorflow/core/kernels/svd*.cc"
+      "${tensorflow_source_dir}/tensorflow/core/kernels/avgpooling_op.*"
  )
  list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_windows_exclude_srcs})
 endif(WIN32)

+file(GLOB_RECURSE tf_core_gpu_kernels_srcs
+   "${tensorflow_source_dir}/tensorflow/core/kernels/*.cu.cc"
+   "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/*.cu.cc"
+)
+
+if(WIN32)
+  file(GLOB_RECURSE tf_core_gpu_kernels_exclude_srcs
+      # not working on windows yet
+      "${tensorflow_source_dir}/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc"
+  )
+  list(REMOVE_ITEM tf_core_gpu_kernels_srcs ${tf_core_gpu_kernels_exclude_srcs})
+endif(WIN32)
+
 add_library(tf_core_kernels OBJECT ${tf_core_kernels_srcs})
+add_dependencies(tf_core_kernels tf_core_cpu)

 if(WIN32)
  target_compile_options(tf_core_kernels PRIVATE /MP)
+  if (tensorflow_ENABLE_GPU)
+    set_source_files_properties(${tf_core_gpu_kernels_srcs} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+    set(tf_core_gpu_kernels_lib tf_core_gpu_kernels)
+    cuda_add_library(${tf_core_gpu_kernels_lib} ${tf_core_gpu_kernels_srcs})
+    set_target_properties(${tf_core_gpu_kernels_lib}
+                          PROPERTIES DEBUG_POSTFIX ""
+                          COMPILE_FLAGS "${TF_REGULAR_CXX_FLAGS}"
+    )
+    add_dependencies(${tf_core_gpu_kernels_lib} tf_core_cpu)
+  endif()
 endif()
-
-add_dependencies(tf_core_kernels tf_core_cpu)
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@ -302,12 +302,14 @@ add_library(pywrap_tensorflow SHARED
    $<TARGET_OBJECTS:tf_core_direct_session>
    $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
    $<TARGET_OBJECTS:tf_core_kernels>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 target_include_directories(pywrap_tensorflow PUBLIC
    ${PYTHON_INCLUDE_DIR}
    ${NUMPY_INCLUDE_DIR}
 )
 target_link_libraries(pywrap_tensorflow
+    ${tf_core_gpu_kernels_lib}
    ${tensorflow_EXTERNAL_LIBRARIES}
    tf_protos_cc
    ${PYTHON_LIBRARIES}
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@ -47,11 +47,17 @@ file(GLOB tf_stream_executor_srcs
    "${tensorflow_source_dir}/tensorflow/stream_executor/platform/default/*.h"
 )

+if (tensorflow_ENABLE_GPU)    
+    file(GLOB tf_stream_executor_gpu_srcs
+        "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc"
+    )
+    list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs})
+endif()    
+
 #file(GLOB_RECURSE tf_stream_executor_test_srcs
 #    "${tensorflow_source_dir}/tensorflow/stream_executor/*_test.cc"
 #    "${tensorflow_source_dir}/tensorflow/stream_executor/*_test.h"
 #)
-#
 #list(REMOVE_ITEM tf_stream_executor_srcs ${tf_stream_executor_test_srcs}) 

 add_library(tf_stream_executor OBJECT ${tf_stream_executor_srcs})
--- a/tensorflow/contrib/cmake/tf_tutorials.cmake
+++ b/tensorflow/contrib/cmake/tf_tutorials.cmake
@ -12,9 +12,11 @@ add_executable(tf_tutorials_example_trainer
    $<TARGET_OBJECTS:tf_cc_ops>
    $<TARGET_OBJECTS:tf_core_ops>
    $<TARGET_OBJECTS:tf_core_direct_session>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )

 target_link_libraries(tf_tutorials_example_trainer PUBLIC
    tf_protos_cc
+    ${tf_core_gpu_kernels_lib}
    ${tensorflow_EXTERNAL_LIBRARIES}
 )
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@ -942,6 +942,7 @@ def convolution2d_transpose(
    kernel_size,
    stride=1,
    padding='SAME',
+    data_format=DATA_FORMAT_NHWC,
    activation_fn=nn.relu,
    normalizer_fn=None,
    normalizer_params=None,
@ -961,7 +962,9 @@ def convolution2d_transpose(
  second variable called 'biases' is added to the result of the operation.

  Args:
-    inputs: a tensor of size [batch_size, height, width, channels].
+    inputs: A 4-D `Tensor` of type `float` and shape
+      `[batch, height, width, in_channels]` for `NHWC` data format or
+      `[batch, in_channels, height, width]` for `NCHW` data format.
    num_outputs: integer, the number of output filters.
    kernel_size: a list of length 2 holding the [kernel_height, kernel_width] of
      of the filters. Can be an int if both values are the same.
@ -969,6 +972,7 @@ def convolution2d_transpose(
      Can be an int if both strides are the same.  Note that presently
      both strides must have the same value.
    padding: one of 'VALID' or 'SAME'.
+    data_format: A string. `NHWC` (default) and `NCHW` are supported.
    activation_fn: activation function, set to None to skip it and maintain
      a linear activation.
    normalizer_fn: normalization function to use instead of `biases`. If
@ -993,14 +997,23 @@ def convolution2d_transpose(

  Raises:
    ValueError: if 'kernel_size' is not a list of length 2.
+    ValueError: if `data_format` is neither `NHWC` nor `NCHW`.
+    ValueError: if `C` dimension of `inputs` is None.
  """
  with variable_scope.variable_scope(
      scope, 'Conv2d_transpose', [inputs], reuse=reuse) as sc:
+    if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
+      raise ValueError('data_format has to be either NCHW or NHWC.')
    dtype = inputs.dtype.base_dtype
    kernel_h, kernel_w = utils.two_element_tuple(kernel_size)
    stride_h, stride_w = utils.two_element_tuple(stride)
-    num_filters_in = utils.last_dimension(
-        inputs.get_shape(), min_rank=4)
+    if data_format == DATA_FORMAT_NCHW:
+      c_axis, h_axis, w_axis = 1, 2, 3
+    else:
+      h_axis, w_axis, c_axis = 1, 2, 3
+    num_filters_in = inputs.get_shape()[c_axis].value
+    if num_filters_in is None:
+      raise ValueError('`C` dimension of `inputs` must be known but is None.')
    weights_shape = [kernel_h, kernel_w, num_outputs, num_filters_in]
    weights_collections = utils.get_variable_collections(
        variables_collections, 'weights')
@ -1015,7 +1028,7 @@ def convolution2d_transpose(

    inputs_shape = array_ops.shape(inputs)
    batch_size = inputs_shape[0]
-    height, width = inputs_shape[1], inputs_shape[2]
+    height, width = inputs_shape[h_axis], inputs_shape[w_axis]

    def get_deconv_dim(dim_size, stride_size, kernel_size, padding):
      if isinstance(dim_size, ops.Tensor):
@ -1031,17 +1044,25 @@ def convolution2d_transpose(
    out_height = get_deconv_dim(height, stride_h, kernel_h, padding)
    out_width = get_deconv_dim(width, stride_w, kernel_w, padding)

-    output_shape = array_ops.pack(
-        [batch_size, out_height, out_width, num_outputs])
+    if data_format == DATA_FORMAT_NHWC:
+      output_shape = [batch_size, out_height, out_width, num_outputs]
+      strides = [1, stride_h, stride_w, 1]
+    else:
+      output_shape = [batch_size, num_outputs, out_height, out_width]
+      strides = [1, 1, stride_h, stride_w]
+
+
+    output_shape = array_ops.pack(output_shape)
    outputs = nn.conv2d_transpose(inputs, weights, output_shape,
-                                  [1, stride_h, stride_w, 1],
-                                  padding=padding)
+                                  strides,
+                                  padding=padding,
+                                  data_format=data_format)

    # Infer the static output shape:
    out_shape = inputs.get_shape().as_list()
-    out_shape[-1] = num_outputs
-    out_shape[1] = get_deconv_dim(out_shape[1], stride_h, kernel_h, padding)
-    out_shape[2] = get_deconv_dim(out_shape[2], stride_w, kernel_w, padding)
+    out_shape[c_axis] = num_outputs
+    out_shape[h_axis] = get_deconv_dim(out_shape[h_axis], stride_h, kernel_h, padding)
+    out_shape[w_axis] = get_deconv_dim(out_shape[w_axis], stride_w, kernel_w, padding)
    outputs.set_shape(out_shape)

    if normalizer_fn is not None:
@ -1057,7 +1078,7 @@ def convolution2d_transpose(
                                          initializer=biases_initializer,
                                          regularizer=biases_regularizer,
                                          collections=biases_collections)
-        outputs = nn.bias_add(outputs, biases)
+        outputs = nn.bias_add(outputs, biases, data_format=data_format)

    if activation_fn is not None:
      outputs = activation_fn(outputs)
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@ -588,6 +588,175 @@ class ConvolutionTest(tf.test.TestCase):

 class Convolution2dTransposeTests(tf.test.TestCase):

+  def testInvalidDataFormat(self):
+    height, width = 7, 9
+    with self.test_session():
+      images = tf.random_uniform((5, height, width, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'data_format has to be either NCHW or NHWC.'):
+        tf.contrib.layers.convolution2d_transpose(
+            images, 32, 3, data_format='CHWN')
+
+
+  def testOutputSizeWithStrideOneSamePaddingNCHW(self):
+    # `NCHW` data fomat is only supported for `GPU` device.
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 32
+        input_size = [5, 3, 10, 12]
+        expected_size = [5, num_filters, 10, 12]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [3, 3], stride=1,
+            padding='SAME', data_format='NCHW')
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+
+        sess.run(tf.initialize_all_variables())
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+
+  def testOutputSizeWithStrideOneValidPaddingNCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 32
+        input_size = [5, 3, 10, 12]
+        expected_size = [5, num_filters, 12, 14]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [3, 3], stride=1,
+            padding='VALID', data_format='NCHW')
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+
+        sess.run(tf.initialize_all_variables())
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWithStrideTwoValidPaddingNCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 32
+        input_size = [5, 3, 9, 11]
+        expected_size = [5, num_filters, 19, 23]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [3, 3], stride=[2, 2],
+            padding='VALID', data_format='NCHW')
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.get_shape().as_list()), expected_size)
+
+        sess.run(tf.initialize_all_variables())
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWith1x1StrideTwoSamePaddingNCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 1
+        input_size = [1, 1, 1, 1]
+        expected_size = [1, num_filters, 2, 2]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [2, 2], stride=[2, 2],
+            padding='SAME', data_format='NCHW')
+        self.assertListEqual(list(output.get_shape().as_list()), expected_size)
+
+        sess.run(tf.initialize_all_variables())
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWith1x1StrideTwoValidPaddingNCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 1
+        input_size = [1, 1, 1, 1]
+        expected_size = [1, num_filters, 2, 2]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [2, 2], stride=[2, 2],
+            padding='VALID', data_format='NCHW')
+        sess.run(tf.initialize_all_variables())
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWith2x2StrideTwoSamePaddingNCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 1
+        input_size = [1, 1, 2, 2]
+        expected_size = [1, num_filters, 4, 4]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [2, 2], stride=[2, 2],
+            padding='SAME', data_format='NCHW')
+        sess.run(tf.initialize_all_variables())
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWith2x2StrideTwoValidPaddingNCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 1
+        input_size = [1, 1, 2, 2]
+        expected_size = [1, num_filters, 4, 4]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [2, 2], stride=[2, 2],
+            padding='VALID', data_format='NCHW')
+        sess.run(tf.initialize_all_variables())
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWithStride2x1NCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 1
+        input_size = [1, 1, 3, 2]
+        expected_size = [1, num_filters, 6, 5]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [2, 4], stride=[2, 1],
+            padding='VALID', data_format='NCHW')
+        sess.run(tf.initialize_all_variables())
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWithStride2x4NCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 1
+        input_size = [1, 1, 3, 2]
+        expected_size = [1, num_filters, 6, 8]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [2, 4], stride=[2, 4],
+            padding='VALID', data_format='NCHW')
+        sess.run(tf.initialize_all_variables())
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWithStride2x5NCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 1
+        input_size = [1, 1, 3, 2]
+        expected_size = [1, num_filters, 6, 10]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [2, 4], stride=[2, 5],
+            padding='VALID', data_format='NCHW')
+        sess.run(tf.initialize_all_variables())
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+
  def testOutputSizeWithStrideOneSamePadding(self):
    num_filters = 32
    input_size = [5, 10, 12, 3]
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
@ -244,7 +244,7 @@ class GraphIOTest(tf.test.TestCase):
      session.run(tf.initialize_local_variables())

      coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)

      self.assertAllEqual(session.run(inputs), [b"ABC"])
      self.assertAllEqual(session.run(inputs), [b"DEF"])
@ -253,6 +253,7 @@ class GraphIOTest(tf.test.TestCase):
        session.run(inputs)

      coord.request_stop()
+      coord.join(threads)

  def test_read_keyed_batch_features_mutual_exclusive_args(self):
    filename = self._create_temp_file("abcde")
@ -307,6 +308,7 @@ class GraphIOTest(tf.test.TestCase):
        coord.request_stop()

      coord.join(threads)
+
    parsed_records = [item for sublist in [d["sequence"] for d in data]
                      for item in sublist]
    # Check that the number of records matches expected and all records
@ -331,7 +333,7 @@ class GraphIOTest(tf.test.TestCase):
      session.run(tf.initialize_local_variables())

      coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)

      self.assertEqual("%s:1" % name, inputs.name)
      file_name_queue_name = "%s/file_name_queue" % name
@ -352,6 +354,7 @@ class GraphIOTest(tf.test.TestCase):
        session.run(inputs)

      coord.request_stop()
+      coord.join(threads)

  def test_read_text_lines_multifile_with_shared_queue(self):
    gfile.Glob = self._orig_glob
@ -375,7 +378,7 @@ class GraphIOTest(tf.test.TestCase):
      session.run(tf.initialize_local_variables())

      coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)

      self.assertEqual("%s:1" % name, inputs.name)
      shared_file_name_queue_name = "%s/file_name_queue" % name
@ -398,6 +401,7 @@ class GraphIOTest(tf.test.TestCase):
        session.run(inputs)

      coord.request_stop()
+      coord.join(threads)

  def _get_qr(self, name):
    for qr in ops.get_collection(ops.GraphKeys.QUEUE_RUNNERS):
@ -490,7 +494,7 @@ class GraphIOTest(tf.test.TestCase):
      session.run(tf.initialize_local_variables())

      coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)

      self.assertAllEqual(session.run(inputs), [b"A", b"B", b"C"])
      self.assertAllEqual(session.run(inputs), [b"D", b"E"])
@ -498,6 +502,7 @@ class GraphIOTest(tf.test.TestCase):
        session.run(inputs)

      coord.request_stop()
+      coord.join(threads)

  def test_keyed_read_text_lines(self):
    gfile.Glob = self._orig_glob
@ -517,7 +522,7 @@ class GraphIOTest(tf.test.TestCase):
      session.run(tf.initialize_local_variables())

      coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)

      self.assertAllEqual(session.run([keys, inputs]),
                          [[filename.encode("utf-8") + b":1"], [b"ABC"]])
@ -529,6 +534,7 @@ class GraphIOTest(tf.test.TestCase):
        session.run(inputs)

      coord.request_stop()
+      coord.join(threads)

  def test_keyed_parse_json(self):
    gfile.Glob = self._orig_glob
@ -557,7 +563,7 @@ class GraphIOTest(tf.test.TestCase):
      session.run(tf.initialize_local_variables())

      coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)

      key, age = session.run([keys, inputs["age"]])
      self.assertAllEqual(age, [[0]])
@ -572,6 +578,7 @@ class GraphIOTest(tf.test.TestCase):
        session.run(inputs)

      coord.request_stop()
+      coord.join(threads)


 if __name__ == "__main__":
--- a/tensorflow/contrib/learn/python/learn/utils/export_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export_test.py
@ -21,6 +21,7 @@ from __future__ import print_function

 import os
 import random
+import six
 import tempfile

 import numpy as np
@ -63,8 +64,8 @@ class ExportTest(tf.test.TestCase):
    # Only the written checkpoints are exported.
    self.assertTrue(tf.gfile.Exists(export_dir + '00000001/export'))
    self.assertTrue(tf.gfile.Exists(export_dir + '00000010/export'))
-    self.assertEquals(export_monitor.last_export_dir, os.path.join(export_dir,
-                                                                   '00000010'))
+    self.assertEquals(export_monitor.last_export_dir,
+                      six.b(os.path.join(export_dir, '00000010')))
    # Validate the signature
    signature = self._get_default_signature(export_dir + '00000010/export.meta')
    self.assertTrue(signature.HasField('regression_signature'))
@ -86,8 +87,8 @@ class ExportTest(tf.test.TestCase):
    # Only the written checkpoints are exported.
    self.assertTrue(tf.gfile.Exists(export_dir + '00000001/export'))
    self.assertTrue(tf.gfile.Exists(export_dir + '00000010/export'))
-    self.assertEquals(export_monitor.last_export_dir, os.path.join(export_dir,
-                                                                   '00000010'))
+    self.assertEquals(export_monitor.last_export_dir,
+                      six.b(os.path.join(export_dir, '00000010')))
    # Validate the signature
    signature = self._get_default_signature(export_dir + '00000010/export.meta')
    self.assertTrue(signature.HasField('generic_signature'))
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@ -351,6 +351,10 @@ class BFCAllocator : public VisitableAllocator {
  inline int Log2FloorNonZero(uint64 n) {
 #if defined(__GNUC__)
    return 63 ^ __builtin_clzll(n);
+#elif defined(PLATFORM_WINDOWS)
+    unsigned long index;
+    _BitScanReverse64(&index, n);
+    return index;
 #else
    int r = 0;
    while (n > 0) {
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@ -873,7 +873,9 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
  if (visible_device_list.empty()) {
    visible_gpu_order.resize(gpu_manager->VisibleDeviceCount());
    // By default, visible to virtual mapping is unchanged.
-    std::iota(visible_gpu_order.begin(), visible_gpu_order.end(), 0);
+    int deviceNo = 0;
+    std::generate(visible_gpu_order.begin(), visible_gpu_order.end(),
+	              [&deviceNo]{ return deviceNo++; });
  } else {
    std::vector<string> order_str = str_util::Split(visible_device_list, ',');
    for (int i = 0; i < order_str.size(); ++i) {
--- a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
@ -254,6 +254,10 @@ CUPTIManager *GetCUPTIManager() {
  return manager;
 }

+#ifdef _MSC_VER
+#define __thread __declspec(thread) 
+#endif
+
 // TODO(pbar) Move this to platform specific header file?
 // Static thread local variable for POD types.
 #define TF_STATIC_THREAD_LOCAL_POD(_Type_, _var_)                  \
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.cc
@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/pool_allocator.h"

 #include <errno.h>
+#ifndef _MSC_VER
 #include <strings.h>
 #include <sys/mman.h>  // for munmap
+#endif

 #include <map>
 #include <utility>
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@ -126,7 +126,7 @@ Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options, int gpu_id,
    gpu::StreamExecutor* se =
        gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
    int bus_id = se->GetDeviceDescription().numa_node();
-    if (bus_id < static_cast<int64>(gpu_visitors_.size())) {
+    if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) {
      for (auto v : gpu_visitors_[bus_id]) {
        gpu_allocators_[gpu_id]->AddAllocVisitor(v);
      }
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@ -152,7 +152,7 @@ class Allocator {
  // allocated by this allocator.
  virtual size_t RequestedSize(void* ptr) {
    CHECK(false) << "allocator doesn't track sizes";
-    return 0;
+    return size_t(0);
  }

  // Returns the allocated size of the buffer at 'ptr' if known,
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@ -149,6 +149,7 @@ class DeviceBase {
  // attributes requested.  See allocator.h for more details.
  virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) {
    LOG(FATAL) << "GetAllocator() is not implemented.";
+    return nullptr;
  }

  // Return the Allocator implementation to use based on the allocator
@ -180,6 +181,8 @@ class DeviceBase {

  virtual const DeviceAttributes& attributes() const {
    LOG(FATAL) << "Device does not implement attributes()";
+    static DeviceAttributes dummy;
+    return dummy;
  }

  // Materializes the given TensorProto into 'tensor' stored in Device
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@ -348,6 +348,15 @@ TEST(Tensor_Float, Reshape) {
 }

 TEST(Tensor_Scalar, Basics) {
+  {
+    Tensor t(DT_BOOL, TensorShape({}));
+    EXPECT_EQ(1, t.NumElements());
+    auto Tt = t.scalar<bool>();
+    EXPECT_EQ(1, Tt.size());
+    EXPECT_EQ(0, Tt.rank());
+    t.scalar<bool>()() = true;
+    EXPECT_TRUE(Tt());
+  }
  {
    Tensor t(DT_FLOAT, TensorShape({}));
    EXPECT_EQ(1, t.NumElements());
--- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@ -16,6 +16,7 @@ limitations under the License.
 #if GOOGLE_CUDA

 #include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"

 namespace tensorflow {
 namespace functor {
@ -31,6 +32,28 @@ struct SelectFunctor<GPUDevice, T> {
  }
 };

+template <typename T>
+struct SelectScalarFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<bool>::ConstScalar cond,
+                  typename TTypes<T>::ConstFlat then_flat,
+                  typename TTypes<T>::ConstFlat else_flat) {
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+  Eigen::array<int, 1> rank1{1};
+#else
+  Eigen::IndexList<Eigen::type2index<1>> rank1;
+#endif
+  const int size  = then_flat.dimension(0);
+  Eigen::array<int, 1> broadcast_dims{size};
+
+  To32Bit(out).device(d) = cond.reshape(rank1)
+                               .broadcast(broadcast_dims)
+                               .select(then_flat, else_flat);
+
+  }
+};
+
 template <typename T>
 struct BatchSelectFunctor<GPUDevice, T> {
  void operator()(const GPUDevice& d,
@ -68,6 +91,7 @@ struct BatchSelectFunctor<GPUDevice, T> {

 #define SELECT_FUNCTOR(T)                      \
  template struct SelectFunctor<GPUDevice, T>; \
+  template struct SelectScalarFunctor<GPUDevice, T>; \
  template struct BatchSelectFunctor<GPUDevice, T>;

 SELECT_FUNCTOR(Eigen::half);
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@ -41,6 +41,11 @@ class SelectOp : public OpKernel {
    OP_REQUIRES_OK(ctx, ctx->input("t", &then));
    OP_REQUIRES_OK(ctx, ctx->input("e", &else_));

+    if (TensorShapeUtils::IsScalar(cond->shape())){
+        ComputeScalar(ctx, cond, then, else_);
+        return;
+    }
+
    bool broadcasting = (TensorShapeUtils::IsVector(cond->shape()) &&
                         !TensorShapeUtils::IsVector(then->shape()));

@ -108,6 +113,25 @@ class SelectOp : public OpKernel {
    }
  }

+  void ComputeScalar(OpKernelContext* ctx, const Tensor* cond,
+                          const Tensor* then, const Tensor* else_) {
+    OP_REQUIRES(
+        ctx, then->shape().IsSameSize(else_->shape()),
+        errors::InvalidArgument(
+            "'then' and 'else' must have the same size.  but received: ",
+            then->shape().DebugString(), " vs. ",
+            else_->shape().DebugString()));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, then->shape(), &output));
+
+    if (output->NumElements() > 0) {
+      functor::SelectScalarFunctor<Device, T> func;
+      TTypes<bool>::ConstScalar cond_scalar = cond->scalar<bool>();
+      func(ctx->eigen_device<Device>(), output->flat<T>(), cond_scalar,
+           then->flat<T>(), else_->flat<T>());
+    }
+  }
 private:
  TF_DISALLOW_COPY_AND_ASSIGN(SelectOp);
 };
@ -152,6 +176,17 @@ struct SelectFunctor<CPUDevice, T> {
  }
 };

+// CPU Specializations of Select functors with scalar
+template <typename T>
+struct SelectScalarFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  TTypes<bool>::ConstScalar cond,
+                  typename TTypes<T>::ConstFlat then_flat,
+                  typename TTypes<T>::ConstFlat else_flat) {
+    out.device(d) = cond() ? then_flat : else_flat;
+  }
+};
+
 template <typename T>
 struct BatchSelectFunctor<CPUDevice, T> {
  void operator()(const CPUDevice& d,
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@ -719,6 +719,14 @@ struct SelectFunctor {
                  typename TTypes<T>::ConstFlat else_flat);
 };

+template <typename Device, typename T>
+struct SelectScalarFunctor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<bool>::ConstScalar cond,
+                  typename TTypes<T>::ConstFlat then_flat,
+                  typename TTypes<T>::ConstFlat else_flat);
+};
+
 template <typename Device, typename T>
 struct BatchSelectFunctor {
  void operator()(const Device& d,
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@ -21,7 +21,11 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"

+#if !defined(_MSC_VER)
 #define UNROLL _Pragma("unroll")
+#else
+#define UNROLL 
+#endif

 namespace tensorflow {

--- a/tensorflow/core/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
@ -25,8 +25,25 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"

+#if GOOGLE_CUDA
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {

+#if GOOGLE_CUDA
+namespace {
+template <typename Scalar>
+perftools::gputools::DeviceMemory<Scalar> AsDeviceMemory(
+    const Scalar* cuda_memory) {
+  perftools::gputools::DeviceMemoryBase wrapped(
+      const_cast<Scalar*>(cuda_memory));
+  perftools::gputools::DeviceMemory<Scalar> typed(wrapped);
+  return typed;
+}
+}  // namespace
+#endif  // GOOGLE_CUDA
+
 template <class Scalar>
 class MatrixTriangularSolveOp : public LinearAlgebraOp<Scalar> {
 public:
@ -60,7 +77,9 @@ class MatrixTriangularSolveOp : public LinearAlgebraOp<Scalar> {
  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
    double rows = static_cast<double>(input_matrix_shapes[0].dim_size(0));
    double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
-    double cost = rows * rows * num_rhss;
+    double cost = rows * rows * num_rhss * 
+          (Eigen::TensorOpCost::AddCost<Scalar>() + 
+           Eigen::TensorOpCost::MulCost<Scalar>());
    return cost >= static_cast<double>(kint64max) ? kint64max
                                                  : static_cast<int64>(cost);
  }
@ -103,6 +122,121 @@ class MatrixTriangularSolveOp : public LinearAlgebraOp<Scalar> {
  TF_DISALLOW_COPY_AND_ASSIGN(MatrixTriangularSolveOp);
 };

+
+#ifdef GOOGLE_CUDA
+template <class Scalar>
+class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
+ public:
+  typedef LinearAlgebraOp<Scalar> Base;
+
+  explicit MatrixTriangularSolveOpGPU(OpKernelConstruction* context)
+      : Base(context), lower_(true), adjoint_(false) {
+    OP_REQUIRES_OK(context, context->GetAttr("lower", &lower_));
+    OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
+  }
+
+  using TensorShapes = typename Base::TensorShapes;
+  using Matrix = typename Base::Matrix;
+  using MatrixMap = typename Base::MatrixMap;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  virtual void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    Base::ValidateSquareSolver(context, input_matrix_shapes);
+  }
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    return TensorShapes({TensorShape({input_matrix_shapes[0].dim_size(1),
+                                      input_matrix_shapes[1].dim_size(1)})});
+  }
+
+  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    double rows = static_cast<double>(input_matrix_shapes[0].dim_size(0));
+    double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
+    double cost = rows * rows * num_rhss * 
+          (Eigen::TensorOpCost::AddCost<Scalar>() + 
+           Eigen::TensorOpCost::MulCost<Scalar>());
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const ConstMatrixMap& matrix = inputs[0];
+    const ConstMatrixMap& rhs = inputs[1];
+    MatrixMap& output = outputs->at(0);
+
+    if (matrix.rows() == 0 || rhs.cols() == 0) {
+      // To be consistent with the MatrixInverse op, we define the solution for
+      // an empty set of equation as the empty matrix.
+      return;
+    }
+
+    auto matrix_ptr = AsDeviceMemory(matrix.data());
+    auto rhs_ptr = AsDeviceMemory(rhs.data());
+    auto out_ptr = AsDeviceMemory(output.data());
+
+    auto* stream = context->op_device_context()->stream();
+    uint64 rhs_elems = rhs.rows() * rhs.cols();
+    bool copy_status =
+        stream->ThenMemcpyD2D(&out_ptr, rhs_ptr, sizeof(Scalar) * rhs_elems)
+        .ok();
+    if (!copy_status) {
+      context->SetStatus(
+          errors::Internal("Failed to copy rhs into output before solve"));
+    }
+
+    // Cublas does
+    // output = matrix \ rhs
+    // where matrix, rhs and output are assumed to be in column major.
+    // We want the output to be in row-major, so we can compute
+    // output' = rhs' / matrix' (' stands for transpose)
+    // Upper/lower needs to be swapped for this.
+
+    perftools::gputools::blas::UpperLower upper_lower_matrix;
+    perftools::gputools::blas::Transpose transpose_matrix;
+    if (lower_) {
+      upper_lower_matrix = perftools::gputools::blas::UpperLower::kUpper;
+    } else {
+      upper_lower_matrix = perftools::gputools::blas::UpperLower::kLower;
+    }
+    if (adjoint_) {
+      transpose_matrix = perftools::gputools::blas::Transpose::kTranspose;
+    } else {
+      transpose_matrix = perftools::gputools::blas::Transpose::kNoTranspose;
+    }
+    uint64 leading_dim_matrix = matrix.cols();   
+    uint64 leading_dim_output = output.cols();      
+    uint64 colmajor_rows = output.cols(); 
+    uint64 colmajor_cols = output.rows(); 
+    bool blas_launch_status =
+      stream
+        ->ThenBlasTrsm(perftools::gputools::blas::Side::kRight /*side*/, 
+                       upper_lower_matrix /*uplo*/, 
+                       transpose_matrix /*trans*/,
+                       perftools::gputools::blas::Diagonal::kNonUnit /*diag*/,
+                       colmajor_rows /*m*/, colmajor_cols /*n*/, 
+                       Scalar(1.0) /*alpha*/, 
+                       matrix_ptr, leading_dim_matrix /*lda*/, 
+                       &out_ptr, leading_dim_output /*ldb*/)
+        .ok();
+    if (!blas_launch_status) {
+      context->SetStatus(errors::Internal("Blas TRSM launch failed"));
+    }
+  }
+
+ private:
+  bool lower_;
+  bool adjoint_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixTriangularSolveOpGPU);
+};
+#endif  // GOOGLE_CUDA
+
 REGISTER_LINALG_OP("MatrixTriangularSolve", (MatrixTriangularSolveOp<float>),
                   float);
 REGISTER_LINALG_OP("MatrixTriangularSolve", (MatrixTriangularSolveOp<double>),
@ -112,4 +246,30 @@ REGISTER_LINALG_OP("BatchMatrixTriangularSolve",
 REGISTER_LINALG_OP("BatchMatrixTriangularSolve",
                   (MatrixTriangularSolveOp<double>), double);

+#ifdef GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(
+    Name("MatrixTriangularSolve")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<float>("T"),
+    MatrixTriangularSolveOpGPU<float>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("MatrixTriangularSolve")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<double>("T"),
+    MatrixTriangularSolveOpGPU<double>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("BatchMatrixTriangularSolve")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<float>("T"),
+    MatrixTriangularSolveOpGPU<float>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("BatchMatrixTriangularSolve")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<double>("T"),
+    MatrixTriangularSolveOpGPU<double>);
+#endif  //GOOGLE_CUDA
+
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/range_sampler.h
+++ b/tensorflow/core/kernels/range_sampler.h
@ -115,10 +115,12 @@ class AllSampler : public RangeSampler {

  int64 Sample(random::SimplePhilox* rnd) const override {
    LOG(FATAL) << "Should not be called";
+    return 0;
  }

  float Probability(int64 value) const override {
    LOG(FATAL) << "Should not be called";
+    return 0;
  }

  void SampleBatchGetExpectedCountAvoid(
--- a/tensorflow/core/lib/io/path.cc
+++ b/tensorflow/core/lib/io/path.cc
@ -55,7 +55,10 @@ string JoinPathImpl(std::initializer_list<StringPiece> paths) {
 // the first part of the output.
 std::pair<StringPiece, StringPiece> SplitPath(StringPiece path) {
  auto pos = path.rfind('/');
-
+#ifdef PLATFORM_WINDOWS
+  if (pos == StringPiece::npos)
+    pos = path.rfind('\\');
+#endif
  // Handle the case with no '/' in 'path'.
  if (pos == StringPiece::npos)
    return std::make_pair(StringPiece(path.data(), 0), path);
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@ -913,7 +913,8 @@ REGISTER_OP("Select")
    .SetShapeFn([](InferenceContext* c) {
      // The inputs 'then' and 'else' must have the same shape.
      ShapeHandle data = c->input(1);
-      TF_RETURN_IF_ERROR(c->Merge(data, c->input(2), &data));
+      ShapeHandle other = c->input(2);
+      TF_RETURN_IF_ERROR(c->Merge(data, other, &data));

      // The input 'cond' must either have the same shape as 'then' and
      // 'else', or be a vector if 'then' and 'else' are at least vectors.
@ -929,30 +930,49 @@ REGISTER_OP("Select")
      const int32 cond_rank = c->Rank(cond);
      const int32 data_rank = c->Rank(data);

-      if (cond_rank != 1) {
-        // If the rank of 'cond' is != 1, the shape must match 'then' and 'else'
-        TF_RETURN_IF_ERROR(c->Merge(data, cond, &data));
+      if (cond_rank == 0){
+        // The rank of 'cond' is a scalar.
+        // t and e can have any shape.
+        c->set_output(0, data);
+        return Status::OK();
      }
-      if (data_rank != 0) {
-        // If then and else are not scalars, then cond must be at least
-        // a vector, and its first value must match that of 'else'
-        TF_RETURN_IF_ERROR(c->WithRankAtLeast(cond, 1, &cond));
-        if (cond_rank == 1) {
-          TF_RETURN_IF_ERROR(c->Merge(cond, c->Vector(c->Dim(data, 0)), &cond));
-        }
+
+      if (cond_rank != 1) {
+        // If 'cond' is not a vector, and not a scalar,
+        // then shape must match 'then' and 'else'
+        TF_RETURN_IF_ERROR(c->Merge(data, cond, &data));
+        c->set_output(0, data);
+        return Status::OK();
+      }
+
+      if (data_rank == 0) {
+        // if 'then' and 'else' are scalar also the cond must be
+        TF_RETURN_IF_ERROR(c->Merge(data, cond, &data));
+        c->set_output(0, data);
+        return Status::OK();
+      }
+
+      if (cond_rank == 1) {
+        // if the cond is a vector and the 'then' is not a scalar,
+        // the first dimension of 'then' and 'else'
+        TF_RETURN_IF_ERROR(c->Merge(cond, c->Vector(c->Dim(data, 0)), &cond));
+        c->set_output(0, data);
+        return Status::OK();
      }

      c->set_output(0, data);
      return Status::OK();
-    })
+   })
    .Doc(R"doc(
 Selects elements from `t` or `e`, depending on `condition`.

-The `t`, and `e` tensors must all have the same shape,
-and the output will also have that shape.  The `condition` tensor
-must be a scalar if `t` and `e` are scalars.  If `t` and `e` are vectors
-or higher rank, then `condition` must be either a vector with size
-matching the first dimension of `t`, or must have the same shape as `t`.
+The `t`, and `e` tensors must all have the same shape, and the
+output will also have that shape.
+
+The `condition` tensor must be a scalar if `t` and `e` are scalars.
+If `t` and `e` are vectors or higher rank, then `condition` must be either a
+scalar, a vector with size matching the first dimension of `t`, or must have
+the same shape as `t`.

 The `condition` tensor acts as a mask that chooses, based on the value at each
 element, whether the corresponding element / row in the output should be
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@ -188,7 +188,10 @@ TEST(MathOpsTest, Select_ShapeFn) {
  ShapeInferenceTestOp op("Select");
  INFER_OK(op, "?;?;?", "in1|in2");

+  // scalar case
+  INFER_OK(op, "[];[1];?", "in1");
  INFER_OK(op, "[];?;?", "in1|in2");
+
  INFER_OK(op, "[1];?;?",
           "in1|in2");  // When cond is vector, t/e may not match it.
  INFER_OK(op, "[1,2];?;?", "in1|in2?");
@ -200,8 +203,8 @@ TEST(MathOpsTest, Select_ShapeFn) {
  INFER_OK(op, "?;[1,2];?", "in1");
  INFER_OK(op, "?;?;[1,2]", "in2");

-  INFER_OK(op, "[1];[];?", "in1");
-  INFER_ERROR("Shapes must be equal rank, but are 1 and 0", op, "[];[1];?");
+  INFER_ERROR("Shapes must be equal rank, but are 0 and 1", op, "[1];[];?");
+  INFER_ERROR("Shapes must be equal rank, but are 1 and 2", op, "[];[1];[1,2]");
  INFER_ERROR("Shapes must be equal rank, but are 1 and 2", op, "[1,2];[1];?");
  INFER_OK(op, "[2];[?];[?]", "in1|in2");

--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.h
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
@ -20,9 +20,11 @@ limitations under the License.

 #include <stddef.h>
 #include <stdint.h>
-
+#if defined(WIN32)
+#include "extras/CUPTI/include/cupti.h"
+#else
 #include "cuda/extras/CUPTI/include/cupti.h"
-
+#endif
 namespace perftools {
 namespace gputools {
 namespace profiler {
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@ -261,6 +261,14 @@ class Env {
  virtual Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
                                      void** symbol) = 0;

+  // \brief build the name of dynamic library.
+  //
+  // "name" should be name of the library.
+  // "version" should be the version of the library or NULL
+  // returns the name that LoadLibrary() can use
+  virtual string FormatLibraryFileName(const string& name,
+      const string& version) = 0;
+
 private:
  std::unique_ptr<FileSystemRegistry> file_system_registry_;
  TF_DISALLOW_COPY_AND_ASSIGN(Env);
@ -318,7 +326,10 @@ class EnvWrapper : public Env {
                              void** symbol) override {
    return target_->GetSymbolFromLibrary(handle, symbol_name, symbol);
  }
-
+  string FormatLibraryFileName(const string& name,
+                               const string& version) override {
+    return target_->FormatLibraryFileName(name, version);
+  }
 private:
  Env* target_;
 };
--- a/tensorflow/core/platform/load_library.h
+++ b/tensorflow/core/platform/load_library.h
@ -25,8 +25,6 @@ namespace internal {
 Status LoadLibrary(const char* library_filename, void** handle);
 Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
                            void** symbol);
-// Return the filename of a dynamically linked library formatted according to
-// platform naming conventions
 string FormatLibraryFileName(const string& name, const string& version);

 }  // namespace internal
--- a/tensorflow/core/platform/platform.h
+++ b/tensorflow/core/platform/platform.h
@ -20,7 +20,8 @@ limitations under the License.
 // mobile.

 #if !defined(PLATFORM_POSIX) && !defined(PLATFORM_GOOGLE) && \
-    !defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID)
+    !defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID) && \
+    !defined(PLATFORM_WINDOWS)

 // Choose which platform we are on.
 #if defined(ANDROID) || defined(__ANDROID__)
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@ -119,6 +119,10 @@ class PosixEnv : public Env {
    return tensorflow::internal::GetSymbolFromLibrary(handle, symbol_name,
                                                      symbol);
  }
+
+  string FormatLibraryFileName(const string& name, const string& version) {
+    return tensorflow::internal::FormatLibraryFileName(name, version);
+  }
 };

 }  // namespace
--- a/tensorflow/core/platform/stacktrace.h
+++ b/tensorflow/core/platform/stacktrace.h
@ -22,7 +22,7 @@ limitations under the License.
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/stacktrace.h"
 #elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
-    defined(PLATFORM_GOOGLE_ANDROID)
+    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_WINDOWS)
 #include "tensorflow/core/platform/default/stacktrace.h"
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@ -26,6 +26,7 @@ limitations under the License.

 #include <thread>
 #include <vector>
+#include <string>

 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/platform/load_library.h"
@ -52,7 +53,20 @@ class StdThread : public Thread {

 class WindowsEnv : public Env {
 public:
-  WindowsEnv() {}
+  WindowsEnv()
+      : GetSystemTimePreciseAsFileTime_(NULL) {
+    // GetSystemTimePreciseAsFileTime function is only available in the latest
+    // versions of Windows. For that reason, we try to look it up in
+    // kernel32.dll at runtime and use an alternative option if the function
+    // is not available.
+    HMODULE module = GetModuleHandle("kernel32.dll");
+    if (module != NULL) {
+      auto func = (FnGetSystemTimePreciseAsFileTime)GetProcAddress(
+          module, "GetSystemTimePreciseAsFileTime");
+      GetSystemTimePreciseAsFileTime_ = func;
+    }
+  }
+
  ~WindowsEnv() override {
    LOG(FATAL) << "Env::Default() must not be destroyed";
  }
@ -62,11 +76,32 @@ class WindowsEnv : public Env {
  }

  uint64 NowMicros() override {
-    FILETIME temp;
-    GetSystemTimeAsFileTime(&temp);
-    uint64 now_ticks =
-        (uint64)temp.dwLowDateTime + ((uint64)(temp.dwHighDateTime) << 32LL);
-    return now_ticks / 10LL;
+    if (GetSystemTimePreciseAsFileTime_ != NULL) {
+      // GetSystemTimePreciseAsFileTime function is only available in latest
+      // versions of Windows, so we need to check for its existence here.
+      // All std::chrono clocks on Windows proved to return
+      // values that may repeat, which is not good enough for some uses.
+      constexpr int64_t kUnixEpochStartTicks = 116444736000000000i64;
+      constexpr int64_t kFtToMicroSec = 10;
+
+      // This interface needs to return system time and not
+      // just any microseconds because it is often used as an argument
+      // to TimedWait() on condition variable
+      FILETIME system_time;
+      GetSystemTimePreciseAsFileTime_(&system_time);
+
+      LARGE_INTEGER li;
+      li.LowPart = system_time.dwLowDateTime;
+      li.HighPart = system_time.dwHighDateTime;
+      // Subtract unix epoch start
+      li.QuadPart -= kUnixEpochStartTicks;
+      // Convert to microsecs
+      li.QuadPart /= kFtToMicroSec;
+      return li.QuadPart;
+    }
+    using namespace std::chrono;
+    return duration_cast<microseconds>(
+        system_clock::now().time_since_epoch()).count();
  }

  void SleepForMicroseconds(int64 micros) override { Sleep(micros / 1000); }
@ -94,19 +129,53 @@ class WindowsEnv : public Env {
    });
  }

-  Status LoadLibrary(const char* library_filename, void** handle) override {
-    return errors::Unimplemented("WindowsEnv::LoadLibrary");
+  Status LoadLibrary(const char *library_filename, void** handle) override {
+    std::string file_name = library_filename;
+    std::replace(file_name.begin(), file_name.end(), '/', '\\');
+
+    HMODULE hModule = LoadLibraryEx(file_name.c_str(), NULL,
+      LOAD_WITH_ALTERED_SEARCH_PATH);
+    if (!hModule) {
+      return errors::NotFound(file_name + " not found");
+    }
+    *handle = hModule;
+    return Status::OK();
  }

  Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
-                              void** symbol) override {
-    return errors::Unimplemented("WindowsEnv::GetSymbolFromLibrary");
+    void** symbol) override {
+    FARPROC found_symbol;
+
+    found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
+    if (found_symbol == NULL) {
+      return errors::NotFound(std::string(symbol_name) + " not found");
+    }
+    *symbol = (void **)found_symbol;
+    return Status::OK();
  }
+
+  string FormatLibraryFileName(const string& name, const string& version)
+    override {
+    string filename;
+    if (version.size() == 0) {
+      filename = name + ".dll";
+    }
+    else {
+      filename = name + version + ".dll";
+    }
+    return filename;
+  }
+
+ private:
+  typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
+  FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
 };

 }  // namespace

 REGISTER_FILE_SYSTEM("", WindowsFileSystem);
+REGISTER_FILE_SYSTEM("file", LocalWinFileSystem);
+
 Env* Env::Default() {
  static Env* default_env = new WindowsEnv;
  return default_env;
--- a/tensorflow/core/platform/windows/error.cc
+++ b/tensorflow/core/platform/windows/error.cc
@ -0,0 +1,33 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/windows/error.h"
+
+namespace tensorflow {
+namespace internal {
+
+std::string GetWindowsErrorMessage(DWORD err) {
+  LPSTR buffer = NULL;
+  DWORD flags = FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
+      FORMAT_MESSAGE_IGNORE_INSERTS;
+  FormatMessageA(flags, NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+                 reinterpret_cast<LPSTR>(&buffer), 0, NULL);
+  std::string message = buffer;
+  LocalFree(buffer);
+  return message;
+}
+
+}  // namespace internal
+}  // namespace tensorflow
--- a/tensorflow/core/platform/windows/error.h
+++ b/tensorflow/core/platform/windows/error.h
@ -0,0 +1,32 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
+#define TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
+
+#include <string>
+
+#include <Windows.h>
+
+namespace tensorflow {
+namespace internal {
+
+std::string GetWindowsErrorMessage(DWORD err);
+
+}
+}
+
+#endif  // TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
+
--- a/tensorflow/core/platform/windows/net.cc
+++ b/tensorflow/core/platform/windows/net.cc
@ -15,25 +15,27 @@ limitations under the License.

 #include "tensorflow/core/platform/net.h"

-#include <cerrno>
 #include <cstdlib>
 #include <unordered_set>

 #include <sys/types.h>
-#include <winsock.h>
+#include <winsock2.h>

-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/windows/error.h"

 #undef ERROR

+#pragma comment(lib,"Ws2_32.lib")
+
 namespace tensorflow {
 namespace internal {

 namespace {
+
 bool IsPortAvailable(int* port, bool is_tcp) {
  const int protocol = is_tcp ? IPPROTO_TCP : 0;
-  const int fd = socket(AF_INET, is_tcp ? SOCK_STREAM : SOCK_DGRAM, protocol);
+  SOCKET sock = socket(AF_INET, is_tcp ? SOCK_STREAM : SOCK_DGRAM, protocol);

  struct sockaddr_in addr;
  int addr_len = static_cast<int>(sizeof(addr));
@ -41,17 +43,20 @@ bool IsPortAvailable(int* port, bool is_tcp) {

  CHECK_GE(*port, 0);
  CHECK_LE(*port, 65535);
-  if (fd < 0) {
-    LOG(ERROR) << "socket() failed: " << strerror(errno);
+  if (sock == INVALID_SOCKET) {
+    LOG(ERROR) << "socket() failed: " <<
+        GetWindowsErrorMessage(WSAGetLastError());
    return false;
  }

-  // SO_REUSEADDR lets us start up a server immediately after it exists.
-  int one = 1;
-  if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (const char*)&one, sizeof(one)) <
-      0) {
-    LOG(ERROR) << "setsockopt() failed: " << strerror(errno);
-    closesocket(fd);
+  // SO_REUSEADDR lets us start up a server immediately after it exits.
+  const int one = 1;
+  int result = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+                          reinterpret_cast<const char*>(&one), sizeof(one));
+  if (result == SOCKET_ERROR) {
+    LOG(ERROR) << "setsockopt() failed: " <<
+        GetWindowsErrorMessage(WSAGetLastError());
+    closesocket(sock);
    return false;
  }

@ -59,18 +64,23 @@ bool IsPortAvailable(int* port, bool is_tcp) {
  addr.sin_family = AF_INET;
  addr.sin_addr.s_addr = INADDR_ANY;
  addr.sin_port = htons((uint16_t)*port);
-  if (bind(fd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
-    LOG(WARNING) << "bind(port=" << *port << ") failed: " << strerror(errno);
-    closesocket(fd);
+  result = bind(sock, (struct sockaddr*)&addr, sizeof(addr));
+  if (result == SOCKET_ERROR) {
+    LOG(WARNING) << "bind(port=" << *port << ") failed: " <<
+        GetWindowsErrorMessage(WSAGetLastError());
+    closesocket(sock);
    return false;
  }

  // Get the bound port number.
-  if (getsockname(fd, (struct sockaddr*)&addr, &addr_len) < 0) {
-    LOG(WARNING) << "getsockname() failed: " << strerror(errno);
-    closesocket(fd);
+  result = getsockname(sock, (struct sockaddr*)&addr, &addr_len);
+  if (result == SOCKET_ERROR) {
+    LOG(WARNING) << "getsockname() failed: " <<
+        GetWindowsErrorMessage(WSAGetLastError());
+    closesocket(sock);
    return false;
  }
+
  CHECK_LE(addr_len, sizeof(addr));
  actual_port = ntohs(addr.sin_port);
  CHECK_GT(actual_port, 0);
@ -79,7 +89,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
  } else {
    CHECK_EQ(*port, actual_port);
  }
-  closesocket(fd);
+
+  closesocket(sock);
  return true;
 }

@ -89,6 +100,12 @@ const int kMaximumTrials = 1000;
 }  // namespace

 int PickUnusedPortOrDie() {
+  WSADATA wsaData;
+  if (WSAStartup(MAKEWORD(2, 2), &wsaData) != NO_ERROR) {
+    LOG(ERROR) << "Error at WSAStartup()";
+    return false;
+  }
+
  static std::unordered_set<int> chosen_ports;

  // Type of port to first pick in the next iteration.
@ -121,6 +138,7 @@ int PickUnusedPortOrDie() {
    }

    chosen_ports.insert(port);
+    WSACleanup();
    return port;
  }

--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@ -19,8 +19,8 @@ limitations under the License.
 #ifdef SNAPPY
 #include <snappy.h>
 #endif
-#include <WinSock2.h>
-#pragma comment(lib, "Ws2_32.lib")
+
+#include <Windows.h>

 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/demangle.h"
@ -37,10 +37,13 @@ namespace port {
 void InitMain(const char* usage, int* argc, char*** argv) {}

 string Hostname() {
-  char hostname[1024];
-  gethostname(hostname, sizeof hostname);
-  hostname[sizeof hostname - 1] = 0;
-  return string(hostname);
+  char name[1024];
+  DWORD name_size = sizeof(name);
+  name[0] = 0;
+  if (::GetComputerNameA(name, &name_size)) {
+    name[name_size] = 0;
+  }
+  return name;
 }

 int NumSchedulableCPUs() {
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/posix/error.h"
+#include "tensorflow/core/platform/windows/error.h"
 #include "tensorflow/core/platform/windows/windows_file_system.h"

 // TODO(mrry): Prevent this Windows.h #define from leaking out of our headers.
@ -39,19 +40,71 @@ namespace tensorflow {

 namespace {

+// RAII helpers for HANDLEs
+const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); };
+typedef std::unique_ptr<void, decltype(CloseHandleFunc)> UniqueCloseHandlePtr;
+
+inline Status IOErrorFromWindowsError(const string& context, DWORD err) {
+  return IOError(
+      context + string(" : ") + internal::GetWindowsErrorMessage(err), err);
+}
+
+// PLEASE NOTE: hfile is expected to be an async handle
+// (i.e. opened with FILE_FLAG_OVERLAPPED)
+SSIZE_T pread(HANDLE hfile, char* src, size_t num_bytes, uint64_t offset) {
+  assert(num_bytes <= std::numeric_limits<DWORD>::max());
+  OVERLAPPED overlapped = {0};
+  ULARGE_INTEGER offset_union;
+  offset_union.QuadPart = offset;
+
+  overlapped.Offset = offset_union.LowPart;
+  overlapped.OffsetHigh = offset_union.HighPart;
+  overlapped.hEvent = ::CreateEvent(NULL, TRUE, FALSE, NULL);
+
+  if (NULL == overlapped.hEvent) {
+    return -1;
+  }
+
+  SSIZE_T result = 0;
+
+  unsigned long bytes_read = 0;
+  DWORD last_error = ERROR_SUCCESS;
+
+  BOOL read_result = ::ReadFile(hfile, src, static_cast<DWORD>(num_bytes),
+                                &bytes_read, &overlapped);
+  if ((FALSE == read_result) &&
+      ((last_error = GetLastError()) != ERROR_IO_PENDING)) {
+    result = (last_error == ERROR_HANDLE_EOF) ? 0 : -1;
+  } else {
+    if (ERROR_IO_PENDING == last_error) { // Otherwise bytes_read already has the result.
+      BOOL overlapped_result = ::GetOverlappedResult(hfile, &overlapped,
+                                                     &bytes_read, TRUE);
+      if (FALSE == overlapped_result) {
+        result = (::GetLastError() == ERROR_HANDLE_EOF) ? 0 : -1;
+      }
+      else {
+        result = bytes_read;
+      }
+    }
+  }
+
+  ::CloseHandle(overlapped.hEvent);
+
+  return result;
+}
+
 // read() based random-access
 class WindowsRandomAccessFile : public RandomAccessFile {
 private:
  string filename_;
-  FILE* file_;
+  HANDLE hfile_;

 public:
-  WindowsRandomAccessFile(const string& fname, FILE* f)
-      : filename_(fname), file_(f) {}
+  WindowsRandomAccessFile(const string& fname, HANDLE hfile)
+      : filename_(fname), hfile_(hfile) {}
  ~WindowsRandomAccessFile() override {
-    if (file_ != NULL) {
-      // Ignoring any potential errors
-      fclose(file_);
+    if (hfile_ != NULL && hfile_ != INVALID_HANDLE_VALUE) {
+      ::CloseHandle(hfile_);
    }
  }

@ -59,13 +112,10 @@ class WindowsRandomAccessFile : public RandomAccessFile {
              char* scratch) const override {
    Status s;
    char* dst = scratch;
-    int seek_result = fseek(file_, offset, SEEK_SET);
-    if (seek_result) {
-      return IOError(filename_, errno);
-    }
    while (n > 0 && s.ok()) {
-      size_t r = fread(dst, 1, n, file_);
+      SSIZE_T r = pread(hfile_, dst, n, offset);
      if (r > 0) {
+        offset += r;
        dst += r;
        n -= r;
      } else if (r == 0) {
@ -84,104 +134,246 @@ class WindowsRandomAccessFile : public RandomAccessFile {
 class WindowsWritableFile : public WritableFile {
 private:
  string filename_;
-  FILE* file_;
+  HANDLE hfile_;

 public:
-  WindowsWritableFile(const string& fname, FILE* f)
-      : filename_(fname), file_(f) {}
+  WindowsWritableFile(const string& fname, HANDLE hFile)
+      : filename_(fname), hfile_(hFile) {}

  ~WindowsWritableFile() override {
-    if (file_ != NULL) {
-      // Ignoring any potential errors
-      fclose(file_);
+    if (hfile_ != NULL && hfile_ != INVALID_HANDLE_VALUE) {
+      WindowsWritableFile::Close();
    }
  }

  Status Append(const StringPiece& data) override {
-    size_t r = fwrite(data.data(), 1, data.size(), file_);
-    if (r != data.size()) {
-      return IOError(filename_, errno);
+    DWORD bytes_written = 0;
+    DWORD data_size = static_cast<DWORD>(data.size());
+    BOOL write_result = ::WriteFile(hfile_, data.data(), data_size,
+                                    &bytes_written, NULL);
+    if (FALSE == write_result) {
+      return IOErrorFromWindowsError(
+          "Failed to WriteFile: " + filename_, ::GetLastError());
    }
+
+    assert(size_t(bytes_written) == data.size());
    return Status::OK();
  }

  Status Close() override {
-    Status result;
-    if (fclose(file_) != 0) {
-      result = IOError(filename_, errno);
+    assert(INVALID_HANDLE_VALUE != hfile_);
+
+    Status result = Flush();
+    if (!result.ok()) {
+      return result;
    }
-    file_ = NULL;
-    return result;
+
+    if (FALSE == ::CloseHandle(hfile_)) {
+      return IOErrorFromWindowsError(
+          "CloseHandle failed for: " + filename_, ::GetLastError());
+    }
+
+    hfile_ = INVALID_HANDLE_VALUE;
+    return Status::OK();
  }

  Status Flush() override {
-    if (fflush(file_) != 0) {
-      return IOError(filename_, errno);
+    if (FALSE == ::FlushFileBuffers(hfile_)) {
+      return IOErrorFromWindowsError(
+          "FlushFileBuffers failed for: " + filename_, ::GetLastError());
    }
    return Status::OK();
  }

  Status Sync() override {
-    Status s;
-    if (fflush(file_) != 0) {
-      s = IOError(filename_, errno);
-    }
-    return s;
+    return Flush();
  }
 };

+class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
+ private:
+  const std::string filename_;
+  HANDLE hfile_;
+  HANDLE hmap_;
+
+  const void* const address_;
+  const uint64 length_;
+
+ public:
+  WinReadOnlyMemoryRegion(const std::string& filename, HANDLE hfile,
+                          HANDLE hmap, const void* address, uint64 length)
+      : filename_(filename), hfile_(hfile), hmap_(hmap), address_(address),
+        length_(length) {}
+
+  ~WinReadOnlyMemoryRegion() {
+    BOOL ret = ::UnmapViewOfFile(address_);
+    assert(ret);
+
+    ret = ::CloseHandle(hmap_);
+    assert(ret);
+
+    ret = ::CloseHandle(hfile_);
+    assert(ret);
+  }
+
+  const void* data() override { return address_; }
+  uint64 length() override { return length_; }
+};
+
 }  // namespace

 Status WindowsFileSystem::NewRandomAccessFile(
    const string& fname, std::unique_ptr<RandomAccessFile>* result) {
  string translated_fname = TranslateName(fname);
  result->reset();
-  Status s;
-  FILE* f = fopen(translated_fname.c_str(), "r");
-  if (f == NULL) {
-    s = IOError(fname, errno);
-  } else {
-    result->reset(new WindowsRandomAccessFile(translated_fname, f));
+
+  // Open the file for read-only random access
+  // Random access is to disable read-ahead as the system reads too much data
+  // Open in async mode which makes Windows allow more parallelism even
+  // if we need to do sync I/O on top of it.
+  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS |
+      FILE_FLAG_OVERLAPPED;
+  // Shared access is necessary for tests to pass
+  // almost all tests would work with a possible exception of fault_injection.
+  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
+
+  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_READ,
+                               share_mode, NULL, OPEN_EXISTING, file_flags,
+                               NULL);
+
+  if (INVALID_HANDLE_VALUE == hfile) {
+    string context = "NewRandomAccessFile failed to Create/Open: " + fname;
+    return IOErrorFromWindowsError(context, ::GetLastError());
  }
-  return s;
+
+  result->reset(new WindowsRandomAccessFile(translated_fname, hfile));
+  return Status::OK();
 }

 Status WindowsFileSystem::NewWritableFile(
    const string& fname, std::unique_ptr<WritableFile>* result) {
  string translated_fname = TranslateName(fname);
-  Status s;
-  FILE* f = fopen(translated_fname.c_str(), "w");
-  if (f == NULL) {
-    result->reset();
-    s = IOError(fname, errno);
-  } else {
-    result->reset(new WindowsWritableFile(translated_fname, f));
+  result->reset();
+
+  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
+  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_WRITE,
+                               share_mode, NULL, CREATE_ALWAYS,
+                               FILE_ATTRIBUTE_NORMAL, NULL);
+
+  if (INVALID_HANDLE_VALUE == hfile) {
+    string context = "Failed to create a NewWriteableFile: " + fname;
+    return IOErrorFromWindowsError(context, ::GetLastError());
  }
-  return s;
+
+  result->reset(new WindowsWritableFile(translated_fname, hfile));
+  return Status::OK();
 }

 Status WindowsFileSystem::NewAppendableFile(
    const string& fname, std::unique_ptr<WritableFile>* result) {
  string translated_fname = TranslateName(fname);
-  Status s;
-  FILE* f = fopen(translated_fname.c_str(), "a");
-  if (f == NULL) {
-    result->reset();
-    s = IOError(fname, errno);
-  } else {
-    result->reset(new WindowsWritableFile(translated_fname, f));
+  result->reset();
+
+  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
+  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_WRITE,
+                               share_mode, NULL, OPEN_ALWAYS,
+                               FILE_ATTRIBUTE_NORMAL, NULL);
+
+  if (INVALID_HANDLE_VALUE == hfile) {
+    string context = "Failed to create a NewAppendableFile: " + fname;
+    return IOErrorFromWindowsError(context, ::GetLastError());
  }
-  return s;
+
+  UniqueCloseHandlePtr file_guard(hfile, CloseHandleFunc);
+
+  DWORD file_ptr = ::SetFilePointer(hfile, NULL, NULL, FILE_END);
+  if (INVALID_SET_FILE_POINTER == file_ptr) {
+    string context = "Failed to create a NewAppendableFile: " + fname;
+    return IOErrorFromWindowsError(context, ::GetLastError());
+  }
+
+  result->reset(new WindowsWritableFile(translated_fname, hfile));
+  file_guard.release();
+
+  return Status::OK();
 }

 Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
-  return errors::Unimplemented(
-      "WindowsFileSystem::NewReadOnlyMemoryRegionFromFile");
+  string translated_fname = TranslateName(fname);
+  result->reset();
+  Status s = Status::OK();
+
+  // Open the file for read-only random access
+  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS;
+
+  // Open in async mode which makes Windows allow more parallelism even
+  // if we need to do sync I/O on top of it.
+  file_flags |= FILE_FLAG_OVERLAPPED;
+
+  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
+  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_READ,
+                               share_mode, NULL, OPEN_EXISTING, file_flags,
+                               NULL);
+
+  if (INVALID_HANDLE_VALUE == hfile) {
+    return IOErrorFromWindowsError(
+        "NewReadOnlyMemoryRegionFromFile failed to Create/Open: " + fname,
+        ::GetLastError());
+  }
+
+  UniqueCloseHandlePtr file_guard(hfile, CloseHandleFunc);
+
+  // Use mmap when virtual address-space is plentiful.
+  uint64_t file_size;
+  s = GetFileSize(translated_fname, &file_size);
+  if (s.ok()) {
+    // Will not map empty files
+    if (file_size == 0) {
+      return IOError(
+          "NewReadOnlyMemoryRegionFromFile failed to map empty file: " + fname,
+          EINVAL);
+    }
+
+    HANDLE hmap = ::CreateFileMappingA(hfile, NULL, PAGE_READONLY,
+                                       0,  // Whole file at its present length
+                                       0,
+                                       NULL);  // Mapping name
+
+    if (!hmap) {
+      string context = "Failed to create file mapping for "
+                       "NewReadOnlyMemoryRegionFromFile: " + fname;
+      return IOErrorFromWindowsError(context, ::GetLastError());
+    }
+
+    UniqueCloseHandlePtr map_guard(hmap, CloseHandleFunc);
+
+    const void* mapped_region = ::MapViewOfFileEx(
+        hmap, FILE_MAP_READ,
+        0,  // High DWORD of access start
+        0,  // Low DWORD
+        file_size,
+        NULL);  // Let the OS choose the mapping
+
+    if (!mapped_region) {
+      string context = "Failed to MapViewOfFile for "
+                       "NewReadOnlyMemoryRegionFromFile: " + fname;
+      return IOErrorFromWindowsError(context, ::GetLastError());
+    }
+
+    result->reset(new WinReadOnlyMemoryRegion(fname, hfile, hmap,
+                                              mapped_region, file_size));
+
+    map_guard.release();
+    file_guard.release();
+  }
+
+  return s;
 }

 bool WindowsFileSystem::FileExists(const string& fname) {
-  return _access(TranslateName(fname).c_str(), 0) == 0;
+  constexpr int kOk = 0;
+  return _access(TranslateName(fname).c_str(), kOk) == 0;
 }

 Status WindowsFileSystem::GetChildren(const string& dir,
@ -189,27 +381,39 @@ Status WindowsFileSystem::GetChildren(const string& dir,
  string translated_dir = TranslateName(dir);
  result->clear();

+  string pattern = translated_dir;
+  if (!pattern.empty() && pattern.back() != '\\' && pattern.back() != '/') {
+    pattern += '\\*';
+  } else {
+    pattern += '*';
+  }
+
  WIN32_FIND_DATA find_data;
-  HANDLE find_handle = FindFirstFile(translated_dir.c_str(), &find_data);
+  HANDLE find_handle = ::FindFirstFileA(pattern.c_str(), &find_data);
  if (find_handle == INVALID_HANDLE_VALUE) {
-    // TODO(mrry): Convert to a more specific error.
-    return errors::Unknown("Error code: ", GetLastError());
+    string context = "FindFirstFile failed for: " + translated_dir;
+    return IOErrorFromWindowsError(context, ::GetLastError());
  }
-  result->push_back(find_data.cFileName);
-  while (FindNextFile(find_handle, &find_data)) {
-    result->push_back(find_data.cFileName);
-  }
-  if (!FindClose(find_handle)) {
-    // TODO(mrry): Convert to a more specific error.
-    return errors::Unknown("Error closing find handle: ", GetLastError());
+
+  do {
+    const StringPiece basename = find_data.cFileName;
+    if (basename != "." && basename != "..") {
+      result->push_back(find_data.cFileName);
+    }
+  } while (::FindNextFileA(find_handle, &find_data));
+
+  if (!::FindClose(find_handle)) {
+    string context = "FindClose failed for: " + translated_dir;
+    return IOErrorFromWindowsError(context, ::GetLastError());
  }
+
  return Status::OK();
 }

 Status WindowsFileSystem::DeleteFile(const string& fname) {
  Status result;
  if (unlink(TranslateName(fname).c_str()) != 0) {
-    result = IOError(fname, errno);
+    result = IOError("Failed to delete a file: " + fname, errno);
  }
  return result;
 }
@ -217,7 +421,7 @@ Status WindowsFileSystem::DeleteFile(const string& fname) {
 Status WindowsFileSystem::CreateDir(const string& name) {
  Status result;
  if (_mkdir(TranslateName(name).c_str()) != 0) {
-    result = IOError(name, errno);
+    result = IOError("Failed to create a directory: " + name, errno);
  }
  return result;
 }
@ -225,42 +429,52 @@ Status WindowsFileSystem::CreateDir(const string& name) {
 Status WindowsFileSystem::DeleteDir(const string& name) {
  Status result;
  if (_rmdir(TranslateName(name).c_str()) != 0) {
-    result = IOError(name, errno);
+    result = IOError("Failed to remove a directory: " + name, errno);
  }
  return result;
 }

 Status WindowsFileSystem::GetFileSize(const string& fname, uint64* size) {
-  Status s;
-  struct _stat sbuf;
-  if (_stat(TranslateName(fname).c_str(), &sbuf) != 0) {
-    *size = 0;
-    s = IOError(fname, errno);
-  } else {
-    *size = sbuf.st_size;
+  string translated_fname = TranslateName(fname);
+  Status result;
+  WIN32_FILE_ATTRIBUTE_DATA attrs;
+  if (TRUE == ::GetFileAttributesExA(translated_fname.c_str(),
+                                     GetFileExInfoStandard, &attrs)) {
+    ULARGE_INTEGER file_size;
+    file_size.HighPart = attrs.nFileSizeHigh;
+    file_size.LowPart = attrs.nFileSizeLow;
+    *size = file_size.QuadPart;
  }
-  return s;
+  else {
+    string context = "Can not get size for: " + fname;
+    result = IOErrorFromWindowsError(context, ::GetLastError());
+  }
+  return result;
 }

 Status WindowsFileSystem::RenameFile(const string& src, const string& target) {
  Status result;
-  if (rename(TranslateName(src).c_str(), TranslateName(target).c_str()) != 0) {
-    result = IOError(src, errno);
+  // rename() is not capable of replacing the existing file as on Linux
+  // so use OS API directly
+  if (!::MoveFileExA(TranslateName(src).c_str(), TranslateName(target).c_str(),
+      MOVEFILE_REPLACE_EXISTING)) {
+    string context(strings::StrCat("Failed to rename: ", src, " to: ", target));
+    result = IOErrorFromWindowsError(context, ::GetLastError());
  }
  return result;
 }

 Status WindowsFileSystem::Stat(const string& fname, FileStatistics* stat) {
-  Status s;
+  Status result;
  struct _stat sbuf;
  if (_stat(TranslateName(fname).c_str(), &sbuf) != 0) {
-    s = IOError(fname, errno);
+    result = IOError(fname, errno);
  } else {
    stat->mtime_nsec = sbuf.st_mtime * 1e9;
    stat->length = sbuf.st_size;
    stat->is_directory = PathIsDirectory(TranslateName(fname).c_str());
  }
-  return s;
+  return result;
 }

 }  // namespace tensorflow
--- a/tensorflow/core/platform/windows/windows_file_system.h
+++ b/tensorflow/core/platform/windows/windows_file_system.h
@ -64,7 +64,14 @@ class WindowsFileSystem : public FileSystem {
  }
 };

-Status IOError(const string& context, int err_number);
+class LocalWinFileSystem : public WindowsFileSystem {
+public:
+    string TranslateName(const string& name) const override {
+      StringPiece scheme, host, path;
+      ParseURI(name, &scheme, &host, &path);
+      return path.ToString();
+    }
+};

 }  // namespace tensorflow

--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@ -20,7 +20,7 @@ limitations under the License.

 #define TF_MAJOR_VERSION 0
 #define TF_MINOR_VERSION 11
-#define TF_PATCH_VERSION 0rc0
+#define TF_PATCH_VERSION 0rc1

 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
--- a/tensorflow/examples/learn/README.md
+++ b/tensorflow/examples/learn/README.md
@ -21,7 +21,7 @@ Some examples use the `pandas` library for data processing (`sudo pip install pa
 * [Deep Neural Network with Customized Decay Function](iris_custom_decay_dnn.py)

 ## Specialized Models
-* [Building a Random Forest Model](random_forest.py)
+* [Building a Random Forest Model](random_forest_mnist.py)
 * [Building a Wide & Deep Model](wide_n_deep_tutorial.py)
 * [Building a Residual Network Model](resnet.py)

--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@ -84,7 +84,6 @@ py_test(
    args = [
        "--fake_data",
        "--max_steps=10",
-        "--train_dir=/tmp/mnist",
    ],
    main = "fully_connected_feed.py",
    srcs_version = "PY2AND3",
--- a/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
+++ b/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
@ -117,7 +117,7 @@ def run_training():
  """Train MNIST for a number of steps."""
  # Get the sets of images and labels for training, validation, and
  # test on MNIST.
-  data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)
+  data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data)

  # Tell TensorFlow that the model will be built into the default Graph.
  with tf.Graph().as_default():
@ -146,13 +146,13 @@ def run_training():
    init = tf.initialize_all_variables()

    # Create a saver for writing training checkpoints.
-    saver = tf.train.Saver()
+    saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)

    # Create a session for running Ops on the Graph.
    sess = tf.Session()

    # Instantiate a SummaryWriter to output summaries and the Graph.
-    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)
+    summary_writer = tf.train.SummaryWriter(FLAGS.log_dir, sess.graph)

    # And then after everything is built:

@ -190,7 +190,7 @@ def run_training():

      # Save a checkpoint and evaluate the model periodically.
      if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
-        checkpoint_file = os.path.join(FLAGS.train_dir, 'checkpoint')
+        checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')
        saver.save(sess, checkpoint_file, global_step=step)
        # Evaluate against the training set.
        print('Training Data Eval:')
@ -216,6 +216,9 @@ def run_training():


 def main(_):
+  if tf.gfile.Exists(FLAGS.log_dir):
+    tf.gfile.DeleteRecursively(FLAGS.log_dir)
+  tf.gfile.MakeDirs(FLAGS.log_dir)
  run_training()


@ -252,10 +255,16 @@ if __name__ == '__main__':
      help='Batch size.  Must divide evenly into the dataset sizes.'
  )
  parser.add_argument(
-      '--train_dir',
+      '--input_data_dir',
      type=str,
-      default='data',
-      help='Directory to put the training data.'
+      default='/tmp/tensorflow/mnist/input_data',
+      help='Directory to put the input data.'
+  )
+  parser.add_argument(
+      '--log_dir',
+      type=str,
+      default='/tmp/tensorflow/mnist/logs/fully_connected_feed',
+      help='Directory to put the log data.'
  )
  parser.add_argument(
      '--fake_data',
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@ -72,7 +72,7 @@ def main(_):

 if __name__ == '__main__':
  parser = argparse.ArgumentParser()
-  parser.add_argument('--data_dir', type=str, default='/tmp/data',
-                      help='Directory for storing data')
+  parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
+                      help='Directory for storing input data')
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@ -137,9 +137,9 @@ def train():

  # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
  merged = tf.summary.merge_all()
-  train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train',
+  train_writer = tf.train.SummaryWriter(FLAGS.log_dir + '/train',
                                        sess.graph)
-  test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
+  test_writer = tf.train.SummaryWriter(FLAGS.log_dir + '/test')
  tf.initialize_all_variables().run()

  # Train the model, and also write summaries.
@ -180,9 +180,9 @@ def train():


 def main(_):
-  if tf.gfile.Exists(FLAGS.summaries_dir):
-    tf.gfile.DeleteRecursively(FLAGS.summaries_dir)
-  tf.gfile.MakeDirs(FLAGS.summaries_dir)
+  if tf.gfile.Exists(FLAGS.log_dir):
+    tf.gfile.DeleteRecursively(FLAGS.log_dir)
+  tf.gfile.MakeDirs(FLAGS.log_dir)
  train()


@ -197,10 +197,9 @@ if __name__ == '__main__':
                      help='Initial learning rate')
  parser.add_argument('--dropout', type=float, default=0.9,
                      help='Keep probability for training dropout.')
-  parser.add_argument('--data_dir', type=str, default='/tmp/data',
-                      help='Directory for storing data')
-  parser.add_argument('--summaries_dir', type=str, default='/tmp/mnist_logs',
-                      help='Summaries directory')
-
+  parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
+                      help='Directory for storing input data')
+  parser.add_argument('--log_dir', type=str, default='/tmp/tensorflow/mnist/logs/mnist_with_summaries',
+                      help='Summaries log directory')
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md
@ -11,8 +11,8 @@ the full softmax loss.
 At inference time, you can compute full softmax probabilities with the
 expression `tf.nn.softmax(tf.matmul(inputs, tf.transpose(weights)) + biases)`.

-See our [Candidate Sampling Algorithms Reference]
-(../../extras/candidate_sampling.pdf)
+See our
+[Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf)

 Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
 ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d_native.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d_native.md
@ -17,7 +17,7 @@ for k in 0..in_channels-1
                        filter[di, dj, k, q]

 Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+horizontal and vertical strides, `strides = [1, stride, stride, 1]`.

 ##### Args:

--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
@ -42,8 +42,7 @@ with an otherwise unused class.
      where a sampled class equals one of the target classes.  If set to
      `True`, this is a "Sampled Logistic" loss instead of NCE, and we are
      learning to generate log-odds instead of log probabilities.  See
-      our [Candidate Sampling Algorithms Reference]
-      (../../extras/candidate_sampling.pdf).
+      our [Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf).
      Default is False.
 *  <b>`partition_strategy`</b>: A string specifying the partitioning strategy, relevant
      if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md
@ -11,8 +11,8 @@ each component is divided by the weighted, squared sum of inputs within
        sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
    output = input / (bias + alpha * sqr_sum) ** beta

-For details, see [Krizhevsky et al., ImageNet classification with deep
-convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+For details, see
+[Krizhevsky et al., ImageNet classification with deep convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).

 ##### Args:

--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.conv2d.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.conv2d.md
@ -22,7 +22,7 @@ In detail, with the default NHWC format,
                        filter[di, dj, q, k]

 Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+horizontal and vertical strides, `strides = [1, stride, stride, 1]`.

 ##### Args:

--- a/tensorflow/g3doc/get_started/os_setup.md
+++ b/tensorflow/g3doc/get_started/os_setup.md
@ -63,37 +63,37 @@ Then, select the correct binary to install:

 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl

 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl

 # Mac OS X, CPU only, Python 2.7:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py2-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py2-none-any.whl

 # Mac OS X, GPU enabled, Python 2.7:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py2-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py2-none-any.whl

 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl

 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl

 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl

 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl

 # Mac OS X, CPU only, Python 3.4 or 3.5:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py3-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py3-none-any.whl

 # Mac OS X, GPU enabled, Python 3.4 or 3.5:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py3-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py3-none-any.whl
 ```

 Install TensorFlow:
@ -159,37 +159,37 @@ Now, install TensorFlow just as you would for a regular Pip installation. First

 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl

 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl

 # Mac OS X, CPU only, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py2-none-any.whl

 # Mac OS X, GPU enabled, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py2-none-any.whl

 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl

 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl

 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl

 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl

 # Mac OS X, CPU only, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py3-none-any.whl

 # Mac OS X, GPU enabled, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py3-none-any.whl
 ```

 Finally install TensorFlow:
@ -298,37 +298,37 @@ select the correct binary to install:

 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl

 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl

 # Mac OS X, CPU only, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py2-none-any.whl

 # Mac OS X, GPU enabled, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py2-none-any.whl

 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl

 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl

 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl

 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl

 # Mac OS X, CPU only, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py3-none-any.whl

 # Mac OS X, GPU enabled, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py3-none-any.whl
 ```

 Finally install TensorFlow:
@ -396,13 +396,13 @@ code.
 code.

 We also have tags with `latest` replaced by a released version (e.g.,
-`0.11.0-gpu`).
+`0.11.0rc1-gpu`).

 With Docker the installation is as follows:

 *  Install Docker on your machine.
 *  Create a [Docker
-group](http://docs.docker.com/engine/installation/ubuntulinux/#create-a-docker-group)
+group](https://docs.docker.com/engine/installation/linux/ubuntulinux/#/create-a-docker-group)
 to allow launching containers without `sudo`.
 *  Launch a Docker container with the TensorFlow image.  The image
   gets downloaded automatically on first launch.
@ -780,7 +780,7 @@ $ bazel build -c opt --config=cuda //tensorflow/tools/pip_package:build_pip_pack
 $ bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg

 # The name of the .whl file will depend on your platform.
-$ sudo pip install /tmp/tensorflow_pkg/tensorflow-0.11.0rc0-py2-none-any.whl
+$ sudo pip install /tmp/tensorflow_pkg/tensorflow-0.11.0rc1-py2-none-any.whl
 ```

 ## Setting up TensorFlow for Development
--- a/tensorflow/g3doc/tutorials/wide/index.md
+++ b/tensorflow/g3doc/tutorials/wide/index.md
@ -222,12 +222,12 @@ To define a feature column for a categorical feature, we can create a
 feature values of a column and there are only a few of them, you can use
 `sparse_column_with_keys`. Each key in the list will get assigned an
 auto-incremental ID starting from 0. For example, for the `gender` column we can
-assign the feature string "female" to an integer ID of 0 and "male" to 1 by
+assign the feature string "Female" to an integer ID of 0 and "Male" to 1 by
 doing:

 ```python
 gender = tf.contrib.layers.sparse_column_with_keys(
-  column_name="gender", keys=["female", "male"])
+  column_name="gender", keys=["Female", "Male"])
 ```

 What if we don't know the set of possible values in advance? Not a problem. We
--- a/tensorflow/g3doc/tutorials/wide_and_deep/index.md
+++ b/tensorflow/g3doc/tutorials/wide_and_deep/index.md
@ -16,7 +16,8 @@ large-scale regression and classification problems with sparse input features
 you're interested in learning more about how Wide & Deep Learning works, please
 check out our [research paper](http://arxiv.org/abs/1606.07792).

-![Wide & Deep Spectrum of Models](../../images/wide_n_deep.svg "Wide & Deep")
+![Wide & Deep Spectrum of Models]
+(../../images/wide_n_deep.svg "Wide & Deep")

 The figure above shows a comparison of a wide model (logistic regression with
 sparse features and transformations), a deep model (feed-forward neural network
@ -85,7 +86,9 @@ part and the deep part of the model.
 import tensorflow as tf

 # Categorical base columns.
-gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["female", "male"])
+gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["Female", "Male"])
+race = tf.contrib.layers.sparse_column_with_keys(column_name="race", keys=[
+  "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"])
 education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
 relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
 workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
--- a/tensorflow/models/image/cifar10/cifar10.py
+++ b/tensorflow/models/image/cifar10/cifar10.py
@ -391,4 +391,5 @@ def maybe_download_and_extract():
    print()
    statinfo = os.stat(filepath)
    print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
-    tarfile.open(filepath, 'r:gz').extractall(dest_directory)
+  
+  tarfile.open(filepath, 'r:gz').extractall(dest_directory)
--- a/tensorflow/models/rnn/ptb/ptb_word_lm.py
+++ b/tensorflow/models/rnn/ptb/ptb_word_lm.py
@ -339,7 +339,7 @@ def main(_):
      tf.scalar_summary("Validation Loss", mvalid.cost)

    with tf.name_scope("Test"):
-      test_input = PTBInput(config=config, data=test_data, name="TestInput")
+      test_input = PTBInput(config=eval_config, data=test_data, name="TestInput")
      with tf.variable_scope("Model", reuse=True, initializer=initializer):
        mtest = PTBModel(is_training=False, config=eval_config,
                         input_=test_input)
@ -347,7 +347,7 @@ def main(_):
    sv = tf.train.Supervisor(logdir=FLAGS.save_path)
    with sv.managed_session() as session:
      for i in range(config.max_max_epoch):
-        lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
+        lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
        m.assign_lr(session, config.learning_rate * lr_decay)

        print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@ -213,7 +213,7 @@ tf_py_test(
    additional_deps = ["//tensorflow:tensorflow_py"],
 )

-tf_py_test(
+cuda_py_test(
    name = "matrix_triangular_solve_op_test",
    size = "small",
    srcs = ["matrix_triangular_solve_op_test.py"],
--- a/tensorflow/python/kernel_tests/conv2d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
+from tensorflow.python.client import device_lib


 class Conv2DTransposeTest(tf.test.TestCase):
@ -157,6 +158,119 @@ class Conv2DTransposeTest(tf.test.TestCase):
    err_tolerance = 0.0005
    self.assertLess(err, err_tolerance)

+  def testConv2DTransposeSingleStrideNCHW(self):
+    # `NCHW` data fomat is only supported for `GPU` device.
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True):
+        strides = [1, 1, 1, 1]
+
+        # Input, output: [batch, depth, height, width, depth]
+        x_shape = [2, 3, 6, 4]
+        y_shape = [2, 2, 6, 4]
+
+        # Filter: [kernel_height, kernel_width, output_depth, input_depth]
+        f_shape = [3, 3, 2, 3]
+
+        x = tf.constant(1.0, shape=x_shape, name="x", dtype=tf.float32)
+        f = tf.constant(1.0, shape=f_shape, name="filter", dtype=tf.float32)
+
+        output = tf.nn.conv2d_transpose(x, f, y_shape, strides=strides,
+                                     padding="SAME", data_format='NCHW')
+
+        value = output.eval()
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[2]):
+            for w in xrange(y_shape[3]):
+              for h in xrange(y_shape[2]):
+                target = 4 * 3.0
+                h_in = h > 0 and h < y_shape[2] - 1
+                w_in = w > 0 and w < y_shape[3] - 1
+                if h_in and w_in:
+                  target += 5 * 3.0
+                elif h_in or w_in:
+                  target += 2 * 3.0
+                self.assertAllClose(target, value[n, k, h, w])
+
+  def testConv2DTransposeSameNCHW(self):
+    # `NCHW` data fomat is only supported for `GPU` device.
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True):
+        strides = [1, 1, 2, 2]
+
+        # Input, output: [batch, depth, height, width]
+        x_shape = [2, 3, 6, 4]
+        y_shape = [2, 2, 12, 8]
+
+        # Filter: [kernel_height, kernel_width, output_depth, input_depth]
+        f_shape = [3, 3, 2, 3]
+
+        x = tf.constant(1.0, shape=x_shape, name="x", dtype=tf.float32)
+        f = tf.constant(1.0, shape=f_shape, name="filter", dtype=tf.float32)
+
+        output = tf.nn.conv2d_transpose(x, f, y_shape, strides=strides,
+                                          padding="SAME", data_format='NCHW')
+
+        value = output.eval()
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[2]):
+            for w in xrange(y_shape[3]):
+              for h in xrange(y_shape[2]):
+                target = 3.0
+                # We add a case for locations divisible by the stride.
+                h_in = h % strides[2] == 0 and h > 0 and h < y_shape[2] - 1
+                w_in = w % strides[3] == 0 and w > 0 and w < y_shape[3] - 1
+                if h_in and w_in:
+                  target += 9.0
+                elif h_in or w_in:
+                  target += 3.0
+                self.assertAllClose(target, value[n, k, h, w])
+
+  def testConv2DTransposeValidNCHW(self):
+    # `NCHW` data fomat is only supported for `GPU` device.
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True):
+        strides = [1, 1, 2, 2]
+
+        # Input, output: [batch, depth, height, width]
+        x_shape = [2, 3, 6, 4]
+        y_shape = [2, 2, 13, 9]
+
+        # Filter: [kernel_height, kernel_width, output_depth, input_depth]
+        f_shape = [3, 3, 2, 3]
+
+        x = tf.constant(1.0, shape=x_shape, name="x", dtype=tf.float32)
+        f = tf.constant(1.0, shape=f_shape, name="filter", dtype=tf.float32)
+        output = tf.nn.conv2d_transpose(x, f, y_shape, strides=strides,
+                                        padding="VALID", data_format='NCHW')
+
+        value = output.eval()
+        cache_values = np.zeros(y_shape, dtype=np.float32)
+        # The amount of padding added
+        pad = 1
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[2]):
+            for w in xrange(pad, y_shape[3] - pad):
+              for h in xrange(pad, y_shape[2] - pad):
+                target = 3.0
+                # We add a case for locations divisible by the stride.
+                h_in = h % strides[
+                    2] == 0 and h > pad and h < y_shape[2] - 1 - pad
+                w_in = w % strides[
+                    3] == 0 and w > pad and w < y_shape[3] - 1 - pad
+                if h_in and w_in:
+                  target += 9.0
+                elif h_in or w_in:
+                  target += 3.0
+                cache_values[n, k, h, w] = target
+
+            # copy values in the border
+            cache_values[n, k, :, 0] = cache_values[n, k, :, 1]
+            cache_values[n, k, :, -1] = cache_values[n, k, :, -2]
+            cache_values[n, k, 0, :] = cache_values[n, k, 1, :]
+            cache_values[n, k, -1, :] = cache_values[n, k, -2, :]
+
+        self.assertAllClose(cache_values, value)
+

 if __name__ == "__main__":
  tf.test.main()
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@ -1356,6 +1356,18 @@ class SelectOpTest(tf.test.TestCase):
    elif x.dtype == np.float64:
      self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)

+  def testScalar(self):
+    c = True
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 3, 2) * 100
+    for t in [np.float16, np.float32, np.float64, np.int32, np.int64,
+              np.complex64, np.complex128]:
+      xt = x.astype(t)
+      yt = y.astype(t)
+      self._compare(c, xt, yt, use_gpu=False)
+      if t in [np.float16, np.float32, np.float64]:
+        self._compare(c, xt, yt, use_gpu=True)
+
  def testBasic(self):
    c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
    x = np.random.rand(1, 3, 2) * 100
--- a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
@ -24,15 +24,17 @@ import tensorflow as tf
 class MatrixTriangularSolveOpTest(tf.test.TestCase):

  def _verifySolveAllWays(self, x, y, batch_dims=None):
-    for lower in True, False:
-      for adjoint in True, False:
-        self._verifySolve(x,
-                          y,
-                          lower=lower,
-                          adjoint=adjoint,
-                          batch_dims=batch_dims)
+    for use_gpu in True, False:
+      for lower in True, False:
+        for adjoint in True, False:
+          self._verifySolve(x,
+                            y,
+                            lower=lower,
+                            adjoint=adjoint,
+                            batch_dims=batch_dims,
+                            use_gpu=use_gpu)

-  def _verifySolve(self, x, y, lower=True, adjoint=False, batch_dims=None):
+  def _verifySolve(self, x, y, lower=True, adjoint=False, batch_dims=None, use_gpu=False):
    for np_type in [np.float32, np.float64]:
      a = x.astype(np_type)
      b = y.astype(np_type)
@ -52,7 +54,7 @@ class MatrixTriangularSolveOpTest(tf.test.TestCase):
        a_np = np.tile(a_np, batch_dims + [1, 1])
        b = np.tile(b, batch_dims + [1, 1])

-      with self.test_session():
+      with self.test_session(use_gpu=use_gpu):
        tf_ans = tf.matrix_triangular_solve(a, b, lower=lower, adjoint=adjoint)
        out = tf_ans.eval()
        np_ans = np.linalg.solve(a_np, b)
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@ -264,6 +264,42 @@ class EluTest(tf.test.TestCase):
    print("elu (float64) gradient err = ", err)
    self.assertLess(err, 1e-6)

+  def testGradGradFloat32(self):
+    with self.test_session():
+      x = tf.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5], name="x")
+      y = tf.nn.elu(x, name="elu")
+      z = tf.gradients(y, x)
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32, order="F")
+      err = tf.test.compute_gradient_error(x,
+                                           [2, 5],
+                                           z[0],
+                                           [2, 5],
+                                           x_init_value=x_init)
+    print("elu (float32) gradient of gradient err = ", err)
+    self.assertLess(err, 1e-4)

+  def testGradGradFloat64(self):
+    with self.test_session():
+      x = tf.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5], dtype=tf.float64, name="x")
+      y = tf.nn.elu(x, name="elu")
+      z = tf.gradients(y, x)
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float64, order="F")
+      err = tf.test.compute_gradient_error(x,
+                                           [2, 5],
+                                           z[0],
+                                           [2, 5],
+                                           x_init_value=x_init)
+    print("elu (float64) gradient of gradient err = ", err)
+    self.assertLess(err, 1e-6)
+
+    
 if __name__ == "__main__":
  tf.test.main()
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@ -1795,7 +1795,7 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
  performed
  instead:
  ```prettyprint
-  tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
+  tf.cumprod([a, b, c], exclusive=True) ==> [1, a, a * b]
  ```

  By setting the `reverse` kwarg to `True`, the cumprod is performed in the
@ -1807,7 +1807,7 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):

  The `reverse` and `exclusive` kwargs can also be combined:
  ```prettyprint
-  tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
+  tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 1]
  ```

  Args:
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@ -25,7 +25,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import gen_nn_ops
-
+from tensorflow.python.ops import gen_math_ops

@ops.RegisterGradient("Conv2DBackpropInput")
 def _Conv2DBackpropInputGrad(op, grad):
@ -268,6 +268,14 @@ def _ReluGrad(op, grad):
  return gen_nn_ops._relu_grad(grad, op.outputs[0])


+@ops.RegisterGradient("EluGrad")
+def _EluGradGrad(op, grad):
+  x = op.inputs[1]
+  return (gen_nn_ops._elu_grad(grad, op.outputs[0]), 
+          gen_math_ops.select(x < 0., gen_nn_ops._elu_grad(grad, op.outputs[0] + 1), 
+          array_ops.zeros(shape = array_ops.shape(x), dtype = x.dtype)))
+
+
@ops.RegisterGradient("Relu6")
 def _Relu6Grad(op, grad):
  return gen_nn_ops._relu6_grad(grad, op.inputs[0])
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@ -1010,6 +1010,7 @@ def conv2d_transpose(value,
                     output_shape,
                     strides,
                     padding="SAME",
+                     data_format="NHWC",
                     name=None):
  """The transpose of `conv2d`.

@ -1020,7 +1021,8 @@ def conv2d_transpose(value,

  Args:
    value: A 4-D `Tensor` of type `float` and shape
-      `[batch, height, width, in_channels]`.
+      `[batch, height, width, in_channels]` for `NHWC` data format or
+      `[batch, in_channels, height, width]` for `NCHW` data format.
    filter: A 4-D `Tensor` with the same type as `value` and shape
      `[height, width, output_channels, in_channels]`.  `filter`'s
      `in_channels` dimension must match that of `value`.
@ -1030,6 +1032,7 @@ def conv2d_transpose(value,
      dimension of the input tensor.
    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
      See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+    data_format: A string. 'NHWC' and 'NCHW' are supported.
    name: Optional name for the returned tensor.

  Returns:
@ -1041,9 +1044,12 @@ def conv2d_transpose(value,
  """
  with ops.name_scope(name, "conv2d_transpose",
                      [value, filter, output_shape]) as name:
+    if data_format not in ("NCHW", "NHWC"):
+      raise ValueError("data_format has to be either NCHW or NHWC.")
    value = ops.convert_to_tensor(value, name="value")
    filter = ops.convert_to_tensor(filter, name="filter")
-    if not value.get_shape()[3].is_compatible_with(filter.get_shape()[3]):
+    axis = 3 if data_format=="NHWC" else 1
+    if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[3]):
      raise ValueError("input channels does not match filter's input channels, "
                       "{} != {}".format(value.get_shape()[3], filter.get_shape(
                       )[3]))
@ -1055,10 +1061,10 @@ def conv2d_transpose(value,

    if isinstance(output_shape, (list, np.ndarray)):
      # output_shape's shape should be == [4] if reached this point.
-      if not filter.get_shape()[2].is_compatible_with(output_shape[3]):
+      if not filter.get_shape()[2].is_compatible_with(output_shape[axis]):
        raise ValueError(
            "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[3], filter.get_shape()[2]))
+            "{} != {}".format(output_shape[axis], filter.get_shape()[2]))

    if padding != "VALID" and padding != "SAME":
      raise ValueError("padding must be either VALID or SAME:"
@ -1069,6 +1075,7 @@ def conv2d_transpose(value,
                                            out_backprop=value,
                                            strides=strides,
                                            padding=padding,
+                                            data_format=data_format,
                                            name=name)


--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@ -68,7 +68,7 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
      Must be positive.  See the decay computation above.
    decay_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The decay rate.
-    staircase: Boolean.  It `True` decay the learning rate at discrete intervals
+    staircase: Boolean.  If `True` decay the learning rate at discrete intervals
    name: String.  Optional name of the operation.  Defaults to
      'ExponentialDecay'.

--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@ -15,7 +15,10 @@ limitations under the License.

 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"

+#if !defined(PLATFORM_WINDOWS)
 #include <dirent.h>
+#endif
+
 #include <limits.h>
 #include <stddef.h>
 #include <stdio.h>
@ -25,11 +28,13 @@ limitations under the License.
 #include <IOKit/kext/KextManager.h>
 #include <mach-o/dyld.h>
 #else
+#if !defined(PLATFORM_WINDOWS)
 #include <link.h>
-#include <sys/stat.h>
 #include <sys/sysmacros.h>
-#endif
 #include <unistd.h>
+#endif
+#include <sys/stat.h>
+#endif
 #include <algorithm>
 #include <memory>
 #include <vector>
@ -135,7 +140,7 @@ void Diagnostician::LogDiagnosticInformation() {
              << "(" << port::Hostname() << ")";
  }
  CFRelease(kext_infos);
-#else
+#elif !defined(PLATFORM_WINDOWS)
  if (access(kDriverVersionPath, F_OK) != 0) {
    LOG(INFO) << "kernel driver does not appear to be running on this host "
              << "(" << port::Hostname() << "): "
@ -158,7 +163,7 @@ void Diagnostician::LogDiagnosticInformation() {

 /* static */ void Diagnostician::LogDriverVersionInformation() {
  LOG(INFO) << "hostname: " << port::Hostname();
-
+#ifndef PLATFORM_WINDOWS
  if (VLOG_IS_ON(1)) {
    const char *value = getenv("LD_LIBRARY_PATH");
    string library_path = value == nullptr ? "" : value;
@ -180,17 +185,17 @@ void Diagnostician::LogDiagnosticInformation() {
      closedir(dir);
    }
  }
-
  port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
  LOG(INFO) << "libcuda reported version is: "
            << DriverVersionStatusToString(dso_version);

  port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
  LOG(INFO) << "kernel reported version is: "
-            << DriverVersionStatusToString(kernel_version);
+	  << DriverVersionStatusToString(kernel_version);
+#endif

  // OS X kernel driver does not report version accurately
-#if !defined(__APPLE__)
+#if !defined(__APPLE__) && !defined(PLATFORM_WINDOWS)
  if (kernel_version.ok() && dso_version.ok()) {
    WarnOnDsoKernelMismatch(dso_version, kernel_version);
  }
@ -227,6 +232,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
      result = StringToDriverVersion(version);
    }
 #else
+#if !defined(PLATFORM_WINDOWS)
  // Callback used when iterating through DSOs. Looks for the driver-interfacing
  // DSO and yields its version number into the callback data, when found.
  auto iterate_phdr =
@ -258,6 +264,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
  };

  dl_iterate_phdr(iterate_phdr, &result);
+#endif
 #endif

  return result;
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@ -3200,6 +3200,7 @@ bool CudnnSupport::DoNormalize(
    Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
    const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
  LOG(FATAL) << "not yet implemented";  // TODO(leary)
+  return false;
 }

 bool CudnnSupport::DoNormalizeWithDimensions(
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@ -19,8 +19,8 @@ limitations under the License.
 #include <stdint.h>
 #include <stdlib.h>
 #include <set>
-#include "tensorflow/stream_executor/platform/port.h"

+#include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/dso_loader.h"
 #include "tensorflow/stream_executor/lib/casts.h"
@ -38,6 +38,14 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/lib/inlined_vector.h"

+#if defined(PLATFORM_WINDOWS)
+// TODO: in windows ARRAYSIZE is defined in winnt.h but including it
+//  here creates a conflict with cuda.h - for now define it here.
+#define ARRAYSIZE(a) \
+  ((sizeof(a) / sizeof(*(a))) / \
+  static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
+#endif
+
 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
 bool FLAGS_gpuexec_cuda_device_0_only = false;
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@ -18,8 +18,12 @@ limitations under the License.
 #if defined(__APPLE__)
 #include <mach-o/dyld.h>
 #endif
+#if defined(PLATFORM_WINDOWS)
+#include <windows.h>
+#define PATH_MAX MAX_PATH
+#else
 #include <unistd.h>
-
+#endif
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 #include "tensorflow/stream_executor/cuda/cuda_event.h"
@ -204,7 +208,12 @@ static string GetBinaryDir(bool strip_exe) {
    _NSGetExecutablePath(unresolved_path, &buffer_size);
    CHECK_ERR(realpath(unresolved_path, exe_path) ? 1 : -1);
 #else
-    CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
+#if defined(PLATFORM_WINDOWS)
+  HMODULE hModule = GetModuleHandle(NULL);
+  GetModuleFileName(hModule, exe_path, MAX_PATH);
+#else
+  CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
+#endif
 #endif
  // Make sure it's null-terminated:
  exe_path[sizeof(exe_path) - 1] = 0;
@ -908,8 +917,10 @@ static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
  // could use the file::* utilities).
  FILE *file = fopen(filename.c_str(), "r");
  if (file == nullptr) {
+#if !defined(PLATFORM_WINDOWS)
    LOG(ERROR) << "could not open file to read NUMA node: " << filename
               << "\nYour kernel may have been built without NUMA support.";
+#endif
    return kUnknownNumaNode;
  }

--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@ -15,8 +15,6 @@ limitations under the License.

 #include "tensorflow/stream_executor/cuda/cuda_rng.h"

-#include <dlfcn.h>
-
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
 #include "tensorflow/stream_executor/cuda/cuda_helpers.h"
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@ -18,13 +18,17 @@ limitations under the License.

 #include "tensorflow/stream_executor/dso_loader.h"

-#include <dlfcn.h>
 #include <limits.h>
 #if defined(__APPLE__)
 #include <mach-o/dyld.h>
 #endif
 #include <stdlib.h>
+#if defined(PLATFORM_WINDOWS)
+#include <windows.h>
+#define PATH_MAX MAX_PATH
+#else
 #include <unistd.h>
+#endif
 #include <initializer_list>
 #include <vector>

@ -45,7 +49,7 @@ string GetCudaVersion() { return TF_CUDA_VERSION; }
 string GetCudnnVersion() { return TF_CUDNN_VERSION; }

 /* static */ port::Status DsoLoader::GetCublasDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                      "cublas", GetCudaVersion()),
                                  GetCudaLibraryDirPath()),
                      dso_handle);
@ -55,35 +59,42 @@ string GetCudnnVersion() { return TF_CUDNN_VERSION; }
  // libcudnn is versioned differently than the other libraries and may have a
  // different version number than other CUDA libraries.  See b/22397368 for
  // some details about the complications surrounding this.
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                      "cudnn", GetCudnnVersion()),
                                  GetCudaLibraryDirPath()),
                      dso_handle);
 }

 /* static */ port::Status DsoLoader::GetCufftDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                      "cufft", GetCudaVersion()),
                                  GetCudaLibraryDirPath()),
                      dso_handle);
 }

 /* static */ port::Status DsoLoader::GetCurandDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                      "curand", GetCudaVersion()),
                                  GetCudaLibraryDirPath()),
                      dso_handle);
 }

 /* static */ port::Status DsoLoader::GetLibcudaDsoHandle(void** dso_handle) {
+#if defined(PLATFORM_WINDOWS)
  return GetDsoHandle(
-      FindDsoPath(tensorflow::internal::FormatLibraryFileName("cuda", "1"),
+      FindDsoPath(port::Env::Default()->FormatLibraryFileName("nvcuda", ""),
                  GetCudaDriverLibraryPath()),
      dso_handle);
+#else
+  return GetDsoHandle(
+      FindDsoPath(port::Env::Default()->FormatLibraryFileName("cuda", "1"),
+                  GetCudaDriverLibraryPath()),
+      dso_handle);
+#endif
 }

 /* static */ port::Status DsoLoader::GetLibcuptiDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                      "cupti", GetCudaVersion()),
                                  GetCudaCuptiLibraryPath()),
                      dso_handle);
@ -101,8 +112,6 @@ string GetCudnnVersion() { return TF_CUDNN_VERSION; }
    return port::Status(port::error::INVALID_ARGUMENT,
                        "Only LoadKind::kLocal is currently supported");
  }
-  int dynload_flags =
-      RTLD_LAZY | (load_kind == LoadKind::kLocal ? RTLD_LOCAL : RTLD_GLOBAL);
  string path_string = path.ToString();
  port::Status s =
      port::Env::Default()->LoadLibrary(path_string.c_str(), dso_handle);
@ -125,6 +134,9 @@ string GetCudnnVersion() { return TF_CUDNN_VERSION; }
  char unresolved_path[buffer_size];
  _NSGetExecutablePath(unresolved_path, &buffer_size);
  CHECK_ERR(realpath(unresolved_path, exe_path) ? 1 : -1);
+#elif defined(PLATFORM_WINDOWS)
+  HMODULE hModule = GetModuleHandle(NULL);
+  GetModuleFileName(hModule, exe_path, MAX_PATH);
 #else
  CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
 #endif
@ -159,6 +171,9 @@ static std::vector<string>* CreatePrimordialRpaths() {
 }

 /* static */ bool DsoLoader::TrySymbolicDereference(string* candidate) {
+#if defined(PLATFORM_WINDOWS)
+  return false;
+#else
  char buf[PATH_MAX];
  char* result = realpath(candidate->c_str(), buf);
  if (result == nullptr) {
@ -168,6 +183,7 @@ static std::vector<string>* CreatePrimordialRpaths() {
          << result << "\"";
  *candidate = result;
  return true;
+#endif
 }

 /* static */ string DsoLoader::FindDsoPath(port::StringPiece library_name,
@ -206,6 +222,8 @@ static std::vector<string>* CreatePrimordialRpaths() {
 /* static */ string DsoLoader::GetCudaDriverLibraryPath() {
 #if defined(__APPLE__)
  return "external/local_config_cuda/cuda/driver/lib";
+#elif defined(PLATFORM_WINDOWS)
+  return "";
 #else
  return "external/local_config_cuda/cuda/driver/lib64";
 #endif
--- a/tensorflow/stream_executor/lib/process_state.cc
+++ b/tensorflow/stream_executor/lib/process_state.cc
@ -15,8 +15,13 @@ limitations under the License.

 #include "tensorflow/stream_executor/lib/process_state.h"

+#if defined(PLATFORM_WINDOWS)
+#include <direct.h>
+#include <stdlib.h>
+#include <WinSock2.h>
+#else
 #include <unistd.h>
-
+#endif
 #include <memory>

 namespace perftools {
@ -27,7 +32,7 @@ string Hostname() {
  char hostname[1024];
  gethostname(hostname, sizeof hostname);
  hostname[sizeof hostname - 1] = 0;
-  return hostname;
+  return std::string(hostname);
 }

 bool GetCurrentDirectory(string* dir) {
--- a/tensorflow/stream_executor/lib/static_threadlocal.h
+++ b/tensorflow/stream_executor/lib/static_threadlocal.h
@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_

+#ifdef _MSC_VER
+#define __thread __declspec(thread) 
+#endif
+
 // For POD types in TLS mode, s_obj_VAR is the thread-local variable.
 #define SE_STATIC_THREAD_LOCAL_POD(_Type_, _var_)               \
  static __thread _Type_ s_obj_##_var_;                         \
--- a/tensorflow/tensorboard/backend/server.py
+++ b/tensorflow/tensorboard/backend/server.py
@ -81,7 +81,7 @@ def ParseEventFilesSpec(logdir):
    else:
      run_name = None
      path = specification
-    if not io_wrapper.IsGCSPath(path):
+    if not (io_wrapper.IsGCSPath(path) or path.startswith('hdfs://')):
      path = os.path.realpath(path)
    files[path] = run_name
  return files
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@ -563,7 +563,7 @@ def _py_wrap_cc_impl(ctx):
  for dep in ctx.attr.deps:
    inputs += dep.cc.transitive_headers
  inputs += ctx.files._swiglib
-  swig_include_dirs = set([f.root.path for f in inputs if f.root.path])
+  swig_include_dirs = set(_get_repository_roots(ctx, inputs))
  swig_include_dirs += sorted([f.dirname for f in ctx.files._swiglib])
  args = ["-c++",
          "-python",
@ -616,6 +616,35 @@ _py_wrap_cc = rule(
    implementation = _py_wrap_cc_impl,
 )

+def _get_repository_roots(ctx, files):
+  """Returns abnormal root directories under which files reside.
+
+  When running a ctx.action, source files within the main repository are all
+  relative to the current directory; however, files that are generated or exist
+  in remote repositories will have their root directory be a subdirectory,
+  e.g. bazel-out/local-fastbuild/genfiles/external/jpeg_archive. This function
+  returns the set of these devious directories, ranked and sorted by popularity
+  in order to hopefully minimize the number of I/O system calls within the
+  compiler, because includes have quadratic complexity.
+  """
+  result = {}
+  for f in files:
+    root = f.root.path
+    if root:
+      if root not in result:
+        result[root] = 0
+      result[root] -= 1
+    work = f.owner.workspace_root
+    if work:
+      if root:
+        root += "/"
+      root += work
+    if root:
+      if root not in result:
+        result[root] = 0
+      result[root] -= 1
+  return [k for v, k in sorted([(v, k) for k, v in result.items()])]
+
 # Bazel rule for collecting the header files that a target depends on.
 def _transitive_hdrs_impl(ctx):
  outputs = set()
--- a/tensorflow/tools/ci_build/builds/test_installation.sh
+++ b/tensorflow/tools/ci_build/builds/test_installation.sh
@ -47,10 +47,6 @@
 # TF_BUILD_BAZEL_CLEAN, if set to any non-empty and non-0 value, directs the
 # script to perform bazel clean prior to main build and test steps.
 #
-# TF_BUILD_SERIAL_INSTALL_TESTS, if set to any non-empty and non-0 value,
-# will force the Python install tests to run serially, overriding than the
-# concurrent testing behavior.
-#
 # TF_GPU_COUNT, Set the number of GPUs in the system. We run only this many
 # concurrent tests when running GPU tests.
 #
@ -411,21 +407,21 @@ SKIP_COUNTER=0
 FAILED_TESTS=""
 FAILED_TEST_LOGS=""

-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-if [[ -z ${N_JOBS} ]]; then
-  # Try the Mac way of getting number of CPUs
-  N_JOBS=$(sysctl -n hw.ncpu)
-fi
-
-if [[ -z ${N_JOBS} ]]; then
-  N_JOBS=8
-  echo "Cannot determine the number of processors"
-  echo "Using default concurrent job counter ${N_JOBS}"
-fi
-
-if [[ ! -z "${TF_BUILD_SERIAL_INSTALL_TESTS}" ]] &&
-   [[ "${TF_BUILD_SERIAL_INSTALL_TESTS}" != "0" ]]; then
+if [[ "${IS_GPU}" == "1" ]]; then
  N_JOBS=$TF_GPU_COUNT
+else
+  N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+  if [[ -z ${N_JOBS} ]]; then
+    # Try the Mac way of getting number of CPUs
+    N_JOBS=$(sysctl -n hw.ncpu)
+  fi
+
+  # If still cannot determine the number of CPUs, pick 8.
+  if [[ -z ${N_JOBS} ]]; then
+    N_JOBS=8
+    echo "Cannot determine the number of processors"
+    echo "Using default concurrent job counter ${N_JOBS}"
+  fi
 fi

 echo "Running Python tests-on-install with ${N_JOBS} concurrent jobs..."
@ -485,9 +481,14 @@ while true; do
    TEST_LOGS="${TEST_LOGS} ${TEST_LOG}"

    # Launch test asynchronously
-    "${SCRIPT_DIR}/../gpu_build/parallel_gpu_execute.sh" \
+    if [[ "${IS_GPU}" == "1" ]]; then
+      "${SCRIPT_DIR}/../gpu_build/parallel_gpu_execute.sh" \
+        "${SCRIPT_DIR}/py_test_delegate.sh" \
+        "${PYTHON_BIN_PATH}" "${PY_TEST_DIR}/${TEST_BASENAME}" "${TEST_LOG}" &
+    else
      "${SCRIPT_DIR}/py_test_delegate.sh" \
-      "${PYTHON_BIN_PATH}" "${PY_TEST_DIR}/${TEST_BASENAME}" "${TEST_LOG}" &
+        "${PYTHON_BIN_PATH}" "${PY_TEST_DIR}/${TEST_BASENAME}" "${TEST_LOG}" &
+    fi

    if [[ "${TEST_COUNTER}" -ge "${N_PAR_TESTS}" ]]; then
      # Run in exclusive mode
--- a/tensorflow/tools/ci_build/builds/test_tutorials.sh
+++ b/tensorflow/tools/ci_build/builds/test_tutorials.sh
@ -146,7 +146,7 @@ test_mnist_with_summaries() {

  run_in_directory "${TEST_DIR}" "${LOG_FILE}" \
    tensorflow/examples/tutorials/mnist/mnist_with_summaries.py \
-    --data_dir="${TUT_TEST_DATA_DIR}/mnist" --summaries_dir="${SUMMARIES_DIR}"
+    --data_dir="${TUT_TEST_DATA_DIR}/mnist" --log_dir="${SUMMARIES_DIR}"

  # Verify final accuracy
  FINAL_ACCURACY=$(grep "Accuracy at step" "${LOG_FILE}" \
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@ -103,10 +103,8 @@ WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}"
 BUILD_TAG="${BUILD_TAG:-tf_ci}"

 # Add extra params for cuda devices and libraries for GPU container.
-if [ "${CONTAINER_TYPE}" == "gpu" ]; then
-  # GPU pip tests-on-install concurrency is limited to the number of GPUs.
-  GPU_EXTRA_PARAMS="${GPU_EXTRA_PARAMS} -e TF_BUILD_SERIAL_INSTALL_TESTS=1"
-else
+# And clear them if we are not building for GPU.
+if [ "${CONTAINER_TYPE}" != "gpu" ]; then
  GPU_EXTRA_PARAMS=""
 fi

--- a/tensorflow/tools/dist_test/build_server.sh
+++ b/tensorflow/tools/dist_test/build_server.sh
@ -16,7 +16,14 @@
 #
 # Builds the test server for distributed (GRPC) TensorFlow
 #
-# Usage: build_server.sh <docker_image_name> [--test]
+# Usage: build_server.sh <docker_image_name> <whl_url> [--test]
+#
+# Arguments:
+#   docker_image_name: Name of the docker image to build.
+#     E.g.: tensorflow/tf_grpc_test_server:0.11.0rc1
+#
+#   whl_url: URL from which the TensorFlow whl file will be downloaded.
+#     E.g.: https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 #
 # The optional flag --test lets the script to use the Dockerfile for the
 # testing GRPC server. Without the flag, the script will build the non-test
@ -33,22 +40,35 @@ die() {
 }

 # Check arguments
-if [[ $# != 1 ]] && [[ $# != 2 ]]; then
-  die "Usage: $0 <docker_image_name> [--test]"
+if [[ $# -lt 2 ]]; then
+  die "Usage: $0 <docker_image_name> <whl_url> [--test]"
 fi

 DOCKER_IMG_NAME=$1
-shift
+WHL_URL=$2
+shift 2

 # Current script directory
 DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

-DOCKER_FILE="${DIR}/server/Dockerfile"
+BUILD_DIR=$(mktemp -d)
+echo ""
+echo "Using whl file URL: ${WHL_URL}"
+echo "Building in temporary directory: ${BUILD_DIR}"
+
+cp -r ${DIR}/* "${BUILD_DIR}"/ || \
+    die "Failed to copy files to ${BUILD_DIR}"
+
+DOCKER_FILE="${BUILD_DIR}/server/Dockerfile"
 if [[ $1 == "--test" ]]; then
-  DOCKER_FILE="${DIR}/server/Dockerfile.test"
+  DOCKER_FILE="${BUILD_DIR}/server/Dockerfile.test"
 fi
 echo "Using Docker file: ${DOCKER_FILE}"

+# Download whl file into the build context directory.
+wget -P "${BUILD_DIR}" ${WHL_URL} || \
+    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+
 if [[ ! -f "${DOCKER_FILE}" ]]; then
  die "ERROR: Unable to find dockerfile: ${DOCKER_FILE}"
 fi
@ -56,5 +76,8 @@ echo "Dockerfile: ${DOCKER_FILE}"

 # Call docker build
 docker build --no-cache -t "${DOCKER_IMG_NAME}" \
-   -f "${DOCKER_FILE}" \
-   "${DIR}"
+   -f "${DOCKER_FILE}" "${BUILD_DIR}" || \
+   die "Failed to build docker image: ${DOCKER_IMG_NAME}"
+
+# Clean up docker build context directory.
+rm -rf "${BUILD_DIR}"
--- a/tensorflow/tools/dist_test/server/Dockerfile
+++ b/tensorflow/tools/dist_test/server/Dockerfile
@ -34,9 +34,10 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
    python get-pip.py && \
    rm get-pip.py

-# Install TensorFlow CPU version from nightly build
-RUN pip --no-cache-dir install \
-    https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+# Install TensorFlow wheel
+COPY tensorflow-*.whl /
+RUN pip install /tensorflow-*.whl && \
+    rm -f /tensorflow-*.whl

 # Copy files, including the GRPC server binary at
 # server/grpc_tensorflow_server.py
--- a/tensorflow/tools/dist_test/server/Dockerfile.test
+++ b/tensorflow/tools/dist_test/server/Dockerfile.test
@ -40,9 +40,10 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
 # Install python panda for the census wide&deep test
 RUN pip install --upgrade pandas==0.18.1

-# Install TensorFlow CPU version.
-RUN pip --no-cache-dir install \
-    https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+# Install TensorFlow wheel
+COPY tensorflow-*.whl /
+RUN pip install /tensorflow-*.whl && \
+    rm -f /tensorflow-*.whl

 # Copy files, including the GRPC server binary at
 # server/grpc_tensorflow_server.py
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@ -33,7 +33,7 @@ RUN pip --no-cache-dir install \
        && \
    python -m ipykernel.kernelspec

-ENV TENSORFLOW_VERSION 0.11.0rc0
+ENV TENSORFLOW_VERSION 0.11.0rc1

 # --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 # These lines will be edited automatically by parameterized_docker_build.sh. #
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@ -33,7 +33,7 @@ RUN pip --no-cache-dir install \
        && \
    python -m ipykernel.kernelspec

-ENV TENSORFLOW_VERSION 0.11.0rc0
+ENV TENSORFLOW_VERSION 0.11.0rc1

 # --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 # These lines will be edited automatically by parameterized_docker_build.sh. #
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@ -17,7 +17,7 @@ RUN ./install_google_cloud_sdk.bash --disable-prompts --install-dir=/var/gcloud

 # Install nightly TensorFlow pip
 RUN pip install \
-   https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+   https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl

 # Copy test files
 RUN mkdir -p /gcs-smoke/python
--- a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
+++ b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
@ -81,7 +81,6 @@ fi
 cat ${LOG_FILE}
 echo ""

-
 # Clean up the newly created tfrecord file in GCS bucket.
 # First, activate gcloud service account
 "${GCLOUD_BIN}" auth activate-service-account \
@ -96,13 +95,3 @@ fi
 "${GSUTIL_BIN}" rm "${NEW_TFREC_URL}" && \
    echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}" || \
    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
-
-# Also clean up newly created GCS dir.
-NEW_DIR_URL=$(grep "Creating dir" "${LOG_FILE}" | \
-                awk '{print $NF}')
-if [[ -z ${NEW_DIR_URL} ]]; then
-  die "FAIL: Unable to determine the URL to the new directory created in GCS."
-fi
-"${GSUTIL_BIN}" rm -r "${NEW_DIR_URL}" && \
-    echo "Cleaned up new directory created in GCS: ${NEW_DIR_URL}" || \
-    die "FAIL: Unable to clean up new directory created in GCS: ${NEW_DIR_URL}"
--- a/tensorflow/tools/gcs_test/python/gcs_smoke.py
+++ b/tensorflow/tools/gcs_test/python/gcs_smoke.py
@ -35,7 +35,6 @@ flags.DEFINE_integer("num_examples", 10, "Number of examples to generate")

 FLAGS = flags.FLAGS

-
 def create_examples(num_examples, input_mean):
  """Create ExampleProto's containg data."""
  ids = np.arange(num_examples).reshape([num_examples, 1])
@ -64,12 +63,48 @@ def create_dir_test():
  print("%s directory exists: %s" % (dir_name, dir_exists))

  # List contents of just created directory.
-  starttime = int(round(time.time() * 1000))
  print("Listing directory %s." % dir_name)
+  starttime = int(round(time.time() * 1000))
  print(file_io.list_directory(dir_name))
  elapsed = int(round(time.time() * 1000)) - starttime
  print("Listed directory %s in %s milliseconds" % (dir_name, elapsed))

+  # Delete directory.
+  print("Deleting directory %s." % dir_name)
+  starttime = int(round(time.time() * 1000))
+  file_io.delete_recursively(dir_name)
+  elapsed = int(round(time.time() * 1000)) - starttime
+  print("Deleted directory %s in %s milliseconds" % (dir_name, elapsed))
+
+def create_object_test():
+  """Verifies file_io's object manipulation methods ."""
+  starttime = int(round(time.time() * 1000))
+  dir_name = "%s/tf_gcs_test_%s" % (FLAGS.gcs_bucket_url, starttime)
+  print("Creating dir %s." % dir_name)
+  file_io.create_dir(dir_name)
+
+  # Create a file in this directory.
+  file_name = "%s/test_file.txt" % dir_name
+  print("Creating file %s." % file_name)
+  file_io.write_string_to_file(file_name, "test file creation.")
+
+  list_files_pattern = "%s/test_file*.txt" % dir_name
+  print("Getting files matching pattern %s." % list_files_pattern)
+  files_list = file_io.get_matching_files(list_files_pattern)
+  print(files_list)
+
+  assert len(files_list) == 1
+  assert files_list[0] == file_name
+
+  # Cleanup test files.
+  print("Deleting file %s." % file_name)
+  file_io.delete_file(file_name)
+
+  # Delete directory.
+  print("Deleting directory %s." % dir_name)
+  file_io.delete_recursively(dir_name)
+
+
 if __name__ == "__main__":
  # Sanity check on the GCS bucket URL.
  if not FLAGS.gcs_bucket_url or not FLAGS.gcs_bucket_url.startswith("gs://"):
@ -132,4 +167,5 @@ if __name__ == "__main__":
        print("Successfully caught the expected OutOfRangeError while "
              "reading one more record than is available")

-    create_dir_test()
+  create_dir_test()
+  create_object_test()
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@ -147,7 +147,7 @@ def get_git_version(git_base_path):
  """
  unknown_label = b"unknown"
  try:
-    val = subprocess.check_output(["git", "-C", git_base_path, "describe",
+    val = subprocess.check_output(["git", str("--git-dir="+git_base_path+"/.git"), str("--work-tree="+git_base_path), "describe",
                                   "--long", "--dirty", "--tags"]).strip()
    return val if val else unknown_label
  except subprocess.CalledProcessError:
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@ -107,7 +107,8 @@ function main() {
  mkdir -p ${TMPDIR}/third_party
  pushd ${RUNFILES%org_tensorflow}
  for header in $(find protobuf -name \*.h); do
-    cp --parents "$header" ${TMPDIR}/google;
+    mkdir -p "${TMPDIR}/google/$(dirname ${header})"
+    cp "$header" "${TMPDIR}/google/$(dirname ${header})/"
  done
  popd
  cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@ -26,7 +26,7 @@ from setuptools import find_packages, setup, Command
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution

-_VERSION = '0.11.0rc0'
+_VERSION = '0.11.0rc1'

 REQUIRED_PACKAGES = [
    'numpy >= 1.11.0',
--- a/tensorflow/tools/swig/.gitignore
+++ b/tensorflow/tools/swig/.gitignore
@ -0,0 +1 @@
+swig_path
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -98,9 +98,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):

  native.http_archive(
    name = "protobuf",
-    url = "http://github.com/google/protobuf/archive/c2b3e70efd2038a54ef8973771ac58192885125e.tar.gz",
-    sha256 = "eafc1bc4c27970d62effe64ba6610823fdd66711f440d8ca4a168167786a2fcb",
-    strip_prefix = "protobuf-c2b3e70efd2038a54ef8973771ac58192885125e",
+    url = "http://github.com/google/protobuf/archive/008b5a228b37c054f46ba478ccafa5e855cb16db.tar.gz",
+    sha256 = "2737ad055eb8a9bc63ed068e32c4ea280b62d8236578cb4d4120eb5543f759ab",
+    strip_prefix = "protobuf-008b5a228b37c054f46ba478ccafa5e855cb16db",
  )

  native.new_http_archive(
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@ -1,3 +1,6 @@
+#ifdef _WIN32
+#define sleep(seconds) Sleep(1000*seconds)
+#endif  // _WIN32
 #include "unsupported/Eigen/CXX11/Tensor"

 #ifdef _WIN32
--- a/util/python/python_config.sh
+++ b/util/python/python_config.sh
@ -113,29 +113,33 @@ function setup_python {
    echo -e "\n\nERROR: Problem getting python include path.  Is distutils installed?"
    exit 1
  fi
-  local python_lib_path
-  # Split python_path into an array of paths, this allows path containing spaces
-  IFS=','
-  python_lib_path=($(python_path))
-  unset IFS
-  echo "Found possible Python library paths:"
-  for x in "${python_lib_path[@]}"; do
-    echo "  $x"
-  done
-  set -- "${python_lib_path[@]}"
-  echo "Please input the desired Python library path to use.  Default is ["$1"]"
-  read b || true
-  if [ "$b" == "" ]; then
-   python_lib="$(default_python_path "${python_lib_path[0]}")"
-   echo $python_lib
-  else
-    if test -d "$b" -a -x "$b"; then
-      python_lib="$b"
+
+  if [ -z "$PYTHON_LIB_PATH" ]; then
+    local python_lib_path
+    # Split python_path into an array of paths, this allows path containing spaces
+    IFS=','
+    python_lib_path=($(python_path))
+    unset IFS
+    echo "Found possible Python library paths:"
+    for x in "${python_lib_path[@]}"; do
+      echo "  $x"
+    done
+    set -- "${python_lib_path[@]}"
+    echo "Please input the desired Python library path to use.  Default is ["$1"]"
+    read b || true
+    if [ "$b" == "" ]; then
+      PYTHON_LIB_PATH="$(default_python_path "${python_lib_path[0]}")"
+      echo $PYTHON_LIB_PATH
    else
-      echo -e "\n\nERROR: The path you have entered does not exist."
-      exit 1
+      PYTHON_LIB_PATH="$b"
    fi
  fi
+  if test -d "$PYTHON_LIB_PATH" -a -x "$PYTHON_LIB_PATH"; then
+    python_lib="$PYTHON_LIB_PATH"
+  else
+    echo -e "\n\nERROR: Invalid python library path: ${PYTHON_LIB_PATH}."
+    exit 1
+  fi

  local numpy_include=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import numpy; print(numpy.get_include());')
  if [ "$numpy_include" == "" ]; then