Merge changes from github.

Change: 136750267
2016-10-20 12:09:18 -08:00 · 2016-10-20 12:09:18 -08:00 · c5ab3dd177
commit c5ab3dd177
parent 8532897352
106 changed files with 1572 additions and 502 deletions
--- a/README.md
+++ b/README.md
@ -61,6 +61,6 @@ Hello, TensorFlow!
 * [TensorFlow website](http://tensorflow.org)
 * [TensorFlow whitepaper](http://download.tensorflow.org/paper/whitepaper2015.pdf)
 * [TensorFlow Model Zoo](https://github.com/tensorflow/models)
-* [TensorFlow MOOC on Udacity] (https://www.udacity.com/course/deep-learning--ud730)
+* [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)

 The TensorFlow community has created amazing things with TensorFlow, please see the [resources section of tensorflow.org](https://www.tensorflow.org/versions/master/resources#community) for an incomplete list.
--- a/RELEASE.md
+++ b/RELEASE.md
@ -84,7 +84,7 @@ Snyder, @jpangburn, Jules Gagnon-Marchand, Karen Brems, @kborer, Kirill Bobyrev,
 Laurent Mazare, Longqi Yang, Malith Yapa, Maniteja Nandana, Martin Englund,
 Matthias Winkelmann, @mecab, Mu-Ik Jeon, Nand Dalal, Niels Ole Salscheider,
 Nikhil Mishra, Park Jiin, Pieter De Rijk, @raix852, Ritwik Gupta, Sahil Sharma,
-@Sangheum, @SergejsRk, Shinichiro Hamaji, Simon Denel, @Steve, @suiyuan2009,
+Sangheum Hwang, @SergejsRk, Shinichiro Hamaji, Simon Denel, @Steve, @suiyuan2009,
 Tiago Jorge, Tijmen Tieleman, @tvn, @tyfkda, Wang Yang, Wei-Ting Kuo, Wenjian
 Huang, Yan Chen, @YenChenLin, Yuan (Terry) Tang, Yuncheng Li, Yunfeng Wang, Zack
 Polizzi, @zhongzyd, Ziming Dong, @perhapszzy
--- a/34
+++ b/34
@ -8,8 +8,22 @@ pushd `dirname $0` #> /dev/null
 SOURCE_BASE_DIR=`pwd -P`
 popd > /dev/null

+PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
+function is_windows() {
+  # On windows, the shell script is actually running in msys
+  if [[ "${PLATFORM}" =~ msys_nt* ]]; then
+    true
+  else
+    false
+  fi
+}
+
 function bazel_clean_and_fetch() {
-  bazel clean --expunge
+  # bazel clean --expunge currently doesn't work on Windows
+  # TODO(pcloudy): Re-enable it after bazel clean --expunge is fixed.
+  if ! is_windows; then
+    bazel clean --expunge
+  fi
  bazel fetch //tensorflow/...
 }

@ -35,6 +49,12 @@ while true; do
  # Retry
 done

+if is_windows; then
+  TF_NEED_GCP=0
+  TF_NEED_HDFS=0
+  TF_NEED_CUDA=0
+fi
+
 while [ "$TF_NEED_GCP" == "" ]; do
  read -p "Do you wish to build TensorFlow with "\
 "Google Cloud Platform support? [y/N] " INPUT
@ -89,12 +109,16 @@ fi

 ## Find swig path
 if [ -z "$SWIG_PATH" ]; then
-  SWIG_PATH=`type -p swig 2> /dev/null`
+  SWIG_PATH=`type -p swig 2> /dev/null || true`
 fi
 if [[ ! -e "$SWIG_PATH" ]]; then
  echo "Can't find swig.  Ensure swig is in \$PATH or set \$SWIG_PATH."
  exit 1
 fi
+# Convert swig path to Windows style before writing into swig_path
+if is_windows; then
+  SWIG_PATH="$(cygpath -m "$SWIG_PATH")"
+fi
 echo "$SWIG_PATH" > tensorflow/tools/swig/swig_path

 # Invoke python_config and set up symlinks to python includes
@ -104,7 +128,7 @@ echo "$SWIG_PATH" > tensorflow/tools/swig/swig_path
 # git hash propagation
 GEN_GIT_SOURCE=tensorflow/tools/git/gen_git_source.py
 chmod a+x ${GEN_GIT_SOURCE}
-${PYTHON_BIN_PATH} ${GEN_GIT_SOURCE} --configure ${SOURCE_BASE_DIR}
+"${PYTHON_BIN_PATH}" ${GEN_GIT_SOURCE} --configure "${SOURCE_BASE_DIR}"

 ## Set up Cuda-related environment settings

@ -255,8 +279,8 @@ while true; do
    CUDA_DNN_LIB_PATH="lib64/libcudnn.so${TF_CUDNN_EXT}"
    CUDA_DNN_LIB_ALT_PATH="libcudnn.so${TF_CUDNN_EXT}"
  elif [ "$OSNAME" == "Darwin" ]; then
-    CUDA_DNN_LIB_PATH="lib/libcudnn${TF_CUDNN_EXT}.dylib"
-    CUDA_DNN_LIB_ALT_PATH="libcudnn${TF_CUDNN_EXT}.dylib"
+    CUDA_DNN_LIB_PATH="lib/libcudnn${TF_CUDNN_EXT}"
+    CUDA_DNN_LIB_ALT_PATH="libcudnn${TF_CUDNN_EXT}"
  fi

  if [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_ALT_PATH}" -o -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_PATH}" ]; then
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@ -44,6 +44,12 @@ config_setting(
    visibility = ["//visibility:public"],
 )

+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows_msvc"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
    name = "ios",
    values = {
@ -109,6 +115,7 @@ filegroup(
        "//tensorflow/contrib/ndlstm:all_files",
        "//tensorflow/contrib/opt:all_files",
        "//tensorflow/contrib/rnn:all_files",
+        "//tensorflow/contrib/seq2seq:all_files",
        "//tensorflow/contrib/session_bundle:all_files",
        "//tensorflow/contrib/session_bundle/example:all_files",
        "//tensorflow/contrib/slim:all_files",
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@ -965,8 +965,6 @@ typedef struct TF_Library TF_Library;
 // Pass "library_filename" to a platform-specific mechanism for dynamically
 // loading a library. The rules for determining the exact location of the
 // library are platform-specific and are not documented here.
-// Expects the symbols "RegisterOps", "RegisterKernels", and "GetOpList", to be
-// defined in the library.
 //
 // On success, place OK in status and return the newly created library handle.
 // The caller owns the library handle.
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@ -32,6 +32,7 @@ py_library(
        "//tensorflow/contrib/opt:opt_py",
        "//tensorflow/contrib/quantization:quantization_py",
        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/contrib/seq2seq:seq2seq_py",
        "//tensorflow/contrib/slim",
        "//tensorflow/contrib/slim:nets",
        "//tensorflow/contrib/tensor_forest:tensor_forest_py",
--- a/tensorflow/contrib/init.py
+++ b/tensorflow/contrib/init.py
@ -37,6 +37,7 @@ from tensorflow.contrib import metrics
 from tensorflow.contrib import opt
 from tensorflow.contrib import quantization
 from tensorflow.contrib import rnn
+from tensorflow.contrib import seq2seq
 from tensorflow.contrib import slim
 from tensorflow.contrib import tensor_forest
 from tensorflow.contrib import tensorboard
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -48,7 +48,9 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC -D__VERSION__=\"MSVC\")
-  set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} /MP)
+  # Suppress warnings to reduce build log size.
+  add_definitions(/wd4267 /wd4244 /wd4800 /wd4503 /wd4554 /wd4996 /wd4348 /wd4018)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
 endif()

 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
@ -56,6 +58,7 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
 endif()

 # External dependencies
+include(zlib)
 include(gif)
 include(png)
 include(jpeg)
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@ -19,7 +19,7 @@ endif()
  
 ExternalProject_Add(grpc
    PREFIX grpc
-    DEPENDS protobuf
+    DEPENDS protobuf zlib
    GIT_REPOSITORY ${GRPC_URL}
    GIT_TAG ${GRPC_TAG}
    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@ -19,6 +19,7 @@ set(png_HEADERS

 ExternalProject_Add(png
    PREFIX png
+    DEPENDS zlib
    URL ${png_URL}
    URL_HASH ${png_HASH}
    INSTALL_DIR ${png_INSTALL}
@ -28,6 +29,7 @@ ExternalProject_Add(png
        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
        -DCMAKE_INSTALL_PREFIX:STRING=${png_INSTALL}
 	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+	-DZLIB_ROOT:STRING=${ZLIB_INSTALL}
 )

 ## put png includes in the directory where they are expected
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@ -15,6 +15,7 @@ endif()

 ExternalProject_Add(protobuf
    PREFIX protobuf
+    DEPENDS zlib
    GIT_REPOSITORY ${PROTOBUF_URL}
    GIT_TAG ${PROTOBUF_TAG}
    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
@ -29,4 +30,5 @@ ExternalProject_Add(protobuf
        -DCMAKE_BUILD_TYPE:STRING=Release
        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+	-DZLIB_ROOT:STRING=${ZLIB_INSTALL}
 )
--- a/tensorflow/contrib/cmake/external/zlib.cmake
+++ b/tensorflow/contrib/cmake/external/zlib.cmake
@ -0,0 +1,46 @@
+include (ExternalProject)
+
+set(zlib_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/zlib_archive)
+set(ZLIB_URL https://github.com/madler/zlib)
+set(ZLIB_BUILD ${CMAKE_CURRENT_BINARY_DIR}/zlib/src/zlib)
+set(ZLIB_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/zlib/install)
+set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
+
+if(WIN32)
+  set(zlib_STATIC_LIBRARIES
+      ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlib.lib)
+else()
+  set(zlib_STATIC_LIBRARIES
+      ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
+endif()
+
+set(ZLIB_HEADERS
+    "${ZLIB_INSTALL}/include/zconf.h"
+    "${ZLIB_INSTALL}/include/zlib.h"
+)
+
+ExternalProject_Add(zlib
+    PREFIX zlib
+    GIT_REPOSITORY ${ZLIB_URL}
+    GIT_TAG ${ZLIB_TAG}
+    INSTALL_DIR ${ZLIB_INSTALL}
+    BUILD_IN_SOURCE 1
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_INSTALL_PREFIX:STRING=${ZLIB_INSTALL}
+	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+)
+
+# put zlib includes in the directory where they are expected
+add_custom_target(zlib_create_destination_dir
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${zlib_INCLUDE_DIR}
+    DEPENDS zlib)
+
+add_custom_target(zlib_copy_headers_to_destination
+    DEPENDS zlib_create_destination_dir)
+
+foreach(header_file ${ZLIB_HEADERS})
+    add_custom_command(TARGET zlib_copy_headers_to_destination PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${header_file} ${zlib_INCLUDE_DIR})
+endforeach()
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@ -41,9 +41,6 @@ if(NOT NUMPY_INCLUDE_DIR)
  endif(${NUMPY_NOT_FOUND})
 endif(NOT NUMPY_INCLUDE_DIR)

-# 3. Resolve the installed version of zlib (for libz.so).
-find_package(ZLIB REQUIRED)
-

 ########################################################
 # Build the Python directory structure.
--- a/tensorflow/contrib/distributions/python/kernel_tests/beta_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/beta_test.py
@ -17,7 +17,7 @@ from __future__ import division
 from __future__ import print_function

 import numpy as np
-from scipy import stats
+from scipy import stats, special
 import tensorflow as tf


@ -308,6 +308,40 @@ class BetaTest(tf.test.TestCase):
      self.assertAllClose(tf.nn.softplus(a).eval(), dist.a.eval())
      self.assertAllClose(tf.nn.softplus(b).eval(), dist.b.eval())

+  def testBetaBetaKL(self):
+    with self.test_session() as sess:
+      for shape in [(10,), (4,5)]:
+        a1 = 6.0*np.random.random(size=shape) + 1e-4
+        b1 = 6.0*np.random.random(size=shape) + 1e-4 
+        a2 = 6.0*np.random.random(size=shape) + 1e-4
+        b2 = 6.0*np.random.random(size=shape) + 1e-4 
+        # Take inverse softplus of values to test BetaWithSoftplusAB
+        a1_sp = np.log(np.exp(a1) - 1.0)
+        b1_sp = np.log(np.exp(b1) - 1.0)
+        a2_sp = np.log(np.exp(a2) - 1.0)
+        b2_sp = np.log(np.exp(b2) - 1.0)
+
+        d1 = tf.contrib.distributions.Beta(a=a1, b=b1)
+        d2 = tf.contrib.distributions.Beta(a=a2, b=b2)
+        d1_sp = tf.contrib.distributions.BetaWithSoftplusAB(a=a1_sp, b=b1_sp)
+        d2_sp = tf.contrib.distributions.BetaWithSoftplusAB(a=a2_sp, b=b2_sp)
+
+        kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1)
+                     + (a1 - a2)*special.digamma(a1)
+                     + (b1 - b2)*special.digamma(b1)
+                     + (a2 - a1 + b2 - b1)*special.digamma(a1 + b1))
+
+        for dist1 in [d1, d1_sp]:
+          for dist2 in [d2, d2_sp]:
+            kl = tf.contrib.distributions.kl(dist1, dist2)
+            kl_val = sess.run(kl)
+            self.assertEqual(kl.get_shape(), shape)
+            self.assertAllClose(kl_val, kl_expected)
+        
+        # Make sure KL(d1||d1) is 0
+        kl_same = sess.run(tf.contrib.distributions.kl(d1, d1))
+        self.assertAllClose(kl_same, np.zeros_like(kl_expected))
+

 if __name__ == "__main__":
  tf.test.main()
--- a/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
@ -222,5 +222,34 @@ class CategoricalTest(tf.test.TestCase):
      dist = tf.contrib.distributions.Categorical(tf.log(histograms) - 50.)
      self.assertAllEqual(dist.mode().eval(), [[1, 0]])

+  def testCategoricalCategoricalKL(self):
+    def np_softmax(logits):
+      exp_logits = np.exp(logits)
+      return exp_logits / exp_logits.sum(axis=-1, keepdims=True)
+
+    with self.test_session() as sess:
+      for categories in [2, 4]:
+        for batch_size in [1, 10]:
+          a_logits = np.random.randn(batch_size, categories)
+          b_logits = np.random.randn(batch_size, categories)
+
+          a = tf.contrib.distributions.Categorical(logits=a_logits)
+          b = tf.contrib.distributions.Categorical(logits=b_logits)
+
+          kl = tf.contrib.distributions.kl(a, b)
+          kl_val = sess.run(kl)
+          # Make sure KL(a||a) is 0
+          kl_same = sess.run(tf.contrib.distributions.kl(a, a))
+
+          prob_a = np_softmax(a_logits)
+          prob_b = np_softmax(b_logits)
+          kl_expected = np.sum(
+              prob_a * (np.log(prob_a) - np.log(prob_b)), axis=-1)
+
+          self.assertEqual(kl.get_shape(), (batch_size,))
+          self.assertAllClose(kl_val, kl_expected)
+          self.assertAllClose(kl_same, np.zeros_like(kl_expected))
+
+
 if __name__ == "__main__":
  tf.test.main()
--- a/tensorflow/contrib/distributions/python/ops/beta.py
+++ b/tensorflow/contrib/distributions/python/ops/beta.py
@ -22,6 +22,7 @@ import numpy as np

 from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@ -299,3 +300,39 @@ class BetaWithSoftplusAB(Beta):
          allow_nan_stats=allow_nan_stats,
          name=ns)
    self._parameters = parameters
+
+
+def _kl_beta_beta(d1, d2, name=None):
+  """Calculate the batched KL divergence KL(d1 || d2) with d1 and d2 Beta.
+
+  Args:
+    d1: instance of a Beta distribution object.
+    d2: instance of a Beta distribution object.
+    name: (optional) Name to use for created operations.
+      default is "kl_beta_beta".
+
+  Returns:
+    Batchwise KL(d1 || d2)
+  """
+  inputs = [d1.a, d1.b, d1.a_b_sum, d2.a_b_sum]
+  with ops.name_scope(name, "kl_beta_beta", inputs):
+    # ln(B(a', b') / B(a, b))
+    log_betas = (math_ops.lgamma(d2.a) + math_ops.lgamma(d2.b)
+                - math_ops.lgamma(d2.a_b_sum) + math_ops.lgamma(d1.a_b_sum)
+                - math_ops.lgamma(d1.a) - math_ops.lgamma(d1.b))
+    # (a - a')*psi(a) + (b - b')*psi(b) + (a' - a + b' - b)*psi(a + b)
+    digammas = ((d1.a - d2.a)*math_ops.digamma(d1.a)
+              + (d1.b - d2.b)*math_ops.digamma(d1.b)
+              + (d2.a_b_sum - d1.a_b_sum)*math_ops.digamma(d1.a_b_sum))
+    return log_betas + digammas
+
+
+# Register KL divergences.
+kl_classes = [
+    Beta,
+    BetaWithSoftplusAB,
+]
+
+for beta_aa in kl_classes:
+  for beta_bb in kl_classes:
+    kullback_leibler.RegisterKL(beta_aa, beta_bb)(_kl_beta_beta)
--- a/tensorflow/contrib/distributions/python/ops/categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/categorical.py
@ -20,6 +20,7 @@ from __future__ import print_function

 from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@ -223,3 +224,24 @@ class Categorical(distribution.Distribution):
    ret = math_ops.cast(ret, self.dtype)
    ret.set_shape(self.get_batch_shape())
    return ret
+
+
+@kullback_leibler.RegisterKL(Categorical, Categorical)
+def _kl_categorical_categorical(a, b, name=None):
+  """Calculate the batched KL divergence KL(a || b) with a and b Categorical.
+
+  Args:
+    a: instance of a Categorical distribution object.
+    b: instance of a Categorical distribution object.
+    name: (optional) Name to use for created operations.
+      default is "kl_categorical_categorical".
+
+  Returns:
+    Batchwise KL(a || b)
+  """
+  with ops.name_scope(
+    name, "kl_categorical_categorical", [a.logits, b.logits]):
+    # sum(p*ln(p/q))
+    return math_ops.reduce_sum(
+        nn_ops.softmax(a.logits)*(nn_ops.log_softmax(a.logits)
+            - nn_ops.log_softmax(b.logits)), reduction_indices=[-1])
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@ -162,7 +162,7 @@ def _fused_batch_norm(
      updates = tf.group(*update_ops)
      total_loss = control_flow_ops.with_dependencies([updates], total_loss)

-  One can set update_collections=None to force the updates in place, but that
+  One can set updates_collections=None to force the updates in place, but that
  can have speed penalty, specially in distributed settings.

  Args:
@ -204,24 +204,36 @@ def _fused_batch_norm(
  Raises:
    ValueError: if `data_format` is neither `NHWC` nor `NCHW`.
    ValueError: if the rank of `inputs` is undefined.
-    ValueError: if rank or last dimension of `inputs` is undefined.
+    ValueError: if the rank of `inputs` is neither 2 or 4.
+    ValueError: if rank or `C` dimension of `inputs` is undefined.
  """
  if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
    raise ValueError('data_format has to be either NCHW or NHWC.')
  with variable_scope.variable_scope(
      scope, 'BatchNorm', [inputs], reuse=reuse) as sc:
    inputs = ops.convert_to_tensor(inputs)
+    original_shape = inputs.get_shape()
+    original_rank = original_shape.ndims
+    if original_rank is None:
+      raise ValueError('Inputs %s has undefined rank' % inputs.name)
+    elif original_rank not in [2, 4]:
+      raise ValueError('Inputs %s has unsupported rank. \
+          Expected 2 or 4 but got %d' % (inputs.name, original_rank))
+    if original_rank == 2:
+      channels = inputs.get_shape()[-1].value
+      if channels is None:
+        raise ValueError('`C` dimension must be known but is None')
+      new_shape = [-1, channels, 1, 1] if data_format == DATA_FORMAT_NCHW else \
+          [-1, 1, 1, channels]
+      inputs = array_ops.reshape(inputs, new_shape)
    inputs_shape = inputs.get_shape()
-    inputs_rank = inputs_shape.ndims
-    if inputs_rank is None:
-      raise ValueError('Inputs %s has undefined rank.' % inputs.name)
    dtype = inputs.dtype.base_dtype
    if data_format == DATA_FORMAT_NHWC:
      params_shape = inputs_shape[-1:]
    else:
      params_shape = inputs_shape[1:2]
    if not params_shape.is_fully_defined():
-      raise ValueError('Inputs %s has undefined last dimension %s.' %
+      raise ValueError('Inputs %s has undefined `C` dimension %s.' %
                       (inputs.name, params_shape))

    # Allocate parameters for the beta and gamma of the normalization.
@ -277,31 +289,31 @@ def _fused_batch_norm(
        trainable=False,
        collections=moving_variance_collections)

+    def _fused_batch_norm_training():
+      return nn.fused_batch_norm(
+          inputs, gamma, beta, epsilon=epsilon, data_format=data_format)
+    def _fused_batch_norm_inference():
+      return nn.fused_batch_norm(
+          inputs,
+          gamma,
+          beta,
+          mean=moving_mean,
+          variance=moving_variance,
+          epsilon=epsilon,
+          is_training=False,
+          data_format=data_format)
+    outputs, mean, variance = utils.smart_cond(is_training,
+                                               _fused_batch_norm_training,
+                                               _fused_batch_norm_inference)
+
    # If `is_training` doesn't have a constant value, because it is a `Tensor`,
    # a `Variable` or `Placeholder` then is_training_value will be None and
-    # `needs_moments` will be true.
+    # `need_updates` will be true.
    is_training_value = utils.constant_value(is_training)
-    need_moments = is_training_value is None or is_training_value
-    if need_moments:
-      # Calculate the moments based on the individual batch.
-      def _fused_batch_norm_training():
-        return nn.fused_batch_norm(
-            inputs, gamma, beta, epsilon=epsilon, data_format=data_format)
-      def _fused_batch_norm_inference():
-        return nn.fused_batch_norm(
-            inputs,
-            gamma,
-            beta,
-            mean=moving_mean,
-            variance=moving_variance,
-            epsilon=epsilon,
-            is_training=False,
-            data_format=data_format)
-      outputs, mean, variance = utils.smart_cond(is_training,
-                                                 _fused_batch_norm_training,
-                                                 _fused_batch_norm_inference)
-      moving_vars_fn = lambda: (moving_mean, moving_variance)
+    need_updates = is_training_value is None or is_training_value
+    if need_updates:
      if updates_collections is None:
+        _no_updates = lambda: outputs
        def _force_updates():
          """Internal function forces updates moving_vars if is_training."""
          update_moving_mean = moving_averages.assign_moving_average(
@ -310,12 +322,10 @@ def _fused_batch_norm(
              moving_variance, variance, decay)
          with ops.control_dependencies(
              [update_moving_mean, update_moving_variance]):
-            return array_ops.identity(mean), array_ops.identity(variance)
-        mean, variance = utils.smart_cond(is_training, _force_updates,
-                                          moving_vars_fn)
-        with ops.control_dependencies([mean, variance]):
-          outputs = array_ops.identity(outputs)
+            return array_ops.identity(outputs)
+        outputs = utils.smart_cond(is_training, _force_updates, _no_updates)
      else:
+        moving_vars_fn = lambda: (moving_mean, moving_variance)
        def _delay_updates():
          """Internal function that delay updates moving_vars if is_training."""
          update_moving_mean = moving_averages.assign_moving_average(
@ -328,22 +338,10 @@ def _fused_batch_norm(
                                                        moving_vars_fn)
        ops.add_to_collections(updates_collections, update_mean)
        ops.add_to_collections(updates_collections, update_variance)
-        # Use computed moments during training and moving_vars otherwise.
-        vars_fn = lambda: (mean, variance)
-        mean, variance = utils.smart_cond(is_training, vars_fn, moving_vars_fn)
-    else:
-      mean, variance = moving_mean, moving_variance
-      outputs, _, _ = nn.fused_batch_norm(
-          inputs,
-          gamma,
-          beta,
-          mean=moving_mean,
-          variance=moving_variance,
-          epsilon=epsilon,
-          is_training=False,
-          data_format=data_format)

    outputs.set_shape(inputs_shape)
+    if original_shape.ndims == 2:
+      outputs = array_ops.reshape(outputs, original_shape)
    if activation_fn is not None:
      outputs = activation_fn(outputs)
    return utils.collect_named_outputs(outputs_collections,
@ -610,6 +608,7 @@ def bias_add(inputs,
             variables_collections=None,
             outputs_collections=None,
             trainable=True,
+             data_format=DATA_FORMAT_NHWC,
             scope=None):
  """Adds a bias to the inputs.

@ -629,16 +628,34 @@ def bias_add(inputs,
    outputs_collections: collections to add the outputs.
    trainable: If `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    data_format: A string. 'NHWC' and 'NCHW' are supported.
    scope: Optional scope for variable_scope.

  Returns:
    a tensor representing the result of adding biases to the inputs.
+
+  Raises:
+    ValueError: if `data_format` is neither `NHWC` nor `NCHW`.
+    ValueError: if `data_format` is `NCHW` and rank of `inputs` is not 4.
+    ValueError: if the rank of `inputs` is undefined.
+    ValueError: if rank or `C` dimension of `inputs` is undefined.
  """
+  if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
+    raise ValueError('data_format has to be either NCHW or NHWC.')
  with variable_scope.variable_scope(scope, 'BiasAdd', [inputs],
                                     reuse=reuse) as sc:
    inputs = ops.convert_to_tensor(inputs)
    dtype = inputs.dtype.base_dtype
-    num_features = utils.last_dimension(inputs.get_shape(), min_rank=2)
+    inputs_shape = inputs.get_shape()
+    inputs_rank = inputs_shape.ndims
+    if inputs_rank is None:
+      raise ValueError('Dims of shape must be known but is None')
+    elif inputs_rank != 4 and data_format == DATA_FORMAT_NCHW:
+      raise ValueError('Data format NCHW only supports 4D Tensor')
+    axis = 1 if data_format==DATA_FORMAT_NCHW else -1
+    num_features = inputs_shape[axis].value
+    if num_features is None:
+      raise ValueError('`C` dimension must be known but is None')
    biases_collections = utils.get_variable_collections(variables_collections,
                                                        'biases')
    biases = variables.model_variable('biases',
@ -648,7 +665,7 @@ def bias_add(inputs,
                                      regularizer=regularizer,
                                      collections=biases_collections,
                                      trainable=trainable)
-    outputs = nn.bias_add(inputs, biases)
+    outputs = nn.bias_add(inputs, biases, data_format=data_format)
    if activation_fn is not None:
      outputs = activation_fn(outputs)
    return utils.collect_named_outputs(outputs_collections,
--- a/tensorflow/contrib/layers/python/layers/utils.py
+++ b/tensorflow/contrib/layers/python/layers/utils.py
@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function

 from collections import namedtuple
+from collections import OrderedDict
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@ -99,15 +100,15 @@ def get_tensor_alias(tensor):


 def convert_collection_to_dict(collection):
-  """Returns a dict of Tensors using get_tensor_alias as key.
+  """Returns an OrderedDict of Tensors using get_tensor_alias as key.

  Args:
    collection: A collection.

  Returns:
-    A dictionary of {get_tensor_alias(tensor): tensor}
+    An OrderedDict of {get_tensor_alias(tensor): tensor}
  """
-  return {get_tensor_alias(t): t for t in ops.get_collection(collection)}
+  return OrderedDict((get_tensor_alias(t), t) for t in ops.get_collection(collection))


 def constant_value(value_or_tensor_or_var, dtype=None):
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py
@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py
@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py
@ -33,27 +33,34 @@ def confusion_matrix(predictions, labels, num_classes=None, dtype=dtypes.int32,
  Calculate the Confusion Matrix for a pair of prediction and
  label 1-D int arrays.

-  Considering a prediction array such as: `[1, 2, 3]`
-  And a label array such as: `[2, 2, 3]`
-
-  The confusion matrix returned would be the following one:
-
-  ```python
-      [[0, 0, 0, 0]
-       [0, 0, 1, 0]
-       [0, 0, 1, 0]
-       [0, 0, 0, 1]]
-  ```
-
-  If `weights` is not None, then the confusion matrix elements are the
-  corresponding `weights` elements.
-
-  Where the matrix rows represent the prediction labels and the columns
+  The matrix rows represent the prediction labels and the columns
  represents the real labels. The confusion matrix is always a 2-D array
-  of shape [n, n], where n is the number of valid labels for a given
+  of shape `[n, n]`, where `n` is the number of valid labels for a given
  classification task. Both prediction and labels must be 1-D arrays of
  the same shape in order for this function to work.

+  If `num_classes` is None, then `num_classes` will be set to the one plus
+  the maximum value in either predictions or labels.
+  Class labels are expected to start at 0. E.g., if `num_classes` was
+  three, then the possible labels would be `[0, 1, 2]`.
+
+  If `weights` is not `None`, then each prediction contributes its
+  corresponding weight to the total value of the confusion matrix cell.
+
+  For example:
+
+  ```python
+    tf.contrib.metrics.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
+        [[0 0 0 0 0]
+         [0 0 1 0 0]
+         [0 0 1 0 0]
+         [0 0 0 0 0]
+         [0 0 0 0 1]]
+  ```
+
+  Note that the possible labels are assumed to be `[0, 1, 2, 3, 4]`,
+  resulting in a 5x5 confusion matrix.
+
  Args:
    predictions: A 1-D array representing the predictions for a given
                 classification.
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@ -3080,6 +3080,7 @@ def aggregate_metric_map(names_to_tuples):
  This function is useful for pairing metric names with their associated value
  and update ops when the list of metrics is long. For example:

+  ```python
    metrics_to_values, metrics_to_updates = slim.metrics.aggregate_metric_map({
        'Mean Absolute Error': new_slim.metrics.streaming_mean_absolute_error(
            predictions, labels, weights),
@ -3090,6 +3091,7 @@ def aggregate_metric_map(names_to_tuples):
        'RMSE Log': new_slim.metrics.streaming_root_mean_squared_error(
            predictions, labels, weights),
    })
+  ```

  Args:
    names_to_tuples: a map of metric names to tuples, each of which contain the
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@ -0,0 +1,52 @@
+# Description:
+#   contains parts of TensorFlow that are experimental or unstable and which are not supported.
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+py_library(
+    name = "seq2seq_py",
+    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    srcs_version = "PY2AND3",
+)
+
+cuda_py_test(
+    name = "layers_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/layers_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "loss_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/loss_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
--- a/tensorflow/contrib/seq2seq/README.md
+++ b/tensorflow/contrib/seq2seq/README.md
@ -0,0 +1,9 @@
+# TensorFlow contrib seq2seq layers and losses
+
+## Layers
+
+Information to be added.
+
+## Losses
+
+Information to be added.
--- a/tensorflow/contrib/seq2seq/init.py
+++ b/tensorflow/contrib/seq2seq/init.py
@ -0,0 +1,26 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Ops for building neural network seq2seq layers and losses."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+# pylint: disable=unused-import,wildcard-import,line-too-long
+from tensorflow.contrib.seq2seq.python.ops import layers
+from tensorflow.contrib.seq2seq.python.ops import loss
--- a/tensorflow/contrib/seq2seq/python/init.py
+++ b/tensorflow/contrib/seq2seq/python/init.py
@ -0,0 +1,19 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ops module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/layers_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/layers_test.py
@ -0,0 +1,36 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for contrib.seq2seq.python.seq2seq.layers_ops."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import tensorflow as tf
+
+
+class LayersTest(tf.test.TestCase):
+
+  def testRNNDecoder(self):
+    pass
+
+  def testRNNDecoderAttention(self):
+    pass
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
@ -0,0 +1,33 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for contrib.seq2seq.python.seq2seq.loss_ops."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import tensorflow as tf
+
+
+class LossTest(tf.test.TestCase):
+
+  def testLoss(self):
+    pass
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/tensorflow/contrib/seq2seq/python/ops/layers.py
+++ b/tensorflow/contrib/seq2seq/python/ops/layers.py
@ -0,0 +1,35 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Seq2seq layer operations for use in neural networks.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+
+
+__all__ = ["rnn_decoder",
+           "rnn_decoder_attention"]
+
+
+def rnn_decoder(*args, **kwargs):
+  pass
+
+
+def rnn_decoder_attention(*args, **kwargs):
+  pass
--- a/tensorflow/contrib/seq2seq/python/ops/loss.py
+++ b/tensorflow/contrib/seq2seq/python/ops/loss.py
@ -0,0 +1,30 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Seq2seq loss operations for use in neural networks.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+
+
+__all__ = ["seq2seq_loss"]
+
+
+def seq2seq_loss(*args, **kwargs):
+  pass
--- a/tensorflow/contrib/session_bundle/README.md
+++ b/tensorflow/contrib/session_bundle/README.md
@ -34,12 +34,10 @@ definition that's needed for serving.
 *   `assets` -- Asset file directory
    *   Holds auxiliary files for the graph (e.g., vocabularies)
 *   `export.meta` -- MetaGraph Definition
-    *   Binary [`tensorflow::MetaGraphDef`]
-        (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/protobuf/meta_graph.proto)
+    *   Binary [`tensorflow::MetaGraphDef`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/protobuf/meta_graph.proto)
 *   `export-?????-of-?????`
    *   A checkpoint of the Graph Variables
-    *   Outputs from Python [`Saver`]
-        (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/training/saver.py)
+    *   Outputs from Python [`Saver`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/training/saver.py)
        with `sharded=True`.

 ## Exporting (Python code)
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -64,6 +64,7 @@ load(
    "//tensorflow:tensorflow.bzl",
    "if_android",
    "if_ios",
+    "if_not_windows",
    "tf_copts",
    "tf_cc_test",
    "tf_cc_tests",
@ -140,6 +141,7 @@ cc_library(
        "platform/protobuf.h",
        "platform/types.h",
    ] + glob(tf_additional_proto_hdrs()),
+    copts = tf_copts(),
    deps = [
        ":protos_all_cc",
        "//tensorflow/core/platform/default/build_config:proto_parsing",
@ -294,8 +296,6 @@ tf_cuda_library(
        "util/example_proto_fast_parsing.h",
        "util/example_proto_helper.h",
        "util/guarded_philox_random.h",
-        "util/memmapped_file_system.h",
-        "util/memmapped_file_system_writer.h",
        "util/mirror_pad_mode.h",
        "util/padding.h",
        "util/port.h",
@ -312,7 +312,13 @@ tf_cuda_library(
        "util/use_cudnn.h",
        "util/util.h",
        "util/work_sharder.h",
-    ],
+    ] + select({
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "util/memmapped_file_system.h",
+            "util/memmapped_file_system_writer.h",
+        ],
+    }),
    visibility = ["//visibility:public"],
    deps = [":framework_internal"],
 )
@ -513,7 +519,6 @@ cc_library(
        "//tensorflow/core/kernels:control_flow_ops",
        "//tensorflow/core/kernels:ctc_ops",
        "//tensorflow/core/kernels:data_flow",
-        "//tensorflow/core/kernels:fact_op",
        "//tensorflow/core/kernels:function_ops",
        "//tensorflow/core/kernels:image",
        "//tensorflow/core/kernels:io",
@ -524,7 +529,6 @@ cc_library(
        "//tensorflow/core/kernels:nn",
        "//tensorflow/core/kernels:parameterized_truncated_normal_op",
        "//tensorflow/core/kernels:parsing",
-        "//tensorflow/core/kernels:quantized_ops",
        "//tensorflow/core/kernels:random_ops",
        "//tensorflow/core/kernels:required",
        "//tensorflow/core/kernels:sdca_ops",
@ -533,7 +537,12 @@ cc_library(
        "//tensorflow/core/kernels:string",
        "//tensorflow/core/kernels:training_ops",
        "//tensorflow/models/embedding:word2vec_kernels",
-    ],
+    ] + if_not_windows([
+        "//tensorflow/core/kernels:fact_op",
+        "//tensorflow/core/kernels:array_not_windows",
+        "//tensorflow/core/kernels:math_not_windows",
+        "//tensorflow/core/kernels:quantized_ops",
+    ]),
 )

 tf_cuda_library(
@ -874,12 +883,12 @@ cc_library(
 # Libraries with GPU facilities that are useful for writing kernels.
 cc_library(
    name = "gpu_lib",
-    srcs = [
+    srcs = if_not_windows([
        "common_runtime/gpu/gpu_event_mgr.cc",
-    ],
-    hdrs = [
+    ]),
+    hdrs = if_not_windows([
        "common_runtime/gpu/gpu_event_mgr.h",
-    ],
+    ]),
    copts = tf_copts(),
    visibility = ["//visibility:public"],
    deps = [
@ -889,8 +898,7 @@ cc_library(
        ":lib_internal",
        ":proto_text",
        ":protos_all_cc",
-        ":stream_executor",
-    ],
+    ] + if_not_windows([":stream_executor"]),
 )

 cc_library(
@ -950,26 +958,47 @@ tf_proto_library_cc(

 cc_library(
    name = "lib_internal",
-    srcs = glob(
-        [
-            "lib/**/*.h",
-            "lib/**/*.cc",
-            "platform/*.h",
-            "platform/*.cc",
-            "platform/profile_utils/**/*.h",
-            "platform/profile_utils/**/*.cc",
-        ] + tf_additional_lib_srcs(),
-        exclude =
+    srcs = select({
+        "//tensorflow:windows": glob(
            [
+                "lib/**/*.h",
+                "lib/**/*.cc",
+                "platform/*.h",
+                "platform/*.cc",
+            ],
+            exclude = [
                "**/*test*",
                "platform/**/cuda.h",
                "platform/**/stream_executor.h",
-            ] +
-            # Protobuf deps already included through the ":lib_proto_parsing"
-            # dependency.
-            tf_additional_proto_srcs(),
+                "platform/load_library.cc",
+            ],
+        ),
+        "//conditions:default": glob(
+            [
+                "lib/**/*.h",
+                "lib/**/*.cc",
+                "platform/*.h",
+                "platform/*.cc",
+                "platform/profile_utils/**/*.h",
+                "platform/profile_utils/**/*.cc",
+            ],
+            exclude = [
+                "**/*test*",
+                "platform/**/cuda.h",
+                "platform/**/stream_executor.h",
+            ],
+        ),
+    }) + tf_additional_lib_srcs(
+        exclude = [
+            "**/*test*",
+            "platform/**/cuda.h",
+            "platform/**/stream_executor.h",
+        ] +
+        # Protobuf deps already included through the ":lib_proto_parsing"
+        # dependency.
+        tf_additional_proto_srcs(),
    ),
-    hdrs = glob(tf_additional_lib_hdrs()) + [
+    hdrs = tf_additional_lib_hdrs() + [
        "lib/core/blocking_counter.h",
        "lib/core/refcount.h",
        "lib/gif/gif_io.h",
@ -1039,6 +1068,7 @@ tf_version_info_genrule()
 cc_library(
    name = "version_lib",
    srcs = ["util/version_info.cc"],
+    copts = tf_copts(),
 )

 tf_cuda_library(
@ -1060,8 +1090,18 @@ tf_cuda_library(
            "util/reporter.h",
            "util/reporter.cc",
            "framework/fake_input.*",
+            "util/memmapped_file_system.*",
+            "util/memmapped_file_system_writer.*",
        ],
-    ),
+    ) + select({
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "util/memmapped_file_system.h",
+            "util/memmapped_file_system.cc",
+            "util/memmapped_file_system_writer.h",
+            "util/memmapped_file_system_writer.cc",
+        ],
+    }),
    hdrs = [
        "framework/op_segment.h",
        "framework/rendezvous.h",  # only needed for tests
@ -1335,7 +1375,7 @@ tf_cuda_library(

 tf_cuda_library(
    name = "gpu_runtime",
-    srcs = [
+    srcs = if_not_windows([
        "common_runtime/gpu/gpu_bfc_allocator.cc",
        "common_runtime/gpu/gpu_debug_allocator.cc",
        "common_runtime/gpu/gpu_device.cc",
@ -1347,8 +1387,8 @@ tf_cuda_library(
        "common_runtime/gpu/pool_allocator.cc",
        "common_runtime/gpu/process_state.cc",
        "common_runtime/gpu_device_context.h",
-    ],
-    hdrs = [
+    ]),
+    hdrs = if_not_windows([
        "common_runtime/gpu/gpu_bfc_allocator.h",
        "common_runtime/gpu/gpu_debug_allocator.h",
        "common_runtime/gpu/gpu_device.h",
@ -1357,7 +1397,7 @@ tf_cuda_library(
        "common_runtime/gpu/gpu_util.h",
        "common_runtime/gpu/pool_allocator.h",
        "common_runtime/gpu/process_state.h",
-    ],
+    ]),
    copts = tf_copts(),
    linkstatic = 1,
    deps = [
@ -1369,9 +1409,10 @@ tf_cuda_library(
        ":lib",
        ":lib_internal",
        ":protos_all_cc",
-        ":stream_executor",
        "//third_party/eigen3",
-    ],
+    ] + if_not_windows([
+        ":stream_executor",
+    ]),
    alwayslink = 1,
 )

--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@ -457,7 +457,7 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
  TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
  if (strides.size() != 4) {
    return errors::InvalidArgument(
-        "AvgPool requires the stride attribute to contain 4 values, but "
+        "MaxPool requires the stride attribute to contain 4 values, but "
        "got: ",
        strides.size());
  }
@ -466,7 +466,7 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
  TF_RETURN_IF_ERROR(c->GetAttr("ksize", &kernel_sizes));
  if (kernel_sizes.size() != 4) {
    return errors::InvalidArgument(
-        "AvgPool requires the ksize attribute to contain 4 values, but got: ",
+        "MaxPool requires the ksize attribute to contain 4 values, but got: ",
        kernel_sizes.size());
  }

--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@ -171,12 +171,6 @@ class OpListOpRegistry : public OpRegistryInterface {
  std::unordered_map<string, const OpRegistrationData*> index_;
 };

-// Treats 'registry_ptr' as a pointer to OpRegistry, and calls
-// registry_ptr->Register(op_def) for each op_def that has been registered with
-// the current library's global op registry (obtained by calling
-// OpRegistry::Global().
-extern "C" void RegisterOps(void* registry_ptr);
-
 // Support for defining the OpDef (specifying the semantics of the Op and how
 // it should be created) and registering it in the OpRegistry::Global()
 // registry.  Usage:
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@ -1105,11 +1105,6 @@ void* GlobalKernelRegistry();
 Status FindKernelDef(DeviceType device_type, const NodeDef& node_def,
                     const KernelDef** def, string* kernel_class_name);

-// Treats 'registry_ptr' as a pointer to KernelRegistry. For each kernel 'k'
-// registered with the current library's global kernel registry (obtained by
-// calling GlobalKernelRegistry()), inserts 'k' into registry_ptr.
-extern "C" void RegisterKernels(void* registry_ptr);
-
 // Writes a list of all registered kernels to LOG(INFO), to help users debug
 // missing kernel errors.
 void LogAllRegisteredKernels();
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -25,6 +25,7 @@ package_group(

 load(
    "//tensorflow:tensorflow.bzl",
+    "if_not_windows",
    "tf_cc_test",
    "tf_cc_tests",
    "tf_copts",
@ -386,6 +387,41 @@ cc_header_only_library(

 # OpKernel libraries ----------------------------------------------------------

+ARRAY_DEPS = [
+    ":batch_space_ops",
+    ":bounds_check",
+    ":concat_lib",
+    ":cuda_device_array",
+    ":depth_space_ops",
+    ":extract_image_patches_op",
+    ":fill_functor",
+    ":gather_functor",
+    ":ops_util",
+    ":split_lib",
+    ":strided_slice_op",
+    ":transpose_functor",
+    "//tensorflow/core:array_grad",
+    "//tensorflow/core:array_ops_op_lib",
+    "//tensorflow/core:core_cpu",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:gpu_runtime",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:lib_internal",
+    "//tensorflow/core:proto_text",
+    "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core/debug:debug_io_utils",
+    "//third_party/eigen3",
+]
+
+tf_kernel_libraries(
+    name = "array_not_windows",
+    prefixes = [
+        "debug_ops",
+        "immutable_constant_op",
+    ],
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_libraries(
    name = "array",
    prefixes = [
@ -393,7 +429,6 @@ tf_kernel_libraries(
        "bitcast_op",
        "concat_op",
        "constant_op",
-        "debug_ops",
        "diag_op",
        "matrix_band_part_op",
        "matrix_diag_op",
@ -402,7 +437,6 @@ tf_kernel_libraries(
        "gather_op",
        "gather_nd_op",
        "identity_op",
-        "immutable_constant_op",
        "listdiff_op",
        "mirror_pad_op",
        "one_hot_op",
@ -421,31 +455,7 @@ tf_kernel_libraries(
        "unpack_op",
        "where_op",
    ],
-    deps = [
-        ":batch_space_ops",
-        ":bounds_check",
-        ":concat_lib",
-        ":cuda_device_array",
-        ":depth_space_ops",
-        ":extract_image_patches_op",
-        ":fill_functor",
-        ":gather_functor",
-        ":ops_util",
-        ":split_lib",
-        ":strided_slice_op",
-        ":transpose_functor",
-        "//tensorflow/core:array_grad",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_runtime",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:proto_text",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/debug:debug_io_utils",
-        "//third_party/eigen3",
-    ],
+    deps = ARRAY_DEPS,
 )

 tf_cc_test(
@ -1264,6 +1274,27 @@ tf_cc_tests(
    ],
 )

+MATH_DEPS = [
+    ":bounds_check",
+    ":fill_functor",
+    ":transpose_functor",
+    "//tensorflow/core:core_cpu",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:lib_internal",
+    "//tensorflow/core:math_grad",
+    "//tensorflow/core:math_ops_op_lib",
+    "//third_party/eigen3",
+]
+
+tf_kernel_libraries(
+    name = "math_not_windows",
+    prefixes = [
+        "sparse_matmul_op",
+    ],
+    deps = MATH_DEPS,
+)
+
 tf_kernel_libraries(
    name = "math",
    prefixes = [
@ -1281,20 +1312,8 @@ tf_kernel_libraries(
        "segment_reduction_ops",
        "scan_ops",
        "sequence_ops",
-        "sparse_matmul_op",
-    ],
-    deps = [
-        ":bounds_check",
-        ":fill_functor",
-        ":transpose_functor",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:math_grad",
-        "//tensorflow/core:math_ops_op_lib",
-        "//third_party/eigen3",
    ],
+    deps = MATH_DEPS,
 )

 tf_cuda_cc_test(
@ -1574,7 +1593,6 @@ tf_kernel_libraries(
        ":conv_2d",
        ":conv_ops",
        ":depthwise_conv_grad_op",
-        ":depthwise_conv_op",
        ":dilation_ops",
        ":fused_batch_norm_util_gpu",
        ":ops_util",
@ -1585,7 +1603,9 @@ tf_kernel_libraries(
        "//tensorflow/core:nn_grad",
        "//tensorflow/core:nn_ops_op_lib",
        "//third_party/eigen3",
-    ],
+    ] + if_not_windows([
+        ":depthwise_conv_op",
+    ]),
 )

 tf_cuda_cc_test(
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@ -93,6 +93,7 @@ class DecodeRawOp : public OpKernel {
      Name("DecodeRaw").Device(DEVICE_CPU).TypeConstraint<type>("out_type"), \
      DecodeRawOp<type>)

+REGISTER(Eigen::half);
 REGISTER(float);
 REGISTER(double);
 REGISTER(int32);
--- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@ -67,4 +67,8 @@ template class InvVarianceToVariance<float>;
 }  // namespace functor
 }  // namespace tensorflow

+#else
+
+#include "tensorflow/core/kernels/fused_batch_norm_op.h"
+
 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@ -45,4 +45,8 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 }  // namespace functor
 }  // namespace tensorflow

+#else
+
+#include "tensorflow/core/kernels/gather_functor.h"
+
 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/scatter_functor.cc
+++ b/tensorflow/core/kernels/scatter_functor.cc
@ -55,4 +55,8 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_GPU_SPECS);
 }  // namespace functor
 }  // namespace tensorflow

+#else
+
+#include "tensorflow/core/kernels/scatter_functor.h"
+
 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@ -351,13 +351,16 @@ REGISTER_OP("FusedBatchNormGrad")
    .Attr("T: numbertype")
    .Attr("epsilon: float = 0.0001")
    .Attr("data_format: string = 'NHWC'")
+    .Attr("is_training: bool = true")
    .SetShapeFn([](InferenceContext* c) {
      ShapeHandle y_backprop;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &y_backprop));
      ShapeHandle x;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &x));

+      bool is_training;
      string data_format;
+      c->GetAttr("is_training", &is_training);
      c->GetAttr("data_format", &data_format);
      DimensionHandle channel_dim = (data_format == "NHWC")
                                        ? c->Dim(y_backprop, 3)
@ -386,8 +389,16 @@ REGISTER_OP("FusedBatchNormGrad")
      c->set_output(0, x_backprop);
      c->set_output(1, c->Vector(channel_dim));
      c->set_output(2, c->Vector(channel_dim));
-      c->set_output(3, c->Vector(0));
-      c->set_output(4, c->Vector(0));
+      // Set the correct shapes for reserve_spaces
+      // so that gradients can be performed when
+      // the op is in a symbolic condition.
+      if (is_training) {
+        c->set_output(3, c->Vector(0));
+        c->set_output(4, c->Vector(0));
+      } else {
+        c->set_output(3, c->Vector(channel_dim));
+        c->set_output(4, c->Vector(channel_dim));
+      }
      return Status::OK();
    })
    .Doc(R"doc(
@ -412,6 +423,8 @@ T: The data type for the elements of input and output Tensors.
 epsilon: A small float number added to the variance of x.
 data_format: The data format for y_backprop, x, x_backprop.
             Either "NHWC" (default) or "NCHW".
+is_training: A bool value to indicate the operation is for training (default)
+             or inference.
 )doc");

 // --------------------------------------------------------------------------
@ -1835,7 +1848,7 @@ pooling_ratio: Pooling ratio for each dimension of `value`, currently only
  respectively.
 pseudo_random: When set to True, generates the pooling sequence in a
  pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-  Graham, Fractional Max-Pooling] (http://arxiv.org/abs/1412.6071) for
+  Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
  difference between pseudorandom and random.
 overlapping: When set to True, it means when pooling, the values at the boundary
  of adjacent pooling cells are used by both cells. For example:
@ -1925,7 +1938,7 @@ pooling_ratio: Pooling ratio for each dimension of `value`, currently only
  respectively.
 pseudo_random: When set to True, generates the pooling sequence in a
  pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-  Graham, Fractional Max-Pooling] (http://arxiv.org/abs/1412.6071) for
+  Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
  difference between pseudorandom and random.
 overlapping: When set to True, it means when pooling, the values at the boundary
  of adjacent pooling cells are used by both cells. For example:
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@ -6976,7 +6976,7 @@ op {
    default_value {
      b: false
    }
-    description: "When set to True, generates the pooling sequence in a\npseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin\nGraham, Fractional Max-Pooling] (http://arxiv.org/abs/1412.6071) for\ndifference between pseudorandom and random."
+    description: "When set to True, generates the pooling sequence in a\npseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin\nGraham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for\ndifference between pseudorandom and random."
  }
  attr {
    name: "overlapping"
@ -7110,7 +7110,7 @@ op {
    default_value {
      b: false
    }
-    description: "When set to True, generates the pooling sequence in a\npseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin\nGraham, Fractional Max-Pooling] (http://arxiv.org/abs/1412.6071) for\ndifference between pseudorandom and random."
+    description: "When set to True, generates the pooling sequence in a\npseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin\nGraham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for\ndifference between pseudorandom and random."
  }
  attr {
    name: "overlapping"
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@ -27,7 +27,7 @@ using shape_inference::ShapeHandle;
 REGISTER_OP("DecodeRaw")
    .Input("bytes: string")
    .Output("output: out_type")
-    .Attr("out_type: {float,double,int32,uint8,int16,int8,int64}")
+    .Attr("out_type: {half,float,double,int32,uint8,int16,int8,int64}")
    .Attr("little_endian: bool = true")
    .SetShapeFn([](InferenceContext* c) {
      // Note: last dimension is data dependent.
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@ -91,17 +91,31 @@ def tf_proto_library(name, srcs = [], has_services = None,
      visibility = visibility,
  )

-def tf_additional_lib_hdrs():
-  return [
-      "platform/default/*.h",
-      "platform/posix/*.h",
-  ]
+def tf_additional_lib_hdrs(exclude = []):
+  return select({
+    "//tensorflow:windows" : native.glob([
+        "platform/default/*.h",
+        "platform/windows/*.h",
+        "platform/posix/error.h",
+      ], exclude = exclude),
+    "//conditions:default" : native.glob([
+        "platform/default/*.h",
+        "platform/posix/*.h",
+      ], exclude = exclude),
+  })

-def tf_additional_lib_srcs():
-  return [
-      "platform/default/*.cc",
-      "platform/posix/*.cc",
-  ]
+def tf_additional_lib_srcs(exclude = []):
+  return select({
+    "//tensorflow:windows" : native.glob([
+        "platform/default/*.cc",
+        "platform/windows/*.cc",
+        "platform/posix/error.cc",
+      ], exclude = exclude),
+    "//conditions:default" : native.glob([
+        "platform/default/*.cc",
+        "platform/posix/*.cc",
+      ], exclude = exclude),
+  })

 def tf_additional_minimal_lib_srcs():
  return [
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@ -32,6 +32,8 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/windows/windows_file_system.h"

+#pragma comment(lib, "Shlwapi.lib")
+
 namespace tensorflow {

 namespace {
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@ -20,6 +20,7 @@ limitations under the License.
 #include <snappy.h>
 #endif
 #include <WinSock2.h>
+#pragma comment(lib, "Ws2_32.lib")

 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/demangle.h"
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@ -9,6 +9,8 @@ licenses(["notice"])  # Apache 2.0

 exports_files(["LICENSE"])

+load("//tensorflow:tensorflow.bzl", "tf_copts")
+
 # To be exported to tensorflow/core:android_srcs.
 filegroup(
    name = "android_srcs",
@ -24,7 +26,7 @@ cc_library(
    name = "tensor_bundle",
    srcs = ["tensor_bundle.cc"],
    hdrs = ["tensor_bundle.h"],
-    copts = ["-Wno-sign-compare"],
+    copts = tf_copts() + ["-Wno-sign-compare"],
    deps = [
        ":naming",
        "//tensorflow/core:core_cpu_internal",
--- a/tensorflow/examples/learn/BUILD
+++ b/tensorflow/examples/learn/BUILD
@ -91,6 +91,15 @@ py_binary(
    ],
 )

+py_binary(
+    name = "resnet",
+    srcs = ["resnet.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 py_binary(
    name = "text_classification",
    srcs = ["text_classification.py"],
@ -160,6 +169,7 @@ sh_test(
        ":iris_val_based_early_stopping",
        ":iris_with_pipeline",
        ":random_forest_mnist",
+        ":resnet",
        ":text_classification",
        ":text_classification_builtin_rnn_model",
        ":text_classification_character_cnn",
--- a/tensorflow/examples/learn/README.md
+++ b/tensorflow/examples/learn/README.md
@ -23,6 +23,7 @@ Some examples use the `pandas` library for data processing (`sudo pip install pa
 ## Specialized Models
 * [Building a Random Forest Model](random_forest.py)
 * [Building a Wide & Deep Model](wide_n_deep_tutorial.py)
+* [Building a Residual Network Model](resnet.py)

 ## Text classification

--- a/tensorflow/examples/learn/examples_test.sh
+++ b/tensorflow/examples/learn/examples_test.sh
@ -13,7 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.

-# This script exercises the examples of using SkFlow.
+# This script exercises the examples of using TF.Learn.

 DIR="$TEST_SRCDIR"

@ -28,20 +28,18 @@ then
  DIR="$DIR"/"$TEST_WORKSPACE"
 fi

-SKFLOW_EXAMPLE_BASE_DIR=$DIR/tensorflow/examples/learn
+TFLEARN_EXAMPLE_BASE_DIR=$DIR/tensorflow/examples/learn


 function test() {
  echo "Test "$1":"
-  $SKFLOW_EXAMPLE_BASE_DIR/$1 $2
+  $TFLEARN_EXAMPLE_BASE_DIR/$1 $2
  if [ $? -eq 0 ]
  then
    echo "Test passed."
-    echo
    return 0
  else
    echo "Test failed."
-    echo
    exit 1
  fi
 }
@ -53,6 +51,7 @@ test iris_custom_decay_dnn
 test iris_run_config
 test iris_val_based_early_stopping
 test iris_with_pipeline
+test resnet
 test text_classification --test_with_fake_data
 test text_classification_builtin_rnn_model --test_with_fake_data
 test text_classification_cnn --test_with_fake_data
--- a/tensorflow/examples/skflow/resnet.py
+++ b/tensorflow/examples/skflow/resnet.py
@ -28,10 +28,9 @@ from collections import namedtuple
 from math import sqrt
 import os

-from sklearn import metrics
 import tensorflow as tf
 from tensorflow.contrib import learn
-from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.contrib.layers import batch_norm, convolution2d


 def res_net(x, y, activation=tf.nn.relu):
@ -69,8 +68,9 @@ def res_net(x, y, activation=tf.nn.relu):

  # First convolution expands to 64 channels
  with tf.variable_scope('conv_layer1'):
-    net = learn.ops.conv2d(x, 64, [7, 7], batch_norm=True,
-                           activation=activation, bias=False)
+    net = convolution2d(x, 64, 7,
+                        normalizer_fn=batch_norm,
+                        activation_fn=activation)

  # Max pool
  net = tf.nn.max_pool(
@ -78,9 +78,8 @@ def res_net(x, y, activation=tf.nn.relu):

  # First chain of resnets
  with tf.variable_scope('conv_layer2'):
-    net = learn.ops.conv2d(net, groups[0].num_filters,
-                           [1, 1], [1, 1, 1, 1],
-                           padding='VALID', bias=True)
+    net = convolution2d(net, groups[0].num_filters, 1,
+                        padding='VALID')

  # Create the bottleneck groups, each of which contains `num_blocks`
  # bottleneck groups.
@ -90,30 +89,24 @@ def res_net(x, y, activation=tf.nn.relu):

      # 1x1 convolution responsible for reducing dimension
      with tf.variable_scope(name + '/conv_in'):
-        conv = learn.ops.conv2d(net, group.bottleneck_size,
-                                [1, 1], [1, 1, 1, 1],
-                                padding='VALID',
-                                activation=activation,
-                                batch_norm=True,
-                                bias=False)
+        conv = convolution2d(net, group.bottleneck_size, 1,
+                             padding='VALID',
+                             activation_fn=activation,
+                             normalizer_fn=batch_norm)

      with tf.variable_scope(name + '/conv_bottleneck'):
-        conv = learn.ops.conv2d(conv, group.bottleneck_size,
-                                [3, 3], [1, 1, 1, 1],
-                                padding='SAME',
-                                activation=activation,
-                                batch_norm=True,
-                                bias=False)
+        conv = convolution2d(conv, group.bottleneck_size, 3,
+                             padding='SAME',
+                             activation_fn=activation,
+                             normalizer_fn=batch_norm)

      # 1x1 convolution responsible for restoring dimension
      with tf.variable_scope(name + '/conv_out'):
        input_dim = net.get_shape()[-1].value
-        conv = learn.ops.conv2d(conv, input_dim,
-                                [1, 1], [1, 1, 1, 1],
-                                padding='VALID',
-                                activation=activation,
-                                batch_norm=True,
-                                bias=False)
+        conv = convolution2d(conv, input_dim, 1,
+                             padding='VALID',
+                             activation_fn=activation,
+                             normalizer_fn=batch_norm)

      # shortcut connections that turn the network into its counterpart
      # residual function (identity shortcut)
@ -123,10 +116,10 @@ def res_net(x, y, activation=tf.nn.relu):
      # upscale to the next group size
      next_group = groups[group_i + 1]
      with tf.variable_scope('block_%d/conv_upscale' % group_i):
-        net = learn.ops.conv2d(net, next_group.num_filters,
-                               [1, 1], [1, 1, 1, 1],
-                               bias=False,
-                               padding='SAME')
+        net = convolution2d(net, next_group.num_filters, 1,
+                            activation_fn=None,
+                            biases_initializer=None,
+                            padding='SAME')
    except IndexError:
      pass

@ -138,21 +131,38 @@ def res_net(x, y, activation=tf.nn.relu):
  net_shape = net.get_shape().as_list()
  net = tf.reshape(net, [-1, net_shape[1] * net_shape[2] * net_shape[3]])

-  return learn.models.logistic_regression(net, y)
+  target = tf.one_hot(y, depth=10, dtype=tf.float32)
+  return learn.models.logistic_regression(net, target)
+
+def res_net_model(x, y):
+  prediction, loss = res_net(x, y)
+  predicted = tf.argmax(prediction, 1)
+  accuracy = tf.equal(predicted, tf.cast(y, tf.int64))
+  predictions = {'prob': prediction, 'class': predicted, 'accuracy': accuracy}
+  train_op = tf.contrib.layers.optimize_loss(
+      loss, tf.contrib.framework.get_global_step(),
+      optimizer='Adagrad', learning_rate=0.001)
+  return predictions, loss, train_op

 # Download and load MNIST data.
-mnist = input_data.read_data_sets('MNIST_data')
+mnist = learn.datasets.load_dataset('mnist')

-# Restore model if graph is saved into a folder.
-if os.path.exists('models/resnet/graph.pbtxt'):
-  classifier = learn.TensorFlowEstimator.restore('models/resnet/')
+# Create a new resnet classifier.
+classifier = learn.Estimator(model_fn=res_net_model)

-while True:
-  # Train model and save summaries into logdir.
-  classifier.fit(
-      mnist.train.images, mnist.train.labels, logdir='models/resnet/')
+tf.logging.set_verbosity(tf.logging.INFO)  # Show training logs. (avoid silence)

-  # Calculate accuracy.
-  score = metrics.accuracy_score(
-      mnist.test.labels, classifier.predict(mnist.test.images, batch_size=64))
-  print('Accuracy: {0:f}'.format(score))
+# Train model and save summaries into logdir.
+classifier.fit(
+    mnist.train.images, mnist.train.labels, batch_size=100, steps=1000)
+
+# Calculate accuracy.
+result = classifier.evaluate(
+    x=mnist.test.images, y=mnist.test.labels,
+    metrics={
+        'accuracy': learn.metric_spec.MetricSpec(
+            metric_fn=tf.contrib.metrics.streaming_accuracy,
+            prediction_key='accuracy'),
+    })
+score = result['accuracy']
+print('Accuracy: {0:f}'.format(score))
--- a/tensorflow/examples/skflow/BUILD
+++ b/tensorflow/examples/skflow/BUILD
@ -25,15 +25,6 @@ py_binary(
    ],
 )

-py_binary(
-    name = "resnet",
-    srcs = ["resnet.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
 filegroup(
    name = "all_files",
    srcs = glob(
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@ -46,7 +46,7 @@ def main(_):

  # The raw formulation of cross-entropy,
  #
-  #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)),
+  #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
  #                                 reduction_indices=[1]))
  #
  # can be numerically unstable.
--- a/tensorflow/g3doc/api_docs/python/client.md
+++ b/tensorflow/g3doc/api_docs/python/client.md
@ -52,8 +52,7 @@ with tf.Session() as sess:
  sess.run(...)
 ```

-The [`ConfigProto`]
-(https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
+The [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
 protocol buffer exposes various configuration options for a
 session. For example, to create a session that uses soft constraints
 for device placement, and log the resulting placement decisions,
@ -84,8 +83,8 @@ the session constructor.


 *  <b>`target`</b>: (Optional.) The execution engine to connect to.
-    Defaults to using an in-process engine. See [Distributed Tensorflow]
-    (https://www.tensorflow.org/how_tos/distributed/index.html)
+    Defaults to using an in-process engine. See
+    [Distributed Tensorflow](https://www.tensorflow.org/how_tos/distributed/index.html)
    for more examples.
 *  <b>`graph`</b>: (Optional.) The `Graph` to be launched (described above).
 *  <b>`config`</b>: (Optional.) A [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md
@ -11,8 +11,8 @@ the full softmax loss.
 At inference time, you can compute full softmax probabilities with the
 expression `tf.nn.softmax(tf.matmul(inputs, tf.transpose(weights)) + biases)`.

-See our [Candidate Sampling Algorithms Reference]
-(../../extras/candidate_sampling.pdf)
+See our
+[Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf)

 Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
 ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.Graph.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.Graph.md
@ -632,8 +632,8 @@ Note that this is unrelated to the

 The GraphDef version information of this graph.

-For details on the meaning of each version, see [`GraphDef`]
-(https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto).
+For details on the meaning of each version, see
+[`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto).

 ##### Returns:

--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.fractional_max_pool.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.fractional_max_pool.md
@ -46,7 +46,7 @@ For more details on fractional max pooling, see this paper:
 *  <b>`pseudo_random`</b>: An optional `bool`. Defaults to `False`.
    When set to True, generates the pooling sequence in a
    pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-    Graham, Fractional Max-Pooling] (http://arxiv.org/abs/1412.6071) for
+    Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
    difference between pseudorandom and random.
 *  <b>`overlapping`</b>: An optional `bool`. Defaults to `False`.
    When set to True, it means when pooling, the values at the boundary
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
@ -2,11 +2,10 @@

 Computes and returns the noise-contrastive estimation training loss.

-See [Noise-contrastive estimation: A new estimation principle for
-unnormalized statistical models]
-(http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
-Also see our [Candidate Sampling Algorithms Reference]
-(../../extras/candidate_sampling.pdf)
+See
+[Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+Also see our
+[Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf)

 Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
 so your labels must be sorted in order of decreasing frequency to achieve
@ -44,8 +43,7 @@ with an otherwise unused class.
      where a sampled class equals one of the target classes.  If set to
      `True`, this is a "Sampled Logistic" loss instead of NCE, and we are
      learning to generate log-odds instead of log probabilities.  See
-      our [Candidate Sampling Algorithms Reference]
-      (../../extras/candidate_sampling.pdf).
+      our [Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf).
      Default is False.
 *  <b>`partition_strategy`</b>: A string specifying the partitioning strategy, relevant
      if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.parse_example.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.parse_example.md
@ -2,8 +2,8 @@

 Parses `Example` protos into a `dict` of tensors.

-Parses a number of serialized [`Example`]
-(https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+Parses a number of serialized
+[`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
 protos given in `serialized`.

 `example_names` may contain descriptive names for the corresponding serialized
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.RMSPropOptimizer.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.RMSPropOptimizer.md
@ -1,7 +1,6 @@
 Optimizer that implements the RMSProp algorithm.

-See the [paper]
-(http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+See the [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).

 - - -

--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.atrous_conv2d.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.atrous_conv2d.md
@ -32,11 +32,10 @@ Convolutional Nets and Fully Connected CRFs](http://arxiv.org/abs/1412.7062).
 The same operation is investigated further in [Multi-Scale Context Aggregation
 by Dilated Convolutions](http://arxiv.org/abs/1511.07122). Previous works
 that effectively use atrous convolution in different ways are, among others,
-[OverFeat: Integrated Recognition, Localization and Detection using
-Convolutional Networks](http://arxiv.org/abs/1312.6229) and [Fast Image
-Scanning with Deep Max-Pooling Convolutional Neural Networks]
-(http://arxiv.org/abs/1302.1700). Atrous convolution is also closely related
-to the so-called noble identities in multi-rate signal processing.
+[OverFeat: Integrated Recognition, Localization and Detection using Convolutional Networks](http://arxiv.org/abs/1312.6229)
+and [Fast Image Scanning with Deep Max-Pooling Convolutional Neural Networks](http://arxiv.org/abs/1302.1700).
+Atrous convolution is also closely related to the so-called noble identities in
+multi-rate signal processing.

 There are many different ways to implement atrous convolution (see the refs
 above). The implementation here reduces
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.fractional_avg_pool.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.fractional_avg_pool.md
@ -22,7 +22,7 @@ pooling region.
 *  <b>`pseudo_random`</b>: An optional `bool`. Defaults to `False`.
    When set to True, generates the pooling sequence in a
    pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-    Graham, Fractional Max-Pooling] (http://arxiv.org/abs/1412.6071) for
+    Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
    difference between pseudorandom and random.
 *  <b>`overlapping`</b>: An optional `bool`. Defaults to `False`.
    When set to True, it means when pooling, the values at the boundary
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md
@ -11,8 +11,8 @@ each component is divided by the weighted, squared sum of inputs within
        sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
    output = input / (bias + alpha * sqr_sum) ** beta

-For details, see [Krizhevsky et al., ImageNet classification with deep
-convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+For details, see
+[Krizhevsky et al., ImageNet classification with deep convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).

 ##### Args:

--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Session.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Session.md
@ -36,8 +36,7 @@ with tf.Session() as sess:
  sess.run(...)
 ```

-The [`ConfigProto`]
-(https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
+The [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
 protocol buffer exposes various configuration options for a
 session. For example, to create a session that uses soft constraints
 for device placement, and log the resulting placement decisions,
@ -68,8 +67,8 @@ the session constructor.


 *  <b>`target`</b>: (Optional.) The execution engine to connect to.
-    Defaults to using an in-process engine. See [Distributed Tensorflow]
-    (https://www.tensorflow.org/how_tos/distributed/index.html)
+    Defaults to using an in-process engine. See
+    [Distributed Tensorflow](https://www.tensorflow.org/how_tos/distributed/index.html)
    for more examples.
 *  <b>`graph`</b>: (Optional.) The `Graph` to be launched (described above).
 *  <b>`config`</b>: (Optional.) A [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.resize_images.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.resize_images.md
@ -8,12 +8,9 @@ the same as `size`.  To avoid distortions see

 `method` can be one of:

-*   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.]
-    (https://en.wikipedia.org/wiki/Bilinear_interpolation)
-*   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.]
-    (https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
-*   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.]
-    (https://en.wikipedia.org/wiki/Bicubic_interpolation)
+*   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.](https://en.wikipedia.org/wiki/Bilinear_interpolation)
+*   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.](https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
+*   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.](https://en.wikipedia.org/wiki/Bicubic_interpolation)
 *   <b>`ResizeMethod.AREA`</b>: Area interpolation.

 ##### Args:
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md
@ -36,9 +36,9 @@ For example, if `elems` is `(t1, [t2, t3])` and `initializer` is


 *  <b>`fn`</b>: The callable to be performed.  It accepts two arguments.  The first
-    will have the same (possibly nested) structure as `elems`.  The second
    will have the same structure as `initializer` if one is provided,
-    otherwise it will have the same structure as `elems`.  Its output
+    otherwise it will have the same structure as `elems`.  The second
+    will have the same (possibly nested) structure as `elems`.  Its output
    must have the same structure as `initializer` if one is provided,
    otherwise it must have the same structure as `elems`.
 *  <b>`elems`</b>: A tensor or (possibly nested) sequence of tensors, each of which
--- a/tensorflow/g3doc/get_started/os_setup.md
+++ b/tensorflow/g3doc/get_started/os_setup.md
@ -1089,3 +1089,25 @@ This can be resolved by create a symbolic link:
 ```bash
 ln -sf /usr/local/cuda/lib/libcuda.dylib /usr/local/cuda/lib/libcuda.1.dylib
 ```
+
+### Mac OS X: RuntimeError: Broken toolchain: cannot link a simple C program
+
+On Mac OS X, when installing tensorflow you might see lots of warnings and errors, ending with a `Broken toolchain: cannot link a simple C program` message:
+
+```
+>>> sudo pip install --upgrade $TF_BINARY_URL
+
+...<lots more warnings and errors>
+
+You have not agreed to the Xcode license agreements, please run 'xcodebuild -license' (for user-level acceptance) or 'sudo xcodebuild -license' (for system-wide acceptance) from within a Terminal window to review and agree to the Xcode license agreements.
+
+...<more stack trace output>
+
+  File "numpy/core/setup.py", line 653, in get_mathlib_info
+
+    raise RuntimeError("Broken toolchain: cannot link a simple C program")
+
+RuntimeError: Broken toolchain: cannot link a simple C program
+```
+
+This is typically because you have the Xcode build tools installed, but you still need to accept the license agreements.  To resolve it, accept the license agreement by opening Xcode, or by running `xcodebuild -license` from the command line.
--- a/tensorflow/g3doc/resources/xla_prerelease.md
+++ b/tensorflow/g3doc/resources/xla_prerelease.md
@ -1733,7 +1733,7 @@ degenerate dimensions to produce a 4x3x2 array result.
    for floats. However, if the range of the data is limited, floating-point
    addition is close enough to being associative for most practical uses. It
    is possible to conceive some complete un-associative reductions, however,
-    and these will produce wrong results in TLA reductions.
+    and these will produce wrong results in XLA reductions.

 ## C++ interface

--- a/tensorflow/go/genop/generate.sh
+++ b/tensorflow/go/genop/generate.sh
@ -16,7 +16,8 @@

 set -e

-go get github.com/golang/protobuf/{proto,protoc-gen-go}
+go get github.com/golang/protobuf/proto
+go get github.com/golang/protobuf/protoc-gen-go

 cd $(dirname $0)
 TF_DIR=${GOPATH}/src/github.com/tensorflow/tensorflow
@ -32,7 +33,7 @@ then
    echo "bazel build -c opt @protobuf//:protoc"
    exit 1
  fi
-  PROTOC=PATH_PROTOC
+  PROTOC=$PATH_PROTOC
 fi

 # Ensure that protoc-gen-go is available in $PATH
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -10,6 +10,7 @@ licenses(["notice"])  # Apache 2.0

 exports_files(["LICENSE"])

+load("//tensorflow:tensorflow.bzl", "if_not_windows")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
@ -39,9 +40,11 @@ py_library(
        ":platform_test",
        ":summary",
        ":training",
-        "//tensorflow/contrib:contrib_py",
+        ":ops",
        "//tensorflow/python/debug:debug_py",
-    ],
+    ] + if_not_windows([
+        "//tensorflow/contrib:contrib_py",
+    ]),
 )

 py_library(
@ -1434,7 +1437,7 @@ cuda_py_test(

 cuda_py_test(
    name = "gradient_checker_test",
-    size = "small",
+    size = "medium",
    srcs = ["ops/gradient_checker_test.py"],
    additional_deps = [
        ":array_ops",
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@ -1088,8 +1088,7 @@ class Session(BaseSession):
    sess.run(...)
  ```

-  The [`ConfigProto`]
-  (https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
+  The [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
  protocol buffer exposes various configuration options for a
  session. For example, to create a session that uses soft constraints
  for device placement, and log the resulting placement decisions,
@ -1127,8 +1126,8 @@ class Session(BaseSession):

    Args:
      target: (Optional.) The execution engine to connect to.
-        Defaults to using an in-process engine. See [Distributed Tensorflow]
-        (https://www.tensorflow.org/how_tos/distributed/index.html)
+        Defaults to using an in-process engine. See
+        [Distributed Tensorflow](https://www.tensorflow.org/how_tos/distributed/index.html)
        for more examples.
      graph: (Optional.) The `Graph` to be launched (described above).
      config: (Optional.) A [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@ -69,6 +69,8 @@ class DType(object):

  @@as_numpy_dtype
  @@as_datatype_enum
+  
+  @@limits
  """

  def __init__(self, type_enum):
@ -222,6 +224,22 @@ class DType(object):
      except:
        raise TypeError("Cannot find maximum value of %s." % self)

+  @property
+  def limits(self, clip_negative=True):
+    """Return intensity limits, i.e. (min, max) tuple, of the dtype.
+    Args:
+      clip_negative : bool, optional
+          If True, clip the negative range (i.e. return 0 for min intensity)
+          even if the image dtype allows negative values.
+    Returns
+      min, max : tuple
+        Lower and upper intensity limits.
+    """
+    min, max = dtype_range[self.as_numpy_dtype]
+    if clip_negative:
+      min = 0
+    return min, max
+
  def is_compatible_with(self, other):
    """Returns True if the `other` DType will be converted to this DType.

@ -277,6 +295,19 @@ class DType(object):
  def size(self):
    return np.dtype(self.as_numpy_dtype).itemsize

+# Define data type range of numpy dtype
+dtype_range = {np.bool_: (False, True),
+               np.bool8: (False, True),
+               np.uint8: (0, 255),
+               np.uint16: (0, 65535),
+               np.int8: (-128, 127),
+               np.int16: (-32768, 32767),
+               np.int64: (-2**63, 2**63 - 1),
+               np.uint64: (0, 2**64 - 1),
+               np.int32: (-2**31, 2**31 - 1),
+               np.uint32: (0, 2**32 - 1),
+               np.float32: (-1, 1),
+               np.float64: (-1, 1)}

 # Define standard wrappers for the types_pb2.DataType enum.
 resource = DType(types_pb2.DT_RESOURCE)
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@ -2130,8 +2130,8 @@ class Graph(object):
  def graph_def_versions(self):
    """The GraphDef version information of this graph.

-    For details on the meaning of each version, see [`GraphDef`]
-    (https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto).
+    For details on the meaning of each version, see
+    [`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto).

    Returns:
      A `VersionDef`.
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@ -89,6 +89,7 @@ void PrintAllPythonOps(const std::vector<string>& hidden_ops,

 int main(int argc, char* argv[]) {
  tensorflow::port::InitMain(argv[0], &argc, &argv);
+
  // Usage:
  //   gen_main [ @FILENAME | OpName[,OpName]* ] (0 | 1)
  if (argc == 2) {
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import numpy as np
 import tensorflow as tf


@ -57,5 +58,19 @@ class DecodeRawOpTest(tf.test.TestCase):
          "size of int16"):
        decode.eval(feed_dict={in_bytes: ["123", "456"]})

+  def testToFloat16(self):
+    with self.test_session():
+      in_bytes = tf.placeholder(tf.string, shape=[None])
+      decode = tf.decode_raw(in_bytes, out_type=tf.float16)
+      self.assertEqual([None, None], decode.get_shape().as_list())
+
+      expected_result = np.matrix([[1, -2, -3, 4]], dtype=np.float16)
+      result = decode.eval(
+        feed_dict={
+          in_bytes: [expected_result.tobytes()]
+        })
+
+      self.assertAllEqual(expected_result, result)
+
 if __name__ == "__main__":
  tf.test.main()
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@ -417,9 +417,9 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,

  Args:
    fn: The callable to be performed.  It accepts two arguments.  The first
-      will have the same (possibly nested) structure as `elems`.  The second
      will have the same structure as `initializer` if one is provided,
-      otherwise it will have the same structure as `elems`.  Its output
+      otherwise it will have the same structure as `elems`.  The second
+      will have the same (possibly nested) structure as `elems`.  Its output
      must have the same structure as `initializer` if one is provided,
      otherwise it must have the same structure as `elems`.
    elems: A tensor or (possibly nested) sequence of tensors, each of which
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@ -147,6 +147,8 @@ type and representation (RGB or HSV).
@@adjust_hue
@@random_hue

+@@adjust_gamma
+
@@adjust_saturation
@@random_saturation

@ -163,6 +165,7 @@ from __future__ import division
 from __future__ import print_function

 from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@ -737,12 +740,9 @@ def resize_images(images,

  `method` can be one of:

-  *   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.]
-      (https://en.wikipedia.org/wiki/Bilinear_interpolation)
-  *   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.]
-      (https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
-  *   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.]
-      (https://en.wikipedia.org/wiki/Bicubic_interpolation)
+  *   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.](https://en.wikipedia.org/wiki/Bilinear_interpolation)
+  *   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.](https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
+  *   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.](https://en.wikipedia.org/wiki/Bicubic_interpolation)
  *   <b>`ResizeMethod.AREA`</b>: Area interpolation.

  Args:
@ -1005,6 +1005,46 @@ def adjust_contrast(images, contrast_factor):
    return convert_image_dtype(adjusted, orig_dtype, saturate=True)


+def adjust_gamma(image, gamma=1, gain=1):
+  """Performs Gamma Correction on the input image.
+    Also known as Power Law Transform. This function transforms the 
+    input image pixelwise according to the equation Out = In**gamma 
+    after scaling each pixel to the range 0 to 1.
+
+  Args:
+    image : A Tensor.
+    gamma : A scalar. Non negative real number.
+    gain  : A scalar. The constant multiplier. 
+
+  Returns:
+    A Tensor. Gamma corrected output image.
+
+  Notes:
+    For gamma greater than 1, the histogram will shift towards left and
+    the output image will be darker than the input image.
+    For gamma less than 1, the histogram will shift towards right and
+    the output image will be brighter than the input image.
+
+  References:
+    [1] http://en.wikipedia.org/wiki/Gamma_correction
+  """
+
+  with ops.op_scope([image, gamma, gain], None, 'adjust_gamma') as name:
+    # Convert pixel value to DT_FLOAT for computing adjusted image
+    img = ops.convert_to_tensor(image, name='img', dtype=dtypes.float32)
+    # Keep image dtype for computing the scale of corresponding dtype
+    image = ops.convert_to_tensor(image, name='image')
+
+    if gamma < 0:
+      raise ValueError("Gamma should be a non-negative real number")
+    # scale = max(dtype) - min(dtype)
+    scale = constant_op.constant(image.dtype.limits[1] - image.dtype.limits[0], dtype=dtypes.float32)
+    # According to the definition of gamma correction
+    adjusted_img = (img / scale) ** gamma * scale * gain
+
+    return adjusted_img
+    
+
 ops.RegisterShape('AdjustContrast')(common_shapes.call_cpp_shape_fn)
 ops.RegisterShape('AdjustContrastv2')(common_shapes.call_cpp_shape_fn)
 ops.RegisterShape('DrawBoundingBoxes')(common_shapes.call_cpp_shape_fn)
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@ -164,6 +164,80 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
      self.assertFalse(rgb_unknown.get_shape())


+class AdjustGamma(test_util.TensorFlowTestCase):
+
+  def test_adjust_gamma_one(self):
+    """Same image should be returned for gamma equal to one"""
+    with self.test_session():
+      x_data = np.random.uniform(0, 255, (8, 8))
+      x_np = np.array(x_data, dtype=np.float32)
+
+      x = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.adjust_gamma(x, gamma=1)
+
+      y_tf = y.eval()
+      y_np = x_np
+
+      self.assertAllClose(y_tf, y_np, 1e-6)
+
+
+  def test_adjust_gamma_zero(self):
+    """White image should be returned for gamma equal to zero"""
+    with self.test_session():
+      x_data = np.random.uniform(0, 255, (8, 8))
+      x_np = np.array(x_data, dtype=np.float32)
+      
+      x = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.adjust_gamma(x, gamma=0)
+      
+      y_tf = y.eval()
+
+      dtype = x.dtype.as_numpy_dtype
+      y_np = np.array([dtypes.dtype_range[dtype][1]] * x_np.size)
+      y_np = y_np.reshape((8,8))
+      
+      self.assertAllClose(y_tf, y_np, 1e-6)
+      
+
+  def test_adjust_gamma_less_one(self):
+    """Verifying the output with expected results for gamma
+    correction with gamma equal to half"""
+    with self.test_session():
+      x_np = np.arange(0, 255, 4, np.uint8).reshape(8,8)
+      y = image_ops.adjust_gamma(x_np, gamma=0.5)
+      y_tf = np.trunc(y.eval())
+
+      y_np = np.array([[  0,  31,  45,  55,  63,  71,  78,  84],
+          [ 90,  95, 100, 105, 110, 115, 119, 123],
+          [127, 131, 135, 139, 142, 146, 149, 153],
+          [156, 159, 162, 165, 168, 171, 174, 177],
+          [180, 183, 186, 188, 191, 194, 196, 199],
+          [201, 204, 206, 209, 211, 214, 216, 218],
+          [221, 223, 225, 228, 230, 232, 234, 236],
+          [238, 241, 243, 245, 247, 249, 251, 253]], dtype=np.float32)
+      
+      self.assertAllClose(y_tf, y_np, 1e-6)
+
+  def test_adjust_gamma_greater_one(self):
+    """Verifying the output with expected results for gamma
+    correction with gamma equal to two"""
+    with self.test_session():
+      x_np = np.arange(0, 255, 4, np.uint8).reshape(8,8)
+      y = image_ops.adjust_gamma(x_np, gamma=2)
+      y_tf = np.trunc(y.eval())
+
+      y_np = np.array([[  0,   0,   0,   0,   1,   1,   2,   3],
+          [  4,   5,   6,   7,   9,  10,  12,  14],
+          [ 16,  18,  20,  22,  25,  27,  30,  33],
+          [ 36,  39,  42,  45,  49,  52,  56,  60],
+          [ 64,  68,  72,  76,  81,  85,  90,  95],
+          [100, 105, 110, 116, 121, 127, 132, 138],
+          [144, 150, 156, 163, 169, 176, 182, 189],
+          [196, 203, 211, 218, 225, 233, 241, 249]], dtype=np.float32)
+
+      self.assertAllClose(y_tf, y_np, 1e-6)
+
+
 class AdjustHueTest(test_util.TensorFlowTestCase):

  def testAdjustNegativeHue(self):
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@ -1029,6 +1029,8 @@ def fused_batch_norm(x, scale, offset,  # pylint: disable=invalid-name
    mean = constant_op.constant([])
  if variance is None:
    variance = constant_op.constant([])
+  # Add 1e-12 to epsilon when epsilon <= 1e-5 to prevent CUDNN exception.
+  epsilon = epsilon if epsilon > 1e-5 else epsilon + 1e-12
  y, batch_mean, batch_var, _, _ = gen_nn_ops.fused_batch_norm(
      x,
      scale,
@ -1271,10 +1273,8 @@ def nce_loss(weights,
  """Computes and returns the noise-contrastive estimation training loss.

  See [Noise-contrastive estimation: A new estimation principle for
-  unnormalized statistical models]
-  (http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
-  Also see our [Candidate Sampling Algorithms Reference]
-  (../../extras/candidate_sampling.pdf)
+  unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+  Also see our [Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf)

  Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
  so your labels must be sorted in order of decreasing frequency to achieve
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@ -210,36 +210,34 @@ def _BiasAddGradGrad(op, received_grad):
  Args:
    op: BiasAddGrad op for which we are calculating gradients.
    received_grad: The gradients passed to the BiasAddGrad op.
-    
+
  Returns:
    A single gradient Tensor for the input to BiasAddGrad (which
    is the gradient of the bias term in BiasAdd)
  """
-  
+
  try:
    data_format = op.get_attr("data_format")
  except ValueError:
    data_format = None
-  
+
  shape = array_ops.shape(op.inputs[0])
  rank = array_ops.rank(op.inputs[0])
  bias_shape = array_ops.shape(received_grad)
-  
+
  if data_format == b"NCHW":
    expanded_shape = array_ops.concat(
      0,
      [array_ops.ones_like(shape[:-3]), bias_shape, array_ops.ones_like(shape[-2:])]
    )
-    
    tile_mults = array_ops.concat(0, [shape[:-3], [1], shape[-2:]])
-    
  else:
    expanded_shape = array_ops.concat(0, [array_ops.ones_like(shape[:-1]), bias_shape])
    tile_mults = array_ops.concat(0, [shape[:-1], [1]])
-  
+
  expanded_grad = array_ops.reshape(received_grad, expanded_shape)
  return array_ops.tile(expanded_grad, tile_mults)
-  
+

@ops.RegisterGradient("BiasAddV1")
 def _BiasAddGradV1(unused_bias_op, received_grad):
@ -498,7 +496,8 @@ def _FusedBatchNormGrad(op, *grad):
      op.outputs[3],
      op.outputs[4],
      epsilon=op.get_attr("epsilon"),
-      data_format=op.get_attr("data_format"))
+      data_format=op.get_attr("data_format"),
+      is_training=op.get_attr("is_training"))


@ops.RegisterGradient("L2Loss")
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@ -842,9 +842,9 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
  that effectively use atrous convolution in different ways are, among others,
  [OverFeat: Integrated Recognition, Localization and Detection using
  Convolutional Networks](http://arxiv.org/abs/1312.6229) and [Fast Image
-  Scanning with Deep Max-Pooling Convolutional Neural Networks]
-  (http://arxiv.org/abs/1302.1700). Atrous convolution is also closely related
-  to the so-called noble identities in multi-rate signal processing.
+  Scanning with Deep Max-Pooling Convolutional Neural Networks](http://arxiv.org/abs/1302.1700).
+  Atrous convolution is also closely related to the so-called noble identities
+  in multi-rate signal processing.

  There are many different ways to implement atrous convolution (see the refs
  above). The implementation here reduces
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@ -153,8 +153,7 @@ def parse_example(serialized, features, name=None, example_names=None):
  # pylint: disable=line-too-long
  """Parses `Example` protos into a `dict` of tensors.

-  Parses a number of serialized [`Example`]
-  (https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+  Parses a number of serialized [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
  protos given in `serialized`.

  `example_names` may contain descriptive names for the corresponding serialized
@ -549,8 +548,7 @@ def parse_single_sequence_example(
  # pylint: disable=line-too-long
  """Parses a single `SequenceExample` proto.

-  Parses a single serialized [`SequenceExample`]
-  (https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+  Parses a single serialized [`SequenceExample`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
  proto given in `serialized`.

  This op parses a serialize sequence example into a tuple of dictionaries
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@ -609,9 +609,6 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
      most TensorFlow data is batch-major, so by default this function
      accepts input and emits output in batch-major form.
    dtype: (optional) The data type for the initial state.  Required if
-      initial_state is not provided.
-    sequence_length: An int32/int64 vector, size `[batch_size]`,
-      containing the actual lengths for each of the sequences.
      either of the initial states are not provided.
    scope: VariableScope for the created subgraph; defaults to "BiRNN"

--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@ -94,48 +94,100 @@ def einsum(axes, *inputs):
  """
  A generalized contraction between tensors of arbitrary dimension.

-  Like numpy.einsum.
-  """
+  Like `numpy.einsum`, but does not support:
+  * Ellipses (subscripts like `ij...,jk...->ik...`)
+  * Subscripts where an axis appears more than once for a single input (e.g. `ijj,jk->ik`).

-  match = re.match('([a-z,]+)->([a-z]+)', axes)
-  assert match, \
-    "Indices have incorrect format: %s" % axes
+  Args:
+    axes: a `str` describing the contraction, in the same format as `numpy.einsum`.
+    inputs: the inputs to contract (each one a `Tensor`), whose shapes should be consistent with `axes`.
+
+  Returns:
+    The contracted `Tensor`, with shape determined by `axes`.
+
+  Raises:
+    ValueError: If the format of `axes` is incorrect,
+                or the number of inputs implied by `axes` does not match `len(inputs)`,
+                or an axis appears in the output subscripts but not in any of the inputs,
+                or the number of dimensions of an input differs from the number of indices in its subscript,
+                or the input shapes are inconsistent along a particular axis.
+  """
+  if '...' in axes:
+    raise ValueError("Subscripts with ellipses are not yet supported.")
+
+  match = re.match('([a-z,]+)(->[a-z]*)?', axes)
+  if not match:
+    raise ValueError(
+      "Indices have incorrect format: %s" % axes
+    )

  inputs = list(inputs)
  idx_in = match.group(1).split(',')
-  idx_out = match.group(2)
  idx_all = set(''.join(idx_in))
+  indices = ''.join(sorted(idx_all))

+  if match.group(2):
+    idx_out = match.group(2)[2:]

-  assert len(idx_in) == len(inputs), \
-    "Expected %d inputs but only got %d" % (len(idx_in), len(inputs))
+  else:
+    # infer the output subscripts if not given, assume alphabetical order
+    counts = {ax: 0 for ax in indices}
+    for axes_ in idx_in:
+      for ax in axes_:
+        counts[ax] += 1

-  # transpose inputs so axes are in alphabetical order
+    idx_out = ''.join(sorted(
+      ax for ax in indices
+      if counts[ax] == 1
+    ))
+
+  if len(idx_in) != len(inputs):
+    raise ValueError(
+      "Expected %d inputs but got %d" % (len(idx_in), len(inputs))
+    )
+
+  missing_idx = set(idx_out).difference(idx_all)
+  if missing_idx:
+    raise ValueError(
+      "Unknown ouput axes: %s" % missing_idx
+    )
+
+  axis_order = {}
+  for ax in indices:
+    if ax not in idx_out:
+      axis_order[ax] = len(axis_order)
+  for ax in idx_out:
+    axis_order[ax] = len(axis_order)
+
+  # transpose inputs so axes are in order
  for i, (input_, axes_) in enumerate(zip(inputs, idx_in)):
-    assert input_.get_shape().ndims == len(axes_), \
-      "Input %d with axes %s has incorrect" \
-      " number of dimensions (expected %d, got %d)" % (
-        i, axes_, len(axes_), input_.get_shape().ndims
+    if input_.get_shape().ndims != len(axes_):
+      raise ValueError(
+        "Input %d with axes %s has incorrect" \
+        " number of dimensions (expected %d, got %d)" % (
+          i, axes_, len(axes_), input_.get_shape().ndims
+        )
      )

-    sorted_idx = sorted(axes_)
+    sorted_idx = sorted(axes_, key=axis_order.get)
+
+    if len(set(axes_)) != len(axes_):
+      raise ValueError(
+        "Subscript not supported: an axis appears more than once: %s" % axes_
+      )

    if list(axes_) != sorted_idx:
      permuted = [axes_.find(ax) for ax in sorted_idx]
      inputs[i] = array_ops.transpose(input_, permuted)
      idx_in[i] = sorted_idx

-  missing_idx = set(idx_out).difference(idx_all)
-  assert not missing_idx, \
-    "Unknown ouput axes: %s" % missing_idx
-
  reduction_idx = []
  shapes = [[dim if dim else -1
             for dim in tensor.get_shape().as_list()]
            for tensor in inputs]

  # validate shapes for broadcasting
-  for j, ax in enumerate(sorted(idx_all)):
+  for j, ax in enumerate(sorted(idx_all, key=axis_order.get)):
    dims = []
    for i, idx in enumerate(idx_in):
      if ax not in idx:
@ -145,8 +197,10 @@ def einsum(axes, *inputs):
        if isinstance(dim, int) and dim > 1:
          dims.append(dim)

-    assert len(set(dims)) <= 1, \
-      "Dimension mismatch on axis: %s" % ax
+    if len(set(dims)) > 1:
+      raise ValueError(
+        "Dimension mismatch on axis: %s" % ax
+      )

    if ax not in idx_out:
      reduction_idx.append(j)
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@ -113,39 +113,77 @@ class LBetaTestGpu(LBetaTest):

 class EinsumTest(tf.test.TestCase):

-  # standard cases
  simple_cases = [
    'ij,jk->ik',
    'ijk,jklm->il',
    'ij,jk,kl->il',
    'ijk->i',
-  ]
-
-  # where axes are not in order
-  misordered_cases = [
+    'ijk->kji',
    'ji,kj->ik',
+
    'ikl,kji->kl',
    'klj,lki->ij',
-  ]
-
-  # more than two arguments
-  multiarg_cases = [
+    'ijk,ilj->kli',
+    'kij,mkb->ijmb',
    'ijk,ijl,ikl->i',
    'i,ijk,j->k',
    'ij,ij,jk,kl->il',
+    'ij,kj,il,jm->ml',
+
+    'a,ab,abc->abc',
+    'a,b,ab->ab',
+    'ab,ab,c->',
+    'ab,ab,c->c',
+    'ab,ab,cd,cd->',
+    'ab,ab,cd,cd->ac',
+    'ab,ab,cd,cd->cd',
+    'ab,ab,cd,cd,ef,ef->',
+
+    'ab,cd,ef->abcdef',
+    'ab,cd,ef->acdf',
+    'ab,cd,de->abcde',
+    'ab,cd,de->be',
+    'ab,bcd,cd->abcd',
+    'ab,bcd,cd->abd',
+
+    'eb,cb,fb->cef',
+    'abcd,ad',
+    'bd,db,eac->ace',
+    'ba,ac,da->bcd',
+
+    'ab,ab',
+    'ab,ba',
+    'abc,abc',
+    'abc,bac',
+    'abc,cba',
+
+    'dba,ead,cad->bce',
+    'aef,fbc,dca->bde',
+  ]
+
+  long_cases = [
+    'bca,cdb,dbf,afc->',
+    'efc,dbc,acf,fd->abe',
+    'ea,fb,gc,hd,abcd->efgh',
+    'ea,fb,abcd,gc,hd->efgh',
+    'abhe,hidj,jgba,hiab,gab',
  ]

  invalid_cases = [
    # bad formats
+    '',
    'ijk ijk',
-    'ij,jk,kl'
-    'ij->',
+    'ij.jk->ik',
+    'ij...,jk...->ik...',

    # axis in output that does not exist
    'ij,jk->im',

    # incorrect number of dimensions
    'ij,jkl->kl',
+
+    # this is allowed in numpy but not implemented here yet
+    'iij,jk'
  ]

  dim_mismatch_cases = [
@ -158,28 +196,18 @@ class EinsumTest(tf.test.TestCase):
    for case in self.simple_cases:
      self.run_test(case)

-  def test_misordered(self):
-    for case in self.misordered_cases:
-      self.run_test(case)
-
-  def test_multiarg(self):
-    for case in self.multiarg_cases:
+  def test_long(self):
+    for case in self.long_cases:
      self.run_test(case)

  def test_invalid(self):
    for axes in self.invalid_cases:
-      result = None
      inputs = [
        tf.placeholder(tf.float32, shape=(3,4)),
        tf.placeholder(tf.float32, shape=(3,4)),
      ]
-
-      try:
-        result = tf.einsum(axes, *inputs)
-      except AssertionError as e:
-        print(e)
-      assert result is None, \
-        "An exception should have been thrown."
+      with self.assertRaises(ValueError):
+        _ = tf.einsum(axes, *inputs)

  def test_dim_mismatch(self):
    for axes, input_shapes in self.dim_mismatch_cases:
@ -187,12 +215,8 @@ class EinsumTest(tf.test.TestCase):
        tf.placeholder(tf.float32, shape=shape)
        for shape in input_shapes
      ]
-      result = None
-      try:
-        result = tf.einsum(axes, *inputs)
-      except AssertionError:
-        pass
-      assert result is None, "An exception should have been thrown."
+      with self.assertRaises(ValueError):
+        _ = tf.einsum(axes, *inputs)

  def run_test(self, axes):
    all_axes = {ax: np.random.randint(4, 12)
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@ -50,8 +50,7 @@ from tensorflow.python.training import training_ops
 class RMSPropOptimizer(optimizer.Optimizer):
  """Optimizer that implements the RMSProp algorithm.

-  See the [paper]
-  (http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  See the [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).

  @@__init__
  """
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@ -33,8 +33,6 @@ limitations under the License.

 #include "tensorflow/stream_executor/cuda/cuda_blas.h"

-#include <dlfcn.h>
-
 #include <complex>

 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
@ -44,6 +42,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/status_macros.h"
@ -71,14 +70,20 @@ namespace dynload {
      static auto status = internal::CachedDsoLoader::GetCublasDsoHandle(); \
      return status.ValueOrDie();                                           \
    }                                                                       \
-    static FuncPointerT DynLoad() {                                         \
-      static void *f = dlsym(GetDsoHandle(), kName);                        \
-      CHECK(f != nullptr) << "could not find " << kName                     \
-                          << " in cuBLAS DSO; dlerror: " << dlerror();      \
+    static FuncPointerT LoadOrDie() {                                       \
+      void *f;                                                              \
+      port::Status s = port::Env::Default()->GetSymbolFromLibrary(          \
+          GetDsoHandle(), kName, &f);                                       \
+      CHECK(s.ok()) << "could not find " << kName                           \
+                    << " in cuBLAS DSO; dlerror: " << s.error_message();    \
      return reinterpret_cast<FuncPointerT>(f);                             \
    }                                                                       \
+    static FuncPointerT DynLoad() {                                         \
+      static FuncPointerT f = LoadOrDie();                                  \
+      return f;                                                             \
+    }                                                                       \
    template <typename... Args>                                             \
-    cublasStatus_t operator()(CUDAExecutor * parent, Args... args) {        \
+    cublasStatus_t operator()(CUDAExecutor *parent, Args... args) {         \
      cuda::ScopedActivateExecutorContext sac{parent};                      \
      return DynLoad()(args...);                                            \
    }                                                                       \
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@ -15,7 +15,6 @@ limitations under the License.

 #include "tensorflow/stream_executor/cuda/cuda_dnn.h"

-#include <dlfcn.h>
 #include <functional>
 #include <memory>

@ -137,36 +136,47 @@ void* GetDsoHandle() {
  return result.ValueOrDie();
 }

-// Calls cudnnGetVersion in the loaded DSO.
-size_t cudnnGetVersion() {
-  static void* f = dlsym(GetDsoHandle(), "cudnnGetVersion");
+static void* DynLoadGetVersionOrDie() {
+  void* f;
+  port::Status s = port::Env::Default()->GetSymbolFromLibrary(
+      GetDsoHandle(), "cudnnGetVersion", &f);
  if (f == nullptr) {
    LOG(FATAL) << "could not find cudnnGetVersion in cudnn DSO; dlerror: "
-               << dlerror();
+               << s.error_message();
  }
+  return f;
+}
+
+// Calls cudnnGetVersion in the loaded DSO.
+size_t cudnnGetVersion() {
+  static void* f = DynLoadGetVersionOrDie();
  auto callable = reinterpret_cast<size_t (*)(void)>(f);
  return callable();
 }

-#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP(__name)                        \
-  struct DynLoadShim__##__name {                                     \
-    static const char* kName;                                        \
-    typedef std::add_pointer<decltype(::__name)>::type FuncPointerT; \
-    static FuncPointerT DynLoad() {                                  \
-      static void* f = dlsym(GetDsoHandle(), kName);                 \
-      if (f == nullptr) {                                            \
-        LOG(FATAL) << "could not find " << kName                     \
-                   << " in cudnn DSO; dlerror: " << dlerror();       \
-      }                                                              \
-      return reinterpret_cast<FuncPointerT>(f);                      \
-    }                                                                \
-    template <typename... Args>                                      \
-    cudnnStatus_t operator()(CUDAExecutor* parent, Args... args) {   \
-      cuda::ScopedActivateExecutorContext sac{parent};               \
-      cudnnStatus_t retval = DynLoad()(args...);                     \
-      return retval;                                                 \
-    }                                                                \
-  } __name;                                                          \
+#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP(__name)                           \
+  struct DynLoadShim__##__name {                                        \
+    static const char* kName;                                           \
+    typedef std::add_pointer<decltype(::__name)>::type FuncPointerT;    \
+    static FuncPointerT LoadOrDie() {                                   \
+      void* f;                                                          \
+      port::Status s = port::Env::Default()->GetSymbolFromLibrary(      \
+          GetDsoHandle(), kName, &f);                                   \
+      CHECK(s.ok()) << "could not find " << kName                       \
+                    << " in cudnn DSO; dlerror: " << s.error_message(); \
+      return reinterpret_cast<FuncPointerT>(f);                         \
+    }                                                                   \
+    static FuncPointerT DynLoad() {                                     \
+      static FuncPointerT f = LoadOrDie();                              \
+      return f;                                                         \
+    }                                                                   \
+    template <typename... Args>                                         \
+    cudnnStatus_t operator()(CUDAExecutor* parent, Args... args) {      \
+      cuda::ScopedActivateExecutorContext sac{parent};                  \
+      cudnnStatus_t retval = DynLoad()(args...);                        \
+      return retval;                                                    \
+    }                                                                   \
+  } __name;                                                             \
  const char* DynLoadShim__##__name::kName = #__name;

 // clang-format off
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@ -15,7 +15,6 @@ limitations under the License.

 #include "tensorflow/stream_executor/cuda/cuda_driver.h"

-#include <dlfcn.h>
 #include <map>
 #include <stdint.h>
 #include <stdlib.h>
@ -61,12 +60,18 @@ namespace dynload {
      static auto status = internal::CachedDsoLoader::GetLibcudaDsoHandle(); \
      return status.ValueOrDie();                                            \
    }                                                                        \
-    static FuncPointerT DynLoad() {                                          \
-      static void *f = dlsym(GetDsoHandle(), kName);                         \
-      CHECK(f != nullptr) << "could not find " << kName                      \
-                          << "in libcuda DSO; dlerror: " << dlerror();       \
+    static FuncPointerT LoadOrDie() {                                        \
+      void *f;                                                               \
+      port::Status s = port::Env::Default()->GetSymbolFromLibrary(           \
+          GetDsoHandle(), kName, &f);                                        \
+      CHECK(s.ok()) << "could not find " << kName                            \
+                    << " in libcuda DSO; dlerror: " << s.error_message();    \
      return reinterpret_cast<FuncPointerT>(f);                              \
    }                                                                        \
+    static FuncPointerT DynLoad() {                                          \
+      static FuncPointerT f = LoadOrDie();                                   \
+      return f;                                                              \
+    }                                                                        \
    template <typename... Args>                                              \
    CUresult operator()(Args... args) {                                      \
      return DynLoad()(args...);                                             \
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@ -15,8 +15,6 @@ limitations under the License.

 #include "tensorflow/stream_executor/cuda/cuda_fft.h"

-#include <dlfcn.h>
-
 #include <complex>

 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
@ -26,6 +24,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/platform/logging.h"
@ -55,13 +54,15 @@ namespace dynload {
      return status.ValueOrDie();                                          \
    }                                                                      \
    static FuncPointerT DynLoad() {                                        \
-      static void *f = dlsym(GetDsoHandle(), kName);                       \
-      CHECK(f != nullptr) << "could not find " << kName                    \
-                          << " in cuFFT DSO; dlerror: " << dlerror();      \
+      static void *f;                                                      \
+      port::Status s = port::Env::Default()->GetSymbolFromLibrary(         \
+          GetDsoHandle(), kName, &f);                                      \
+      CHECK(s.ok()) << "could not find " << kName                          \
+                    << " in cuFFT DSO; dlerror: " << s.error_message();    \
      return reinterpret_cast<FuncPointerT>(f);                            \
    }                                                                      \
    template <typename... Args>                                            \
-    cufftResult operator()(CUDAExecutor * parent, Args... args) {          \
+    cufftResult operator()(CUDAExecutor *parent, Args... args) {           \
      cuda::ScopedActivateExecutorContext sac{parent};                     \
      return DynLoad()(args...);                                           \
    }                                                                      \
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/platform/logging.h"
@ -72,14 +73,20 @@ namespace dynload {
      static auto status = internal::CachedDsoLoader::GetCurandDsoHandle(); \
      return status.ValueOrDie();                                           \
    }                                                                       \
-    static FuncPointerT DynLoad() {                                         \
-      static void *f = dlsym(GetDsoHandle(), kName);                        \
-      CHECK(f != nullptr) << "could not find " << kName                     \
-                          << " in curand DSO; dlerror: " << dlerror();      \
+    static FuncPointerT LoadOrDie() {                                       \
+      void *f;                                                              \
+      port::Status s = port::Env::Default()->GetSymbolFromLibrary(          \
+          GetDsoHandle(), kName, &f);                                       \
+      CHECK(s.ok()) << "could not find " << kName                           \
+                    << " in curand DSO; dlerror: " << s.error_message();    \
      return reinterpret_cast<FuncPointerT>(f);                             \
    }                                                                       \
+    static FuncPointerT DynLoad() {                                         \
+      static FuncPointerT f = LoadOrDie();                                  \
+      return f;                                                             \
+    }                                                                       \
    template <typename... Args>                                             \
-    curandStatus_t operator()(CUDAExecutor * parent, Args... args) {        \
+    curandStatus_t operator()(CUDAExecutor *parent, Args... args) {         \
      cuda::ScopedActivateExecutorContext sac{parent};                      \
      return DynLoad()(args...);                                            \
    }                                                                       \
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@ -29,9 +29,9 @@ limitations under the License.
 #include <vector>

 #include "tensorflow/core/platform/load_library.h"
+#include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform/logging.h"
@ -97,19 +97,23 @@ string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 /* static */ port::Status DsoLoader::GetDsoHandle(port::StringPiece path,
                                                  void** dso_handle,
                                                  LoadKind load_kind) {
+  if (load_kind != LoadKind::kLocal) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "Only LoadKind::kLocal is currently supported");
+  }
  int dynload_flags =
      RTLD_LAZY | (load_kind == LoadKind::kLocal ? RTLD_LOCAL : RTLD_GLOBAL);
  string path_string = path.ToString();
-  *dso_handle = dlopen(path_string.c_str(), dynload_flags);
-  if (*dso_handle == nullptr) {
+  port::Status s =
+      port::Env::Default()->LoadLibrary(path_string.c_str(), dso_handle);
+  if (!s.ok()) {
    LOG(INFO) << "Couldn't open CUDA library " << path
              << ". LD_LIBRARY_PATH: " << getenv("LD_LIBRARY_PATH");
-    return port::Status(
-        port::error::FAILED_PRECONDITION,
-        port::StrCat("could not dlopen DSO: ", path, "; dlerror: ", dlerror()));
+    return port::Status(port::error::FAILED_PRECONDITION,
+                        port::StrCat("could not dlopen DSO: ", path,
+                                     "; dlerror: ", s.error_message()));
  }
-  LOG(INFO) << "successfully opened CUDA library " << path
-            << (load_kind == LoadKind::kLocal ? " locally" : " globally");
+  LOG(INFO) << "successfully opened CUDA library " << path << " locally";
  return port::Status::OK();
 }

--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@ -141,6 +141,12 @@ def if_not_mobile(a):
      "//conditions:default": a,
  })

+def if_not_windows(a):
+  return select({
+      "//tensorflow:windows": [],
+      "//conditions:default": a,
+  })  
+
 def tf_copts():
  return (["-DEIGEN_AVOID_STL_ARRAY",
           "-Iexternal/gemmlowp",
@ -156,6 +162,10 @@ def tf_copts():
                  "-O2",
              ],
              "//tensorflow:darwin": [],
+              "//tensorflow:windows": [
+                "/DLANG_CXX11",
+                "/D__VERSION__=\\\"MSVC\\\"",
+              ],
              "//tensorflow:ios": ["-std=c++11"],
              "//conditions:default": ["-pthread"]}))

@ -565,12 +575,15 @@ def _py_wrap_cc_impl(ctx):
  args += ["-outdir", py_out.dirname]
  args += [src.path]
  outputs = [cc_out, py_out]
-  ctx.action(executable=ctx.executable.swig_binary,
-             arguments=args,
+  # TODO(pcloudy): Move args to arguments after
+  # https://github.com/bazelbuild/bazel/issues/1926 is fixed
+  ctx.action(command=" ".join(["tensorflow/tools/swig/swig.sh"] + args),
+             arguments=[],
             mnemonic="PythonSwig",
             inputs=sorted(set([src]) + cc_includes + ctx.files.swig_includes +
                         ctx.attr.swig_deps.files),
             outputs=outputs,
+             use_default_shell_env=True,
             progress_message="SWIGing {input}".format(input=src.path))
  return struct(files=set(outputs))

@ -593,12 +606,6 @@ _py_wrap_cc = rule(
        )),
        "module_name": attr.string(mandatory = True),
        "py_module_name": attr.string(mandatory = True),
-        "swig_binary": attr.label(
-            default = Label("//tensorflow:swig"),
-            cfg = "host",
-            executable = True,
-            allow_files = True,
-        ),
    },
    outputs = {
        "cc_out": "%{module_name}.cc",
@ -743,6 +750,7 @@ def tf_py_wrap_cc(name, srcs, swig_includes=[], deps=[], copts=[], **kwargs):
  # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
  # and use that as the name for the rule producing the .so file.
  cc_library_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".so"])
+  cc_library_pyd_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".pyd"])
  extra_deps = []
  _py_wrap_cc(name=name + "_py_wrap",
              srcs=srcs,
@ -755,6 +763,8 @@ def tf_py_wrap_cc(name, srcs, swig_includes=[], deps=[], copts=[], **kwargs):
          "-Wl,-exported_symbols_list",
          "//tensorflow:tf_exported_symbols.lds"
      ],
+      "//tensorflow:windows": [
+      ],
      "//conditions:default": [
          "-Wl,--version-script",
          "//tensorflow:tf_version_script.lds"
@ -763,6 +773,8 @@ def tf_py_wrap_cc(name, srcs, swig_includes=[], deps=[], copts=[], **kwargs):
      "@local_config_cuda//cuda:darwin": [
        "//tensorflow:tf_exported_symbols.lds"
      ],
+      "//tensorflow:windows": [
+      ],
      "//conditions:default": [
        "//tensorflow:tf_version_script.lds"
      ]
@ -779,10 +791,19 @@ def tf_py_wrap_cc(name, srcs, swig_includes=[], deps=[], copts=[], **kwargs):
      linkstatic=1,
      linkshared=1,
      deps=deps + extra_deps)
+  native.genrule(
+      name = "gen_" + cc_library_pyd_name,
+      srcs = [":" + cc_library_name],
+      outs = [cc_library_pyd_name],
+      cmd = "cp $< $@",
+  )
  native.py_library(name=name,
                    srcs=[":" + name + ".py"],
                    srcs_version="PY2AND3",
-                    data=[":" + cc_library_name])
+                    data=select({
+                      "//tensorflow:windows": [":" + cc_library_pyd_name],
+                      "//conditions:default": [":" + cc_library_name],
+                    }))

 def tf_py_test(name, srcs, size="medium", data=[], main=None, args=[],
               tags=[], shard_count=1, additional_deps=[], flaky=0):
--- a/tensorflow/tools/ci_build/install/install_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh
@ -44,5 +44,10 @@ apt-get install -y --no-install-recommends \
    wget \
    zip \
    zlib1g-dev
+
+# Install ca-certificates, and update the certificate store.
+apt-get install ca-certificates-java
+update-ca-certificates -f
+
 apt-get clean
 rm -rf /var/lib/apt/lists/*
--- a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
+++ b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
@ -95,4 +95,14 @@ if [[ -z ${NEW_TFREC_URL} ]]; then
 fi
 "${GSUTIL_BIN}" rm "${NEW_TFREC_URL}" && \
    echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}" || \
-    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
+    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
+
+# Also clean up newly created GCS dir.
+NEW_DIR_URL=$(grep "Creating dir" "${LOG_FILE}" | \
+                awk '{print $NF}')
+if [[ -z ${NEW_DIR_URL} ]]; then
+  die "FAIL: Unable to determine the URL to the new directory created in GCS."
+fi
+"${GSUTIL_BIN}" rm -r "${NEW_DIR_URL}" && \
+    echo "Cleaned up new directory created in GCS: ${NEW_DIR_URL}" || \
+    die "FAIL: Unable to clean up new directory created in GCS: ${NEW_DIR_URL}"
--- a/tensorflow/tools/gcs_test/python/gcs_smoke.py
+++ b/tensorflow/tools/gcs_test/python/gcs_smoke.py
@ -19,10 +19,12 @@ from __future__ import print_function

 import random
 import sys
+import time

 import numpy as np
 import tensorflow as tf
 from tensorflow.core.example import example_pb2
+from tensorflow.python.lib.io import file_io

 flags = tf.app.flags
 flags.DEFINE_string("gcs_bucket_url", "",
@ -48,6 +50,25 @@ def create_examples(num_examples, input_mean):
    examples.append(ex)
  return examples

+def create_dir_test():
+  """Verifies file_io directory handling methods ."""
+
+  starttime = int(round(time.time() * 1000))
+  dir_name = "%s/tf_gcs_test_%s" % (FLAGS.gcs_bucket_url, starttime)
+  print("Creating dir %s" % dir_name)
+  file_io.create_dir(dir_name)
+  elapsed = int(round(time.time() * 1000)) - starttime
+  print("Created directory in: %d milliseconds" % elapsed)
+  # Check that the directory exists.
+  dir_exists = file_io.is_directory(dir_name)
+  print("%s directory exists: %s" % (dir_name, dir_exists))
+
+  # List contents of just created directory.
+  starttime = int(round(time.time() * 1000))
+  print("Listing directory %s." % dir_name)
+  print(file_io.list_directory(dir_name))
+  elapsed = int(round(time.time() * 1000)) - starttime
+  print("Listed directory %s in %s milliseconds" % (dir_name, elapsed))

 if __name__ == "__main__":
  # Sanity check on the GCS bucket URL.
@ -110,3 +131,5 @@ if __name__ == "__main__":
      except tf.errors.OutOfRangeError:
        print("Successfully caught the expected OutOfRangeError while "
              "reading one more record than is available")
+
+    create_dir_test()
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@ -112,9 +112,14 @@ def configure(src_base_path, debug=False):
    if src is None:
      open(os.path.join(gen_path, target), "w").write("")
    else:
-      if hasattr(os, 'symlink'):
-        os.symlink(src, os.path.join(gen_path, target))
-      else:
+      try:
+        # In python 3.5, symlink function exists even on Windows. But requires
+        # Windows Admin privileges, otherwise an OSError will be thrown.
+        if hasattr(os, 'symlink'):
+          os.symlink(src, os.path.join(gen_path, target))
+        else:
+          shutil.copy2(src, os.path.join(gen_path, target))
+      except OSError:
        shutil.copy2(src, os.path.join(gen_path, target))

  json.dump(spec, open(os.path.join(gen_path, "spec.json"), "w"), indent=2)
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@ -20,15 +20,17 @@ py_binary(
    deps = ["//tensorflow:tensorflow_py"],
 )

-sh_binary(
-    name = "build_pip_package",
-    srcs = ["build_pip_package.sh"],
+# On Windows, python binary is a zip file of runfiles tree.
+# Add everything to its data dependency for generating a runfiles tree
+# for building the pip package on Windows.
+py_binary(
+    name = "simple_console_for_windows",
+    srcs = ["simple_console_for_windows.py"],
    data = [
        "MANIFEST.in",
        "README",
        "setup.py",
        ":other_headers",
-        ":simple_console",
        "//tensorflow:tensorflow_py",
        "//tensorflow/contrib/ndlstm:all_files",
        "//tensorflow/contrib/session_bundle:all_files",
@ -44,13 +46,55 @@ sh_binary(
        "//tensorflow/models/image/alexnet:all_files",
        "//tensorflow/models/image/cifar10:all_files",
        "//tensorflow/models/image/imagenet:all_files",
-        "//tensorflow/models/image/mnist:convolutional",
        "//tensorflow/models/rnn:package",
        "//tensorflow/models/rnn/ptb:package",
        "//tensorflow/models/rnn/translate:package",
        "//tensorflow/python:util_example_parser_configuration",
        "//tensorflow/python/debug:all_files",
        "//tensorflow/python/saved_model:all_files",
-        "//tensorflow/tensorboard",
+        # The following two targets have an issue when archiving them into
+        # the python zip, exclude them for now.
+        # "//tensorflow/models/image/mnist:convolutional",
+        # "//tensorflow/tensorboard",
    ],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow:tensorflow_py"],
+)
+
+sh_binary(
+    name = "build_pip_package",
+    srcs = ["build_pip_package.sh"],
+    data = select({
+        "//tensorflow:windows": [":simple_console_for_windows"],
+        "//conditions:default": [
+            "MANIFEST.in",
+            "README",
+            "setup.py",
+            ":other_headers",
+            ":simple_console",
+            "//tensorflow:tensorflow_py",
+            "//tensorflow/contrib/ndlstm:all_files",
+            "//tensorflow/contrib/session_bundle:all_files",
+            "//tensorflow/contrib/slim:all_files",
+            "//tensorflow/contrib/slim/python/slim/data:all_files",
+            "//tensorflow/contrib/slim/python/slim/nets:all_files",
+            "//tensorflow/contrib/specs:all_files",
+            "//tensorflow/contrib/tensor_forest:all_files",
+            "//tensorflow/contrib/tensor_forest/hybrid:all_files",
+            "//tensorflow/core:framework_headers",
+            "//tensorflow/examples/tutorials/mnist:package",
+            "//tensorflow/models/embedding:package",
+            "//tensorflow/models/image/alexnet:all_files",
+            "//tensorflow/models/image/cifar10:all_files",
+            "//tensorflow/models/image/imagenet:all_files",
+            "//tensorflow/models/image/mnist:convolutional",
+            "//tensorflow/models/rnn:package",
+            "//tensorflow/models/rnn/ptb:package",
+            "//tensorflow/models/rnn/translate:package",
+            "//tensorflow/python:util_example_parser_configuration",
+            "//tensorflow/python/debug:all_files",
+            "//tensorflow/python/saved_model:all_files",
+            "//tensorflow/tensorboard",
+        ],
+    }),
 )
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@ -25,6 +25,16 @@ function cp_external() {
  done
 }

+PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
+function is_windows() {
+  # On windows, the shell script is actually running in msys
+  if [[ "${PLATFORM}" =~ msys_nt* ]]; then
+    true
+  else
+    false
+  fi
+}
+
 function main() {
  if [ $# -lt 1 ] ; then
    echo "No destination dir provided"
@ -41,7 +51,23 @@ function main() {
    exit 1
  fi

-  if [ ! -d bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow ]; then
+  if is_windows; then
+    rm -rf ./bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip
+    mkdir -p ./bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip
+    echo "Unzipping simple_console_for_windows.zip to create runfiles tree..."
+    unzip -o -q ./bazel-bin/tensorflow/tools/pip_package/simple_console_for_windows.zip -d ./bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip
+    echo "Unzip finished."
+    # runfiles structure after unzip the python binary
+    cp -R \
+      bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow/tensorflow \
+      "${TMPDIR}"
+    mkdir "${TMPDIR}/external"
+    # Note: this makes an extra copy of org_tensorflow.
+    cp_external \
+      bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles \
+      "${TMPDIR}/external"
+    RUNFILES=bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow
+  elif [ ! -d bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow ]; then
    # Really old (0.2.1-) runfiles, without workspace name.
    cp -R \
      bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/tensorflow \
@ -78,9 +104,13 @@ function main() {
  # protobuf pip package doesn't ship with header files. Copy the headers
  # over so user defined ops can be compiled.
  mkdir -p ${TMPDIR}/google
-  rsync --include "*/" --include "*.h" --exclude "*" --prune-empty-dirs -a \
-    $RUNFILES/external/protobuf ${TMPDIR}/google
-  rsync -a $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
+  mkdir -p ${TMPDIR}/third_party
+  pushd ${RUNFILES%org_tensorflow}
+  for header in $(find protobuf -name \*.h); do
+    cp --parents "$header" ${TMPDIR}/google;
+  done
+  popd
+  cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party

  cp tensorflow/tools/pip_package/MANIFEST.in ${TMPDIR}
  cp tensorflow/tools/pip_package/README ${TMPDIR}
@ -93,7 +123,7 @@ function main() {
  pushd ${TMPDIR}
  rm -f MANIFEST
  echo $(date) : "=== Building wheel"
-  ${PYTHON_BIN_PATH:-python} setup.py bdist_wheel >/dev/null
+  "${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel >/dev/null
  mkdir -p ${DEST}
  cp dist/* ${DEST}
  popd
--- a/Show More
+++ b/Show More