Merge changes from github.

PiperOrigin-RevId: 176695926
2017-11-22 13:42:21 -08:00 · 2017-11-22 13:42:21 -08:00 · b1d8c59e9b
commit b1d8c59e9b
parent e219aeb542
217 changed files with 6304 additions and 1432 deletions
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -42,7 +42,7 @@ The Code of Conduct also applies within project spaces and in public spaces when

 Conflicts in an open source project can take many forms, from someone having a bad day and using harsh and hurtful language in the issue queue, to more serious instances such as sexist/racist statements or threats of violence, and everything in between.

-If the behaviour is threatening or harassing, or for other reasons requires immediate escalation, please see below.
+If the behavior is threatening or harassing, or for other reasons requires immediate escalation, please see below.

 However, for the vast majority of issues, we aim to empower individuals to first resolve conflicts themselves, asking for help when needed, and only after that fails to escalate further. This approach gives people more control over the outcome of their dispute. 

@ -55,14 +55,14 @@ If you are experiencing or witnessing conflict, we ask you to use the following

 ## Reporting Violations

-Violations of the Code of Conduct can be reported to TensorFlow’s Project Steward at conduct@tensorflow.org. The Project Steward will determine whether the Code of Conduct was violated, and will issue an appropriate sanction, possibly including a written warning or expulsion from the project, project sponsored spaces, or project forums. We ask that you make a good-faith effort to resolve your conflict via the conflict resolution policy before submitting a report.
+Violations of the Code of Conduct can be reported to TensorFlow’s Project Stewards, Edd Wilder-James (ewj@google.com) and Sarah Novotny (sarahnovotny@google.com). The Project Steward will determine whether the Code of Conduct was violated, and will issue an appropriate sanction, possibly including a written warning or expulsion from the project, project sponsored spaces, or project forums. We ask that you make a good-faith effort to resolve your conflict via the conflict resolution policy before submitting a report.

 Violations of the Code of Conduct can occur in any setting, even those unrelated to the project. We will only consider complaints about conduct that has occurred within one year of the report.


 ## Enforcement

-If the Project Steward receives a report alleging a violation of the Code of Conduct, the Project Steward will notify the accused of the report, and provide them an opportunity to discuss the report before a sanction is issued. The Project Steward will do their utmost to keep the reporter anonymous. If the act is ongoing (such as someone engaging in harassment), or involves a threat to anyone's safety (e.g. threats of violence), the Project Steward may issue sanctions without notice.
+If the Project Stewards receive a report alleging a violation of the Code of Conduct, the Project Stewards will notify the accused of the report, and provide them an opportunity to discuss the report before a sanction is issued. The Project Stewards will do their utmost to keep the reporter anonymous. If the act is ongoing (such as someone engaging in harassment), or involves a threat to anyone's safety (e.g. threats of violence), the Project Stewards may issue sanctions without notice.


 ## Attribution
--- a/README.md
+++ b/README.md
@ -73,11 +73,11 @@ $ python

 ## For more information

-* [TensorFlow website](https://www.tensorflow.org)
+* [TensorFlow Website](https://www.tensorflow.org)
 * [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
 * [TensorFlow Model Zoo](https://github.com/tensorflow/models)
 * [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
-* [TensorFlow course at Stanford](https://web.stanford.edu/class/cs20si)
+* [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)

 Learn more about the TensorFlow community at the [community page of tensorflow.org](https://www.tensorflow.org/community) for a few ways to participate.

--- a/configure.py
+++ b/configure.py
@ -43,6 +43,7 @@ _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
                          'Toolkit/CUDA/v%s' % _DEFAULT_CUDA_VERSION)
 _TF_OPENCL_VERSION = '1.2'
 _DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
+_DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'


 def is_windows():
@ -636,7 +637,7 @@ def set_tf_cuda_version(environ_cp):
  write_action_env_to_bazelrc('TF_CUDA_VERSION', tf_cuda_version)


-def set_tf_cunn_version(environ_cp):
+def set_tf_cudnn_version(environ_cp):
  """Set CUDNN_INSTALL_PATH and TF_CUDNN_VERSION."""
  ask_cudnn_version = (
      'Please specify the cuDNN version you want to use. '
@ -883,6 +884,28 @@ def set_computecpp_toolkit_path(environ_cp):
                              computecpp_toolkit_path)


+def set_trisycl_include_dir(environ_cp):
+  """Set TRISYCL_INCLUDE_DIR."""
+  ask_trisycl_include_dir = ('Please specify the location of the triSYCL '
+                             'include directory. (Use --config=sycl_trisycl '
+                             'when building with Bazel) '
+                             '[Default is %s]: ') % (
+                                 _DEFAULT_TRISYCL_INCLUDE_DIR)
+  while True:
+    trisycl_include_dir = get_from_env_or_user_or_default(
+        environ_cp, 'TRISYCL_INCLUDE_DIR', ask_trisycl_include_dir,
+        _DEFAULT_TRISYCL_INCLUDE_DIR)
+    if os.path.exists(trisycl_include_dir):
+      break
+
+    print('Invalid triSYCL include directory, %s cannot be found' %
+          (trisycl_include_dir))
+
+  # Set TRISYCL_INCLUDE_DIR
+  environ_cp['TRISYCL_INCLUDE_DIR'] = trisycl_include_dir
+  write_action_env_to_bazelrc('TRISYCL_INCLUDE_DIR', trisycl_include_dir)
+
+
 def set_mpi_home(environ_cp):
  """Set MPI_HOME."""
  default_mpi_home = which('mpirun') or which('mpiexec') or ''
@ -997,6 +1020,8 @@ def main():
    environ_cp['TF_NEED_GCP'] = '0'
    environ_cp['TF_NEED_HDFS'] = '0'
    environ_cp['TF_NEED_JEMALLOC'] = '0'
+    environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
+    environ_cp['TF_NEED_COMPUTECPP'] = '0'
    environ_cp['TF_NEED_OPENCL'] = '0'
    environ_cp['TF_CUDA_CLANG'] = '0'

@ -1018,17 +1043,21 @@ def main():
  set_build_var(environ_cp, 'TF_NEED_VERBS', 'VERBS', 'with_verbs_support',
                False, 'verbs')

-  set_action_env_var(environ_cp, 'TF_NEED_OPENCL', 'OpenCL', False)
-  if environ_cp.get('TF_NEED_OPENCL') == '1':
+  set_action_env_var(environ_cp, 'TF_NEED_OPENCL_SYCL', 'OpenCL SYCL', False)
+  if environ_cp.get('TF_NEED_OPENCL_SYCL') == '1':
    set_host_cxx_compiler(environ_cp)
    set_host_c_compiler(environ_cp)
-    set_computecpp_toolkit_path(environ_cp)
+    set_action_env_var(environ_cp, 'TF_NEED_COMPUTECPP', 'ComputeCPP', True)
+    if environ_cp.get('TF_NEED_COMPUTECPP') == '1':
+      set_computecpp_toolkit_path(environ_cp)
+    else:
+      set_trisycl_include_dir(environ_cp)

  set_action_env_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False)
  if (environ_cp.get('TF_NEED_CUDA') == '1' and
      'TF_CUDA_CONFIG_REPO' not in environ_cp):
    set_tf_cuda_version(environ_cp)
-    set_tf_cunn_version(environ_cp)
+    set_tf_cudnn_version(environ_cp)
    set_tf_cuda_compute_capabilities(environ_cp)

    set_tf_cuda_clang(environ_cp)
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@ -54,6 +54,15 @@ config_setting(
    visibility = ["//visibility:public"],
 )

+config_setting(
+    name = "raspberry_pi_armeabi",
+    values = {
+        "crosstool_top": "@local_config_arm_compiler//:toolchain",
+        "cpu": "armeabi",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
    name = "android_arm",
    values = {
@ -760,6 +769,13 @@ tf_cc_shared_object(
    ],
 )

+exports_files(
+    [
+        "tf_version_script.lds",
+        "tf_exported_symbols.lds",
+    ],
+)
+
 py_library(
    name = "tensorflow_py",
    srcs = ["__init__.py"],
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@ -119,7 +119,7 @@ def tf_library(name, graph, config,
            out_nodes_file,
        ] + freeze_saver_srcs,
        outs=[freeze_file],
-        cmd=("$(location //tensorflow/python/tools:freeze_graph)" +
+        cmd=("$(location @org_tensorflow//tensorflow/python/tools:freeze_graph)" +
             freeze_args),
        tools=["@org_tensorflow//tensorflow/python/tools:freeze_graph"],
        tags=tags,
@ -130,6 +130,10 @@ def tf_library(name, graph, config,
  header_file = name + ".h"
  object_file = name + ".o"
  ep = ("__" + PACKAGE_NAME + "__" + name).replace("/", "_")
+  if type(tfcompile_flags) == type(""):
+    flags = tfcompile_flags
+  else:
+    flags = " ".join(["'" + arg.replace("'", "'\\''") + "'" for arg in (tfcompile_flags or [])])
  native.genrule(
      name=("gen_" + name),
      srcs=[
@ -148,7 +152,7 @@ def tf_library(name, graph, config,
           " --target_triple=" + target_llvm_triple() +
           " --out_header=$(@D)/" + header_file +
           " --out_object=$(@D)/" + object_file +
-           " " + (tfcompile_flags or "")),
+           " " + flags),
      tools=[tfcompile_tool],
      visibility=visibility,
      testonly=testonly,
@ -185,7 +189,7 @@ def tf_library(name, graph, config,
           " --cpp_class=" + cpp_class +
           " --target_triple=" + target_llvm_triple() +
           " --out_session_module=$(@D)/" + session_module_pb +
-           " " + (tfcompile_flags or "")),
+           " " + flags),
      tools=[tfcompile_tool],
      visibility=visibility,
      testonly=testonly,
@ -195,8 +199,7 @@ def tf_library(name, graph, config,

  # The cc_library rule packaging up the header and object file, and needed
  # kernel implementations.
-  need_xla_data_proto = (tfcompile_flags and
-                         tfcompile_flags.find("--gen_program_shape") != -1)
+  need_xla_data_proto = (flags and flags.find("--gen_program_shape") != -1)
  native.cc_library(
      name=name,
      srcs=[object_file],
@ -253,7 +256,7 @@ def tf_library(name, graph, config,
        ],
        outs=[test_file],
        cmd=("sed " + sed_replace +
-             " $(location //tensorflow/compiler/aot:test.cc) " +
+             " $(location @org_tensorflow//tensorflow/compiler/aot:test.cc) " +
             "> $(OUTS)"),
        tags=tags,
    )
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@ -672,7 +672,7 @@ tf_library(
    cpp_class = "LSTMLayerInference",
    graph = "lstm_layer_inference.pbtxt",
    tags = ["manual"],
-    tfcompile_flags = "--xla_cpu_multi_thread_eigen=false",
+    tfcompile_flags = ["--xla_cpu_multi_thread_eigen=false"],
 )

 # -----------------------------------------------------------------------------
--- a/tensorflow/compiler/tests/fused_batchnorm_test.py
+++ b/tensorflow/compiler/tests/fused_batchnorm_test.py
@ -36,7 +36,7 @@ class FusedBatchNormTest(XLATestCase):
    x_square = x * x
    x_square_sum = np.sum(x_square, (0, 1, 2))
    x_sum = np.sum(x, axis=(0, 1, 2))
-    element_count = np.size(x) / int(np.shape(x)[0])
+    element_count = np.size(x) / int(np.shape(x)[-1])
    mean = x_sum / element_count
    var = x_square_sum / element_count - mean * mean
    normalized = (x - mean) / np.sqrt(var + epsilon)
@ -64,8 +64,9 @@ class FusedBatchNormTest(XLATestCase):
    return grad_x, grad_scale, grad_offset

  def testInference(self):
-    x_shape = [2, 2, 6, 2]
-    scale_shape = [2]
+    channel = 3
+    x_shape = [2, 2, 6, channel]
+    scale_shape = [channel]
    x_val = np.random.random_sample(x_shape).astype(np.float32)
    scale_val = np.random.random_sample(scale_shape).astype(np.float32)

@ -74,8 +75,9 @@ class FusedBatchNormTest(XLATestCase):
    with self.test_session() as sess, self.test_scope():
      # To avoid constant folding
      t_val = array_ops.placeholder(np.float32, shape=x_shape, name="x")
-      scale = array_ops.placeholder(np.float32, shape=[2], name="scale")
-      offset = array_ops.placeholder(np.float32, shape=[2], name="offset")
+      scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
+      offset = array_ops.placeholder(
+          np.float32, shape=scale_shape, name="offset")
      epsilon = 0.001
      y_ref, mean_ref, var_ref = self._reference_training(
          x_val, scale_val, offset_val, epsilon, data_format)
@ -97,8 +99,9 @@ class FusedBatchNormTest(XLATestCase):
      self.assertAllClose(y_val, y_ref, atol=1e-3)

  def _testLearning(self, use_gradient_checker):
-    x_shape = [2, 2, 6, 2]
-    scale_shape = [2]
+    channel = 3
+    x_shape = [2, 2, 6, channel]
+    scale_shape = [channel]
    x_val = np.random.random_sample(x_shape).astype(np.float32)
    scale_val = np.random.random_sample(scale_shape).astype(np.float32)

@ -109,8 +112,9 @@ class FusedBatchNormTest(XLATestCase):
    with self.test_session() as sess, self.test_scope():
      # To avoid constant folding
      t_val = array_ops.placeholder(np.float32, shape=x_shape, name="x")
-      scale = array_ops.placeholder(np.float32, shape=[2], name="scale")
-      offset = array_ops.placeholder(np.float32, shape=[2], name="offset")
+      scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
+      offset = array_ops.placeholder(
+          np.float32, shape=scale_shape, name="offset")
      epsilon = 0.001
      y, mean, var = nn.fused_batch_norm(
          t_val,
@ -154,8 +158,9 @@ class FusedBatchNormTest(XLATestCase):
  def testGradient(self):
    # TODO(b/64270657): Use gradient_checker here in addition to comparing with
    # this reference implementation.
-    x_shape = [2, 2, 6, 2]
-    scale_shape = [2]
+    channel = 3
+    x_shape = [2, 2, 6, channel]
+    scale_shape = [channel]
    grad_val = np.random.random_sample(x_shape).astype(np.float32)
    x_val = np.random.random_sample(x_shape).astype(np.float32)
    scale_val = np.random.random_sample(scale_shape).astype(np.float32)
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@ -222,7 +222,7 @@ class HloInstruction {
      tensorflow::gtl::ArraySlice<int64> strides);

  // Creates a slice instruction, where the first operand is sliced by
-  // start indices specified in the second operand, and by size specfied in
+  // start indices specified in the second operand, and by size specified in
  // 'slice_sizes'.
  static std::unique_ptr<HloInstruction> CreateDynamicSlice(
      const Shape& shape, HloInstruction* operand,
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@ -792,8 +792,8 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
  //   sub = Sub(mul, clamp)
  //   tuple = Tuple({sub, sub, mul, C1})
  //
-  // Notable complexities are repeated operands in a same instruction, different
-  // shapes, use of value in different expressions.
+  // Notable complexities are repeated operands in the same instruction,
+  // different shapes, use of value in different expressions.
  auto c1 = builder.AddInstruction(
      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
  auto c2 = builder.AddInstruction(
--- a/tensorflow/contrib/android/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/android/cmake/CMakeLists.txt
@ -37,7 +37,7 @@ set_target_properties(lib_tf PROPERTIES IMPORTED_LOCATION
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIS_SLIM_BUILD \
                     -std=c++11 -fno-rtti -fno-exceptions \
                     -O2 -Wno-narrowing -fomit-frame-pointer \
-                     -mfpu=neon -mfloat-abi=softfp -fPIE \
+                     -mfpu=neon -mfloat-abi=softfp -fPIE -fPIC \
                     -ftemplate-depth=900 \
                     -DGOOGLE_PROTOBUF_NO_RTTI \
                     -DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER")
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@ -82,6 +82,7 @@ cc_library(
 tf_cc_test(
    name = "adaptive_shared_batch_scheduler_test",
    srcs = ["adaptive_shared_batch_scheduler_test.cc"],
+    tags = ["manual"],  # b/69013768
    deps = [
        ":adaptive_shared_batch_scheduler",
        "//tensorflow/contrib/batching/test_util:fake_clock_env",
--- a/tensorflow/contrib/batching/kernels/batch_kernels.cc
+++ b/tensorflow/contrib/batching/kernels/batch_kernels.cc
@ -461,7 +461,7 @@ class BatchResource : public ResourceBase {
    return Status::OK();
  }

-  // Looks up the batcher queue for 'queue_name'. If it did't previously exist,
+  // Looks up the batcher queue for 'queue_name'. If it didn't previously exist,
  // creates it.
  Status LookupOrCreateBatcherQueue(const string& queue_name,
                                    BatcherQueue** queue) {
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
@ -759,7 +759,7 @@ class CsiszarVIMCOTest(test.TestCase):
  def _csiszar_vimco_helper_grad(self, logu, delta):
    """Finite difference approximation of `grad(csiszar_vimco_helper, logu)`."""

-    # This code actually estimates the sum of the Jacobiab because thats what
+    # This code actually estimates the sum of the Jacobiab because that's what
    # TF's `gradients` does.
    np_log_avg_u1, np_log_sooavg_u1 = self._csiszar_vimco_helper(
        logu[..., None] + np.diag([delta]*len(logu)))
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -34,13 +34,41 @@ option(tensorflow_BUILD_SHARED_LIB "Build TensorFlow as a shared library" OFF)
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
 option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
 option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON)
+if(HAIKU)
+	option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" OFF)
+else()
+	option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" ON)
+endif()
+

 if (NOT WIN32)
  # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
  # for targets that link ${CMAKE_THREAD_LIBS_INIT}.
  find_package (Threads)
+
+  option(tensorflow_PATH_STATIC_LIB "Additional library search path for libcudnn_static.a, libnccl_static.a, libculibos.a" /usr/local/cuda/lib64/)
+  option(tensorflow_CUDNN_INCLUDE "cudnn.h header install path" /usr/include/)
+  if (NOT tensorflow_CUDNN_INCLUDE)
+    # option's default value is OFF. Fill it with real default values
+    set(tensorflow_CUDNN_INCLUDE /usr/include)
+  endif (NOT tensorflow_CUDNN_INCLUDE)
+  option(tensorflow_PATH_CUDNN_STATIC_LIB "Override PATH_STATIC_LIB for libcudnn_static.a" ${tensorflow_PATH_STATIC_LIB})
+  option(tensorflow_PATH_NCCL_STATIC_LIB "Override PATH_STATIC_LIB for libnccl_static.a" ${tensorflow_PATH_STATIC_LIB})
+  option(tensorflow_CUDA_LIBRARY_PATH "Designate the default CUDA library paths" /usr/local/cuda/lib64)
+  if (NOT tensorflow_CUDA_LIBRARY_PATH)
+    # option's default value is OFF. Fill it with real default values
+    set(tensorflow_CUDA_LIBRARY_PATH /usr/local/cuda/lib64)
+  endif (NOT tensorflow_CUDA_LIBRARY_PATH)
 endif()

+if (WIN32)
+  set(BOOL_WIN32 ON)
+else (WIN32)
+  set(BOOL_WIN32 OFF)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+endif (WIN32)
+
 # [CLEANUP] Remove when done
 # For debugging
 function(SHOW_VARIABLES)
@ -58,7 +86,12 @@ set (DOWNLOAD_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/downloads"
     CACHE PATH "Location where external projects will be downloaded.")
 mark_as_advanced(DOWNLOAD_LOCATION)

-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+if (tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+	set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+else()
+	set(CMAKE_POSITION_INDEPENDENT_CODE OFF)
+endif()
+
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
@ -217,20 +250,35 @@ endif()
 if(UNIX)
  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS})
 endif()
+if(HAIKU)
+  list(APPEND tensorflow_EXTERNAL_LIBRARIES network)
+endif()

 if (tensorflow_ENABLE_GPU)
+  if (NOT WIN32)
+    # Default install paths for cuda libraries in Linux
+    # In some Linux distros, find_package(CUDA) seems to require CMAKE_LIBRARY_PATH to include cuda-lib paths
+    list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}")
+    list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}/stubs")
+  endif (NOT WIN32)
+
+  find_package(CUDA 8.0 REQUIRED)
+
+  # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
+  # CUDA_NVCC_FLAGS and cuda_config.h below
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=\"sm_30,compute_30\";-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true)  # Flush denormals to zero
+  set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
+  include_directories(${CUDA_INCLUDE})
  if (WIN32)
-    find_package(CUDA 8.0 REQUIRED)
-
-    # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
-    # CUDA_NVCC_FLAGS and cuda_config.h below
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=\"sm_30,compute_30\";-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true)  # Flush denormals to zero
-    set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
-    include_directories(${CUDA_INCLUDE})
    add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2)
+  else (WIN32)
+    # Without these double quotes, cmake in Linux makes it "-DTF_EXTRA_CUDA_CAPABILITIES=3.0, -D3.5, -D5.2" for cc, which incurs build breaks
+    add_definitions(-DGOOGLE_CUDA=1 -D"TF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2")
+  endif (WIN32)

+  if (WIN32)
    # add cudnn
    if(NOT CUDNN_HOME)
      set(CUDNN_HOME ${CUDA_TOOLKIT_TARGET_DIR})
@ -238,18 +286,48 @@ if (tensorflow_ENABLE_GPU)
    include_directories(${CUDNN_HOME})
    set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}
      ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${CUDNN_HOME}/lib/x64/cudnn.lib)
+  else (WIN32)
+    set(CUDNN_INCLUDE "${tensorflow_CUDNN_INCLUDE}")

-    # create cuda_config.h
-    FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
-      "#ifndef CUDA_CUDA_CONFIG_H_\n"
-      "#define CUDA_CUDA_CONFIG_H_\n"
-      "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
-      "#define TF_CUDA_VERSION \"64_80\"\n"
-      "#define TF_CUDNN_VERSION \"64_6\"\n"
-      "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
-      "#endif  // CUDA_CUDA_CONFIG_H_\n"
-    )
+    find_library(nccl_STATIC_LIBRARY NAMES libnccl_static.a PATHS ${tensorflow_PATH_NCCL_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    if (NOT nccl_STATIC_LIBRARY)
+      message(FATAL_ERROR "NCCL is required for GPU-build")
+    else (NOT nccl_STATIC_LIBRARY)
+      message("nccl-static: ${nccl_STATIC_LIBRARY}")
+      # something like /usr/lib64/libnccl_static.a
+    endif (NOT nccl_STATIC_LIBRARY)

+    find_library(cudnn_STATIC_LIBRARY NAMES libcudnn_static.a PATHS ${tensorflow_PATH_CUDNN_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    if (NOT cudnn_STATIC_LIBRARY)
+      message(FATAL_ERROR "CUDNN is required for GPU-build")
+    else (NOT cudnn_STATIC_LIBRARY)
+      message("cudnn-static: ${cudnn_STATIC_LIBRARY}")
+    endif (NOT cudnn_STATIC_LIBRARY)
+
+    find_library(culibos_STATIC_LIBRARY NAMES libculibos.a PATHS ${tensorflow_PATH_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    if (NOT culibos_STATIC_LIBRARY)
+      message(FATAL_ERROR "CULIBOS is required for GPU-build")
+    else (NOT culibos_STATIC_LIBRARY)
+      message("culibos-static: ${culibos_STATIC_LIBRARY}")
+    endif (NOT culibos_STATIC_LIBRARY)
+
+    include_directories(${CUDNN_INCLUDE})
+    set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}
+      ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_STATIC_LIBRARY} ${culibos_STATIC_LIBRARY} ${nccl_STATIC_LIBRARY})
+  endif (WIN32)
+
+  # create cuda_config.h
+  FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
+    "#ifndef CUDA_CUDA_CONFIG_H_\n"
+    "#define CUDA_CUDA_CONFIG_H_\n"
+    "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
+    "#define TF_CUDA_VERSION \"64_80\"\n"
+    "#define TF_CUDNN_VERSION \"64_6\"\n"
+    "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
+    "#endif  // CUDA_CUDA_CONFIG_H_\n"
+  )
+
+  if (WIN32)
    # tf assumes in various places header files to be in cuda/include. On windows the cuda sdk
    # installs them under cuda/version/include and to avoid that we need to change tf we copy a
    # few files to cuda/include
@ -261,12 +339,25 @@ if (tensorflow_ENABLE_GPU)
      ${CUDA_TOOLKIT_TARGET_DIR}/include/cusolverDn.h
      DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
    )
-    include_directories(${tensorflow_source_dir}/third_party/gpus)
-    # add cuda libraries to tensorflow_EXTERNAL_LIBRARIES
-    list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
+  else(WIN32)
+    # Linux has slightly differnt install paths than Windows
+    FILE(COPY
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda.h ${CUDA_TOOLKIT_TARGET_DIR}/include/cuComplex.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h ${CUDNN_INCLUDE}/cudnn.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_runtime_api.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cusolverDn.h
+      DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
+    )
+  endif(WIN32)

-    # NOTE(mrry): Update these flags when the version of CUDA or cuDNN used
-    # in the default build is upgraded.
+  include_directories(${tensorflow_source_dir}/third_party/gpus)
+  # add cuda libraries to tensorflow_EXTERNAL_LIBRARIES
+  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
+
+  # NOTE(mrry): Update these flags when the version of CUDA or cuDNN used
+  # in the default build is upgraded.
+  if(WIN32)
    set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
      msvcp_dll_name=msvcp140.dll
      cudart_dll_name=cudart64_80.dll
@ -275,7 +366,9 @@ if (tensorflow_ENABLE_GPU)
      cudnn_dll_name=cudnn64_6.dll
      cudnn_version_number=6)
  else(WIN32)
-    message(FATAL_ERROR "CMake GPU build is currently only supported on Windows.")
+    set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
+      cuda_version_number=8.0
+      cudnn_version_number=6)
  endif(WIN32)
 else(tensorflow_ENABLE_GPU)
  set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
@ -293,9 +386,7 @@ include(tf_core_framework.cmake)
 # NOTE: Disabled until issue #3996 is fixed.
 # include(tf_stream_executor.cmake)
 if (tensorflow_ENABLE_GPU)
-  if (WIN32)
    include(tf_stream_executor.cmake)
-  endif()
 endif()

 include(tf_core_cpu.cmake)
--- a/tensorflow/contrib/cmake/external/boringssl.cmake
+++ b/tensorflow/contrib/cmake/external/boringssl.cmake
@ -39,8 +39,12 @@ ExternalProject_Add(boringssl
    # BUILD_IN_SOURCE 1
    INSTALL_COMMAND ""
    CMAKE_CACHE_ARGS
+        if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+        	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+        else()
+        	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+        endif()
        -DCMAKE_BUILD_TYPE:STRING=Release
        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )

--- a/tensorflow/contrib/cmake/external/jsoncpp.cmake
+++ b/tensorflow/contrib/cmake/external/jsoncpp.cmake
@ -42,8 +42,12 @@ ExternalProject_Add(jsoncpp
    BUILD_IN_SOURCE 1
    INSTALL_COMMAND ""
    CMAKE_CACHE_ARGS
+  	  if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+  	      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+  	  else()
+   	    	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+   	 endif()
        -DCMAKE_BUILD_TYPE:STRING=Release
        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )

--- a/tensorflow/contrib/cmake/external/lmdb.cmake
+++ b/tensorflow/contrib/cmake/external/lmdb.cmake
@ -29,10 +29,14 @@ ExternalProject_Add(lmdb
    INSTALL_DIR ${lmdb_INSTALL}
    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
    CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
        -DCMAKE_BUILD_TYPE:STRING=Release
        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
        -DCMAKE_INSTALL_PREFIX:STRING=${lmdb_INSTALL}
-    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )

 if(WIN32)
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@ -41,10 +41,14 @@ ExternalProject_Add(png
    INSTALL_DIR ${png_INSTALL}
    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
    CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
        -DCMAKE_BUILD_TYPE:STRING=Release
        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
        -DCMAKE_INSTALL_PREFIX:STRING=${png_INSTALL}
-	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 	-DZLIB_ROOT:STRING=${ZLIB_INSTALL}
 )

--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@ -44,8 +44,12 @@ ExternalProject_Add(protobuf
        ${PROTOBUF_ADDITIONAL_CMAKE_OPTIONS}
    INSTALL_COMMAND ""
    CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
        -DCMAKE_BUILD_TYPE:STRING=Release
        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
        -DZLIB_ROOT:STRING=${ZLIB_INSTALL}
 )
--- a/tensorflow/contrib/cmake/external/re2.cmake
+++ b/tensorflow/contrib/cmake/external/re2.cmake
@ -38,7 +38,11 @@ ExternalProject_Add(re2
    BUILD_IN_SOURCE 1
    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
    CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
        -DCMAKE_BUILD_TYPE:STRING=Release
        -DCMAKE_INSTALL_PREFIX:STRING=${re2_INSTALL}
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-)
+)
--- a/tensorflow/contrib/cmake/external/snappy.cmake
+++ b/tensorflow/contrib/cmake/external/snappy.cmake
@ -40,11 +40,15 @@ ExternalProject_Add(snappy
    LOG_CONFIGURE ON
    LOG_BUILD ON
    CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
        -DCMAKE_BUILD_TYPE:STRING=Release
        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
        -DSNAPPY_BUILD_TESTS:BOOL=OFF
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )

 # actually enables snappy in the source code
-add_definitions(-DTF_USE_SNAPPY)
+add_definitions(-DTF_USE_SNAPPY)
--- a/tensorflow/contrib/cmake/external/sqlite.cmake
+++ b/tensorflow/contrib/cmake/external/sqlite.cmake
@ -53,9 +53,13 @@ else()
        INSTALL_DIR ${sqlite_INSTALL}
        DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
        CMAKE_CACHE_ARGS
+			if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+				-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+			else()
+				-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+			endif()
            -DCMAKE_BUILD_TYPE:STRING=Release
            -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
            -DCMAKE_INSTALL_PREFIX:STRING=${sqlite_INSTALL}
    )

--- a/tensorflow/contrib/cmake/external/zlib.cmake
+++ b/tensorflow/contrib/cmake/external/zlib.cmake
@ -42,9 +42,13 @@ ExternalProject_Add(zlib
    BUILD_IN_SOURCE 1
    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
    CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
        -DCMAKE_BUILD_TYPE:STRING=Release
        -DCMAKE_INSTALL_PREFIX:STRING=${ZLIB_INSTALL}
-	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )

 # put zlib includes in the directory where they are expected
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@ -148,7 +148,11 @@ list(REMOVE_ITEM tf_cc_srcs ${tf_cc_test_srcs})
 add_library(tf_cc OBJECT ${tf_cc_srcs})
 add_dependencies(tf_cc tf_cc_framework tf_cc_ops)

-set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
+if (WIN32)
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
+else (WIN32)
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
+endif (WIN32)
 add_custom_target(tf_extension_ops)

 function(AddUserOps)
@ -164,15 +168,13 @@ function(AddUserOps)
  # create shared library from source and cuda obj
  add_library(${_AT_TARGET} SHARED ${_AT_SOURCES} ${gpu_lib})
  target_link_libraries(${_AT_TARGET} ${pywrap_tensorflow_lib})
-  if(WIN32)
-    if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
-        # some ops call out to cuda directly; need to link libs for the cuda dlls
-        target_link_libraries(${_AT_TARGET} ${CUDA_LIBRARIES})
-    endif()
-    if (_AT_DISTCOPY)
-        add_custom_command(TARGET ${_AT_TARGET} POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${_AT_TARGET}> ${_AT_DISTCOPY}/)
-    endif()
+  if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
+      # some ops call out to cuda directly; need to link libs for the cuda dlls
+      target_link_libraries(${_AT_TARGET} ${CUDA_LIBRARIES})
+  endif()
+  if (_AT_DISTCOPY)
+      add_custom_command(TARGET ${_AT_TARGET} POST_BUILD
+          COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${_AT_TARGET}> ${_AT_DISTCOPY}/)
  endif()
  if (_AT_DEPENDS)
    add_dependencies(${_AT_TARGET} ${_AT_DEPENDS})
@ -180,9 +182,19 @@ function(AddUserOps)
  # make sure TF_COMPILE_LIBRARY is not defined for this target
  get_target_property(target_compile_flags  ${_AT_TARGET} COMPILE_FLAGS)
  if(target_compile_flags STREQUAL "target_compile_flags-NOTFOUND")
-    set(target_compile_flags "/UTF_COMPILE_LIBRARY")
+    if (WIN32)
+      set(target_compile_flags "/UTF_COMPILE_LIBRARY")
+    else (WIN32)
+      # gcc uses UTF as default
+      set(target_compile_flags "-finput-charset=UTF-8")
+    endif (WIN32)
  else()
-    set(target_compile_flags "${target_compile_flags} /UTF_COMPILE_LIBRARY")
+    if (WIN32)
+      set(target_compile_flags "${target_compile_flags} /UTF_COMPILE_LIBRARY")
+    else (WIN32)
+      # gcc uses UTF as default
+      set(target_compile_flags "${target_compile_flags} -finput-charset=UTF-8")
+    endif (WIN32)
  endif()
  set_target_properties(${_AT_TARGET} PROPERTIES COMPILE_FLAGS ${target_compile_flags})
  add_dependencies(tf_extension_ops ${_AT_TARGET})
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@ -179,6 +179,7 @@ file(GLOB_RECURSE tf_core_gpu_kernels_srcs
    "${tensorflow_source_dir}/tensorflow/contrib/image/kernels/*.cu.cc"
    "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/*.cu.cc"
    "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/*.cu.cc"
+    "${tensorflow_source_dir}/tensorflow/contrib/resampler/kernels/*.cu.cc"
 )

 if(WIN32 AND tensorflow_ENABLE_GPU)
@ -202,16 +203,16 @@ endif(WIN32 AND tensorflow_ENABLE_GPU)
 add_library(tf_core_kernels OBJECT ${tf_core_kernels_srcs})
 add_dependencies(tf_core_kernels tf_core_cpu)

-if(WIN32)
+if (WIN32)
  target_compile_options(tf_core_kernels PRIVATE /MP)
-  if (tensorflow_ENABLE_GPU)
-    set_source_files_properties(${tf_core_gpu_kernels_srcs} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
-    set(tf_core_gpu_kernels_lib tf_core_gpu_kernels)
-    cuda_add_library(${tf_core_gpu_kernels_lib} ${tf_core_gpu_kernels_srcs})
-    set_target_properties(${tf_core_gpu_kernels_lib}
-                          PROPERTIES DEBUG_POSTFIX ""
-                          COMPILE_FLAGS "${TF_REGULAR_CXX_FLAGS}"
-    )
-    add_dependencies(${tf_core_gpu_kernels_lib} tf_core_cpu)
-  endif()
+endif (WIN32)
+if (tensorflow_ENABLE_GPU)
+  set_source_files_properties(${tf_core_gpu_kernels_srcs} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+  set(tf_core_gpu_kernels_lib tf_core_gpu_kernels)
+  cuda_add_library(${tf_core_gpu_kernels_lib} ${tf_core_gpu_kernels_srcs})
+  set_target_properties(${tf_core_gpu_kernels_lib}
+                        PROPERTIES DEBUG_POSTFIX ""
+                        COMPILE_FLAGS "${TF_REGULAR_CXX_FLAGS}"
+  )
+  add_dependencies(${tf_core_gpu_kernels_lib} tf_core_cpu)
 endif()
--- a/tensorflow/contrib/cmake/tf_label_image_example.cmake
+++ b/tensorflow/contrib/cmake/tf_label_image_example.cmake
@ -34,3 +34,8 @@ target_link_libraries(tf_label_image_example PUBLIC
    ${tf_core_gpu_kernels_lib}
    ${tensorflow_EXTERNAL_LIBRARIES}
 )
+
+install(TARGETS tf_label_image_example
+        RUNTIME DESTINATION bin
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@ -715,6 +715,9 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
      set(require_shape_fn 1)
    endif()

+    get_filename_component(GENERATE_PYTHON_OP_LIB_MKDIRPATH ${GENERATE_PYTHON_OP_LIB_DESTINATION} PATH)
+    file(MAKE_DIRECTORY ${GENERATE_PYTHON_OP_LIB_MKDIRPATH})
+
    # Create a C++ executable that links in the appropriate op
    # registrations and generates Python wrapper code based on the
    # registered ops.
@ -743,6 +746,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
        ${GENERATE_PYTHON_OP_LIB_DESTINATION} PARENT_SCOPE)
 endfunction()

+GENERATE_PYTHON_OP_LIB("audio_ops")
 GENERATE_PYTHON_OP_LIB("array_ops")
 GENERATE_PYTHON_OP_LIB("bitwise_ops")
 GENERATE_PYTHON_OP_LIB("math_ops")
@ -987,7 +991,7 @@ add_library(pywrap_tensorflow_internal SHARED
    $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
    $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
    $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
    ${pywrap_tensorflow_deffile}
 )
@ -1063,25 +1067,23 @@ if(WIN32)
        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/)
 endif(WIN32)

-if(WIN32)
-    # include contrib/seq2seq as .so
-    #
-    set(tf_beam_search_srcs
-        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc"
-        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
-        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc"
-    )
+# include contrib/seq2seq as .so
+#
+set(tf_beam_search_srcs
+    "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc"
+    "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
+    "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc"
+)

-    set(tf_beam_search_gpu_srcs
-        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc"
-    )
+set(tf_beam_search_gpu_srcs
+    "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc"
+)

-    AddUserOps(TARGET _beam_search_ops
-        SOURCES "${tf_beam_search_srcs}"
-        GPUSOURCES ${tf_beam_search_gpu_srcs}
-        DEPENDS pywrap_tensorflow_internal tf_python_ops
-        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
-endif(WIN32)
+AddUserOps(TARGET _beam_search_ops
+    SOURCES "${tf_beam_search_srcs}"
+    GPUSOURCES ${tf_beam_search_gpu_srcs}
+    DEPENDS pywrap_tensorflow_internal tf_python_ops
+    DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)

 ############################################################
 # Build a PIP package containing the TensorFlow runtime.
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@ -73,7 +73,7 @@ add_library(tensorflow SHARED
    $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
    $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
    $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
    ${tensorflow_deffile}
 )
@ -94,3 +94,46 @@ endif()
 if(WIN32)
  add_dependencies(tensorflow tensorflow_static)
 endif(WIN32)
+
+install(TARGETS tensorflow
+        RUNTIME DESTINATION bin
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
+
+# install necessary headers
+# tensorflow headers
+install(DIRECTORY ${tensorflow_source_dir}/tensorflow/cc/
+        DESTINATION include/tensorflow/cc
+        FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tensorflow/cc/
+        DESTINATION include/tensorflow/cc
+        FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY ${tensorflow_source_dir}/tensorflow/core/
+        DESTINATION include/tensorflow/core
+        FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tensorflow/core/
+        DESTINATION include/tensorflow/core
+        FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY ${tensorflow_source_dir}/tensorflow/stream_executor/
+        DESTINATION include/tensorflow/stream_executor
+        FILES_MATCHING PATTERN "*.h")
+# google protobuf headers
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src/google/
+        DESTINATION include/google
+        FILES_MATCHING PATTERN "*.h")
+# nsync headers
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/
+        DESTINATION include/external/nsync
+        FILES_MATCHING PATTERN "*.h")
+# Eigen directory
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/Eigen/
+        DESTINATION include/Eigen)
+# external directory
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive/
+        DESTINATION include/external/eigen_archive)
+# third_party eigen directory
+install(DIRECTORY ${tensorflow_source_dir}/third_party/eigen3/
+        DESTINATION include/third_party/eigen3)
+# unsupported Eigen directory
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/
+        DESTINATION include/unsupported/Eigen)
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@ -74,6 +74,9 @@ endif()
 #)
 #list(REMOVE_ITEM tf_stream_executor_srcs ${tf_stream_executor_test_srcs}) 

+if (NOT WIN32)
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lgomp")
+endif (NOT WIN32)
 add_library(tf_stream_executor OBJECT ${tf_stream_executor_srcs})

 add_dependencies(tf_stream_executor
--- a/tensorflow/contrib/cmake/tf_tools.cmake
+++ b/tensorflow/contrib/cmake/tf_tools.cmake
@ -73,7 +73,7 @@ add_executable(${transform_graph}
    $<TARGET_OBJECTS:tf_core_direct_session>
    $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
    $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )

@ -95,7 +95,7 @@ add_executable(${summarize_graph}
    $<TARGET_OBJECTS:tf_core_direct_session>
    $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
    $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )

@ -117,7 +117,7 @@ add_executable(${compare_graphs}
    $<TARGET_OBJECTS:tf_core_direct_session>
    $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
    $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )

@ -138,7 +138,7 @@ add_executable(${benchmark_model}
    $<TARGET_OBJECTS:tf_core_ops>
    $<TARGET_OBJECTS:tf_core_direct_session>
    $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )

@ -147,3 +147,8 @@ target_link_libraries(${benchmark_model} PUBLIC
  ${tf_core_gpu_kernels_lib}
  ${tensorflow_EXTERNAL_LIBRARIES}
 )
+
+install(TARGETS ${transform_graph} ${summarize_graph} ${compare_graphs} ${benchmark_model}
+        RUNTIME DESTINATION bin
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
--- a/tensorflow/contrib/cmake/tf_tutorials.cmake
+++ b/tensorflow/contrib/cmake/tf_tutorials.cmake
@ -34,3 +34,8 @@ target_link_libraries(tf_tutorials_example_trainer PUBLIC
    ${tf_core_gpu_kernels_lib}
    ${tensorflow_EXTERNAL_LIBRARIES}
 )
+
+install(TARGETS tf_tutorials_example_trainer
+        RUNTIME DESTINATION bin
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@ -395,8 +395,8 @@ class CrfDecodeForwardRnnCell(rnn_cell.RNNCell):
      scope: Unused variable scope of this cell.

    Returns:
-      backpointers: [batch_size, num_tags], containing backpointers.
-      new_state: [batch_size, num_tags], containing new score values.
+      backpointers: A [batch_size, num_tags] matrix of backpointers.
+      new_state: A [batch_size, num_tags] matrix of new score values.
    """
    # For simplicity, in shape comments, denote:
    # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
@ -436,8 +436,9 @@ class CrfDecodeBackwardRnnCell(rnn_cell.RNNCell):
    """Build the CrfDecodeBackwardRnnCell.

    Args:
-      inputs: [batch_size, num_tags], backpointer of next step (in time order).
-      state: [batch_size, 1], next position's tag index.
+      inputs: A [batch_size, num_tags] matrix of
+            backpointer of next step (in time order).
+      state: A [batch_size, 1] matrix of tag index of next step.
      scope: Unused variable scope of this cell.

    Returns:
@ -461,16 +462,16 @@ def crf_decode(potentials, transition_params, sequence_length):
  This is a function for tensor.

  Args:
-    potentials: A [batch_size, max_seq_len, num_tags] tensor, matrix of
+    potentials: A [batch_size, max_seq_len, num_tags] tensor of
              unary potentials.
-    transition_params: A [num_tags, num_tags] tensor, matrix of
+    transition_params: A [num_tags, num_tags] matrix of
              binary potentials.
-    sequence_length: A [batch_size] tensor, containing sequence lengths.
+    sequence_length: A [batch_size] vector of true sequence lengths.

  Returns:
-    decode_tags: A [batch_size, max_seq_len] tensor, with dtype tf.int32.
+    decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
                Contains the highest scoring tag indices.
-    best_score: A [batch_size] tensor, containing the score of decode_tags.
+    best_score: A [batch_size] vector, containing the score of `decode_tags`.
  """
  # If max_seq_len is 1, we skip the algorithm and simply return the argmax tag
  # and the max activation.
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@ -11,6 +11,7 @@ py_test(
    size = "small",
    srcs = ["batch_dataset_op_test.py"],
    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
    deps = [
        ":dataset_serialization_test",
        "//tensorflow/contrib/data/python/ops:dataset_ops",
@ -373,6 +374,7 @@ py_test(
    size = "small",
    srcs = ["sequence_dataset_op_test.py"],
    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
    deps = [
        ":dataset_serialization_test",
        "//tensorflow/contrib/data/python/ops:dataset_ops",
@ -450,6 +452,7 @@ py_test(
    size = "small",
    srcs = ["zip_dataset_op_test.py"],
    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
    deps = [
        ":dataset_serialization_test",
        "//tensorflow/contrib/data/python/ops:dataset_ops",
@ -466,7 +469,10 @@ py_test(
    size = "small",
    srcs = ["prefetching_ops_test.py"],
    srcs_version = "PY2AND3",
-    tags = ["no_oss"],  # b/68785503
+    tags = [
+        "manual",
+        "no_oss",  # b/68785503
+    ],
    deps = [
        "//tensorflow/contrib/data/python/ops:prefetching_py",
        "//tensorflow/core:protos_all_py",
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@ -140,6 +140,23 @@ cuda_py_test(
    ],
 )

+cuda_py_test(
+    name = "cauchy_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/cauchy_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
    name = "chi2_test",
    srcs = ["python/kernel_tests/chi2_test.py"],
--- a/tensorflow/contrib/distributions/init.py
+++ b/tensorflow/contrib/distributions/init.py
@ -24,6 +24,7 @@ from __future__ import print_function

 from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.contrib.distributions.python.ops.binomial import *
+from tensorflow.contrib.distributions.python.ops.cauchy import *
 from tensorflow.contrib.distributions.python.ops.chi2 import *
 from tensorflow.contrib.distributions.python.ops.conditional_distribution import *
 from tensorflow.contrib.distributions.python.ops.conditional_transformed_distribution import *
@ -83,6 +84,7 @@ from tensorflow.python.util.all_util import remove_undocumented

 _allowed_symbols = [
    'bijectors',
+    'Cauchy',
    'ConditionalDistribution',
    'ConditionalTransformedDistribution',
    'FULLY_REPARAMETERIZED',
--- a/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
@ -0,0 +1,438 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Cauchy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import cauchy as cauchy_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
+
+
+class CauchyTest(test.TestCase):
+
+  def setUp(self):
+    self._rng = np.random.RandomState(123)
+
+  def assertAllFinite(self, tensor):
+    is_finite = np.isfinite(tensor.eval())
+    all_true = np.ones_like(is_finite, dtype=np.bool)
+    self.assertAllEqual(all_true, is_finite)
+
+  def _testParamShapes(self, sample_shape, expected):
+    with self.test_session():
+      param_shapes = cauchy_lib.Cauchy.param_shapes(sample_shape)
+      loc_shape, scale_shape = param_shapes["loc"], param_shapes["scale"]
+      self.assertAllEqual(expected, loc_shape.eval())
+      self.assertAllEqual(expected, scale_shape.eval())
+      loc = array_ops.zeros(loc_shape)
+      scale = array_ops.ones(scale_shape)
+      self.assertAllEqual(expected,
+                          array_ops.shape(
+                              cauchy_lib.Cauchy(loc, scale).sample()).eval())
+
+  def _testParamStaticShapes(self, sample_shape, expected):
+    param_shapes = cauchy_lib.Cauchy.param_static_shapes(sample_shape)
+    loc_shape, scale_shape = param_shapes["loc"], param_shapes["scale"]
+    self.assertEqual(expected, loc_shape)
+    self.assertEqual(expected, scale_shape)
+
+  def testParamShapes(self):
+    sample_shape = [10, 3, 4]
+    self._testParamShapes(sample_shape, sample_shape)
+    self._testParamShapes(constant_op.constant(sample_shape), sample_shape)
+
+  def testParamStaticShapes(self):
+    sample_shape = [10, 3, 4]
+    self._testParamStaticShapes(sample_shape, sample_shape)
+    self._testParamStaticShapes(
+        tensor_shape.TensorShape(sample_shape), sample_shape)
+
+  def testCauchyLogPDF(self):
+    with self.test_session():
+      batch_size = 6
+      loc = constant_op.constant([3.0] * batch_size)
+      scale = constant_op.constant([np.sqrt(10.0)] * batch_size)
+      x = np.array([-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], dtype=np.float32)
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      log_pdf = cauchy.log_prob(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), log_pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
+                          log_pdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, log_pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, log_pdf.eval().shape)
+
+      pdf = cauchy.prob(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, pdf.eval().shape)
+
+      if not stats:
+        return
+      expected_log_pdf = stats.cauchy(loc.eval(), scale.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
+
+  def testCauchyLogPDFMultidimensional(self):
+    with self.test_session():
+      batch_size = 6
+      loc = constant_op.constant([[3.0, -3.0]] * batch_size)
+      scale = constant_op.constant(
+          [[np.sqrt(10.0), np.sqrt(15.0)]] * batch_size)
+      x = np.array([[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0]], dtype=np.float32).T
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      log_pdf = cauchy.log_prob(x)
+      log_pdf_values = log_pdf.eval()
+      self.assertEqual(log_pdf.shape, (6, 2))
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), log_pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
+                          log_pdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, log_pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, log_pdf.eval().shape)
+
+      pdf = cauchy.prob(x)
+      pdf_values = pdf.eval()
+      self.assertEqual(pdf.shape, (6, 2))
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf_values.shape)
+      self.assertAllEqual(cauchy.batch_shape, pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, pdf_values.shape)
+
+      if not stats:
+        return
+      expected_log_pdf = stats.cauchy(loc.eval(), scale.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf_values)
+      self.assertAllClose(np.exp(expected_log_pdf), pdf_values)
+
+  def testCauchyCDF(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+      cdf = cauchy.cdf(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, cdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, cdf.eval().shape)
+      if not stats:
+        return
+      expected_cdf = stats.cauchy(loc, scale).cdf(x)
+      self.assertAllClose(expected_cdf, cdf.eval(), atol=0)
+
+  def testCauchySurvivalFunction(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      sf = cauchy.survival_function(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, sf.shape)
+      self.assertAllEqual(cauchy.batch_shape, sf.eval().shape)
+      if not stats:
+        return
+      expected_sf = stats.cauchy(loc, scale).sf(x)
+      self.assertAllClose(expected_sf, sf.eval(), atol=0)
+
+  def testCauchyLogCDF(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-100.0, 10.0, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      cdf = cauchy.log_cdf(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, cdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, cdf.eval().shape)
+
+      if not stats:
+        return
+      expected_cdf = stats.cauchy(loc, scale).logcdf(x)
+      self.assertAllClose(expected_cdf, cdf.eval(), atol=0, rtol=1e-5)
+
+  def testFiniteGradientAtDifficultPoints(self):
+    for dtype in [np.float32, np.float64]:
+      g = ops.Graph()
+      with g.as_default():
+        loc = variables.Variable(dtype(0.0))
+        scale = variables.Variable(dtype(1.0))
+        dist = cauchy_lib.Cauchy(loc=loc, scale=scale)
+        x = np.array([-100., -20., -5., 0., 5., 20., 100.]).astype(dtype)
+        for func in [
+            dist.cdf, dist.log_cdf, dist.survival_function,
+            dist.log_survival_function, dist.log_prob, dist.prob
+        ]:
+          value = func(x)
+          grads = gradients_impl.gradients(value, [loc, scale])
+          with self.test_session(graph=g):
+            variables.global_variables_initializer().run()
+            self.assertAllFinite(value)
+            self.assertAllFinite(grads[0])
+            self.assertAllFinite(grads[1])
+
+  def testCauchyLogSurvivalFunction(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-10.0, 100.0, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      sf = cauchy.log_survival_function(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, sf.shape)
+      self.assertAllEqual(cauchy.batch_shape, sf.eval().shape)
+
+      if not stats:
+        return
+      expected_sf = stats.cauchy(loc, scale).logsf(x)
+      self.assertAllClose(expected_sf, sf.eval(), atol=0, rtol=1e-5)
+
+  def testCauchyEntropy(self):
+    with self.test_session():
+      loc = np.array([1.0, 1.0, 1.0])
+      scale = np.array([[1.0, 2.0, 3.0]])
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      entropy = cauchy.entropy()
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), entropy.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
+                          entropy.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, entropy.shape)
+      self.assertAllEqual(cauchy.batch_shape, entropy.eval().shape)
+
+      if not stats:
+        return
+      expected_entropy = stats.cauchy(loc, scale[0]).entropy().reshape((1, 3))
+      self.assertAllClose(expected_entropy, entropy.eval())
+
+  def testCauchyMode(self):
+    with self.test_session():
+      # Mu will be broadcast to [7, 7, 7].
+      loc = [7.]
+      scale = [11., 12., 13.]
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertAllEqual((3,), cauchy.mode().shape)
+      self.assertAllEqual([7., 7, 7], cauchy.mode().eval())
+
+  def testCauchyMean(self):
+    with self.test_session():
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertAllEqual((3,), cauchy.mean().shape)
+      self.assertAllEqual([np.nan] * 3, cauchy.mean().eval())
+
+  def testCauchyNanMean(self):
+    with self.test_session():
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale, allow_nan_stats=False)
+
+      with self.assertRaises(ValueError):
+        cauchy.mean().eval()
+
+  def testCauchyQuantile(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      p = np.linspace(0.000001, 0.999999, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+      x = cauchy.quantile(p)
+
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), x.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), x.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, x.shape)
+      self.assertAllEqual(cauchy.batch_shape, x.eval().shape)
+
+      if not stats:
+        return
+      expected_x = stats.cauchy(loc, scale).ppf(p)
+      self.assertAllClose(expected_x, x.eval(), atol=0.)
+
+  def testCauchyVariance(self):
+    with self.test_session():
+      # scale will be broadcast to [7, 7, 7]
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertAllEqual((3,), cauchy.variance().shape)
+      self.assertAllEqual([np.nan] * 3, cauchy.variance().eval())
+
+  def testCauchyNanVariance(self):
+    with self.test_session():
+      # scale will be broadcast to [7, 7, 7]
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale, allow_nan_stats=False)
+
+      with self.assertRaises(ValueError):
+        cauchy.variance().eval()
+
+  def testCauchyStandardDeviation(self):
+    with self.test_session():
+      # scale will be broadcast to [7, 7, 7]
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertAllEqual((3,), cauchy.stddev().shape)
+      self.assertAllEqual([np.nan] * 3, cauchy.stddev().eval())
+
+  def testCauchyNanStandardDeviation(self):
+    with self.test_session():
+      # scale will be broadcast to [7, 7, 7]
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale, allow_nan_stats=False)
+
+      with self.assertRaises(ValueError):
+        cauchy.stddev().eval()
+
+  def testCauchySample(self):
+    with self.test_session():
+      loc = constant_op.constant(3.0)
+      scale = constant_op.constant(1.0)
+      loc_v = 3.0
+      n = constant_op.constant(100000)
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+      samples = cauchy.sample(n)
+      sample_values = samples.eval()
+
+      self.assertEqual(sample_values.shape, (100000,))
+      self.assertAllClose(np.median(sample_values), loc_v, atol=1e-1)
+
+      expected_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
+          tensor_shape.TensorShape(cauchy.batch_shape_tensor().eval()))
+
+      self.assertAllEqual(expected_shape, samples.shape)
+      self.assertAllEqual(expected_shape, sample_values.shape)
+
+      expected_shape = (
+          tensor_shape.TensorShape([n.eval()]).concatenate(cauchy.batch_shape))
+
+      self.assertAllEqual(expected_shape, samples.shape)
+      self.assertAllEqual(expected_shape, sample_values.shape)
+
+  def testCauchySampleMultiDimensional(self):
+    with self.test_session():
+      batch_size = 2
+      loc = constant_op.constant([[3.0, -3.0]] * batch_size)
+      scale = constant_op.constant([[0.5, 1.0]] * batch_size)
+      loc_v = [3.0, -3.0]
+      n = constant_op.constant(100000)
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+      samples = cauchy.sample(n)
+      sample_values = samples.eval()
+      self.assertEqual(samples.shape, (100000, batch_size, 2))
+      self.assertAllClose(
+          np.median(sample_values[:, 0, 0]), loc_v[0], atol=1e-1)
+      self.assertAllClose(
+          np.median(sample_values[:, 0, 1]), loc_v[1], atol=1e-1)
+
+      expected_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
+          tensor_shape.TensorShape(cauchy.batch_shape_tensor().eval()))
+      self.assertAllEqual(expected_shape, samples.shape)
+      self.assertAllEqual(expected_shape, sample_values.shape)
+
+      expected_shape = (
+          tensor_shape.TensorShape([n.eval()]).concatenate(cauchy.batch_shape))
+      self.assertAllEqual(expected_shape, samples.shape)
+      self.assertAllEqual(expected_shape, sample_values.shape)
+
+  def testCauchyNegativeLocFails(self):
+    with self.test_session():
+      cauchy = cauchy_lib.Cauchy(loc=[1.], scale=[-5.], validate_args=True)
+      with self.assertRaisesOpError("Condition x > 0 did not hold"):
+        cauchy.mode().eval()
+
+  def testCauchyShape(self):
+    with self.test_session():
+      loc = constant_op.constant([-3.0] * 5)
+      scale = constant_op.constant(11.0)
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertEqual(cauchy.batch_shape_tensor().eval(), [5])
+      self.assertEqual(cauchy.batch_shape, tensor_shape.TensorShape([5]))
+      self.assertAllEqual(cauchy.event_shape_tensor().eval(), [])
+      self.assertEqual(cauchy.event_shape, tensor_shape.TensorShape([]))
+
+  def testCauchyShapeWithPlaceholders(self):
+    loc = array_ops.placeholder(dtype=dtypes.float32)
+    scale = array_ops.placeholder(dtype=dtypes.float32)
+    cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+    with self.test_session() as sess:
+      # get_batch_shape should return an "<unknown>" tensor.
+      self.assertEqual(cauchy.batch_shape, tensor_shape.TensorShape(None))
+      self.assertEqual(cauchy.event_shape, ())
+      self.assertAllEqual(cauchy.event_shape_tensor().eval(), [])
+      self.assertAllEqual(
+          sess.run(
+              cauchy.batch_shape_tensor(),
+              feed_dict={
+                  loc: 5.0,
+                  scale: [1.0, 2.0]
+              }), [2])
+
+
+if __name__ == "__main__":
+  test.main()
--- a/tensorflow/contrib/distributions/python/ops/cauchy.py
+++ b/tensorflow/contrib/distributions/python/ops/cauchy.py
@ -0,0 +1,219 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Cauchy distribution class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+
+__all__ = [
+    "Cauchy",
+]
+
+
+class Cauchy(distribution.Distribution):
+  """The Cauchy distribution with location `loc` and scale `scale`.
+
+  #### Mathematical details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; loc, scale) = 1 / (pi * scale * (1 + ((x - loc) / scale)**2))
+  ```
+  where `loc` is the location, and `scale` is the scale.
+
+  The Cauchy distribution is a member of the [location-scale family](
+  https://en.wikipedia.org/wiki/Location-scale_family), i.e.
+
+  ```none
+  X ~ Cauchy(loc=0, scale=1)
+  Y ~ Cauchy(loc=loc, scale=scale)
+  Y = loc + scale * X
+  ```
+
+  #### Examples
+
+  Examples of initialization of one or a batch of distributions.
+
+  ```python
+  # Define a single scalar Cauchy distribution.
+  dist = Cauchy(loc=0., scale=3.)
+
+  # Evaluate the cdf at 1, returning a scalar.
+  dist.cdf(1.)
+
+  # Define a batch of two scalar valued Cauchy distributions.
+  dist = Cauchy(loc=[1, 2.], scale=[11, 22.])
+
+  # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
+  # returning a length two tensor.
+  dist.prob([0, 1.5])
+
+  # Get 3 samples, returning a 3 x 2 tensor.
+  dist.sample([3])
+  ```
+
+  Arguments are broadcast when possible.
+
+  ```python
+  # Define a batch of two scalar valued Cauchy distributions.
+  # Both have median 1, but different scales.
+  dist = tf.contrib.distributions.Cauchy(loc=1., scale=[11, 22.])
+  # Evaluate the pdf of both distributions on the same point, 3.0,
+  # returning a length 2 tensor.
+  dist.prob(3.0)
+  ```
+  """
+
+  def __init__(self,
+               loc,
+               scale,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="Cauchy"):
+    """Construct Cauchy distributions.
+
+    The parameters `loc` and `scale` must be shaped in a way that supports
+    broadcasting (e.g. `loc + scale` is a valid operation).
+
+    Args:
+      loc: Floating point tensor; the modes of the distribution(s).
+      scale: Floating point tensor; the locations of the distribution(s).
+        Must contain only positive values.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      TypeError: if `loc` and `scale` have different `dtype`.
+    """
+    parameters = locals()
+    with ops.name_scope(name, values=[loc, scale]):
+      with ops.control_dependencies([check_ops.assert_positive(scale)]
+                                    if validate_args else []):
+        self._loc = array_ops.identity(loc, name="loc")
+        self._scale = array_ops.identity(scale, name="scale")
+        check_ops.assert_same_float_dtype([self._loc, self._scale])
+    super(Cauchy, self).__init__(
+        dtype=self._scale.dtype,
+        reparameterization_type=distribution.FULLY_REPARAMETERIZED,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        parameters=parameters,
+        graph_parents=[self._loc, self._scale],
+        name=name)
+
+  @staticmethod
+  def _param_shapes(sample_shape):
+    return dict(
+        zip(("loc", "scale"),
+            ([ops.convert_to_tensor(sample_shape, dtype=dtypes.int32)] * 2)))
+
+  @property
+  def loc(self):
+    """Distribution parameter for the mean."""
+    return self._loc
+
+  @property
+  def scale(self):
+    """Distribution parameter for standard deviation."""
+    return self._scale
+
+  def _batch_shape_tensor(self):
+    return array_ops.broadcast_dynamic_shape(
+        array_ops.shape(self.loc), array_ops.shape(self.scale))
+
+  def _batch_shape(self):
+    return array_ops.broadcast_static_shape(self.loc.shape, self.scale.shape)
+
+  def _event_shape_tensor(self):
+    return constant_op.constant([], dtype=dtypes.int32)
+
+  def _event_shape(self):
+    return tensor_shape.scalar()
+
+  def _sample_n(self, n, seed=None):
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
+    probs = random_ops.random_uniform(
+        shape=shape, minval=0., maxval=1., dtype=self.dtype, seed=seed)
+    return self._quantile(probs)
+
+  def _log_prob(self, x):
+    return self._log_unnormalized_prob(x) - self._log_normalization()
+
+  def _cdf(self, x):
+    return math_ops.atan(self._z(x)) / np.pi + 0.5
+
+  def _log_cdf(self, x):
+    return math_ops.log1p(2 / np.pi * math_ops.atan(self._z(x))) - np.log(2)
+
+  def _log_unnormalized_prob(self, x):
+    return -math_ops.log1p(math_ops.square(self._z(x)))
+
+  def _log_normalization(self):
+    return np.log(np.pi) + math_ops.log(self.scale)
+
+  def _entropy(self):
+    h = np.log(4 * np.pi) + math_ops.log(self.scale)
+    return h * array_ops.ones_like(self.loc)
+
+  def _quantile(self, p):
+    return self.loc + self.scale * math_ops.tan(np.pi * (p - 0.5))
+
+  def _mode(self):
+    return self.loc * array_ops.ones_like(self.scale)
+
+  def _z(self, x):
+    """Standardize input `x`."""
+    with ops.name_scope("standardize", values=[x]):
+      return (x - self.loc) / self.scale
+
+  def _inv_z(self, z):
+    """Reconstruct input `x` from a its normalized version."""
+    with ops.name_scope("reconstruct", values=[z]):
+      return z * self.scale + self.loc
+
+  def _mean(self):
+    if self.allow_nan_stats:
+      return array_ops.fill(self.batch_shape_tensor(),
+                            self.dtype.as_numpy_dtype(np.nan))
+    else:
+      raise ValueError("`mean` is undefined for Cauchy distribution.")
+
+  def _stddev(self):
+    if self.allow_nan_stats:
+      return array_ops.fill(self.batch_shape_tensor(),
+                            self.dtype.as_numpy_dtype(np.nan))
+    else:
+      raise ValueError("`stddev` is undefined for Cauchy distribution.")
--- a/tensorflow/contrib/eager/python/examples/notebooks/1_basics.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/1_basics.ipynb
@ -429,7 +429,9 @@
        "cpu_tensor = tf.random_normal([SIZE, SIZE])\n",
        "\n",
        "if is_gpu_available:\n",
-        "  gpu_tensor = cpu_tensor.gpu()"
+        "  gpu_tensor = cpu_tensor.gpu()\n",
+        "else:\n",
+        "  print(\"GPU not available.\")"
      ]
    },
    {
--- a/tensorflow/contrib/eager/python/examples/notebooks/2_gradients.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/2_gradients.ipynb
@ -383,7 +383,7 @@
        "\n",
        "`implicit_value_and_gradients()` returns a function that accepts the same inputs as the function passed in, and returns a tuple consisting of:\n",
        "\n",
-        "1. the value returned by the function passed in (in this case, the loss calculated by `calculate_linear_model_loss()`), and\n",
+        "1. the value returned by the function passed in (in this case, the loss calculated by `loss_fn()`), and\n",
        "1. a list of tuples consisting of:\n",
        "  1. The value of the gradient (a `tf.Tensor`) with respect to a given variable\n",
        "  1. The corresponding variable (`tf.Variable`)\n",
@ -698,7 +698,7 @@
      "source": [
        "## Other Ways to Compute Gradients\n",
        "\n",
-        "Using our loss function as an example (`calculate_linear_model_loss()`), there are several other ways we could compute gradients:\n",
+        "Using our loss function as an example (`loss_fn()`), there are several other ways we could compute gradients:\n",
        "\n",
        "1. `tfe.implicit_gradients()`\n",
        "1. `tfe.gradients_function()`\n",
@ -841,7 +841,7 @@
        "# tfe.implicit_value_and_gradients() demo\n",
        "value_gradients_fn = tfe.implicit_value_and_gradients(loss_fn)\n",
        "\n",
-        "# Returns only gradients:\n",
+        "# Returns the value returned by the function passed in, gradients, and variables:\n",
        "value_gradients_fn(inputs, labels, wb)"
      ]
    }
--- a/tensorflow/contrib/eager/python/examples/notebooks/3_datasets.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/3_datasets.ipynb
@ -9,7 +9,7 @@
      "source": [
        "# Eager Execution Tutorial: Importing Data\n",
        "\n",
-        "This notebook demonstrates the use of the [`tf.contrib.data.Dataset` API](https://www.tensorflow.org/programmers_guide/datasets) to build pipelines to feed data to your program. It covers:\n",
+        "This notebook demonstrates the use of the [`tf.data.Dataset` API](https://www.tensorflow.org/programmers_guide/datasets) to build pipelines to feed data to your program. It covers:\n",
        "\n",
        "* Creating a `Dataset`.\n",
        "* Iteration over a `Dataset` with eager execution enabled.\n",
@ -64,7 +64,7 @@
      "source": [
        "# Step 1: Create a source `Dataset`\n",
        "\n",
-        "Create a _source_ dataset using one of the factory functions like [`Dataset.from_tensors`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#from_tensors), [`Dataset.from_tensor_slices`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#from_tensor_slices) or using objects that read from files like [`TextLineDataset`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/TextLineDataset) or [`TFRecordDataset`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/TFRecordDataset). See the [Programmer's Guide](https://www.google.com/url?sa=D\u0026q=https%3A%2F%2Fwww.tensorflow.org%2Fprogrammers_guide%2Fdatasets%23reading_input_data) for more information."
+        "Create a _source_ dataset using one of the factory functions like [`Dataset.from_tensors`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensors), [`Dataset.from_tensor_slices`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensor_slices) or using objects that read from files like [`TextLineDataset`](https://www.tensorflow.org/api_docs/python/tf/data/TextLineDataset) or [`TFRecordDataset`](https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset). See the [Programmer's Guide](https://www.google.com/url?sa=D\u0026q=https%3A%2F%2Fwww.tensorflow.org%2Fprogrammers_guide%2Fdatasets%23reading_input_data) for more information."
      ]
    },
    {
@ -83,7 +83,7 @@
      },
      "outputs": [],
      "source": [
-        "ds_tensors = tf.contrib.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6])\n",
+        "ds_tensors = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6])\n",
        "\n",
        "# Create a CSV file\n",
        "import tempfile\n",
@ -93,7 +93,7 @@
        "Line 2\n",
        "Line 3\n",
        "  \"\"\")\n",
-        "ds_file = tf.contrib.data.TextLineDataset(filename)\n"
+        "ds_file = tf.data.TextLineDataset(filename)\n"
      ]
    },
    {
@ -105,7 +105,7 @@
      "source": [
        "# Step 2: Apply transformations\n",
        "\n",
-        "Use the transformations functions like [`map`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#map), [`batch`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#batch), [`shuffle`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#shuffle) etc. to apply transformations to the records of the dataset. See the [API documentation for `tf.contrib.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset) for details."
+        "Use the transformations functions like [`map`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map), [`batch`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch), [`shuffle`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#shuffle) etc. to apply transformations to the records of the dataset. See the [API documentation for `tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) for details."
      ]
    },
    {
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@ -286,7 +286,6 @@ def _fused_batch_norm(inputs,
    ValueError: If the rank of `inputs` is neither 2 or 4.
    ValueError: If rank or `C` dimension of `inputs` is undefined.
  """
-  # TODO(reedwm): Add support for fp16 inputs.
  if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
    raise ValueError('data_format has to be either NCHW or NHWC.')
  with variable_scope.variable_scope(
@ -310,7 +309,6 @@ def _fused_batch_norm(inputs,
        new_shape = [-1, channels, 1, 1]
      inputs = array_ops.reshape(inputs, new_shape)
    inputs_shape = inputs.get_shape()
-    dtype = inputs.dtype.base_dtype
    if data_format == DATA_FORMAT_NHWC:
      params_shape = inputs_shape[-1:]
    else:
@ -320,9 +318,10 @@ def _fused_batch_norm(inputs,
                       (inputs.name, params_shape))

    # Allocate parameters for the beta and gamma of the normalization.
-    trainable_beta = trainable and center
    beta_collections = utils.get_variable_collections(variables_collections,
                                                      'beta')
+    # Float32 required to avoid precision-loss when using fp16 input/output
+    variable_dtype = dtypes.float32
    if not param_initializers:
      param_initializers = {}
    if not param_regularizers:
@ -336,13 +335,13 @@ def _fused_batch_norm(inputs,
      beta = variables.model_variable(
          'beta',
          shape=params_shape,
-          dtype=dtype,
+          dtype=variable_dtype,
          initializer=beta_initializer,
          regularizer=beta_regularizer,
          collections=beta_collections,
-          trainable=trainable_beta)
+          trainable=trainable)
    else:
-      beta = array_ops.constant(0.0, shape=params_shape)
+      beta = array_ops.constant(0.0, dtype=variable_dtype, shape=params_shape)

    if scale:
      gamma_collections = utils.get_variable_collections(
@ -352,13 +351,13 @@ def _fused_batch_norm(inputs,
      gamma = variables.model_variable(
          'gamma',
          shape=params_shape,
-          dtype=dtype,
+          dtype=variable_dtype,
          initializer=gamma_initializer,
          regularizer=gamma_regularizer,
          collections=gamma_collections,
          trainable=trainable)
    else:
-      gamma = array_ops.constant(1.0, shape=params_shape)
+      gamma = array_ops.constant(1.0, dtype=variable_dtype, shape=params_shape)

    # Create moving_mean and moving_variance variables and add them to the
    # appropriate collections. We disable variable partitioning while creating
@ -375,7 +374,7 @@ def _fused_batch_norm(inputs,
      moving_mean = variables.model_variable(
          'moving_mean',
          shape=params_shape,
-          dtype=dtype,
+          dtype=variable_dtype,
          initializer=moving_mean_initializer,
          trainable=False,
          collections=moving_mean_collections)
@ -386,7 +385,7 @@ def _fused_batch_norm(inputs,
      moving_variance = variables.model_variable(
          'moving_variance',
          shape=params_shape,
-          dtype=dtype,
+          dtype=variable_dtype,
          initializer=moving_variance_initializer,
          trainable=False,
          collections=moving_variance_collections)
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@ -1774,10 +1774,13 @@ class BatchNormTest(test.TestCase):
      with self.assertRaisesRegexp(ValueError, 'undefined'):
        _layers.batch_norm(inputs, data_format='NCHW')

-  def _testCreateOp(self, fused):
+  def _testCreateOp(self, fused, dtype=None):
+    if dtype is None:
+      dtype = dtypes.float32
    height, width = 3, 3
    with self.test_session():
-      images = np.random.uniform(size=(5, height, width, 3)).astype('f')
+      images = np.random.uniform(size=(5, height, width, 3)).astype(
+          dtype.as_numpy_dtype)
      output = _layers.batch_norm(images, fused=fused)
      expected_name = ('BatchNorm/FusedBatchNorm' if fused else
                       'BatchNorm/batchnorm')
@ -1792,6 +1795,9 @@ class BatchNormTest(test.TestCase):
  def testCreateOpFused(self):
    self._testCreateOp(True)

+  def testCreateOpFusedFloat16(self):
+    self._testCreateOp(True, dtypes.float16)
+
  def _testCreateOpBetaRegularizer(self, fused=True):
    height, width = 3, 3
    with self.test_session():
@ -2659,11 +2665,64 @@ class BatchNormTest(test.TestCase):
  def testBatchNormBeta(self):
    # Test case for 11673
    with self.test_session() as sess:
-      a = array_ops.placeholder(dtypes.float32, shape=(10, 10, 10, 10))
-      b = _layers.batch_norm(a, center=False, data_format='NCHW',
-                                       zero_debias_moving_mean=True)
+      a_32 = array_ops.placeholder(dtypes.float32, shape=(10, 10, 10, 10))
+      _layers.batch_norm(
+          a_32, center=False, data_format='NCHW', zero_debias_moving_mean=True)
+      a_16 = array_ops.placeholder(dtypes.float16, shape=(10, 10, 10, 10))
+      _layers.batch_norm(
+          a_16, center=False, data_format='NCHW', zero_debias_moving_mean=True)
      sess.run(variables_lib.global_variables_initializer())

+  def testVariablesAreFloat32(self):
+    height, width = 3, 3
+    with self.test_session():
+      images = random_ops.random_uniform(
+          (5, height, width, 3), seed=1, dtype=dtypes.float16)
+      _layers.batch_norm(images, scale=True)
+      beta = variables.get_variables_by_name('beta')[0]
+      gamma = variables.get_variables_by_name('gamma')[0]
+      self.assertEqual(beta.dtype, dtypes.float32_ref)
+      self.assertEqual(gamma.dtype, dtypes.float32_ref)
+      moving_mean = variables.get_variables_by_name('moving_mean')[0]
+      moving_variance = variables.get_variables_by_name('moving_variance')[0]
+      self.assertEqual(moving_mean.dtype, dtypes.float32_ref)
+      self.assertEqual(moving_variance.dtype, dtypes.float32_ref)
+
+  def _runFusedBatchNorm(self, shape, dtype):
+    channels = shape[1]
+    images = np.arange(np.product(shape), dtype=dtype).reshape(shape)
+    beta = init_ops.constant_initializer(
+        np.arange(2, channels + 2, dtype=np.float32))
+    gamma = init_ops.constant_initializer(
+        np.arange(10, channels + 10, dtype=np.float32) * 2.0)
+    mean = init_ops.constant_initializer(
+        np.arange(3, channels + 3, dtype=np.float32) * 5.0)
+    variance = init_ops.constant_initializer(
+        np.arange(1, channels + 1, dtype=np.float32) * 4.0)
+    output = _layers.batch_norm(
+        images,
+        fused=True,
+        is_training=True,
+        scale=True,
+        epsilon=0.5,
+        param_initializers={
+            'beta': beta,
+            'gamma': gamma,
+            'moving_mean': mean,
+            'moving_variance': variance,
+        },
+        data_format='NCHW')
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(variables_lib.global_variables_initializer())
+      return sess.run(output)
+
+  def testFusedBatchNormFloat16MatchesFloat32(self):
+    if test.is_gpu_available(cuda_only=True):
+      shape = [5, 4, 2, 3]
+      res_32 = self._runFusedBatchNorm(shape, np.float32)
+      res_16 = self._runFusedBatchNorm(shape, np.float16)
+      self.assertAllClose(res_32, res_16, rtol=1e-3)
+
  def testAdjustmentCreated(self):
    # Tests that the adjustment is appropriately passed to and used by the core
    # BN layer.
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@ -119,7 +119,7 @@ class Head(object):
      update_op = tf.contrib.layers.optimize_loss(optimizer=sync,
                                                  loss=model_fn_ops.loss, ...)
      hooks = [sync.make_session_run_hook(is_chief)]
-      ... upate train_op and hooks in ModelFnOps and return
+      ... update train_op and hooks in ModelFnOps and return
    ```
  """
  __metaclass__ = abc.ABCMeta
--- a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
@ -23,7 +23,6 @@ import collections

 import six

-from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib.framework import get_graph_from_inputs
 from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import metric_key
@ -32,6 +31,7 @@ from tensorflow.python.estimator import model_fn as core_model_fn_lib
 from tensorflow.python.estimator.export import export_output as core_export_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
@ -156,11 +156,11 @@ class ModelFnOps(
    else:
      if isinstance(predictions, dict):
        predictions = {
-            k: contrib_framework.convert_to_tensor_or_sparse_tensor(v)
+            k: sparse_tensor.convert_to_tensor_or_sparse_tensor(v)
            for k, v in six.iteritems(predictions)
        }
      else:
-        predictions = contrib_framework.convert_to_tensor_or_sparse_tensor(
+        predictions = sparse_tensor.convert_to_tensor_or_sparse_tensor(
            predictions)

    # Validate eval_metric_ops
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@ -28,13 +28,13 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin

 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging

 # pylint: disable=g-multiple-import,g-bad-import-order
 from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels
 from .dask_io import HAS_DASK, extract_dask_data, extract_dask_labels
-
 # pylint: enable=g-multiple-import,g-bad-import-order


@ -365,8 +365,14 @@ class DataFeeder(object):
    self.random_state = np.random.RandomState(
        42) if random_state is None else random_state

-    num_samples = list(self._x.values())[0].shape[
-        0] if x_is_dict else self._x.shape[0]
+    if x_is_dict:
+      num_samples = list(self._x.values())[0].shape[0]
+    elif tensor_util.is_tensor(self._x):
+      num_samples = self._x.shape[
+          0].value  # shape will be a Dimension, extract an int
+    else:
+      num_samples = self._x.shape[0]
+
    if self._shuffle:
      self.indices = self.random_state.permutation(num_samples)
    else:
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@ -238,10 +238,10 @@ class SdcaModel(object):
    with name_scope('sdca/prediction'):
      sparse_variables = self._convert_n_to_tensor(self._variables[
          'sparse_features_weights'])
-      result = 0.0
+      result_sparse = 0.0
      for sfc, sv in zip(examples['sparse_features'], sparse_variables):
        # TODO(sibyl-Aix6ihai): following does not take care of missing features.
-        result += math_ops.segment_sum(
+        result_sparse += math_ops.segment_sum(
            math_ops.multiply(
                array_ops.gather(sv, sfc.feature_indices), sfc.feature_values),
            sfc.example_indices)
@ -249,12 +249,14 @@ class SdcaModel(object):
      dense_variables = self._convert_n_to_tensor(self._variables[
          'dense_features_weights'])

+      result_dense = 0.0
      for i in range(len(dense_variables)):
-        result += math_ops.matmul(dense_features[i],
-                                  array_ops.expand_dims(dense_variables[i], -1))
+        result_dense += math_ops.matmul(dense_features[i],
+                                        array_ops.expand_dims(
+                                            dense_variables[i], -1))

    # Reshaping to allow shape inference at graph construction time.
-    return array_ops.reshape(result, [-1])
+    return array_ops.reshape(result_dense, [-1]) + result_sparse

  def predictions(self, examples):
    """Add operations to compute predictions by the model.
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@ -23,6 +23,7 @@ py_library(
 py_test(
    name = "lite_test",
    srcs = ["lite_test.py"],
+    srcs_version = "PY2AND3",
    deps = [
        ":lite",
        "//tensorflow/python:array_ops",
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@ -36,6 +36,11 @@ import traceback
 import zipfile
 import numpy as np
 from six import StringIO
+
+# TODO(aselle): Disable GPU for now
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+# pylint: disable=g-import-not-at-top
 import tensorflow as tf
 from google.protobuf import text_format
 # TODO(aselle): switch to TensorFlow's resource_loader
@ -379,12 +384,13 @@ def make_zip_of_tests(zip_path,
        report["toco_log"] = ""
        tf.reset_default_graph()

-        try:
-          inputs, outputs = make_graph(param_dict_real)
-        except (tf.errors.UnimplementedError, tf.errors.InvalidArgumentError,
-                ValueError):
-          report["tf_log"] += traceback.format_exc()
-          return None, report
+        with tf.device("/cpu:0"):
+          try:
+            inputs, outputs = make_graph(param_dict_real)
+          except (tf.errors.UnimplementedError, tf.errors.InvalidArgumentError,
+                  ValueError):
+            report["tf_log"] += traceback.format_exc()
+            return None, report

        sess = tf.Session()
        try:
--- a/tensorflow/contrib/lite/toco/python/BUILD
+++ b/tensorflow/contrib/lite/toco/python/BUILD
@ -61,6 +61,7 @@ tf_py_test(
    data = [
        ":toco_from_protos",
    ],
+    tags = ["no_pip"],
 )

 filegroup(
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@ -314,7 +314,8 @@ ifeq ($(TARGET),ANDROID)
 -Wno-narrowing \
 -fomit-frame-pointer \
 $(MARCH_OPTION) \
-fPIE
+-fPIE \
+-fPIC
 	INCLUDES = \
 -I$(NDK_ROOT)/sources/android/support/include \
 -I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/include \
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@ -174,10 +174,26 @@ tensorflow/contrib/makefile/build_all_ios.sh

 This process will take around twenty minutes on a modern MacBook Pro.

-When it completes, you will have a library for a single architecture and the
-benchmark program. Although successfully compiling the benchmark program is a
+When it completes, you will have a unified library for all architectures
+(i386sim, x86_64sim, armv7, armv7s and arm64)  and the benchmark program.
+Although successfully compiling the benchmark program is a
 sign of success, the program is not a complete iOS app.

+If you would only like to build only one architecture to save time:
+(iOS 11+ only supports 64bit so you can get away with arm64)
+
+```bash
+tensorflow/contrib/makefile/build_all_ios.sh -a arm64
+```
+
+After the first build if you would like to just build the tensorflow
+library you can pass the -T flag to avoid a clean & rebuild. This should
+take you just a few seconds to generate the library if you modified one file.
+
+```bash
+tensorflow/contrib/makefile/build_all_ios.sh -a arm64 -T
+```
+
 To see TensorFlow running on iOS, the example Xcode project in
 [tensorflow/examples/ios](../../examples/ios/) shows how to use the static
 library in a simple app.
@ -193,19 +209,18 @@ If you have not already, you will need to download dependencies:
 tensorflow/contrib/makefile/download_dependencies.sh
 ```

-Next, you will need to compile protobufs for iOS:
+Next, you will need to compile protobufs for iOS (optionally takes the -a $ARCH flag):

 ```bash
-tensorflow/contrib/makefile/compile_ios_protobuf.sh 
+tensorflow/contrib/makefile/compile_ios_protobuf.sh
 ```

-Then, you will need to compile the nsync library for iOS:
+Then, you will need to compile the nsync library for iOS (optionally takes -a $ARCH flag):

 ```bash
 export HOST_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh`
 export TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios`
 ```
-
 Then, you can run the makefile specifying iOS as the target, along with the
 architecture you want to build for:

@ -219,10 +234,6 @@ This creates a library in
 `tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a` that you can link any
 xcode project against. 

-At this point, you will have a library for a single architecture and the
-benchmark program. Although successfully compiling the benchmark program is a
-sign of success, the program is not a complete iOS app. 
-
 To see TensorFlow running on iOS, the example Xcode project in
 [tensorflow/examples/ios](../../examples/ios/) shows how to use the static
 library in a simple app.
@ -237,6 +248,14 @@ time follow it with:
 compile_ios_tensorflow.sh
 ```

+`compile_ios_tensorflow.sh` takes the -a flag to build only for one architecture.
+In case you run into issues with unresolved symbols with nsync you can also pass
+-h ${HOST_NSYNC_LIB} and -n {TARGET_NSYNC_LIB} so it would look like:
+
+```bash
+tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -h tensorflow/contrib/makefile/downloads/nsync/builds/default.macos.c++11/nsync.a -n tensorflow/contrib/makefile/downloads/nsync/builds/lipo.ios.c++11/nsync.a -a arm64
+```
+
 In XCode, you will need to use -force_load in the linker flags
 section of the build settings to pull in the global constructors that are used
 to register ops and kernels. 
@ -249,7 +268,7 @@ debug mode. If you are concerned about performance or are working on a release
 build, you would likely want a higher optimization setting, like so:
 
 ```bash
-compile_ios_tensorflow.sh "-Os"
+compile_ios_tensorflow.sh -f "-Os"
 ```

 For other variations of valid optimization flags, see [clang optimization levels](http://stackoverflow.com/questions/15548023/clang-optimization-levels).
--- a/tensorflow/contrib/makefile/build_all_ios.sh
+++ b/tensorflow/contrib/makefile/build_all_ios.sh
@ -23,14 +23,29 @@ if [[ $(uname) != "Darwin" ]]; then
    exit 1
 fi

+usage() {
+  echo "Usage: $(basename "$0") [-a:T]"
+  echo "-a [build_arch] build only for specified arch x86_64 [default=all]"
+  echo "-T only build tensorflow (dont download other deps etc)"
+  exit 1
+}
+
+while getopts "a:T" opt_name; do
+  case "$opt_name" in
+    a) BUILD_ARCH="${OPTARG}";;
+    T) ONLY_MAKE_TENSORFLOW="true";;
+    *) usage;;
+  esac
+done
+shift $((OPTIND - 1))
+
+
 # Make sure we're in the correct directory, at the root of the source tree.
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd ${SCRIPT_DIR}/../../../

-
-# Remove any old files first.
-make -f tensorflow/contrib/makefile/Makefile clean
-rm -rf tensorflow/contrib/makefile/downloads
+source "${SCRIPT_DIR}/build_helper.subr"
+JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"

 # Setting a deployment target is required for building with bitcode,
 # otherwise linking will fail with:
@ -41,20 +56,37 @@ if [[ -n MACOSX_DEPLOYMENT_TARGET ]]; then
    export MACOSX_DEPLOYMENT_TARGET=$(sw_vers -productVersion)
 fi

-# Pull down the required versions of the frameworks we need.
-tensorflow/contrib/makefile/download_dependencies.sh
+if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
+    # Remove any old files first.
+    make -f tensorflow/contrib/makefile/Makefile clean
+    rm -rf tensorflow/contrib/makefile/downloads

-# Compile protobuf for the target iOS device architectures.
-tensorflow/contrib/makefile/compile_ios_protobuf.sh
+    # Pull down the required versions of the frameworks we need.
+    tensorflow/contrib/makefile/download_dependencies.sh
+
+    # Compile protobuf for the target iOS device architectures.
+    tensorflow/contrib/makefile/compile_ios_protobuf.sh
+fi

 # Compile nsync for the target iOS device architectures.
 # Don't use  export var=`something` syntax; it swallows the exit status.
 HOST_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh`
-TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios`
+if [[ -z "${BUILD_ARCH}" ]]; then
+    # No arch specified so build all architectures
+    TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios`
+else
+    # arch specified so build just that
+    TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios -a ${BUILD_ARCH}`
+fi
 export HOST_NSYNC_LIB TARGET_NSYNC_LIB

-# Build the iOS TensorFlow libraries.
-tensorflow/contrib/makefile/compile_ios_tensorflow.sh "-O3"
+if [[ -z "${BUILD_ARCH}" ]]; then
+    # build the ios tensorflow libraries.
+    tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -h $HOST_NSYNC_LIB -n $TARGET_NSYNC_LIB
+else
+    # arch specified so build just that
+    tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -a "${BUILD_ARCH}" -h $HOST_NSYNC_LIB -n $TARGET_NSYNC_LIB
+fi

 # Creates a static universal library in
 # tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a
--- a/tensorflow/contrib/makefile/compile_ios_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
@ -21,10 +21,28 @@ if [[ -n MACOSX_DEPLOYMENT_TARGET ]]; then
    export MACOSX_DEPLOYMENT_TARGET=$(sw_vers -productVersion)
 fi

-SCRIPT_DIR=$(dirname $0)
+usage() {
+  echo "Usage: $(basename "$0") [-a]"
+  echo "-a [build_arch] build for specified arch comma separate for multiple archs (eg: x86_64,arm64)"
+  echo "default arch i386, x86_64, armv7, armv7s, arm64"
+  exit 1
+}
+
+BUILD_TARGET="i386 x86_64 armv7 armv7s arm64"
+while getopts "a:" opt_name; do
+  case "$opt_name" in
+    a) BUILD_TARGET="${OPTARG}";;
+    *) usage;;
+  esac
+done
+shift $((OPTIND - 1))
+
+IFS=' ' read -r -a build_targets <<< "${BUILD_TARGET}"
+
+SCRIPT_DIR=$(cd `dirname $0` && pwd)
 source "${SCRIPT_DIR}/build_helper.subr"

-cd tensorflow/contrib/makefile
+cd ${SCRIPT_DIR}

 HOST_GENDIR="$(pwd)/gen/protobuf-host"
 mkdir -p "${HOST_GENDIR}"
@ -64,6 +82,10 @@ else
  echo "protoc found. Skip building host tools."
 fi

+# Remove old libs
+rm -f ${LIBDIR}/libprotobuf.a
+rm -f ${LIBDIR}/libprotobuf-lite.a
+
 ./autogen.sh
 if [ $? -ne 0 ]
 then
@ -71,157 +93,192 @@ then
  exit 1
 fi

-make distclean
-./configure \
--host=i386-apple-${OSX_VERSION} \
--disable-shared \
--enable-cross-compile \
--with-protoc="${PROTOC_PATH}" \
--prefix=${LIBDIR}/iossim_386 \
--exec-prefix=${LIBDIR}/iossim_386 \
-"CFLAGS=${CFLAGS} \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
-arch i386 \
-fembed-bitcode \
-isysroot ${IPHONESIMULATOR_SYSROOT}" \
-"CXX=${CXX}" \
-"CXXFLAGS=${CXXFLAGS} \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
-arch i386 \
-fembed-bitcode \
-isysroot \
-${IPHONESIMULATOR_SYSROOT}" \
-LDFLAGS="-arch i386 \
-fembed-bitcode \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS} \
-L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
-L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
+package_pb_library() {
+    pb_libs="${LIBDIR}/${1}/lib/libprotobuf.a"
+    if [ -f "${LIBDIR}/libprotobuf.a" ]; then
+        pb_libs="$pb_libs ${LIBDIR}/libprotobuf.a"
+    fi
+    lipo \
+    $pb_libs \
+    -create \
+    -output ${LIBDIR}/libprotobuf.a

-make distclean
-./configure \
--host=x86_64-apple-${OSX_VERSION} \
--disable-shared \
--enable-cross-compile \
--with-protoc="${PROTOC_PATH}" \
--prefix=${LIBDIR}/iossim_x86_64 \
--exec-prefix=${LIBDIR}/iossim_x86_64 \
-"CFLAGS=${CFLAGS} \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
-arch x86_64 \
-fembed-bitcode \
-isysroot ${IPHONESIMULATOR_SYSROOT}" \
-"CXX=${CXX}" \
-"CXXFLAGS=${CXXFLAGS} \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
-arch x86_64 \
-fembed-bitcode \
-isysroot \
-${IPHONESIMULATOR_SYSROOT}" \
-LDFLAGS="-arch x86_64 \
-fembed-bitcode \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS} \
-L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
-L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
+    pblite_libs="${LIBDIR}/${1}/lib/libprotobuf-lite.a"
+    if [ -f "${LIBDIR}/libprotobuf-lite.a" ]; then
+        pblite_libs="$pblite_libs ${LIBDIR}/libprotobuf-lite.a"
+    fi
+    lipo \
+    $pblite_libs \
+    -create \
+    -output ${LIBDIR}/libprotobuf-lite.a
+}

-make distclean
-./configure \
--host=armv7-apple-${OSX_VERSION} \
--with-protoc="${PROTOC_PATH}" \
--disable-shared \
--prefix=${LIBDIR}/ios_arm7 \
--exec-prefix=${LIBDIR}/ios_arm7 \
-"CFLAGS=${CFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch armv7 \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
-"CXX=${CXX}" \
-"CXXFLAGS=${CXXFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch armv7 \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
-LDFLAGS="-arch armv7 \
-fembed-bitcode \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS}" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
+build_target() {
+case "$1" in
+    i386)  make distclean
+        ./configure \
+        --host=i386-apple-${OSX_VERSION} \
+        --disable-shared \
+        --enable-cross-compile \
+        --with-protoc="${PROTOC_PATH}" \
+        --prefix=${LIBDIR}/iossim_386 \
+        --exec-prefix=${LIBDIR}/iossim_386 \
+        "CFLAGS=${CFLAGS} \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        -arch i386 \
+        -fembed-bitcode \
+        -isysroot ${IPHONESIMULATOR_SYSROOT}" \
+        "CXX=${CXX}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        -arch i386 \
+        -fembed-bitcode \
+        -isysroot \
+        ${IPHONESIMULATOR_SYSROOT}" \
+        LDFLAGS="-arch i386 \
+        -fembed-bitcode \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS} \
+        -L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
+        -L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install

-make distclean
-./configure \
--host=armv7s-apple-${OSX_VERSION} \
--with-protoc="${PROTOC_PATH}" \
--disable-shared \
--prefix=${LIBDIR}/ios_arm7s \
--exec-prefix=${LIBDIR}/ios_arm7s \
-"CFLAGS=${CFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch armv7s \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
-"CXX=${CXX}" \
-"CXXFLAGS=${CXXFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch armv7s \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
-LDFLAGS="-arch armv7s \
-fembed-bitcode \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS}" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
+        package_pb_library "iossim_386"
+        ;;

-make distclean
-./configure \
--host=arm \
--with-protoc="${PROTOC_PATH}" \
--disable-shared \
--prefix=${LIBDIR}/ios_arm64 \
--exec-prefix=${LIBDIR}/ios_arm64 \
-"CFLAGS=${CFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch arm64 \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
-"CXXFLAGS=${CXXFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch arm64 \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
-LDFLAGS="-arch arm64 \
-fembed-bitcode \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS}" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
+    x86_64) make distclean
+        ./configure \
+        --host=x86_64-apple-${OSX_VERSION} \
+        --disable-shared \
+        --enable-cross-compile \
+        --with-protoc="${PROTOC_PATH}" \
+        --prefix=${LIBDIR}/iossim_x86_64 \
+        --exec-prefix=${LIBDIR}/iossim_x86_64 \
+        "CFLAGS=${CFLAGS} \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        -arch x86_64 \
+        -fembed-bitcode \
+        -isysroot ${IPHONESIMULATOR_SYSROOT}" \
+        "CXX=${CXX}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        -arch x86_64 \
+        -fembed-bitcode \
+        -isysroot \
+        ${IPHONESIMULATOR_SYSROOT}" \
+        LDFLAGS="-arch x86_64 \
+        -fembed-bitcode \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS} \
+        -L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
+        -L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install

-lipo \
-${LIBDIR}/iossim_386/lib/libprotobuf.a \
-${LIBDIR}/iossim_x86_64/lib/libprotobuf.a \
-${LIBDIR}/ios_arm7/lib/libprotobuf.a \
-${LIBDIR}/ios_arm7s/lib/libprotobuf.a \
-${LIBDIR}/ios_arm64/lib/libprotobuf.a \
-create \
-output ${LIBDIR}/libprotobuf.a
+        package_pb_library "iossim_x86_64"
+        ;;

-lipo \
-${LIBDIR}/iossim_386/lib/libprotobuf-lite.a \
-${LIBDIR}/iossim_x86_64/lib/libprotobuf-lite.a \
-${LIBDIR}/ios_arm7/lib/libprotobuf-lite.a \
-${LIBDIR}/ios_arm7s/lib/libprotobuf-lite.a \
-${LIBDIR}/ios_arm64/lib/libprotobuf-lite.a \
-create \
-output ${LIBDIR}/libprotobuf-lite.a
+    armv7) make distclean
+        ./configure \
+        --host=armv7-apple-${OSX_VERSION} \
+        --with-protoc="${PROTOC_PATH}" \
+        --disable-shared \
+        --prefix=${LIBDIR}/ios_arm7 \
+        --exec-prefix=${LIBDIR}/ios_arm7 \
+        "CFLAGS=${CFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch armv7 \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        "CXX=${CXX}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch armv7 \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        LDFLAGS="-arch armv7 \
+        -fembed-bitcode \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS}" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install
+
+        package_pb_library "ios_arm7"
+        ;;
+
+    armv7s) make distclean
+        ./configure \
+        --host=armv7s-apple-${OSX_VERSION} \
+        --with-protoc="${PROTOC_PATH}" \
+        --disable-shared \
+        --prefix=${LIBDIR}/ios_arm7s \
+        --exec-prefix=${LIBDIR}/ios_arm7s \
+        "CFLAGS=${CFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch armv7s \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        "CXX=${CXX}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch armv7s \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        LDFLAGS="-arch armv7s \
+        -fembed-bitcode \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS}" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install
+
+        package_pb_library "ios_arm7s"
+        ;;
+
+    arm64) make distclean
+        ./configure \
+        --host=arm \
+        --with-protoc="${PROTOC_PATH}" \
+        --disable-shared \
+        --prefix=${LIBDIR}/ios_arm64 \
+        --exec-prefix=${LIBDIR}/ios_arm64 \
+        "CFLAGS=${CFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch arm64 \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch arm64 \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        LDFLAGS="-arch arm64 \
+        -fembed-bitcode \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS}" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install
+
+        package_pb_library "ios_arm64"
+        ;;
+    *)
+        echo "Unknown ARCH"
+        exit 1
+        ;;
+esac 
+}
+
+for build_element in "${build_targets[@]}"
+do
+    echo "$build_element"
+    build_target "$build_element"
+done
+
+file ${LIBDIR}/libprotobuf.a
+file ${LIBDIR}/libprotobuf-lite.a
+echo "Done building and packaging the libraries"
--- a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
+++ b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
@ -43,55 +43,124 @@ then
    exit 1
 fi

+usage() {
+  echo "Usage: $(basename "$0") [-a]"
+  echo "-a [build_arch] build for specified arch comma separate for multiple archs (eg: x86_64,arm64)"
+  echo "default is [i386, x86_64, armv7, armv7s, arm64]"
+  exit 1
+}
+
+BUILD_TARGET="i386 x86_64 armv7 armv7s arm64"
+while getopts "a:f:h:n:" opt_name; do
+  case "$opt_name" in
+    a) BUILD_TARGET="${OPTARG}";;
+    f) BUILD_OPT="${OPTARG}";;
+    h) NSYNC_HOST="${OPTARG}";;
+    n) NSYNC_TARGET="${OPTARG}";;
+    *) usage;;
+  esac
+done
+shift $((OPTIND - 1))
+
+IFS=' ' read -r -a build_targets <<< "${BUILD_TARGET}"
+
+SCRIPT_DIR=$(cd `dirname $0` && pwd)
+source "${SCRIPT_DIR}/build_helper.subr"
+
+
 GENDIR=tensorflow/contrib/makefile/gen/
 LIBDIR=${GENDIR}lib
 LIB_PREFIX=libtensorflow-core

-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=ARMV7 LIB_NAME=${LIB_PREFIX}-armv7.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "armv7 compilation failed."
-  exit 1
-fi
+#remove any old artifacts
+rm -rf ${LIBDIR}/${LIB_PREFIX}.a

-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=ARMV7S LIB_NAME=${LIB_PREFIX}-armv7s.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "arm7vs compilation failed."
-  exit 1
-fi
+package_tf_library() {
+    CAP_DIR=`echo $1 | tr 'a-z' 'A-Z'`
+    tf_libs="${LIBDIR}/ios_${CAP_DIR}/${LIB_PREFIX}-${1}.a"
+    if [ -f "${LIBDIR}/${LIB_PREFIX}.a" ]; then
+        tf_libs="$tf_libs ${LIBDIR}/${LIB_PREFIX}.a"
+    fi
+    lipo \
+    $tf_libs \
+    -create \
+    -output ${LIBDIR}/${LIB_PREFIX}.a
+}

-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=ARM64 LIB_NAME=${LIB_PREFIX}-arm64.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "arm64 compilation failed."
-  exit 1
-fi
+build_tf_target() {
+case "$1" in
+    armv7)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=ARMV7 LIB_NAME=${LIB_PREFIX}-armv7.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
+        if [ $? -ne 0 ]
+        then
+          echo "armv7 compilation failed."
+          exit 1
+        fi
+        package_tf_library "armv7"
+        ;;
+    armv7s)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=ARMV7S LIB_NAME=${LIB_PREFIX}-armv7s.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"

-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=I386 LIB_NAME=${LIB_PREFIX}-i386.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "i386 compilation failed."
-  exit 1
-fi
+        if [ $? -ne 0 ]
+        then
+          echo "arm7vs compilation failed."
+          exit 1
+        fi
+        package_tf_library "armv7s"
+        ;;
+    arm64)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=ARM64 LIB_NAME=${LIB_PREFIX}-arm64.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
+        if [ $? -ne 0 ]
+        then
+          echo "arm64 compilation failed."
+          exit 1
+        fi
+        package_tf_library "arm64"
+        ;;
+    i386)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=I386 LIB_NAME=${LIB_PREFIX}-i386.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
+        if [ $? -ne 0 ]
+        then
+          echo "i386 compilation failed."
+          exit 1
+        fi
+        package_tf_library "i386"
+        ;;
+    x86_64)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=X86_64 LIB_NAME=${LIB_PREFIX}-x86_64.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
+        if [ $? -ne 0 ]
+        then
+          echo "x86_64 compilation failed."
+          exit 1
+        fi
+        package_tf_library "x86_64"
+        ;;
+    *)
+        echo "Unknown ARCH"
+        exit 1
+esac
+}

-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=X86_64 LIB_NAME=${LIB_PREFIX}-x86_64.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "x86_64 compilation failed."
-  exit 1
-fi
+for build_tf_element in "${build_targets[@]}"
+do
+    echo "$build_tf_element"
+    build_tf_target "$build_tf_element"
+done

-lipo \
-${LIBDIR}/ios_ARMV7/${LIB_PREFIX}-armv7.a \
-${LIBDIR}/ios_ARMV7S/${LIB_PREFIX}-armv7s.a \
-${LIBDIR}/ios_ARM64/${LIB_PREFIX}-arm64.a \
-${LIBDIR}/ios_I386/${LIB_PREFIX}-i386.a \
-${LIBDIR}/ios_X86_64/${LIB_PREFIX}-x86_64.a \
-create \
-output ${LIBDIR}/${LIB_PREFIX}.a
+echo "Done building and packaging TF"
+file ${LIBDIR}/${LIB_PREFIX}.a
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@ -265,7 +265,7 @@ for arch in $archs; do
                                          -I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/'"$arch"'/include \
                                          -I../../platform/c++11 -I../../platform/gcc \
                                          -I../../platform/posix -pthread
-                        PLATFORM_CFLAGS=-std=c++11 -Wno-narrowing '"$march_option"' -fPIE
+                        PLATFORM_CFLAGS=-std=c++11 -Wno-narrowing '"$march_option"' -fPIE -fPIC
                        PLATFORM_LDFLAGS=-pthread
                        MKDEP=${CC} -M -std=c++11
                        PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
@ -301,6 +301,9 @@ done

 case "$target_platform" in
 ios)    nsync_platform_dir="$nsync_builds_dir/lipo.$target_platform.c++11"
+        if [ -d "$nsync_platform_dir" ]; then
+            rm -rf "$nsync_platform_dir"
+        fi
        mkdir "$nsync_platform_dir"
        eval lipo $platform_libs -create -output '$nsync_platform_dir/nsync.a'
        echo "$nsync_platform_dir/nsync.a"
--- a/tensorflow/contrib/nn/init.py
+++ b/tensorflow/contrib/nn/init.py
@ -15,6 +15,7 @@
 """Module for variants of ops in tf.nn.

@@alpha_dropout
+@@conv1d_transpose
@@deprecated_flipped_softmax_cross_entropy_with_logits
@@deprecated_flipped_sparse_softmax_cross_entropy_with_logits
@@deprecated_flipped_sigmoid_cross_entropy_with_logits
@ -32,6 +33,7 @@ from tensorflow.contrib.nn.python.ops.alpha_dropout import *
 from tensorflow.contrib.nn.python.ops.cross_entropy import *
 from tensorflow.contrib.nn.python.ops.sampling_ops import *
 from tensorflow.contrib.nn.python.ops.scaled_softplus import *
+from tensorflow.python.ops.nn_ops import conv1d_transpose
 from tensorflow.python.ops.nn_ops import nth_element
 # pylint: enable=unused-import,wildcard-import

--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@ -19,6 +19,7 @@ py_library(
        "python/training/external_optimizer.py",
        "python/training/lazy_adam_optimizer.py",
        "python/training/moving_average_optimizer.py",
+        "python/training/multitask_optimizer_wrapper.py",
        "python/training/nadam_optimizer.py",
        "python/training/powersign.py",
        "python/training/sign_decay.py",
@ -98,6 +99,23 @@ py_test(
    ],
 )

+py_test(
+    name = "multitask_optimizer_wrapper_test",
+    srcs = ["python/training/multitask_optimizer_wrapper_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
 py_test(
    name = "lazy_adam_optimizer_test",
    srcs = ["python/training/lazy_adam_optimizer_test.py"],
--- a/tensorflow/contrib/opt/init.py
+++ b/tensorflow/contrib/opt/init.py
@ -24,7 +24,7 @@ from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import
 from tensorflow.contrib.opt.python.training.external_optimizer import *
 from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
 from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
-from tensorflow.contrib.opt.python.training.nadam_optimizer import *
+from tensorflow.contrib.opt.python.training.multitask_optimizer_wrapper import *
 from tensorflow.contrib.opt.python.training.nadam_optimizer import *
 from tensorflow.contrib.opt.python.training.powersign import *
 from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
@ -34,11 +34,18 @@ from tensorflow.python.util.all_util import remove_undocumented


 _allowed_symbols = [
-    'PowerSignOptimizer', 'AddSignOptimizer'
+    'PowerSignOptimizer',
+    'AddSignOptimizer'
    'DelayCompensatedGradientDescentOptimizer',
-    'DropStaleGradientOptimizer', 'ExternalOptimizerInterface',
-    'LazyAdamOptimizer', 'NadamOptimizer', 'MovingAverageOptimizer',
-    'ScipyOptimizerInterface', 'VariableClippingOptimizer'
+    'DropStaleGradientOptimizer',
+    'ExternalOptimizerInterface',
+    'LazyAdamOptimizer',
+    'NadamOptimizer',
+    'MovingAverageOptimizer',
+    'ScipyOptimizerInterface',
+    'VariableClippingOptimizer',
+    'MultitaskOptimizerWrapper',
+    'clip_gradients_by_global_norm',
 ]

 remove_undocumented(__name__, _allowed_symbols)
--- a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
+++ b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
@ -0,0 +1,140 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An optimizer wrapper for stateful optimizers with multitask loss."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import types
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import optimizer
+
+__all__ = ['MultitaskOptimizerWrapper', 'clip_gradients_by_global_norm']
+
+
+def _is_all_zeros(grad):
+  all_zeros = math_ops.equal(math_ops.count_nonzero(grad), 0)
+  return all_zeros
+
+
+def _get_wrapper(fn, opt):
+
+  def wrapper(self, grad, *args, **kwargs):  # pylint: disable=unused-argument
+    all_zeros = _is_all_zeros(grad)
+    return control_flow_ops.cond(all_zeros, control_flow_ops.no_op,
+                                 lambda: fn(grad, *args, **kwargs))
+
+  wrapper = types.MethodType(wrapper, opt)
+  return wrapper
+
+
+class MultitaskOptimizerWrapper(object):
+  """Optimizer wrapper making all-zero gradients harmless.
+
+  This might be useful when a multi-task loss is used,
+  and some components of the loss might be
+  not present (e.g. masked out) in some training batches.
+  Technically their gradient would be zero,
+  which would normally affect the optimizer state
+  (e.g. push running average to zero).
+  However this is not the desired behaviour,
+  since the missing loss component
+  should be treated as unknown rather than zero.
+
+  This wrapper filters out all-zero gradient tensors,
+  therefore preserving the optimizer state.
+
+  If gradient clipping by global norm is used,
+  the provided function clip_gradients_by_global_norm
+  should be used (and specified explicitly by the user).
+  Otherwise the global norm would be underestimated
+  because of all-zero tensors that should be ignored.
+
+  The gradient calculation and application
+  are delegated to an underlying optimizer.
+  The gradient application is altered only for all-zero tensors.
+
+  Example:
+  ```python
+  momentum_optimizer = tf.train.MomentumOptimizer(
+    learning_rate, momentum=0.9)
+  multitask_momentum_optimizer = tf.contrib.opt.MultitaskOptimizerWrapper(
+    momentum_optimizer)
+  gradvars = multitask_momentum_optimizer.compute_gradients(
+    loss)
+  gradvars_clipped, _ = tf.contrib.opt.clip_gradients_by_global_norm(
+    gradvars, 15.0)
+  train_op = multitask_momentum_optimizer.apply_gradients(
+    gradvars_clipped, global_step=batch)
+  ```
+  """
+
+  def __init__(self, opt):
+    """Constructor.
+
+    Args:
+      opt: an instance of a class that implements tf.train.Optimizer.
+    """
+    if not isinstance(opt, optimizer.Optimizer):
+      raise TypeError(
+          'Supplied optimizer must be an instance of tf.train.Optimizer')
+    self._opt = opt
+    overridden_methods = ('_apply_dense', '_resource_apply_dense',
+                          '_apply_sparse', '_resource_apply_sparse')
+    for name in overridden_methods:
+      fn = getattr(self._opt, name)
+      wrapper = _get_wrapper(fn, self._opt)
+      setattr(self._opt, name, wrapper)
+
+  def __getattr__(self, name):
+    return getattr(self._opt, name)
+
+
+def clip_gradients_by_global_norm(gradients_variables, clip_norm=20.):
+  """Clips gradients of a multitask loss by their global norm.
+
+  Ignores all-zero tensors when computing the global norm.
+
+  Args:
+    gradients_variables: a list of pairs (gradient, variable).
+    clip_norm: a float Tensor, the global norm to clip on. Default is 20.0.
+
+  Returns:
+    list: A list of pairs of the same type as gradients_variables,.
+    fixed_global_norm: A 0-D (scalar) Tensor representing the global norm.
+  """
+  gradients, variables = six.moves.zip(*gradients_variables)
+
+  def _replace_nonexisting_grad(grad):
+    if grad is None:
+      return grad
+    all_zeros = _is_all_zeros(grad)
+    return control_flow_ops.cond(
+        all_zeros,
+        lambda: array_ops.zeros([], dtype=dtypes.as_dtype(grad.dtype)),
+        lambda: grad)
+
+  nonzero_gradients = [_replace_nonexisting_grad(g) for g in gradients]
+  fixed_global_norm = clip_ops.global_norm(nonzero_gradients)
+  gradients, _ = clip_ops.clip_by_global_norm(
+      gradients, clip_norm, use_norm=fixed_global_norm)
+  return list(six.moves.zip(gradients, variables)), fixed_global_norm
--- a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py
+++ b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py
@ -0,0 +1,119 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MultitaskOptimizerWrapper."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.contrib.opt.python.training import multitask_optimizer_wrapper
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import momentum
+
+
+class MultitaskOptimizerWrapperTest(test.TestCase):
+  """Tests for the multitask optimizer wrapper.
+  """
+
+  def testWrapper(self):
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtypes.float32)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtypes.float32)
+      grads_allzero = constant_op.constant([0.0, 0.0], dtype=dtypes.float32)
+      mom_opt_impl = momentum.MomentumOptimizer(learning_rate=2.0, momentum=0.9)
+      mom_opt = multitask_optimizer_wrapper.MultitaskOptimizerWrapper(
+          mom_opt_impl)
+      mom_update = mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      mom_update_partial = mom_opt.apply_gradients(
+          zip([grads_allzero, grads1], [var0, var1]))
+      mom_update_no_action = mom_opt.apply_gradients(
+          zip([grads_allzero, grads_allzero], [var0, var1]))
+      self.evaluate(variables.global_variables_initializer())
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+      self.assertEqual(["momentum"], mom_opt.get_slot_names())
+      slot0 = mom_opt.get_slot(var0, "momentum")
+      self.assertEquals(slot0.get_shape(), var0.get_shape())
+      slot1 = mom_opt.get_slot(var1, "momentum")
+      self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+      # Step 1: normal momentum update.
+      self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([0.1, 0.1]), self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([0.01, 0.01]), self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+          self.evaluate(var1))
+
+      # Step 2: momentum update that changes only slot1 but not slot0.
+      self.evaluate(mom_update_partial)
+      # Check that only the relevant momentum accumulator has been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([0.1, 0.1]), self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+          self.evaluate(slot1))
+
+      # Step 3: momentum update that does not change anything.
+      self.evaluate(mom_update_no_action)
+      # Check that the momentum accumulators have *NOT* been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([0.1, 0.1]), self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+          self.evaluate(slot1))
+
+  def testGradientClipping(self):
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
+      var2 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
+      var3 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
+      grads0 = constant_op.constant([10.0, 15.0], dtype=dtypes.float32)
+      grads1 = constant_op.constant([0.0, 5.0], dtype=dtypes.float32)
+      grads2 = constant_op.constant([0.0, 0.0], dtype=dtypes.float32)
+      grads3 = None
+      varlist = [var0, var1, var2, var3]
+      gradients = [grads0, grads1, grads2, grads3]
+      clipped_gradvars, global_norm = (
+          multitask_optimizer_wrapper.clip_gradients_by_global_norm(
+              six.moves.zip(gradients, varlist), clip_norm=1.0))
+      clipped_grads = list(six.moves.zip(*clipped_gradvars))[0]
+      reference_global_norm = np.sqrt(np.sum(np.square([10.0, 15.0, 0.0, 5.0])))
+      self.assertAllCloseAccordingToType(
+          self.evaluate(global_norm), reference_global_norm)
+      self.assertAllCloseAccordingToType(
+          self.evaluate(clipped_grads[2]), np.array([0., 0.]))
+      self.assertEqual(clipped_grads[3], None)
+
+
+if __name__ == "__main__":
+  test.main()
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@ -24,6 +24,7 @@ import numpy as np

 from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.contrib.rnn.python.ops import core_rnn_cell
+from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@ -358,6 +359,46 @@ class RNNCellTest(test.TestCase):
      self.assertEquals(variables[2].op.name,
                        "root/lstm_cell/projection/kernel")

+  def testLSTMCellLayerNorm(self):
+    with self.test_session() as sess:
+      num_units = 2
+      num_proj = 3
+      batch_size = 1
+      input_size = 4
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([batch_size, input_size])
+        c = array_ops.zeros([batch_size, num_units])
+        h = array_ops.zeros([batch_size, num_proj])
+        state = rnn_cell_impl.LSTMStateTuple(c, h)
+        cell = contrib_rnn_cell.LayerNormLSTMCell(
+            num_units=num_units,
+            num_proj=num_proj,
+            forget_bias=1.0,
+            layer_norm=True,
+            norm_gain=1.0,
+            norm_shift=0.0)
+        g, out_m = cell(x, state)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            [g, out_m], {
+                x.name: np.ones((batch_size, input_size)),
+                c.name: 0.1 * np.ones((batch_size, num_units)),
+                h.name: 0.1 * np.ones((batch_size, num_proj))
+            })
+        self.assertEqual(len(res), 2)
+        # The numbers in results were not calculated, this is mostly just a
+        # smoke test.
+        self.assertEqual(res[0].shape, (batch_size, num_proj))
+        self.assertEqual(res[1][0].shape, (batch_size, num_units))
+        self.assertEqual(res[1][1].shape, (batch_size, num_proj))
+        # Different inputs so different outputs and states
+        for i in range(1, batch_size):
+          self.assertTrue(
+              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) < 1e-6)
+          self.assertTrue(
+              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) < 1e-6)
+
  def testOutputProjectionWrapper(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@ -37,6 +37,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@ -995,26 +996,19 @@ class RNNCellTest(test.TestCase):
        output, state = cell(x, hidden)

        sess.run([variables.global_variables_initializer()])
-        res = sess.run([output, state], {
-            hidden[0].name:
-                np.array([[[[[1.],[1.]], 
-                            [[1.],[1.]]],
-                           [[[1.],[1.]],
-                            [[1.],[1.]]]], 
-                          [[[[2.],[2.]],
-                            [[2.],[2.]]],
-                           [[[2.],[2.]],
-                            [[2.],[2.]]]]]),
-            x.name:
-                np.array([[[[[1.],[1.]],
-                            [[1.],[1.]]],
-                           [[[1.],[1.]],
-                            [[1.],[1.]]]],
-                          [[[[2.],[2.]],
-                            [[2.],[2.]]],
-                           [[[2.],[2.]],
-                           [[2.],[2.]]]]])
-        })
+        res = sess.run(
+            [output, state], {
+                hidden[0].name:
+                    np.array([[[[[1.], [1.]], [[1.], [1.]]], [[[1.], [1.]], [[
+                        1.
+                    ], [1.]]]], [[[[2.], [2.]], [[2.], [2.]]],
+                                 [[[2.], [2.]], [[2.], [2.]]]]]),
+                x.name:
+                    np.array([[[[[1.], [1.]], [[1.], [1.]]], [[[1.], [1.]], [[
+                        1.
+                    ], [1.]]]], [[[[2.], [2.]], [[2.], [2.]]], [[[2.], [2.]],
+                                                                [[2.], [2.]]]]])
+            })
        # This is a smoke test, making sure expected values are unchanged.
        self.assertEqual(len(res), 2)
        self.assertAllClose(res[0], res[1].h)
@ -1275,6 +1269,47 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
        self.assertAllClose(res[2].c, expected_c1, 1e-5)
        self.assertAllClose(res[2].h, expected_h1, 1e-5)

+  def testBasicLSTMCellWithStateTupleLayerNorm(self):
+    """The results of LSTMCell and LayerNormBasicLSTMCell should be the same."""
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        c0 = array_ops.zeros([1, 2])
+        h0 = array_ops.zeros([1, 2])
+        state0 = rnn_cell_impl.LSTMStateTuple(c0, h0)
+        c1 = array_ops.zeros([1, 2])
+        h1 = array_ops.zeros([1, 2])
+        state1 = rnn_cell_impl.LSTMStateTuple(c1, h1)
+        cell = rnn_cell_impl.MultiRNNCell([
+            contrib_rnn_cell.LayerNormLSTMCell(
+                2, layer_norm=True, norm_gain=1.0, norm_shift=0.0)
+            for _ in range(2)
+        ])
+        h, (s0, s1) = cell(x, (state0, state1))
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run(
+            [h, s0, s1], {
+                x.name: np.array([[1., 1.]]),
+                c0.name: 0.1 * np.asarray([[0, 1]]),
+                h0.name: 0.1 * np.asarray([[2, 3]]),
+                c1.name: 0.1 * np.asarray([[4, 5]]),
+                h1.name: 0.1 * np.asarray([[6, 7]]),
+            })
+
+        expected_h = np.array([[-0.38079708, 0.38079708]])
+        expected_h0 = np.array([[-0.38079708, 0.38079708]])
+        expected_c0 = np.array([[-1.0, 1.0]])
+        expected_h1 = np.array([[-0.38079708, 0.38079708]])
+        expected_c1 = np.array([[-1.0, 1.0]])
+
+        self.assertEqual(len(res), 3)
+        self.assertAllClose(res[0], expected_h, 1e-5)
+        self.assertAllClose(res[1].c, expected_c0, 1e-5)
+        self.assertAllClose(res[1].h, expected_h0, 1e-5)
+        self.assertAllClose(res[2].c, expected_c1, 1e-5)
+        self.assertAllClose(res[2].h, expected_h1, 1e-5)
+
  def testBasicLSTMCellWithDropout(self):

    def _is_close(x, y, digits=4):
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@ -76,6 +76,18 @@ def _get_sharded_variable(name, shape, dtype, num_shards):
  return shards


+def _norm(g, b, inp, scope):
+  shape = inp.get_shape()[-1:]
+  gamma_init = init_ops.constant_initializer(g)
+  beta_init = init_ops.constant_initializer(b)
+  with vs.variable_scope(scope):
+    # Initialize beta and gamma for use by layer_norm.
+    vs.get_variable("gamma", shape=shape, initializer=gamma_init)
+    vs.get_variable("beta", shape=shape, initializer=beta_init)
+  normalized = layers.layer_norm(inp, reuse=True, scope=scope)
+  return normalized
+
+
 class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
  """Long short-term memory unit (LSTM) recurrent network cell.

@ -102,13 +114,33 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):

  The class uses optional peep-hole connections, and an optional projection
  layer.
+
+  Layer normalization implementation is based on:
+
+    https://arxiv.org/abs/1607.06450.
+
+  "Layer Normalization"
+  Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
+
+  and is applied before the internal nonlinearities.
+
  """

-  def __init__(self, num_units, use_peepholes=False,
-               initializer=None, num_proj=None, proj_clip=None,
-               num_unit_shards=1, num_proj_shards=1,
-               forget_bias=1.0, state_is_tuple=True,
-               activation=math_ops.tanh, reuse=None):
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               initializer=None,
+               num_proj=None,
+               proj_clip=None,
+               num_unit_shards=1,
+               num_proj_shards=1,
+               forget_bias=1.0,
+               state_is_tuple=True,
+               activation=math_ops.tanh,
+               reuse=None,
+               layer_norm=False,
+               norm_gain=1.0,
+               norm_shift=0.0):
    """Initialize the parameters for an LSTM cell.

    Args:
@ -135,6 +167,11 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
      reuse: (optional) Python boolean describing whether to reuse variables
        in an existing scope.  If not `True`, and the existing scope already has
        the given variables, an error is raised.
+      layer_norm: If `True`, layer normalization will be applied.
+      norm_gain: float, The layer normalization gain initial value. If
+        `layer_norm` has been set to `False`, this argument will be ignored.
+      norm_shift: float, The layer normalization shift initial value. If
+        `layer_norm` has been set to `False`, this argument will be ignored.
    """
    super(CoupledInputForgetGateLSTMCell, self).__init__(_reuse=reuse)
    if not state_is_tuple:
@ -152,6 +189,9 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
    self._state_is_tuple = state_is_tuple
    self._activation = activation
    self._reuse = reuse
+    self._layer_norm = layer_norm
+    self._norm_gain = norm_gain
+    self._norm_shift = norm_shift

    if num_proj:
      self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
@ -220,9 +260,20 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):

    # j = new_input, f = forget_gate, o = output_gate
    cell_inputs = array_ops.concat([inputs, m_prev], 1)
-    lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
+    lstm_matrix = math_ops.matmul(cell_inputs, concat_w)
+
+    # If layer nomalization is applied, do not add bias
+    if not self._layer_norm:
+      lstm_matrix = nn_ops.bias_add(lstm_matrix, b)
+
    j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=3, axis=1)

+    # Apply layer normalization
+    if self._layer_norm:
+      j = _norm(self._norm_gain, self._norm_shift, j, "transform")
+      f = _norm(self._norm_gain, self._norm_shift, f, "forget")
+      o = _norm(self._norm_gain, self._norm_shift, o, "output")
+
    # Diagonal connections
    if self._use_peepholes:
      w_f_diag = vs.get_variable(
@ -236,6 +287,10 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
      f_act = sigmoid(f + self._forget_bias)
    c = (f_act * c_prev + (1 - f_act) * self._activation(j))

+    # Apply layer normalization
+    if self._layer_norm:
+      c = _norm(self._norm_gain, self._norm_shift, c, "state")
+
    if self._use_peepholes:
      m = sigmoid(o + w_o_diag * c) * self._activation(c)
    else:
@ -1301,8 +1356,8 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
    self._keep_prob = dropout_keep_prob
    self._seed = dropout_prob_seed
    self._layer_norm = layer_norm
-    self._g = norm_gain
-    self._b = norm_shift
+    self._norm_gain = norm_gain
+    self._norm_shift = norm_shift
    self._reuse = reuse

  @property
@ -1313,24 +1368,25 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
  def output_size(self):
    return self._num_units

-  def _norm(self, inp, scope):
+  def _norm(self, inp, scope, dtype=dtypes.float32):
    shape = inp.get_shape()[-1:]
-    gamma_init = init_ops.constant_initializer(self._g)
-    beta_init = init_ops.constant_initializer(self._b)
+    gamma_init = init_ops.constant_initializer(self._norm_gain)
+    beta_init = init_ops.constant_initializer(self._norm_shift)
    with vs.variable_scope(scope):
      # Initialize beta and gamma for use by layer_norm.
-      vs.get_variable("gamma", shape=shape, initializer=gamma_init)
-      vs.get_variable("beta", shape=shape, initializer=beta_init)
+      vs.get_variable("gamma", shape=shape, initializer=gamma_init, dtype=dtype)
+      vs.get_variable("beta", shape=shape, initializer=beta_init, dtype=dtype)
    normalized = layers.layer_norm(inp, reuse=True, scope=scope)
    return normalized

  def _linear(self, args):
    out_size = 4 * self._num_units
    proj_size = args.get_shape()[-1]
-    weights = vs.get_variable("kernel", [proj_size, out_size])
+    dtype = args.dtype
+    weights = vs.get_variable("kernel", [proj_size, out_size], dtype=dtype)
    out = math_ops.matmul(args, weights)
    if not self._layer_norm:
-      bias = vs.get_variable("bias", [out_size])
+      bias = vs.get_variable("bias", [out_size], dtype=dtype)
      out = nn_ops.bias_add(out, bias)
    return out

@ -1339,13 +1395,14 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
    c, h = state
    args = array_ops.concat([inputs, h], 1)
    concat = self._linear(args)
+    dtype = args.dtype

    i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
    if self._layer_norm:
-      i = self._norm(i, "input")
-      j = self._norm(j, "transform")
-      f = self._norm(f, "forget")
-      o = self._norm(o, "output")
+      i = self._norm(i, "input", dtype=dtype)
+      j = self._norm(j, "transform", dtype=dtype)
+      f = self._norm(f, "forget", dtype=dtype)
+      o = self._norm(o, "output", dtype=dtype)

    g = self._activation(j)
    if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1:
@ -1354,7 +1411,7 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
    new_c = (c * math_ops.sigmoid(f + self._forget_bias)
             + math_ops.sigmoid(i) * g)
    if self._layer_norm:
-      new_c = self._norm(new_c, "state")
+      new_c = self._norm(new_c, "state", dtype=dtype)
    new_h = self._activation(new_c) * math_ops.sigmoid(o)

    new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_h)
@ -1998,8 +2055,8 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
    if self._skip_connection:
      self._total_output_channels += self._input_shape[-1]

-    state_size = tensor_shape.TensorShape(self._input_shape[:-1] 
-                                          + [self._output_channels])
+    state_size = tensor_shape.TensorShape(
+        self._input_shape[:-1] + [self._output_channels])
    self._state_size = rnn_cell_impl.LSTMStateTuple(state_size, state_size)
    self._output_size = tensor_shape.TensorShape(self._input_shape[:-1]
                                                 + [self._total_output_channels])
@ -2059,11 +2116,8 @@ class Conv3DLSTMCell(ConvLSTMCell):
    """Construct Conv3DLSTM. See `ConvLSTMCell` for more details."""
    super(Conv3DLSTMCell, self).__init__(conv_ndims=3, **kwargs)

-def _conv(args, 
-          filter_size,
-          num_features,
-          bias,
-          bias_start=0.0):
+
+def _conv(args, filter_size, num_features, bias, bias_start=0.0):
  """convolution:
  Args:
    args: a Tensor or a list of Tensors of dimension 3D, 4D or 5D, 
@ -2306,3 +2360,273 @@ class GLSTMCell(rnn_cell_impl.RNNCell):

    new_state = rnn_cell_impl.LSTMStateTuple(c, m)
    return m, new_state
+
+
+class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
+  """Long short-term memory unit (LSTM) recurrent network cell.
+
+  The default non-peephole implementation is based on:
+
+    http://www.bioinf.jku.at/publications/older/2604.pdf
+
+  S. Hochreiter and J. Schmidhuber.
+  "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
+
+  The peephole implementation is based on:
+
+    https://research.google.com/pubs/archive/43905.pdf
+
+  Hasim Sak, Andrew Senior, and Francoise Beaufays.
+  "Long short-term memory recurrent neural network architectures for
+   large scale acoustic modeling." INTERSPEECH, 2014.
+
+  The class uses optional peep-hole connections, optional cell clipping, and
+  an optional projection layer.
+
+  Layer normalization implementation is based on:
+
+    https://arxiv.org/abs/1607.06450.
+
+  "Layer Normalization"
+  Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
+
+  and is applied before the internal nonlinearities.
+
+  """
+
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               cell_clip=None,
+               initializer=None,
+               num_proj=None,
+               proj_clip=None,
+               forget_bias=1.0,
+               activation=None,
+               layer_norm=False,
+               norm_gain=1.0,
+               norm_shift=0.0,
+               reuse=None):
+    """Initialize the parameters for an LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the LSTM cell
+      use_peepholes: bool, set True to enable diagonal/peephole connections.
+      cell_clip: (optional) A float value, if provided the cell state is clipped
+        by this value prior to the cell output activation.
+      initializer: (optional) The initializer to use for the weight and
+        projection matrices.
+      num_proj: (optional) int, The output dimensionality for the projection
+        matrices.  If None, no projection is performed.
+      proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
+        provided, then the projected values are clipped elementwise to within
+        `[-proj_clip, proj_clip]`.
+      forget_bias: Biases of the forget gate are initialized by default to 1
+        in order to reduce the scale of forgetting at the beginning of
+        the training. Must set it manually to `0.0` when restoring from
+        CudnnLSTM trained checkpoints.
+      activation: Activation function of the inner states.  Default: `tanh`.
+      layer_norm: If `True`, layer normalization will be applied.
+      norm_gain: float, The layer normalization gain initial value. If
+        `layer_norm` has been set to `False`, this argument will be ignored.
+      norm_shift: float, The layer normalization shift initial value. If
+        `layer_norm` has been set to `False`, this argument will be ignored.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+
+      When restoring from CudnnLSTM-trained checkpoints, must use
+      CudnnCompatibleLSTMCell instead.
+    """
+    super(LayerNormLSTMCell, self).__init__(_reuse=reuse)
+
+    self._num_units = num_units
+    self._use_peepholes = use_peepholes
+    self._cell_clip = cell_clip
+    self._initializer = initializer
+    self._num_proj = num_proj
+    self._proj_clip = proj_clip
+    self._forget_bias = forget_bias
+    self._activation = activation or math_ops.tanh
+    self._layer_norm = layer_norm
+    self._norm_gain = norm_gain
+    self._norm_shift = norm_shift
+
+    if num_proj:
+      self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_proj))
+      self._output_size = num_proj
+    else:
+      self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_units))
+      self._output_size = num_units
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def _linear(self,
+              args,
+              output_size,
+              bias,
+              bias_initializer=None,
+              kernel_initializer=None,
+              layer_norm=False):
+    """Linear map: sum_i(args[i] * W[i]), where W[i] is a Variable.
+
+    Args:
+      args: a 2D Tensor or a list of 2D, batch x n, Tensors.
+      output_size: int, second dimension of W[i].
+      bias: boolean, whether to add a bias term or not.
+      bias_initializer: starting value to initialize the bias
+        (default is all zeros).
+      kernel_initializer: starting value to initialize the weight.
+      layer_norm: boolean, whether to apply layer normalization.
+
+
+    Returns:
+      A 2D Tensor with shape [batch x output_size] taking value
+      sum_i(args[i] * W[i]), where each W[i] is a newly created Variable.
+
+    Raises:
+      ValueError: if some of the arguments has unspecified or wrong shape.
+    """
+    if args is None or (nest.is_sequence(args) and not args):
+      raise ValueError("`args` must be specified")
+    if not nest.is_sequence(args):
+      args = [args]
+
+    # Calculate the total size of arguments on dimension 1.
+    total_arg_size = 0
+    shapes = [a.get_shape() for a in args]
+    for shape in shapes:
+      if shape.ndims != 2:
+        raise ValueError("linear is expecting 2D arguments: %s" % shapes)
+      if shape[1].value is None:
+        raise ValueError("linear expects shape[1] to be provided for shape %s, "
+                         "but saw %s" % (shape, shape[1]))
+      else:
+        total_arg_size += shape[1].value
+
+    dtype = [a.dtype for a in args][0]
+
+    # Now the computation.
+    scope = vs.get_variable_scope()
+    with vs.variable_scope(scope) as outer_scope:
+      weights = vs.get_variable(
+          "kernel", [total_arg_size, output_size],
+          dtype=dtype,
+          initializer=kernel_initializer)
+      if len(args) == 1:
+        res = math_ops.matmul(args[0], weights)
+      else:
+        res = math_ops.matmul(array_ops.concat(args, 1), weights)
+      if not bias:
+        return res
+      with vs.variable_scope(outer_scope) as inner_scope:
+        inner_scope.set_partitioner(None)
+        if bias_initializer is None:
+          bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
+        biases = vs.get_variable(
+            "bias", [output_size], dtype=dtype, initializer=bias_initializer)
+
+    if not layer_norm:
+      res = nn_ops.bias_add(res, biases)
+
+    return res
+
+  def call(self, inputs, state):
+    """Run one step of LSTM.
+
+    Args:
+      inputs: input Tensor, 2D, batch x num_units.
+      state: this must be a tuple of state Tensors,
+       both `2-D`, with column sizes `c_state` and
+        `m_state`.
+
+    Returns:
+      A tuple containing:
+
+      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
+        LSTM after reading `inputs` when previous state was `state`.
+        Here output_dim is:
+           num_proj if num_proj was set,
+           num_units otherwise.
+      - Tensor(s) representing the new state of LSTM after reading `inputs` when
+        the previous state was `state`.  Same type and shape(s) as `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    sigmoid = math_ops.sigmoid
+
+    (c_prev, m_prev) = state
+
+    dtype = inputs.dtype
+    input_size = inputs.get_shape().with_rank(2)[1]
+    if input_size.value is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+    scope = vs.get_variable_scope()
+    with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+
+      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+      lstm_matrix = self._linear(
+          [inputs, m_prev],
+          4 * self._num_units,
+          bias=True,
+          bias_initializer=None,
+          layer_norm=self._layer_norm)
+      i, j, f, o = array_ops.split(
+          value=lstm_matrix, num_or_size_splits=4, axis=1)
+
+      if self._layer_norm:
+        i = _norm(self._norm_gain, self._norm_shift, i, "input")
+        j = _norm(self._norm_gain, self._norm_shift, j, "transform")
+        f = _norm(self._norm_gain, self._norm_shift, f, "forget")
+        o = _norm(self._norm_gain, self._norm_shift, o, "output")
+
+      # Diagonal connections
+      if self._use_peepholes:
+        with vs.variable_scope(unit_scope):
+          w_f_diag = vs.get_variable(
+              "w_f_diag", shape=[self._num_units], dtype=dtype)
+          w_i_diag = vs.get_variable(
+              "w_i_diag", shape=[self._num_units], dtype=dtype)
+          w_o_diag = vs.get_variable(
+              "w_o_diag", shape=[self._num_units], dtype=dtype)
+
+      if self._use_peepholes:
+        c = (
+            sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
+            sigmoid(i + w_i_diag * c_prev) * self._activation(j))
+      else:
+        c = (
+            sigmoid(f + self._forget_bias) * c_prev +
+            sigmoid(i) * self._activation(j))
+
+      if self._layer_norm:
+        c = _norm(self._norm_gain, self._norm_shift, c, "state")
+
+      if self._cell_clip is not None:
+        # pylint: disable=invalid-unary-operand-type
+        c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
+        # pylint: enable=invalid-unary-operand-type
+      if self._use_peepholes:
+        m = sigmoid(o + w_o_diag * c) * self._activation(c)
+      else:
+        m = sigmoid(o) * self._activation(c)
+
+      if self._num_proj is not None:
+        with vs.variable_scope("projection"):
+          m = self._linear(m, self._num_proj, bias=False)
+
+        if self._proj_clip is not None:
+          # pylint: disable=invalid-unary-operand-type
+          m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
+          # pylint: enable=invalid-unary-operand-type
+
+    new_state = (rnn_cell_impl.LSTMStateTuple(c, m))
+    return m, new_state
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@ -149,7 +149,7 @@ class _BaseAttentionMechanism(AttentionMechanism):
               memory_sequence_length=None,
               memory_layer=None,
               check_inner_dims_defined=True,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
               name=None):
    """Construct base AttentionMechanism class.

@ -187,9 +187,13 @@ class _BaseAttentionMechanism(AttentionMechanism):
          "memory_layer is not a Layer: %s" % type(memory_layer).__name__)
    self._query_layer = query_layer
    self._memory_layer = memory_layer
+    self.dtype = memory_layer.dtype
    if not callable(probability_fn):
      raise TypeError("probability_fn must be callable, saw type: %s" %
                      type(probability_fn).__name__)
+    if score_mask_value is None:
+      score_mask_value = dtypes.as_dtype(
+          self._memory_layer.dtype).as_numpy_dtype(-np.inf)
    self._probability_fn = lambda score, prev: (  # pylint:disable=g-long-lambda
        probability_fn(
            _maybe_mask_score(score, memory_sequence_length, score_mask_value),
@ -334,7 +338,8 @@ class LuongAttention(_BaseAttentionMechanism):
               memory_sequence_length=None,
               scale=False,
               probability_fn=None,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
+               dtype=None,
               name="LuongAttention"):
    """Construct the AttentionMechanism mechanism.

@ -353,17 +358,20 @@ class LuongAttention(_BaseAttentionMechanism):
      score_mask_value: (optional) The mask value for score before passing into
        `probability_fn`. The default is -inf. Only used if
        `memory_sequence_length` is not None.
+      dtype: The data type for the memory layer of the attention mechanism.
      name: Name to use when creating ops.
    """
    # For LuongAttention, we only transform the memory layer; thus
    # num_units **must** match expected the query depth.
    if probability_fn is None:
      probability_fn = nn_ops.softmax
+    if dtype is None:
+      dtype = dtypes.float32
    wrapped_probability_fn = lambda score, _: probability_fn(score)
    super(LuongAttention, self).__init__(
        query_layer=None,
        memory_layer=layers_core.Dense(
-            num_units, name="memory_layer", use_bias=False),
+            num_units, name="memory_layer", use_bias=False, dtype=dtype),
        memory=memory,
        probability_fn=wrapped_probability_fn,
        memory_sequence_length=memory_sequence_length,
@ -475,7 +483,8 @@ class BahdanauAttention(_BaseAttentionMechanism):
               memory_sequence_length=None,
               normalize=False,
               probability_fn=None,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
+               dtype=None,
               name="BahdanauAttention"):
    """Construct the Attention mechanism.

@ -494,16 +503,20 @@ class BahdanauAttention(_BaseAttentionMechanism):
      score_mask_value: (optional): The mask value for score before passing into
        `probability_fn`. The default is -inf. Only used if
        `memory_sequence_length` is not None.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
      name: Name to use when creating ops.
    """
    if probability_fn is None:
      probability_fn = nn_ops.softmax
+    if dtype is None:
+      dtype = dtypes.float32
    wrapped_probability_fn = lambda score, _: probability_fn(score)
    super(BahdanauAttention, self).__init__(
        query_layer=layers_core.Dense(
-            num_units, name="query_layer", use_bias=False),
+            num_units, name="query_layer", use_bias=False, dtype=dtype),
        memory_layer=layers_core.Dense(
-            num_units, name="memory_layer", use_bias=False),
+            num_units, name="memory_layer", use_bias=False, dtype=dtype),
        memory=memory,
        probability_fn=wrapped_probability_fn,
        memory_sequence_length=memory_sequence_length,
@ -738,11 +751,12 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
               memory,
               memory_sequence_length=None,
               normalize=False,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
               sigmoid_noise=0.,
               sigmoid_noise_seed=None,
               score_bias_init=0.,
               mode="parallel",
+               dtype=None,
               name="BahdanauMonotonicAttention"):
    """Construct the Attention mechanism.

@ -766,17 +780,21 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
      mode: How to compute the attention distribution.  Must be one of
        'recursive', 'parallel', or 'hard'.  See the docstring for
        `tf.contrib.seq2seq.monotonic_attention` for more information.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
      name: Name to use when creating ops.
    """
    # Set up the monotonic probability fn with supplied parameters
+    if dtype is None:
+      dtype = dtypes.float32
    wrapped_probability_fn = functools.partial(
        _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
        seed=sigmoid_noise_seed)
    super(BahdanauMonotonicAttention, self).__init__(
        query_layer=layers_core.Dense(
-            num_units, name="query_layer", use_bias=False),
+            num_units, name="query_layer", use_bias=False, dtype=dtype),
        memory_layer=layers_core.Dense(
-            num_units, name="memory_layer", use_bias=False),
+            num_units, name="memory_layer", use_bias=False, dtype=dtype),
        memory=memory,
        probability_fn=wrapped_probability_fn,
        memory_sequence_length=memory_sequence_length,
@ -834,11 +852,12 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
               memory,
               memory_sequence_length=None,
               scale=False,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
               sigmoid_noise=0.,
               sigmoid_noise_seed=None,
               score_bias_init=0.,
               mode="parallel",
+               dtype=None,
               name="LuongMonotonicAttention"):
    """Construct the Attention mechanism.

@ -862,17 +881,21 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
      mode: How to compute the attention distribution.  Must be one of
        'recursive', 'parallel', or 'hard'.  See the docstring for
        `tf.contrib.seq2seq.monotonic_attention` for more information.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
      name: Name to use when creating ops.
    """
    # Set up the monotonic probability fn with supplied parameters
+    if dtype is None:
+      dtype = dtypes.float32
    wrapped_probability_fn = functools.partial(
        _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
        seed=sigmoid_noise_seed)
    super(LuongMonotonicAttention, self).__init__(
        query_layer=layers_core.Dense(
-            num_units, name="query_layer", use_bias=False),
+            num_units, name="query_layer", use_bias=False, dtype=dtype),
        memory_layer=layers_core.Dense(
-            num_units, name="memory_layer", use_bias=False),
+            num_units, name="memory_layer", use_bias=False, dtype=dtype),
        memory=memory,
        probability_fn=wrapped_probability_fn,
        memory_sequence_length=memory_sequence_length,
@ -1123,8 +1146,11 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
            % (len(attention_layer_sizes), len(attention_mechanisms)))
      self._attention_layers = tuple(
          layers_core.Dense(
-              attention_layer_size, name="attention_layer", use_bias=False)
-          for attention_layer_size in attention_layer_sizes)
+              attention_layer_size,
+              name="attention_layer",
+              use_bias=False,
+              dtype=attention_mechanisms[i].dtype)
+          for i, attention_layer_size in enumerate(attention_layer_sizes))
      self._attention_layer_size = sum(attention_layer_sizes)
    else:
      self._attention_layers = None
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@ -237,7 +237,7 @@ One way to reduce this code duplication would be via a `for` loop:
 ```python
 net = ...
 for i in range(3):
-  net = slim.conv2d(net, 256, [3, 3], scope='conv3_' % (i+1))
+  net = slim.conv2d(net, 256, [3, 3], scope='conv3_%d' % (i+1))
 net = slim.max_pool2d(net, [2, 2], scope='pool2')
 ```

--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
@ -386,7 +386,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
                inputs, None, is_training=False, global_pool=False)
            sess.run(variables.global_variables_initializer())
            self.assertAllClose(
-                output.eval(), expected.eval(), atol=1e-4, rtol=1e-4)
+                output.eval(), expected.eval(), atol=2e-4, rtol=1e-4)

  def testUnknownBatchSize(self):
    batch = 2
--- a/tensorflow/contrib/verbs/README.md
+++ b/tensorflow/contrib/verbs/README.md
@ -1,4 +1,4 @@
-## How to compile and use RDMA-enabled TensorFlow
+## How to compile, use and configure RDMA-enabled TensorFlow
 1. Follow the regular TF compilation instructions. During configure step, if you want ibverbs based RDMA support, answer yes to this question:

    ```Do you wish to build TensorFlow with VERBS-RDMA support [y/N]```
@ -7,6 +7,18 @@

    ```server = tf.train.Server(cluster, job_name="local", task_index=0, protocol='grpc+verbs') # default protocol is 'grpc'```

+3. RDMA configuration is done by setting the following environment variables:
+   * **RDMA_DEVICE**: The RDMA device name to be used. If not defined by user, a default device with an active port will be set if exists.
+   * **RDMA_DEVICE_PORT**: The port within the selected device. Not relevant if RDMA_DEVICE is not defined. If not defined by user, a default active port will be set if exists.
+   * **RDMA_GID_INDEX**: The GID index of the port. If not defined by user, a default suitable GID index will be set (RoCEV2 is favourable as default).
+   * **RDMA_QP_PKEY_INDEX**: The Pkey for the QP. If not defined by user, the default value is 0.
+   * **RDMA_QP_QUEUE_DEPTH**: TX/RX queue size for the QP. If not defined by user, the default value is 1024.
+   * **RDMA_QP_TIMEOUT**: The retransmission timeout for QPs. If not defined by user, the default value is 14.
+   * **RDMA_QP_RETRY_COUNT**: Number of retransmission for QPs. If not defined by user, the default value is 7.
+   * **RDMA_QP_SL**: Service level configuration for QOS and ECN, valid values are 0-7. If not defined by user, the default value is 0.
+   * **RDMA_QP_MTU**: MTU configuration for the QPs. If not defined by user, the default value is active MTU from query_port.
+   * **RDMA_TRAFFIC_CLASS**: Traffic class configuration for QP, in case of DSCP trust level QoS configuration. If not defined by user, the default value is 0. For more info see [HowTo Configure Trust state on Mellanox Adapters](https://community.mellanox.com/docs/DOC-2866).
+
 ## Overview
 The design is based on TensorFlow r1.0. An RDMA path is added between servers for tensor transfer (weights, gradients, etc). The existing GRPC path remains and is responsible for "administrative" tasks, such as setting up the RDMA path, exchanging computation graphs, etc.

--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@ -16,6 +16,7 @@ limitations under the License.
 #ifdef TENSORFLOW_USE_VERBS

 #include "tensorflow/contrib/verbs/rdma.h"
+#include <fcntl.h>
 #include <cstdlib>
 #include "tensorflow/contrib/verbs/verbs_util.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@ -33,6 +34,8 @@ limitations under the License.

 namespace tensorflow {

+#define RoCE_V2 "RoCE v2"
+
 namespace {
 // hash name to 32-bit integer
 uint32_t NameHash(const string& name) {
@ -66,18 +69,338 @@ string MessageTypeToString(RdmaMessageType rmt) {
 }
 }  // namespace

-ibv_context* open_default_device() {
-  ibv_device** dev_list;
-  ibv_device* ib_dev;
-  dev_list = ibv_get_device_list(NULL);
-  CHECK(dev_list) << "No InfiniBand device found";
-  ib_dev = dev_list[0];
-  CHECK(ib_dev) << "No InfiniBand device found";
-  ibv_context* context = ibv_open_device(ib_dev);
-  CHECK(context) << "Open context failed for " << ibv_get_device_name(ib_dev);
+// Function to get environment variable
+// Args:
+//    var_name - the name of the environmental variable
+// Returns:
+//    string with it's value or empty string if not set
+string get_env_var(char const* var_name) {
+  char const* var_temp = getenv(var_name);
+
+  return (var_temp == NULL) ? string() : string(var_temp);
+}
+
+// Function to open device
+// Args:
+//   ibv_dev device to open
+// Returns:
+//   context of the opened device
+ibv_context* open_device(ibv_device* ibv_dev) {
+  ibv_context* context = ibv_open_device(ibv_dev);
+
+  CHECK(context) << "Open context failed for " << ibv_get_device_name(ibv_dev);
  return context;
 }

+// Function to count the number of active ports for device
+// Args:
+//   device - to check active ports
+// Returns:
+//   number of active ports of the given device
+int get_dev_active_port_count(ibv_device* device) {
+  ibv_device_attr device_att;
+  ibv_port_attr port_attr;
+  ibv_context* context = NULL;
+  int rc, port_index, active_ports = 0;
+
+  context = ibv_open_device(device);
+  CHECK(context) << "Open context failed for " << ibv_get_device_name(device);
+  rc = ibv_query_device(context, &device_att);
+  CHECK(!rc) << "Failed to query the device";
+
+  for (port_index = 1; port_index <= device_att.phys_port_cnt; port_index++) {
+    rc = ibv_query_port(context, port_index, &port_attr);
+    CHECK(!rc) << "Failed to query the port" << port_index;
+    if (port_attr.state == IBV_PORT_ACTIVE) {
+      active_ports++;
+    }
+  }
+  ibv_close_device(context);
+  return active_ports;
+}
+
+// Function to set device. If RDMA_DEVICE not set, search for device with active
+// port.
+// Fails if more than one device with active port was found.
+// Returns:
+//   device to use
+ibv_device* set_device() {
+  ibv_device** dev_list;
+  int dev_num, device_index, device_to_open = 0;
+  int num_devs_with_active_port = 0;
+  string env_p_rdma_device, str_port_num;
+
+  dev_list = ibv_get_device_list(&dev_num);
+  CHECK(dev_list) << "No InfiniBand device found";
+
+  env_p_rdma_device = get_env_var("RDMA_DEVICE");
+  if (!env_p_rdma_device.empty()) {
+    for (device_index = 0; device_index < dev_num; device_index++) {
+      if (!env_p_rdma_device.compare(
+              ibv_get_device_name(dev_list[device_index]))) {
+        CHECK(get_dev_active_port_count(dev_list[device_index]) != 0)
+            << "Device " << ibv_get_device_name(dev_list[device_index])
+            << " has no active ports";
+        return dev_list[device_index];
+      }
+    }
+    // check validity of input device
+    CHECK(false) << "The device " << env_p_rdma_device << " wasn't found";
+  } else {
+    // set default device
+    str_port_num = get_env_var("RDMA_DEVICE_PORT");
+    CHECK(str_port_num.empty())
+        << "RDMA_DEVICE should be provided if RDMA_DEVICE_PORT is set by user";
+    for (device_index = 0; device_index < dev_num; device_index++) {
+      // get port_num
+      if (get_dev_active_port_count(dev_list[device_index]) > 0) {
+        num_devs_with_active_port++;
+        CHECK(num_devs_with_active_port <= 1) << ". More than one device with "
+                                                 "active port in the system. "
+                                                 "Please enter RDMA_DEVICE";
+        // found device with at least 1 active port
+        device_to_open = device_index;
+      }
+    }
+    CHECK(num_devs_with_active_port > 0)
+        << "There is no active port in the system";
+    return dev_list[device_to_open];
+  }
+  CHECK(false) << "No device was set!";
+  return NULL;  // never happens
+}
+
+// Function to set port for device.
+// If RDMA_DEVICE_PORT not set, first active port of the device will be set.
+// Args:
+//   context of the device
+// Returns:
+//   port to use
+uint8_t set_port(ibv_context* context) {
+  uint8_t port_num = 0;  // 0 is illegal port number
+  string str_port_num;
+  ibv_device_attr device_att;
+  ibv_port_attr port_attr;
+  int rc, port_index;
+
+  rc = ibv_query_device(context, &device_att);
+  CHECK(!rc) << "Failed to query the device\n";
+
+  str_port_num = get_env_var("RDMA_DEVICE_PORT");
+  // user defined port
+  if (!str_port_num.empty()) {
+    port_num = stoi(str_port_num);
+    CHECK(port_num > 0) << "RDMA_DEVICE_PORT should be positive";
+    CHECK(port_num <= device_att.phys_port_cnt) << "RDMA_DEVICE_PORT should be "
+                                                   "less or equal to amount of "
+                                                   "available ports";
+    rc = ibv_query_port(context, port_num, &port_attr);
+    CHECK(!rc) << "Failed to query the port" << port_num;
+    // check if port id active
+    CHECK(port_attr.state == IBV_PORT_ACTIVE)
+        << "Selected RDMA_DEVICE_PORT is not active";
+  } else {  // set default port
+    for (port_index = 1; port_index <= device_att.phys_port_cnt; port_index++) {
+      rc = ibv_query_port(context, port_index, &port_attr);
+      CHECK(!rc) << "Failed to query the port" << port_index;
+      if (port_attr.state == IBV_PORT_ACTIVE) {
+        port_num = port_index;
+        break;
+      }
+    }
+    CHECK_GT(port_num, 0) << "No active ports";
+  }
+  return port_num;
+}
+
+// Function read from sysfs file
+// Args:
+//   dir - directory
+//   file - file
+//   buff - buffer for the result
+//   size - buffer size
+// Returns:
+//   number of bytes were read or -1 if failed
+int read_sysfs_file(const char* dir, const char* file, char* buf, size_t size) {
+  char* path;
+  int fd;
+  int len;
+
+  if (asprintf(&path, "%s/%s", dir, file) < 0) return -1;
+
+  fd = open(path, O_RDONLY);
+  if (fd < 0) {
+    free(path);
+    return -1;
+  }
+
+  len = read(fd, buf, size);
+
+  close(fd);
+  free(path);
+
+  if (len > 0 && buf[len - 1] == '\n') buf[--len] = '\0';
+
+  return len;
+}
+
+// Function to check if GID index support RoCE V2
+// Args:
+//   context - device context
+//   port_num - port number
+//   index -  GID index
+// Returns:
+//   if GID supports RoCE V2 - true, otherwise - false.
+bool is_gid_type_roce_v2(ibv_context* context, uint8_t port_num,
+                         uint8_t index) {
+  char name[32];
+  char buff[41];
+
+  snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", port_num, index);
+  if (read_sysfs_file(context->device->ibdev_path, name, buff, sizeof(buff)) <=
+      0) {
+    return false;
+  }
+  return !strcmp(buff, RoCE_V2);
+}
+
+// Function to set GID index.
+// If the port link is IB, no GID index should be selected.
+// If Ethernet but RDMA_GID_INDEX not set gid index that supports
+//   RoCE V2 will be chosen(fails if more than one IP is configured)
+// Args:
+//   context - device context
+//   port_num - port number
+// Returns:
+//   GID index to use
+uint8_t set_gid(uint8_t port_num, ibv_context* context) {
+  ibv_port_attr port_attr;
+  string gid_str;
+  int rc, i, gids_num = 0, v2_ip_num = 0;
+  union ibv_gid gid;
+  uint8_t gid_index = 0;
+
+  rc = ibv_query_port(context, port_num, &port_attr);
+  CHECK(!rc) << "Failed to query the port" << port_num;
+
+  for (i = 0; i < port_attr.gid_tbl_len; i++) {
+    rc = ibv_query_gid(context, port_num, i, &gid);
+    CHECK(!rc) << "Failed to query gid to port " << (int)port_num << " index "
+               << i;
+    if (gid.global.interface_id) {
+      gids_num++;
+      if (gid.global.subnet_prefix == 0 &&
+          is_gid_type_roce_v2(context, port_num, i)) {
+        if (v2_ip_num == 0) {
+          // can be overwritten by RDMA_GID_INDEX later
+          gid_index = i;
+        }
+        v2_ip_num++;
+      }
+    }
+  }
+  switch (port_attr.link_layer) {
+    case (IBV_LINK_LAYER_ETHERNET):
+      gid_str = get_env_var("RDMA_GID_INDEX");
+      if (!gid_str.empty()) {
+        gid_index = stoi(gid_str);
+        CHECK(gid_index < gids_num)
+            << "RDMA_GID_INDEX should be less than GIDs amount" << gids_num;
+      } else {
+        CHECK(v2_ip_num <= 1)
+            << "More than one IP is available, please specify GID_INDEX";
+      }
+      break;
+    case (IBV_LINK_LAYER_INFINIBAND):  // no need in GID index
+      break;
+    default:
+      LOG(INFO) << "Unknown port link layer. Currently supporting Ethernet and "
+                   "InfiniBand only. ";
+  }
+  if (!is_gid_type_roce_v2(context, port_num, gid_index)) {
+    LOG(INFO) << "RoCE v2 is not configured for GID_INDEX " << (int)gid_index;
+  }
+  return gid_index;
+}
+
+// set the default or environment value to the configuration parameter.
+// Args:
+//   default_val- the default value for this parameter
+//   env_param- the environment parameter's name
+// Returns:
+//   32-bit value
+uint32_t set_param(uint32_t default_val, const char* env_param) {
+  uint32_t val = default_val;
+  string val_s;
+
+  val_s = get_env_var(env_param);
+
+  if (!val_s.empty()) {
+    val = stoi(val_s);
+  }
+  return val;
+}
+
+enum ibv_mtu set_mtu(uint8_t port_num, ibv_context* context) {
+  ibv_port_attr port_attr;
+  enum ibv_mtu mtu;
+  string mtu_s;
+  int rc, mtu_i;
+
+  rc = ibv_query_port(context, port_num, &port_attr);
+  CHECK(!rc) << "Failed to query the port" << port_num;
+
+  mtu_s = get_env_var("RDMA_MTU");
+
+  if (!mtu_s.empty()) {
+    mtu_i = stoi(mtu_s);
+    switch (mtu_i) {
+      case 256:
+        mtu = IBV_MTU_256;
+        break;
+      case 512:
+        mtu = IBV_MTU_512;
+        break;
+      case 1024:
+        mtu = IBV_MTU_1024;
+        break;
+      case 2048:
+        mtu = IBV_MTU_2048;
+        break;
+      case 4096:
+        mtu = IBV_MTU_4096;
+        break;
+      default:
+        CHECK(0) << "Error: MTU input value must be one of the following: 256, "
+                    "512, 1024, 2048, 4096. MTU "
+                 << mtu << " is invalid\n";
+        break;
+    }
+    CHECK(mtu < port_attr.active_mtu)
+        << "MTU configuration for the QPs is larger than active MTU";
+  } else {
+    mtu = port_attr.active_mtu;
+  }
+  return mtu;
+}
+
+RdmaParams params_init(ibv_context* context) {
+  RdmaParams params;
+
+  params.port_num = set_port(context);
+  params.sgid_index = set_gid(params.port_num, context);
+  params.pkey_index = (uint8_t)set_param(PKEY_DEFAULT, "RDMA_PKEY");
+  params.queue_depth = set_param(QUEUE_DEPTH_DEFAULT, "RDMA_QUEUE_DEPTH");
+  params.timeout = (uint8_t)set_param(TIMEOUT_DEFAULT, "RDMA_TIMEOUT");
+  params.retry_cnt = (uint8_t)set_param(RETRY_CNT_DEFAULT, "RDMA_RETRY_CNT");
+  params.sl = (uint8_t)set_param(SL_DEFAULT, "RDMA_SL");
+  CHECK(params.sl <= 7) << "SL value is " << (int)params.sl
+                        << ". Valid values are 0-7.";
+  params.mtu = set_mtu(params.port_num, context);
+  params.traffic_class = set_param(TRAFFIC_CLASS, "RDMA_TRAFFIC_CLASS");
+  return params;
+}
+
 ibv_pd* alloc_protection_domain(ibv_context* context) {
  ibv_pd* pd = ibv_alloc_pd(context);
  CHECK(pd) << "Failed to allocate protection domain";
@ -85,7 +408,8 @@ ibv_pd* alloc_protection_domain(ibv_context* context) {
 }

 RdmaAdapter::RdmaAdapter(const WorkerEnv* worker_env)
-    : context_(open_default_device()),
+    : context_(open_device(set_device())),
+      params_(params_init(context_)),
      pd_(alloc_protection_domain(context_)),
      worker_env_(worker_env) {
  event_channel_ = ibv_create_comp_channel(context_);
@ -242,8 +566,8 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
    memset(&attr, 0, sizeof(ibv_qp_init_attr));
    attr.send_cq = adapter_->cq_;
    attr.recv_cq = adapter_->cq_;
-    attr.cap.max_send_wr = RdmaAdapter::MAX_CONCURRENT_WRITES;
-    attr.cap.max_recv_wr = RdmaAdapter::MAX_CONCURRENT_WRITES;
+    attr.cap.max_send_wr = adapter_->params_.queue_depth;
+    attr.cap.max_recv_wr = adapter_->params_.queue_depth;
    attr.cap.max_send_sge = 1;
    attr.cap.max_recv_sge = 1;
    attr.qp_type = IBV_QPT_RC;
@ -257,8 +581,8 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
    struct ibv_qp_attr attr;
    memset(&attr, 0, sizeof(ibv_qp_attr));
    attr.qp_state = IBV_QPS_INIT;
-    attr.pkey_index = 0;
-    attr.port_num = 1;
+    attr.pkey_index = adapter_->params_.pkey_index;
+    attr.port_num = adapter_->params_.port_num;
    attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;

    int mask =
@ -269,13 +593,15 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
  // Local address
  {
    struct ibv_port_attr attr;
-    CHECK(!ibv_query_port(adapter_->context_, (uint8_t)1, &attr))
+    CHECK(
+        !ibv_query_port(adapter_->context_, adapter_->params_.port_num, &attr))
        << "Query port";
    self_.lid = attr.lid;
    self_.qpn = qp_->qp_num;
    self_.psn = static_cast<uint32_t>(random::New64()) & 0xffffff;
    union ibv_gid gid;
-    CHECK(!ibv_query_gid(adapter_->context_, (uint8_t)1, 0, &gid))
+    CHECK(!ibv_query_gid(adapter_->context_, adapter_->params_.port_num,
+                         adapter_->params_.sgid_index, &gid))
        << "Query gid";
    self_.snp = gid.global.subnet_prefix;
    self_.iid = gid.global.interface_id;
@ -479,11 +805,9 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
    struct ibv_qp_attr attr;
    memset(&attr, 0, sizeof(ibv_qp_attr));
    attr.qp_state = IBV_QPS_RTR;
-    struct ibv_port_attr port_attr;
-    CHECK(!ibv_query_port(adapter_->context_, (uint8_t)1, &port_attr))
-        << "Query port failed";
+
    // This assumes both QP's ports are configured with the same MTU
-    attr.path_mtu = port_attr.active_mtu;
+    attr.path_mtu = adapter_->params_.mtu;
    attr.dest_qp_num = remoteAddr.qpn;
    attr.rq_psn = remoteAddr.psn;
    attr.max_dest_rd_atomic = 1;
@ -494,9 +818,11 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
    attr.ah_attr.grh.flow_label = 0;
    attr.ah_attr.grh.hop_limit = 255;
    attr.ah_attr.dlid = remoteAddr.lid;
-    attr.ah_attr.sl = 0;
+    attr.ah_attr.sl = adapter_->params_.sl;
    attr.ah_attr.src_path_bits = 0;
-    attr.ah_attr.port_num = 1;
+    attr.ah_attr.port_num = adapter_->params_.port_num;
+    attr.ah_attr.grh.sgid_index = adapter_->params_.sgid_index;
+    attr.ah_attr.grh.traffic_class = adapter_->params_.traffic_class;

    int r;
    CHECK(!(r = ibv_modify_qp(qp_, &attr,
@ -509,8 +835,8 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
    memset(&attr, 0, sizeof(ibv_qp_attr));
    attr.qp_state = IBV_QPS_RTS;
    attr.sq_psn = self_.psn;
-    attr.timeout = 14;
-    attr.retry_cnt = 7;
+    attr.timeout = adapter_->params_.timeout;
+    attr.retry_cnt = adapter_->params_.retry_cnt;
    attr.rnr_retry = 7; /* infinite */
    attr.max_rd_atomic = 1;

--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@ -36,7 +36,24 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"

 namespace tensorflow {
+#define PKEY_DEFAULT 0
+#define QUEUE_DEPTH_DEFAULT 1024
+#define TIMEOUT_DEFAULT 14
+#define RETRY_CNT_DEFAULT 7
+#define SL_DEFAULT 0
+#define TRAFFIC_CLASS 0

+struct RdmaParams {
+  uint8_t port_num;
+  uint8_t sgid_index;
+  uint8_t pkey_index;
+  uint32_t queue_depth;
+  uint8_t timeout;
+  uint8_t retry_cnt;
+  uint8_t sl;
+  enum ibv_mtu mtu;
+  uint8_t traffic_class;
+};
 // structure to save the address of remote channels.
 struct RdmaAddress {
  uint32_t lid;
@ -84,6 +101,8 @@ class RdmaAdapter {
 protected:
  static const int MAX_CONCURRENT_WRITES = 1000;
  ibv_context* context_;
+  // RDMA configuration parameters
+  RdmaParams params_;
  // ibverbs protection domain
  ibv_pd* pd_;
  // Completion event channel, to wait for work completions
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -2710,6 +2710,7 @@ tf_cc_test_mkl(
    srcs = [
        "graph/mkl_layout_pass_test.cc",
        "graph/mkl_tfconversion_pass_test.cc",
+        "util/mkl_util_test.cc",
    ],
    linkstatic = 1,
    deps = [
--- a/tensorflow/core/api_def/base_api/api_def_UniqueV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UniqueV2.pbtxt
@ -0,0 +1,47 @@
+op {
+  graph_op_name: "UniqueV2"
+  in_arg {
+    name: "x"
+    description: <<END
+A `Tensor`.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
+find the unique elements.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+A `Tensor`. Unique elements along the `axis` of `Tensor` x.
+END
+  }
+  out_arg {
+    name: "idx"
+    description: <<END
+A 1-D Tensor. Has the same type as x that contains the index of each
+value of x in the output y.
+END
+  }
+  summary: "Finds unique elements in a 1-D tensor."
+  description: <<END
+This operation returns a tensor `y` containing all of the unique elements of `x`
+sorted in the same order that they occur in `x`. This operation also returns a
+tensor `idx` the same size as `x` that contains the index of each value of `x`
+in the unique output `y`. In other words:
+
+`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+
+For example:
+
+```
+# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+y, idx = unique(x)
+y ==> [1, 2, 4, 7, 8]
+idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+```
+END
+}
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
@ -26,6 +26,8 @@ need not be sorted and need not cover all values in the full
 range of valid values.

 If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+If the given segment ID `i` is negative, the value is dropped and will not be
+added to the sum of the segment.

 `num_segments` should equal the number of distinct segment IDs.

--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@ -81,7 +81,7 @@ class MklCPUAllocator : public Allocator {
      }
 #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
      if (user_val > max_mem_bytes) {
-        LOG(WARNING) << "The user specifed a memory limit " << kMaxLimitStr
+        LOG(WARNING) << "The user specified a memory limit " << kMaxLimitStr
                     << "=" << user_val
                     << " greater than available physical memory: "
                     << max_mem_bytes
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@ -46,8 +46,8 @@ class GSYCLInterface {

    if (!found_device) {
      // Currently Intel GPU is not supported
-      LOG(WARNING) << "No OpenCL GPU found that is supported by ComputeCpp, "
-                      "trying OpenCL CPU";
+      LOG(WARNING) << "No OpenCL GPU found that is supported by "
+                   << "ComputeCpp/triSYCL, trying OpenCL CPU";
    }

    for (const auto& device : device_list) {
@ -58,10 +58,24 @@ class GSYCLInterface {
      }
    }

+    if (!found_device) {
+      LOG(WARNING) << "No OpenCL CPU found that is supported by "
+                   << "ComputeCpp/triSYCL, checking for host sycl device";
+    }
+
+    for (const auto& device : device_list) {
+      // triSYCL only supports the host device for now
+      if (device.is_host()) {
+        LOG(WARNING) << "Found SYCL host device";
+        AddDevice(device);
+        found_device = true;
+      }
+    }
+
    if (!found_device) {
      // Currently Intel GPU is not supported
-      LOG(FATAL)
-          << "No OpenCL GPU nor CPU found that is supported by ComputeCpp";
+      LOG(FATAL) << "No SYCL host and no OpenCL GPU nor CPU"
+                 << " supported by ComputeCPP/triSYCL was found";
    } else {
      LOG(INFO) << "Found following OpenCL devices:";
      for (int i = 0; i < device_list.size(); i++) {
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@ -453,6 +453,21 @@ const Edge* Graph::AddControlEdge(Node* source, Node* dest,
  return AddEdge(source, kControlSlot, dest, kControlSlot);
 }

+void Graph::RemoveControlEdge(const Edge* e) {
+  if (!e->src_->IsSource() && !e->dst_->IsSink()) {
+    e->dst_->MaybeCopyOnWrite();
+    std::string e_src_name = strings::StrCat("^", e->src_->name());
+    auto* inputs = e->dst_->props_->node_def.mutable_input();
+    for (auto it = inputs->begin(); it != inputs->end(); ++it) {
+      if (*it == e_src_name) {
+        inputs->erase(it);
+        break;
+      }
+    }
+  }
+  RemoveEdge(e);
+}
+
 Status Graph::UpdateEdge(Node* new_src, int new_src_index, Node* dst,
                         int dst_index) {
  TF_RETURN_IF_ERROR(IsValidOutputTensor(new_src, new_src_index));
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@ -451,6 +451,11 @@ class Graph {
  // REQUIRES: The edge must exist.
  void RemoveEdge(const Edge* edge);

+  // Removes control edge `edge` from the graph. Note that this also updates
+  // the corresponding NodeDef to reflect the change.
+  // REQUIRES: The control edge must exist.
+  void RemoveControlEdge(const Edge* e);
+
  // Updates the input to a node.  The existing edge to `dst` is removed and an
  // edge from `new_src` to `dst` is created. The NodeDef associated with `dst`
  // is also updated.
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@ -117,7 +117,7 @@ DataType EdgeType(const Edge* e) {
  }
 }

-// Return true iff we need to add a same device send/recv for 'edge'.
+// Return true iff we need to add the same device send/recv for 'edge'.
 bool NeedSameDeviceSendRecv(const Edge* edge, const GraphInfo& info) {
  if (edge->IsControlEdge()) {
    return false;
@ -1116,7 +1116,7 @@ Status Partition(const PartitionOptions& opts, Graph* g,
        // before the data is available.
        AddInput(real_recv, send->name(), Graph::kControlSlot);
      } else if (control_flow_edge != nullptr) {
-        // Redirect control edge to the real recv since this is not a same
+        // Redirect control edge to the real recv since this is not the same
        // device send/recv.
        --num_control_flow_edges;
        AddInput(real_recv, control_flow_edge->src()->name(),
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@ -118,6 +118,23 @@ class GraphTest : public ::testing::Test {
    LOG(FATAL) << name;
  }

+  bool ControlEdgeExistsInGraphOrNodeDef(const Node* src, const Node* dst) {
+    for (const Edge* e : dst->in_edges()) {
+      if (e->IsControlEdge() && e->src() == src &&
+          e->src_output() == Graph::kControlSlot &&
+          e->dst_input() == Graph::kControlSlot) {
+        return true;
+      }
+    }
+    std::string control_edge_name = strings::StrCat("^", src->name());
+    for (int i = 0; i < dst->def().input_size(); ++i) {
+      if (dst->def().input(i) == control_edge_name) {
+        return true;
+      }
+    }
+    return false;
+  }
+
  Graph graph_;

 private:
@ -458,8 +475,8 @@ TEST_F(GraphTest, AddControlEdge) {
  EXPECT_TRUE(edge == nullptr);
  EXPECT_EQ(b->def().input_size(), 2);

-  // Can add redundant control edge with create_duplicate.
-  edge = graph_.AddControlEdge(a, b, /*create_duplicate=*/true);
+  // Can add redundant control edge with allow_duplicates.
+  edge = graph_.AddControlEdge(a, b, /*allow_duplicates=*/true);
  EXPECT_TRUE(edge != nullptr);
  // create_duplicate causes the NodeDef not to be updated.
  ASSERT_EQ(b->def().input_size(), 2);
@ -477,6 +494,47 @@ TEST_F(GraphTest, AddControlEdge) {
  EXPECT_EQ(b->def().input_size(), 2);
 }

+TEST_F(GraphTest, RemoveControlEdge) {
+  FromGraphDef(
+      "node { name: 'A' op: 'OneOutput' }"
+      "node { name: 'B' op: 'OneInputTwoOutputs' input: [ 'A:0' ] }"
+      "node { name: 'C' op: 'NoOp' } ");
+  Node* a = FindNode("A");
+  Node* b = FindNode("B");
+  Node* c = FindNode("C");
+
+  // Add a control edge.
+  const Edge* edge_1 = graph_.AddControlEdge(c, a);
+  const Edge* edge_2 = graph_.AddControlEdge(a, b);
+  ASSERT_TRUE(edge_1 != nullptr);
+  ASSERT_TRUE(edge_2 != nullptr);
+
+  ASSERT_TRUE(ControlEdgeExistsInGraphOrNodeDef(c, a));
+  ASSERT_TRUE(ControlEdgeExistsInGraphOrNodeDef(a, b));
+
+  graph_.RemoveControlEdge(edge_1);
+  ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(c, a));
+  ASSERT_TRUE(ControlEdgeExistsInGraphOrNodeDef(a, b));
+
+  graph_.RemoveControlEdge(edge_2);
+  ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(c, a));
+  ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(a, b));
+
+  // Test removing a duplicate control edge.
+  // Note that unless allow_duplicates is true, the duplicate edge
+  // will not be added. That's why we expect edge_4 to be a null
+  // pointer. We are not testing with allow_duplicates set to true,
+  // as that is a highly unlikely use case that does not make much
+  // sense.
+  const Edge* edge_3 = graph_.AddControlEdge(c, a);
+  const Edge* edge_4 = graph_.AddControlEdge(c, a);
+  ASSERT_TRUE(edge_3 != nullptr);
+  ASSERT_TRUE(edge_4 == nullptr);
+
+  graph_.RemoveControlEdge(edge_3);
+  ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(c, a));
+}
+
 TEST_F(GraphTest, UpdateEdge) {
  // Build a little graph
  Node* a = FromNodeDef("A", "OneOutput", 0);
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@ -68,7 +68,7 @@ namespace tensorflow {
 // take place before we hit the op. For this, we add a new op before each
 // element-wise MKL op to deal with the inputs, called _MklInputConversion.
 // This pass has been enhanced to add this capability.
-// 
+//
 // The _MklInputConversion op will check the inputs to the elementwise op and
 // make sure that either both are in MKL format or both are in TF format,
 // depending on their initial state and whether broadcast is needed or not.
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@ -58,6 +58,12 @@ class GraphProperties {
  const std::vector<OpInfo::TensorProperties>& GetOutputProperties(
      const string& node_name) const;

+  static void FillTensorPropertiesFromContext(
+      const shape_inference::ShapeHandle&, const DataType&,
+      shape_inference::InferenceContext*,
+      std::unordered_map<const shape_inference::Dimension*, int>* dim_ids,
+      OpInfo::TensorProperties*);
+
 private:
  // Inputs
  GrapplerItem item_;
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@ -62,7 +62,7 @@ const std::set<NodeDef*>& NodeMap::GetOutputs(const string& node_name) const {
 void NodeMap::AddNode(const string& name, NodeDef* node) {
  auto ret = nodes_.insert(std::make_pair(name, node));
  CHECK(ret.second) << "Pair (" << name << "," << node
-                    << ") is not inserted because a same key already exists.";
+                    << ") is not inserted because the same key already exists.";
 }

 void NodeMap::AddOutput(const string& node_name, const string& output_name) {
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -929,6 +929,25 @@ tf_cc_test(
    ],
 )

+tf_cuda_cc_test(
+    name = "bincount_op_test",
+    size = "small",
+    srcs = ["bincount_op_test.cc"],
+    deps = [
+        ":bincount_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cuda_cc_test(
    name = "constant_op_test",
    size = "small",
@ -1617,7 +1636,10 @@ DYNAMIC_DEPS = [
 tf_kernel_library(
    name = "dynamic_partition_op",
    prefix = "dynamic_partition_op",
-    deps = DYNAMIC_DEPS,
+    deps = DYNAMIC_DEPS + [
+        ":fill_functor",
+        ":gather_functor",
+    ] + if_cuda(["@cub_archive//:cub"]),
 )

 tf_kernel_library(
@ -1687,7 +1709,7 @@ tf_kernel_library(
    ],
 )

-tf_cc_tests(
+tf_cuda_cc_tests(
    name = "dynamic_op_test",
    size = "small",
    srcs = [
@ -1698,6 +1720,7 @@ tf_cc_tests(
        ":data_flow",
        ":ops_testutil",
        ":ops_util",
+        "//tensorflow/core:core_cpu",
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
        "//tensorflow/core:protos_all_cc",
@ -2572,8 +2595,9 @@ tf_kernel_library(

 tf_kernel_library(
    name = "bucketize_op",
+    gpu_srcs = ["cuda_device_array.h"],
    prefix = "bucketize_op",
-    deps = MATH_DEPS,
+    deps = ARRAY_DEPS,
 )

 tf_kernel_library(
@ -3174,7 +3198,7 @@ tf_kernel_library(
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
        "//third_party/eigen3",
-    ],
+    ] + if_cuda(["@cub_archive//:cub"]),
 )

 tf_kernel_library(
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@ -153,7 +153,8 @@ class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
    if (data_format_ == FORMAT_NCHW) {
      DnnPoolingOp<T>::Compute(
          context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
-          stride_, padding_, data_format_, tensor_in, output_shape);
+          stride_, padding_, data_format_, tensor_in, output_shape,
+          /*propagate_nans=*/false);
    } else {
      Tensor* output = nullptr;
      OP_REQUIRES_OK(context,
@ -408,7 +409,7 @@ class AvgPoolingGradOp<GPUDevice, T> : public OpKernel {
    DnnPoolingGradOp<T>::Compute(
        context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
        stride_, padding_, data_format_, nullptr, nullptr, out_backprop,
-        output_shape);
+        output_shape, /*propagate_nans=*/false);
  }

 private:
@ -532,7 +533,7 @@ class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
      DnnPoolingGradOp<T>::Compute(
          context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
          stride_, padding_, data_format_, nullptr, nullptr, out_backprop,
-          output_shape);
+          output_shape, /*propagate_nans=*/false);
    }
  }

--- a/tensorflow/core/kernels/bincount_op.cc
+++ b/tensorflow/core/kernels/bincount_op.cc
@ -17,6 +17,7 @@ limitations under the License.

 #define EIGEN_USE_THREADS

+#include "tensorflow/core/kernels/bincount_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
@ -27,46 +28,37 @@ namespace tensorflow {

 using thread::ThreadPool;

-template <typename T>
-class BincountOp : public OpKernel {
- public:
-  explicit BincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;

-  void Compute(OpKernelContext* ctx) override {
-    const Tensor& arr_t = ctx->input(0);
-    const Tensor& size_tensor = ctx->input(1);
-    const Tensor& weights_t = ctx->input(2);
-    int32 size = size_tensor.scalar<int32>()();
-    OP_REQUIRES(
-        ctx, size >= 0,
-        errors::InvalidArgument("size (", size, ") must be non-negative"));
-    const bool has_weights = weights_t.NumElements() > 0;
-    OP_REQUIRES(ctx, !(has_weights && arr_t.shape() != weights_t.shape()),
-                errors::InvalidArgument(
-                    "If weights are passed, they must have the same shape (" +
-                    weights_t.shape().DebugString() + ") as arr (" +
-                    arr_t.shape().DebugString() + ")"));
-    const auto arr = arr_t.flat<int32>();
-    const auto weights = weights_t.flat<T>();
+namespace functor {
+
+template <typename T>
+struct BincountFunctor<CPUDevice, T> {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<int32, 1>::ConstTensor& arr,
+                        const typename TTypes<T, 1>::ConstTensor& weights,
+                        typename TTypes<T, 1>::Tensor& output) {
+    int size = output.size();

    Tensor all_nonneg_t;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_temp(DT_BOOL, TensorShape({}), &all_nonneg_t,
-                                      AllocatorAttributes()));
-    all_nonneg_t.scalar<bool>().device(ctx->eigen_cpu_device()) =
+    TF_RETURN_IF_ERROR(context->allocate_temp(
+        DT_BOOL, TensorShape({}), &all_nonneg_t, AllocatorAttributes()));
+    all_nonneg_t.scalar<bool>().device(context->eigen_cpu_device()) =
        (arr >= 0).all();
-    OP_REQUIRES(ctx, all_nonneg_t.scalar<bool>()(),
-                errors::InvalidArgument("Input arr must be non-negative!"));
+    if (!all_nonneg_t.scalar<bool>()()) {
+      return errors::InvalidArgument("Input arr must be non-negative!");
+    }

    // Allocate partial output bin sums for each worker thread. Worker ids in
    // ParallelForWithWorkerId range from 0 to NumThreads() inclusive.
    ThreadPool* thread_pool =
-        ctx->device()->tensorflow_cpu_worker_threads()->workers;
+        context->device()->tensorflow_cpu_worker_threads()->workers;
    const int64 num_threads = thread_pool->NumThreads() + 1;
    Tensor partial_bins_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(weights_t.dtype(),
-                                           TensorShape({num_threads, size}),
-                                           &partial_bins_t));
+    TF_RETURN_IF_ERROR(context->allocate_temp(DataTypeToEnum<T>::value,
+                                              TensorShape({num_threads, size}),
+                                              &partial_bins_t));
    auto partial_bins = partial_bins_t.matrix<T>();
    partial_bins.setZero();
    thread_pool->ParallelForWithWorkerId(
@ -75,7 +67,7 @@ class BincountOp : public OpKernel {
          for (int64 i = start_ind; i < limit_ind; i++) {
            int32 value = arr(i);
            if (value < size) {
-              if (has_weights) {
+              if (weights.size()) {
                partial_bins(worker_id, value) += weights(i);
              } else {
                // Complex numbers don't support "++".
@ -84,25 +76,63 @@ class BincountOp : public OpKernel {
            }
          }
        });
-    TensorShape output_shape({size});
-    Tensor* output_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_t));
+
    // Sum the partial bins along the 0th axis.
    Eigen::array<int, 1> reduce_dims({0});
-    output_t->flat<T>().device(ctx->eigen_cpu_device()) =
-        partial_bins.sum(reduce_dims);
+    output.device(context->eigen_cpu_device()) = partial_bins.sum(reduce_dims);
+    return Status::OK();
  }
 };

-#define REGISTER(TYPE)                                               \
+}  // namespace functor
+
+template <typename Device, typename T>
+class BincountOp : public OpKernel {
+ public:
+  explicit BincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& arr_t = ctx->input(0);
+    const Tensor& size_tensor = ctx->input(1);
+    const Tensor& weights_t = ctx->input(2);
+
+    int32 size = size_tensor.scalar<int32>()();
+    OP_REQUIRES(
+        ctx, size >= 0,
+        errors::InvalidArgument("size (", size, ") must be non-negative"));
+
+    const auto arr = arr_t.flat<int32>();
+    const auto weights = weights_t.flat<T>();
+    Tensor* output_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({size}), &output_t));
+    auto output = output_t->flat<T>();
+    OP_REQUIRES_OK(ctx, functor::BincountFunctor<Device, T>::Compute(
+                            ctx, arr, weights, output));
+  }
+};
+
+#define REGISTER_KERNELS(type)                                       \
  REGISTER_KERNEL_BUILDER(                                           \
-      Name("Bincount").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
-      BincountOp<TYPE>)
+      Name("Bincount").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      BincountOp<CPUDevice, type>)

-TF_CALL_NUMBER_TYPES(REGISTER);
+TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS

-// TODO(ringwalt): Add a GPU implementation. We probably want to take a
-// different approach, e.g. threads in a warp each taking a pass over the same
-// data, and each thread summing a single bin.
+#if GOOGLE_CUDA
+
+#define REGISTER_KERNELS(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("Bincount")                \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("size")         \
+                              .TypeConstraint<type>("T"), \
+                          BincountOp<GPUDevice, type>)
+
+TF_CALL_int32(REGISTER_KERNELS);
+TF_CALL_float(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#endif  // GOOGLE_CUDA

 }  // end namespace tensorflow
--- a/tensorflow/core/kernels/bincount_op.h
+++ b/tensorflow/core/kernels/bincount_op.h
@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_BINCOUNT_OP_H_
+#define TENSORFLOW_BINCOUNT_OP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct BincountFunctor {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<int32, 1>::ConstTensor& arr,
+                        const typename TTypes<T, 1>::ConstTensor& weights,
+                        typename TTypes<T, 1>::Tensor& output);
+};
+
+}  // end namespace functor
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_BINCOUNT_OP_H_
--- a/tensorflow/core/kernels/bincount_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
@ -0,0 +1,114 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "external/cub_archive/cub/device/device_histogram.cuh"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/bincount_op.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename T>
+struct BincountFunctor<GPUDevice, T> {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<int32, 1>::ConstTensor& arr,
+                        const typename TTypes<T, 1>::ConstTensor& weights,
+                        typename TTypes<T, 1>::Tensor& output) {
+    if (weights.size() != 0) {
+      return errors::InvalidArgument(
+          "Weights should not be passed as it should be "
+          "handled by unsorted_segment_sum");
+    }
+    if (output.size() == 0) {
+      return Status::OK();
+    }
+    // In case weight.size() == 0, use CUB
+    size_t temp_storage_bytes = 0;
+    const int32* d_samples = arr.data();
+    T* d_histogram = output.data();
+    int num_levels = output.size() + 1;
+    int32 lower_level = 0;
+    int32 upper_level = output.size();
+    int num_samples = arr.size();
+    const cudaStream_t& stream = GetCudaStream(context);
+
+    // The first HistogramEven is to obtain the temp storage size required
+    // with d_temp_storage = NULL passed to the call.
+    auto err = cub::DeviceHistogram::HistogramEven(
+        /* d_temp_storage */ NULL,
+        /* temp_storage_bytes */ temp_storage_bytes,
+        /* d_samples */ d_samples,
+        /* d_histogram */ d_histogram,
+        /* num_levels */ num_levels,
+        /* lower_level */ lower_level,
+        /* upper_level */ upper_level,
+        /* num_samples */ num_samples,
+        /* stream */ stream);
+    if (err != cudaSuccess) {
+      return errors::Internal(
+          "Could not launch HistogramEven to get temp storage: ",
+          cudaGetErrorString(err), ".");
+    }
+    Tensor temp_storage;
+    TF_RETURN_IF_ERROR(context->allocate_temp(
+        DataTypeToEnum<int8>::value,
+        TensorShape({static_cast<int64>(temp_storage_bytes)}), &temp_storage));
+
+    void* d_temp_storage = temp_storage.flat<int8>().data();
+    // The second HistogramEven is to actual run with d_temp_storage
+    // allocated with temp_storage_bytes.
+    err = cub::DeviceHistogram::HistogramEven(
+        /* d_temp_storage */ d_temp_storage,
+        /* temp_storage_bytes */ temp_storage_bytes,
+        /* d_samples */ d_samples,
+        /* d_histogram */ d_histogram,
+        /* num_levels */ num_levels,
+        /* lower_level */ lower_level,
+        /* upper_level */ upper_level,
+        /* num_samples */ num_samples,
+        /* stream */ stream);
+    if (err != cudaSuccess) {
+      return errors::Internal(
+          "Could not launch HistogramEven: ", cudaGetErrorString(err), ".");
+    }
+    return Status::OK();
+  }
+};
+
+}  // end namespace functor
+
+#define REGISTER_GPU_SPEC(type) \
+  template struct functor::BincountFunctor<GPUDevice, type>;
+
+TF_CALL_int32(REGISTER_GPU_SPEC);
+TF_CALL_float(REGISTER_GPU_SPEC);
+#undef REGISTER_GPU_SPEC
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/bincount_op_test.cc
+++ b/tensorflow/core/kernels/bincount_op_test.cc
@ -0,0 +1,75 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+static Graph* Bincount(int arr_size, int nbins) {
+  Graph* g = new Graph(OpRegistry::Global());
+
+  Tensor arr(DT_INT32, TensorShape({arr_size}));
+  arr.flat<int32>() = arr.flat<int32>().setRandom().abs();
+
+  Tensor size(DT_INT32, TensorShape({static_cast<int32>(1)}));
+  size.flat<int32>()(0) = static_cast<int32>(nbins);
+
+  Tensor weights(DT_INT32, TensorShape({0}));
+
+  Node* node;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Bincount")
+                  .Input(test::graph::Constant(g, arr))
+                  .Input(test::graph::Constant(g, size))
+                  .Input(test::graph::Constant(g, weights))
+                  .Attr("T", DT_INT32)
+                  .Finalize(g, &node));
+  return g;
+}
+
+#define BM_BincountDev(K, NBINS, type)                             \
+  static void BM_Bincount##_##type##_##K##_##NBINS(int iters) {    \
+    testing::ItemsProcessed(static_cast<int64>(iters) * K * 1024); \
+    test::Benchmark(#type, Bincount(K * 1024, NBINS)).Run(iters);  \
+  }                                                                \
+  BENCHMARK(BM_Bincount##_##type##_##K##_##NBINS);
+
+BM_BincountDev(32, 1000, cpu);
+BM_BincountDev(32, 2000, cpu);
+BM_BincountDev(32, 5000, cpu);
+BM_BincountDev(64, 1000, cpu);
+BM_BincountDev(64, 2000, cpu);
+BM_BincountDev(64, 5000, cpu);
+BM_BincountDev(128, 1000, cpu);
+BM_BincountDev(128, 2000, cpu);
+BM_BincountDev(128, 5000, cpu);
+
+BM_BincountDev(32, 1000, gpu);
+BM_BincountDev(32, 2000, gpu);
+BM_BincountDev(32, 5000, gpu);
+BM_BincountDev(64, 1000, gpu);
+BM_BincountDev(64, 2000, gpu);
+BM_BincountDev(64, 5000, gpu);
+BM_BincountDev(128, 1000, gpu);
+BM_BincountDev(128, 2000, gpu);
+BM_BincountDev(128, 5000, gpu);
+
+}  // end namespace tensorflow
--- a/tensorflow/core/kernels/bucketize_op.cc
+++ b/tensorflow/core/kernels/bucketize_op.cc
@ -15,15 +15,43 @@ limitations under the License.

 // See docs in ../ops/math_ops.cc.

-#include <algorithm>
-#include <vector>
-
+#include "tensorflow/core/kernels/bucketize_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"

 namespace tensorflow {

+using thread::ThreadPool;
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
 template <typename T>
+struct BucketizeFunctor<CPUDevice, T> {
+  // PRECONDITION: boundaries_vector must be sorted.
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<T, 1>::ConstTensor& input,
+                        const std::vector<float>& boundaries_vector,
+                        typename TTypes<int32, 1>::Tensor& output) {
+    const int N = input.size();
+    for (int i = 0; i < N; i++) {
+      auto first_bigger_it = std::upper_bound(
+          boundaries_vector.begin(), boundaries_vector.end(), input(i));
+      output(i) = first_bigger_it - boundaries_vector.begin();
+    }
+
+    return Status::OK();
+  }
+};
+}  // namespace functor
+
+template <typename Device, typename T>
 class BucketizeOp : public OpKernel {
 public:
  explicit BucketizeOp(OpKernelConstruction* context) : OpKernel(context) {
@ -34,31 +62,24 @@ class BucketizeOp : public OpKernel {

  void Compute(OpKernelContext* context) override {
    const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<T>();
+    const auto input = input_tensor.flat<T>();
+
    Tensor* output_tensor = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                     &output_tensor));
    auto output = output_tensor->template flat<int32>();
-
-    const int N = input.size();
-    for (int i = 0; i < N; i++) {
-      output(i) = CalculateBucketIndex(input(i));
-    }
+    OP_REQUIRES_OK(context, functor::BucketizeFunctor<Device, T>::Compute(
+                                context, input, boundaries_, output));
  }

 private:
-  int32 CalculateBucketIndex(const T value) {
-    auto first_bigger_it =
-        std::upper_bound(boundaries_.begin(), boundaries_.end(), value);
-    return first_bigger_it - boundaries_.begin();
-  }
  std::vector<float> boundaries_;
 };

 #define REGISTER_KERNEL(T)                                         \
  REGISTER_KERNEL_BUILDER(                                         \
      Name("Bucketize").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      BucketizeOp<T>);
+      BucketizeOp<CPUDevice, T>);

 REGISTER_KERNEL(int32);
 REGISTER_KERNEL(int64);
@ -66,4 +87,17 @@ REGISTER_KERNEL(float);
 REGISTER_KERNEL(double);
 #undef REGISTER_KERNEL

+#if GOOGLE_CUDA
+#define REGISTER_KERNEL(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("Bucketize").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      BucketizeOp<GPUDevice, T>);
+
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(int64);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+#endif  // GOOGLE_CUDA
+
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/bucketize_op.h
+++ b/tensorflow/core/kernels/bucketize_op.h
@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_BUCKETIZE_OP_H_
+#define TENSORFLOW_BUCKETIZE_OP_H_
+
+#include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct BucketizeFunctor {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<T, 1>::ConstTensor& input,
+                        const std::vector<float>& boundaries_vector,
+                        typename TTypes<int32, 1>::Tensor& output);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_BUCKETIZE_OP_H_
--- a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
@ -0,0 +1,101 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/bucketize_op.h"
+#include "tensorflow/core/kernels/cuda_device_array.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T>
+__global__ void BucketizeCustomKernel(
+    const int32 size_in, const T* in, const int32 size_boundaries,
+    CudaDeviceArrayStruct<float> boundaries_array, int32* out) {
+  const float* boundaries = GetCudaDeviceArrayOnDevice(&boundaries_array);
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in[i];
+    int32 bucket = 0;
+    int32 count = size_boundaries;
+    while (count > 0) {
+      int32 l = bucket;
+      int32 step = count / 2;
+      l += step;
+      if (!(value < static_cast<T>(boundaries[l]))) {
+        bucket = ++l;
+        count -= step + 1;
+      } else {
+        count = step;
+      }
+    }
+    out[i] = bucket;
+  }
+}
+
+namespace functor {
+
+template <typename T>
+struct BucketizeFunctor<GPUDevice, T> {
+  // PRECONDITION: boundaries_vector must be sorted.
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<T, 1>::ConstTensor& input,
+                        const std::vector<float>& boundaries_vector,
+                        typename TTypes<int32, 1>::Tensor& output) {
+    const GPUDevice& d = context->eigen_device<GPUDevice>();
+
+    CudaDeviceArrayOnHost<float> boundaries_array(context,
+                                                  boundaries_vector.size());
+    TF_RETURN_IF_ERROR(boundaries_array.Init());
+    for (int i = 0; i < boundaries_vector.size(); ++i) {
+      boundaries_array.Set(i, boundaries_vector[i]);
+    }
+    TF_RETURN_IF_ERROR(boundaries_array.Finalize());
+
+    CudaLaunchConfig config = GetCudaLaunchConfig(input.size(), d);
+    BucketizeCustomKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            input.size(), input.data(), boundaries_vector.size(),
+            boundaries_array.data(), output.data());
+
+    return Status::OK();
+  }
+};
+}  // namespace functor
+
+#define REGISTER_GPU_SPEC(type) \
+  template struct functor::BucketizeFunctor<GPUDevice, type>;
+
+REGISTER_GPU_SPEC(int32);
+REGISTER_GPU_SPEC(int64);
+REGISTER_GPU_SPEC(float);
+REGISTER_GPU_SPEC(double);
+#undef REGISTER_GPU_SPEC
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@ -236,6 +236,7 @@ class Conv3DBackpropInputOp : public OpKernel {
  REGISTER_KERNEL_BUILDER(                                                     \
      Name("Conv3DBackpropInputV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
      Conv3DBackpropInputOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
@ -383,6 +384,7 @@ class Conv3DBackpropFilterOp : public OpKernel {
                              .Device(DEVICE_CPU)                             \
                              .TypeConstraint<T>("T"),                        \
                          Conv3DBackpropFilterOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
@ -409,6 +411,7 @@ namespace functor {
      const std::array<int, 3>& padding_right,                        \
      typename TTypes<T, 5, int>::Tensor out, TensorFormat format);

+DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
@ -1098,22 +1101,27 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
  bool cudnn_use_autotune_;
 };

-REGISTER_KERNEL_BUILDER(
-    Name("Conv3DBackpropInput").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    Conv3DBackpropInputOp<GPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInputV2")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("input_sizes"),
-                        Conv3DBackpropInputOp<GPUDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("Conv3DBackpropFilter").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    Conv3DBackpropFilterOp<GPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("filter_sizes"),
-                        Conv3DBackpropFilterOp<GPUDevice, float>);
+#define REGISTER_GPU_KERNEL(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Conv3DBackpropInput").Device(DEVICE_GPU).TypeConstraint<T>("T"),  \
+      Conv3DBackpropInputOp<GPUDevice, T>);                                   \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInputV2")                       \
+                              .Device(DEVICE_GPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .HostMemory("input_sizes"),                     \
+                          Conv3DBackpropInputOp<GPUDevice, T>);               \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Conv3DBackpropFilter").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      Conv3DBackpropFilterOp<GPUDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")                      \
+                              .Device(DEVICE_GPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .HostMemory("filter_sizes"),                    \
+                          Conv3DBackpropFilterOp<GPUDevice, T>);
+TF_CALL_half(REGISTER_GPU_KERNEL);
+TF_CALL_float(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
 #endif  // GOOGLE_CUDA

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@ -145,6 +145,7 @@ class Conv3DOp : public BinaryOp<T> {
  REGISTER_KERNEL_BUILDER(                                      \
      Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
      Conv3DOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
@ -482,12 +483,16 @@ namespace functor {
      const std::array<int, 3>& padding_right,                        \
      typename TTypes<T, 5, int>::Tensor out, TensorFormat format);

+DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
 #undef DECLARE_GPU_SPEC

 }  // namespace functor

 // Registration of the GPU implementations.
+REGISTER_KERNEL_BUILDER(
+    Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+    Conv3DOp<GPUDevice, Eigen::half>);
 REGISTER_KERNEL_BUILDER(
    Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
    Conv3DOp<GPUDevice, float>);
--- a/tensorflow/core/kernels/cwise_op_acosh.cc
+++ b/tensorflow/core/kernels/cwise_op_acosh.cc
@ -20,16 +20,8 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double,
          complex64, complex128);

-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Acosh")                               \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::acosh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
 #endif // TENSORFLOW_USE_SYCL

 #if GOOGLE_CUDA
--- a/tensorflow/core/kernels/cwise_op_asinh.cc
+++ b/tensorflow/core/kernels/cwise_op_asinh.cc
@ -20,17 +20,9 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double,
          complex64, complex128);

-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Asinh")                               \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::asinh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
+#endif  // TENSORFLOW_USE_SYCL

 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Asinh", functor::asinh, float, double);
--- a/tensorflow/core/kernels/cwise_op_atanh.cc
+++ b/tensorflow/core/kernels/cwise_op_atanh.cc
@ -20,17 +20,9 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double,
          complex64, complex128);

-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Atanh")                               \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::atanh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
+#endif  // TENSORFLOW_USE_SYCL

 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Atanh", functor::atanh, float, double);
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@ -49,7 +49,11 @@ template <typename T>
 struct scalar_asinh_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
+#if EIGEN_HAS_CXX11_MATH
+    return numext::asinh(a);
+#else
    return std::asinh(a);
+#endif  // EIGEN_HAS_CXX11_MATH
  }
 };
 template <typename T>
@ -61,7 +65,11 @@ template <typename T>
 struct scalar_acosh_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
+#if EIGEN_HAS_CXX11_MATH
+    return numext::acosh(a);
+#else
    return std::acosh(a);
+#endif  // EIGEN_HAS_CXX11_MATH
  }
 };
 template <typename T>
@ -73,7 +81,11 @@ template <typename T>
 struct scalar_atanh_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
+#if EIGEN_HAS_CXX11_MATH
+    return numext::atanh(a);
+#else
    return std::atanh(a);
+#endif  // EIGEN_HAS_CXX11_MATH
  }
 };
 template <typename T>
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@ -231,7 +231,8 @@ static void CopyOutputBackpropRegion(const DepthwiseArgs& args,
      }
      // Pad to vector-register width (if needed).
      for (int64 d = 0; d < pad_size; ++d) {
-        buffer[buf_base + vectorized_size + scalar_size + d] = 0;
+        buffer[buf_base + vectorized_size + scalar_size + d] =
+            static_cast<T>(0);
      }
    }
  }
@ -297,7 +298,7 @@ static void ComputeBackpropInput(const DepthwiseArgs& args,

  for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
    // Reset accumulator.
-    auto vaccum = Eigen::internal::pset1<Packet>(0);
+    auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
    for (int j = 0; j < filter_spatial_size; ++j) {
      // Calculate index.
      const int64 index = i + j * padded_filter_inner_dim_size;
@ -318,7 +319,7 @@ static void ComputeBackpropInput(const DepthwiseArgs& args,
  }

  if (output_scalar_size > 0) {
-    auto vaccum = Eigen::internal::pset1<Packet>(0);
+    auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
    for (int j = 0; j < filter_spatial_size; ++j) {
      const int64 index =
          output_vectorized_size + j * padded_filter_inner_dim_size;
@ -346,7 +347,7 @@ static void ComputeBackpropInput(const DepthwiseArgs& args,
  if (depth_multiplier > 1) {
    for (int64 d = 0; d < in_depth; ++d) {
      const int64 index = d * args.depth_multiplier;
-      T accum = 0;
+      T accum = static_cast<T>(0);
      for (int64 dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
        const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
        accum += Eigen::internal::predux(v);
@ -510,6 +511,8 @@ static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,

 #if GOOGLE_CUDA

+extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
+                                                          Eigen::half>;
 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;

@ -884,6 +887,8 @@ static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,

 #if GOOGLE_CUDA

+extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
+                                                           Eigen::half>;
 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;

--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@ -94,7 +94,7 @@ struct DepthwiseConv2DKernel {

    for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
      // Reset accumulator.
-      auto vaccum = Eigen::internal::pset1<Packet>(0);
+      auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
      for (int j = 0; j < filter_spatial_size; ++j) {
        // Calculate index.
        const int64 index = i + j * padded_filter_inner_dim_size;
@ -115,7 +115,7 @@ struct DepthwiseConv2DKernel {
    }

    if (output_scalar_size > 0) {
-      auto vaccum = Eigen::internal::pset1<Packet>(0);
+      auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
      for (int j = 0; j < filter_spatial_size; ++j) {
        const int64 index =
            output_vectorized_size + j * padded_filter_inner_dim_size;
@ -246,6 +246,7 @@ extern template class LaunchConv2DOp<CPUDevice, float>;
 #if GOOGLE_CUDA

 // Extern template instantiated in depthwise_conv_op_gpu.cc.
+extern template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
 extern template struct LaunchDepthwiseConvOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvOp<GPUDevice, double>;

@ -419,12 +420,18 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
      Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
      DepthwiseConv2dNativeOp<CPUDevice, T>);

+TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
 TF_CALL_double(REGISTER_CPU_KERNEL);
 #endif

 #if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::half>("T"),
+                        DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
+
 REGISTER_KERNEL_BUILDER(
    Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<float>("T"),
    DepthwiseConv2dNativeOp<GPUDevice, float>);
--- a/tensorflow/core/kernels/depthwise_conv_op.h
+++ b/tensorflow/core/kernels/depthwise_conv_op.h
@ -158,7 +158,8 @@ struct DepthwiseFilterPadOp {
      }
      // Pad the remainder of output to vector-register boundary.
      for (int64 j = 0; j < pad_size; ++j) {
-        padded_filter[output_base + vectorized_size + scalar_size + j] = 0;
+        padded_filter[output_base + vectorized_size + scalar_size + j] =
+            static_cast<T>(0);
      }
    }
  }
@ -266,7 +267,7 @@ struct DepthwiseInputCopyOp {

          // Pad the remainder of the output to vector register boundary.
          for (int64 d = 0; d < output_pad_size; ++d) {
-            in_buf[d] = 0;
+            in_buf[d] = static_cast<T>(0);
          }
          in_buf += output_pad_size;

--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@ -105,7 +105,7 @@ __global__ void __launch_bounds__(1024, 2)
    const int input_row_end = input_row_start + filter_rows;
    const int input_col_end = input_col_start + filter_cols;

-    T sum = 0;
+    T sum = static_cast<T>(0);

    const int input_offset_temp = in_rows * OB;
    if (input_row_start >= 0 && input_col_start >= 0 &&
@ -258,8 +258,8 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
    __syncthreads();

    if (depth_in_range) {
-      T sum1 = 0;
-      T sum2 = 0;
+      T sum1 = static_cast<T>(0);
+      T sum2 = static_cast<T>(0);
      int shared_offset = data_idx;
      const T* filter_ptr = filter_read_offset + shared_data;
      UNROLL for (int r = 0; r < filter_rows; ++r) {
@ -369,7 +369,7 @@ __global__ void __launch_bounds__(1024, 2)
    const int input_row_end = input_row_start + filter_rows;
    const int input_col_end = input_col_start + filter_cols;

-    T sum = 0;
+    T sum = static_cast<T>(0);
    if (input_row_start >= 0 && input_col_start >= 0 &&
        input_row_end < in_rows && input_col_end < in_cols) {
      // Loop that doesn't need to check for boundary conditions.
@ -529,8 +529,8 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
    __syncthreads();

    if (slice_in_range) {
-      T sum1 = 0;
-      T sum2 = 0;
+      T sum1 = static_cast<T>(0);
+      T sum2 = static_cast<T>(0);
      int shared_offset = data_idx;
      const T* filter_ptr = filter_read_offset + shared_data;
      UNROLL for (int r = 0; r < filter_rows; ++r) {
@ -710,6 +710,7 @@ void LaunchDepthwiseConvOp<GPUDevice, T>::operator()(OpKernelContext* ctx,
                  "Launch of gpu kernel for DepthwiseConv2dGPULaunch failed"));
 }

+template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
 template struct LaunchDepthwiseConvOp<GPUDevice, float>;
 template struct LaunchDepthwiseConvOp<GPUDevice, double>;

@ -744,7 +745,7 @@ __global__ void __launch_bounds__(640, 2)
    const int in_r = (thread_id / in_depth / in_cols) % in_rows;
    const int b = thread_id / in_depth / in_cols / in_rows;

-    T sum = 0;
+    T sum = static_cast<T>(0);

    const int out_r_start =
        tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride);
@ -810,7 +811,7 @@ __global__ void __launch_bounds__(640, 2)
    const int in_d = (thread_id / in_cols / in_rows) % in_depth;
    const int b = thread_id / in_depth / in_cols / in_rows;

-    T sum = 0;
+    T sum = static_cast<T>(0);
    const int out_d_start = in_d * depth_multiplier;
    const int out_d_end = out_d_start + depth_multiplier;

@ -919,6 +920,7 @@ void LaunchDepthwiseConvBackpropInputOp<GPUDevice, T>::operator()(
                               "utGPULaunch failed"));
 }

+template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, Eigen::half>;
 template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
 template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;

@ -1631,6 +1633,7 @@ void LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T>::operator()(
                               "terGPULaunch failed"));
 }

+template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, Eigen::half>;
 template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
 template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
 }  // namespace tensorflow
--- a/Show More
+++ b/Show More