Merge commit for internal changes

2018-06-12 11:47:21 -07:00 · 2018-06-12 11:47:21 -07:00 · e33538d114
commit e33538d114
parent 5fa7b03a25 c5436b90ad
100 changed files with 3632 additions and 1332 deletions
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@ -142,8 +142,10 @@ void TestRemoteExecute(bool async) {
  TFE_ContextOptions* opts = TFE_NewContextOptions();
  TFE_ContextOptionsSetServerDef(opts, serialized.data(), serialized.size(),
                                 status);
-  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(1));
  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(1));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts,
+                                             TFE_DEVICE_PLACEMENT_EXPLICIT);
  TFE_Context* ctx = TFE_NewContext(opts, status);
  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
  TFE_DeleteContextOptions(opts);
@ -205,6 +207,83 @@ void TestRemoteExecute(bool async) {
 TEST(CAPI, RemoteExecute) { TestRemoteExecute(false); }
 TEST(CAPI, RemoteExecuteAsync) { TestRemoteExecute(true); }

+void TestRemoteExecuteSilentCopies(bool async) {
+  tensorflow::ServerDef server_def = GetServerDef(2);
+
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+
+  std::unique_ptr<tensorflow::eager::EagerGrpcServer> worker_server;
+  ASSERT_TRUE(
+      tensorflow::eager::EagerGrpcServer::Create(server_def, &worker_server)
+          .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetServerDef(opts, serialized.data(), serialized.size(),
+                                 status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(1));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle();
+  TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle();
+  const char remote_device_name[] =
+      "/job:localhost/replica:0/task:1/device:CPU:0";
+
+  // Handles are on task0, but op is on remote (task1).
+  TFE_Op* matmul = MatMulOp(ctx, h0_task0, h1_task0);
+  TFE_OpSetDevice(matmul, remote_device_name, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* retvals[1];
+  int num_retvals = 1;
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  auto* retval_task0 = TFE_TensorHandleCopyToDevice(
+      retvals[0], ctx, "/job:localhost/replica:0/task:0/device:CPU:0", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retval_task0, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(retval_task0);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+
+  TFE_DeleteTensorHandle(h0_task0);
+  TFE_DeleteTensorHandle(h1_task0);
+  TFE_DeleteTensorHandle(retvals[0]);
+
+  TFE_DeleteOp(matmul);
+
+  TFE_ContextAsyncWait(ctx, status);
+  TFE_DeleteContext(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_DeleteStatus(status);
+
+  // TODO(nareshmodi): Figure out how to correctly shut the server down.
+  worker_server.release();
+}
+
+TEST(CAPI, RemoteExecuteSilentCopies) { TestRemoteExecuteSilentCopies(false); }
+TEST(CAPI, RemoteExecuteSilentCopiesAsync) {
+  TestRemoteExecuteSilentCopies(true);
+}
+
 TEST(CAPI, TensorHandle) {
  TFE_TensorHandle* h = TestMatrixTensorHandle();
  EXPECT_EQ(TF_FLOAT, TFE_TensorHandleDataType(h));
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@ -42,7 +42,7 @@ tf_cc_binary(
        "//tensorflow/compiler/xla/service:cpu_plugin",
        "//tensorflow/core:framework_internal",
        "//tensorflow/core:lib",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -61,7 +61,7 @@ tf_cc_test(
        "//tensorflow/core:lib",
        "//tensorflow/core:test",
        "//tensorflow/core:test_main",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -74,6 +74,6 @@ cc_library(
        "//tensorflow/compiler/xla/service",
        "//tensorflow/compiler/xla/service:platform_util",
        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@ -2379,7 +2379,6 @@ cc_library(
        ":hlo_graph_dumper",
        ":hlo_pass",
        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
        "//tensorflow/core:lib",
    ],
 )
@ -2574,6 +2573,7 @@ cc_library(
    hdrs = ["hlo_graph_dumper.h"],
    deps = [
        ":hlo",
+        ":hlo_casting_utils",
        ":hlo_execution_profile",
        ":hlo_tfgraph_builder",
        "//tensorflow/compiler/xla:literal_util",
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@ -47,12 +47,16 @@ bool GpuMultiOutputFusion::ShapesCompatibleForFusion(HloInstruction* instr1,
        element_instr = fused_expression_root;
      }
    }
+    // Special handling of kReduce instructions -- the fusion
+    // applies to the first operand.
+    if (element_instr->opcode() == HloOpcode::kReduce) {
+      return element_instr->operand(0)->shape();
+    }
    return element_instr->shape();
  };

  // The elementwise output shapes must be the same (including layout)
-  return ShapeUtil::ShapeUtil::Equal(get_element_shape(instr1),
-                                     get_element_shape(instr2));
+  return ShapeUtil::Equal(get_element_shape(instr1), get_element_shape(instr2));
 }

 bool GpuMultiOutputFusion::IsProfitableOperand(HloInstruction* instr) {
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@ -36,6 +36,11 @@ const char kModulePrefix[] = R"(
      scalar_lhs = f32[] parameter(0)
      scalar_rhs = f32[] parameter(1)
      ROOT add = f32[] add(scalar_lhs, scalar_rhs)
+    }
+    scalar_mul_computation {
+      scalar_lhs = f32[] parameter(0)
+      scalar_rhs = f32[] parameter(1)
+      ROOT mul = f32[] add(scalar_lhs, scalar_rhs)
    })";

 TEST_F(InstructionFusionTest, MultiOutputFusionSiblingReduceAndReduceFusion) {
@ -67,6 +72,34 @@ TEST_F(InstructionFusionTest, MultiOutputFusionSiblingReduceAndReduceFusion) {
              op::Tuple(op::Reduce(), op::Reduce()));
 }

+TEST_F(InstructionFusionTest, MultiOutputFusionDifferentReduceInputShapes) {
+  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p1.1 = f32[6400]{0} parameter(1)
+      mul = f32[6400]{0} multiply(p1.1, p1.1)
+      const.1 = f32[] parameter(0)
+      ROOT reduce.1 = f32[] reduce(p1.1, const.1), dimensions={0}, to_apply=scalar_add_computation
+    }
+
+    fused_computation_2 {
+      p1.2 = f32[6400]{0} parameter(1)
+      r1 = f32[64,100]{0,1} reshape(p1.2)
+      const.2 = f32[] parameter(0)
+      ROOT reduce.2 = f32[] reduce(r1, const.2), dimensions={1,0}, to_apply=scalar_mul_computation
+    }
+
+    ENTRY entry {
+      p0 = f32[] parameter(0)
+      p1 = f32[6400]{0} parameter(1)
+      const.2 = f32[] constant(1)
+      fusion.1 = f32[] fusion(p0, p1), kind=kInput, calls=fused_computation_1
+      fusion.2 = f32[] fusion(p0, p1), kind=kInput, calls=fused_computation_2
+      ROOT root = (f32[], f32[]) tuple(fusion.1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+}
+
 TEST_F(InstructionFusionTest, MultiOutputFusionSiblingReduceFusions) {
  // Two sibling fusions with reduce instruction roots sharing the same input
  // param.
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@ -357,7 +357,6 @@ std::list<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
  std::list<HloInstruction*> post_order;
  std::list<HloInstruction*> trace_instructions;
  tensorflow::gtl::FlatSet<HloInstruction*> added_instructions;
-  std::vector<HloInstruction> dfs_stack;
  for (auto& instruction : instructions_) {
    if (instruction->opcode() == HloOpcode::kTrace) {
      // Trace instructions aren't handled by the DFS visitor. Add trace
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@ -28,6 +28,8 @@ limitations under the License.

 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@ -723,17 +725,14 @@ string HloDotDumper::DumpRootTag() {
                to_id, node_body, node_shape, NodeColorAttributes(color));
 }

-static const HloInstruction* TryGetFusionParameterConstant(
+static const HloConstantInstruction* TryGetFusionParameterConstant(
    const HloInstruction* instr) {
  if (instr->opcode() != HloOpcode::kParameter || !instr->IsFused()) {
    return nullptr;
  }
  const HloInstruction* fusion = instr->parent()->FusionInstruction();
  const HloInstruction* operand = fusion->operand(instr->parameter_number());
-  if (operand->opcode() == HloOpcode::kConstant) {
-    return operand;
-  }
-  return nullptr;
+  return DynCast<HloConstantInstruction>(operand);
 }

 bool HloDotDumper::ShouldMergeIntoUsers(const HloInstruction* instr) const {
@ -826,7 +825,7 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr) {

 string HloDotDumper::GetInstructionNodeInlinedOperands(
    const HloInstruction* instr) {
-  auto stringify_constant = [](const HloInstruction* constant) {
+  auto stringify_constant = [](const HloConstantInstruction* constant) {
    const auto& shape = constant->shape();

    // If the shape has a dimension of size zero, print it as e.g.
@ -845,7 +844,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
        *elem_count *= dim;
      }
    }
-    if (elem_count.has_value() && *elem_count <= 8 && constant->HasLiteral()) {
+    if (elem_count.has_value() && *elem_count <= 8) {
      return Printf("%s (%s)", constant->literal().ToString(),
                    ShapeUtil::HumanString(constant->shape()));
    }
@ -864,9 +863,10 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
  std::vector<string> lines;
  for (int64 i = 0; i < instr->operand_count(); ++i) {
    const HloInstruction* operand = instr->operand(i);
+    const auto* constant_operand = DynCast<HloConstantInstruction>(operand);
    optional<string> operand_str;
-    if (operand->opcode() == HloOpcode::kConstant) {
-      operand_str = stringify_constant(operand);
+    if (constant_operand != nullptr) {
+      operand_str = stringify_constant(constant_operand);
    } else if (ShouldMergeIntoUsers(operand)) {
      // Special case: If the operand is a parameter to a fusion node and it
      // always has a constant value, display it like a regular constant.
@ -874,7 +874,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
      // For other parameters, use the parameter number rather than the proper
      // name, because that's generally how people think of the node.
      if (operand->opcode() == HloOpcode::kParameter) {
-        if (const HloInstruction* constant =
+        if (const HloConstantInstruction* constant =
                TryGetFusionParameterConstant(operand)) {
          operand_str = stringify_constant(constant);
        } else {
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@ -178,6 +178,23 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                                slice_limits, slice_strides);
      break;
    }
+    case HloOpcode::kConstant: {
+      CHECK(proto.has_literal());
+      TF_ASSIGN_OR_RETURN(auto literal,
+                          Literal::CreateFromProto(proto.literal()));
+      instruction = CreateConstant(std::move(literal));
+      break;
+    }
+    case HloOpcode::kTrace: {
+      TF_RET_CHECK(proto.operand_ids_size() == 1)
+          << "Trace instruction should have 1 operand but sees "
+          << proto.operand_ids_size();
+      CHECK(proto.has_literal());
+      TF_ASSIGN_OR_RETURN(auto literal,
+                          Literal::CreateFromProto(proto.literal()));
+      instruction = CreateTrace(literal->GetR1U8AsString(), operands(0));
+      break;
+    }
    default: {
      instruction = WrapUnique(new HloInstruction(opcode, proto.shape()));
      for (const int64 operand_id : proto.operand_ids()) {
@ -223,22 +240,11 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
    instruction->called_computations_.push_back(fused_computation);
  }

-  if (instruction->opcode() == HloOpcode::kTrace) {
-    TF_RET_CHECK(instruction->operands().size() == 1)
-        << "Trace instruction should have 1 operand but sees "
-        << instruction->operands().size();
-    instruction->mutable_operand(0)->set_tracing(instruction.get());
-  }
-
  TF_RET_CHECK(!proto.name().empty());
  instruction->SetAndSanitizeName(proto.name());

  instruction->metadata_ = proto.metadata();
  instruction->backend_config_ = proto.backend_config();
-  if (proto.has_literal()) {
-    TF_ASSIGN_OR_RETURN(instruction->literal_,
-                        Literal::CreateFromProto(proto.literal()));
-  }
  instruction->parameter_number_ = proto.parameter_number();

  instruction->tuple_index_ = proto.tuple_index();
@ -301,20 +307,12 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(

 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateTrace(
    const string& tag, HloInstruction* operand) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kTrace, ShapeUtil::MakeNil()));
-  instruction->operands_.push_back(operand);
-  instruction->literal_ = Literal::CreateR1U8(tag);
-  operand->set_tracing(instruction.get());
-  return instruction;
+  return MakeUnique<HloTraceInstruction>(tag, operand);
 }

 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConstant(
    std::unique_ptr<Literal> literal) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kConstant, literal->shape()));
-  instruction->literal_ = std::move(literal);
-  return instruction;
+  return MakeUnique<HloConstantInstruction>(std::move(literal));
 }

 /* static */ std::unique_ptr<HloInstruction>
@ -1321,6 +1319,8 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
    case HloOpcode::kBroadcast:
    case HloOpcode::kMap:
    case HloOpcode::kSlice:
+    case HloOpcode::kConstant:
+    case HloOpcode::kTrace:
      clone = CloneWithNewOperandsImpl(shape, new_operands, context);
      break;
    // Unary ops.
@ -1470,9 +1470,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
      clone =
          CreateWhile(shape, while_condition(), while_body(), new_operands[0]);
      break;
-    case HloOpcode::kConstant:
-      clone = CreateConstant(literal_->CloneToUnique());
-      break;
    case HloOpcode::kFusion: {
      HloModule* module = context != nullptr ? context->module() : GetModule();
      HloComputation* new_fused_computation = nullptr;
@ -1520,8 +1517,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
    case HloOpcode::kGenerateToken:
      clone = CreateGenerateToken(new_operands);
      break;
-    case HloOpcode::kTrace:
-      LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode_);
  }
  SetupDerivedInstruction(clone.get());
  clone->set_parent(parent_);
@ -1602,13 +1597,6 @@ const HloInstruction* HloInstruction::LatestNonGteAncestor() const {
  return hlo;
 }

-const Literal& HloInstruction::literal() const {
-  CHECK_EQ(HloOpcode::kConstant, opcode_);
-  return *literal_;
-}
-
-bool HloInstruction::HasLiteral() const { return literal_ != nullptr; }
-
 int64 HloInstruction::tuple_index() const {
  CHECK_EQ(HloOpcode::kGetTupleElement, opcode_);
  return tuple_index_;
@ -1702,10 +1690,6 @@ void HloInstruction::AddUser(HloInstruction* user) {
  }
 }

-bool HloInstruction::IsConstant() const {
-  return opcode_ == HloOpcode::kConstant;
-}
-
 bool HloInstruction::HasConstantOperand() const {
  for (const HloInstruction* operand : operands_) {
    if (operand->IsConstant()) {
@ -1782,7 +1766,6 @@ bool HloInstruction::IdenticalSlowPath(
    // These opcodes have complex or special behavior so just return false.
    case HloOpcode::kDomain:
    case HloOpcode::kRng:
-    case HloOpcode::kTrace:
    case HloOpcode::kWhile:
    case HloOpcode::kGenerateToken:
      return false;
@ -1790,10 +1773,6 @@ bool HloInstruction::IdenticalSlowPath(
    case HloOpcode::kParameter:
      return parameter_number() == other.parameter_number();

-    // A constant is defined by the value in the literal.
-    case HloOpcode::kConstant:
-      return literal() == other.literal();
-
    // A reduce-precision operation is determined by the bit sizes.
    case HloOpcode::kReducePrecision:
      return exponent_bits() == other.exponent_bits() &&
@ -1878,6 +1857,8 @@ bool HloInstruction::IdenticalSlowPath(
    case HloOpcode::kBroadcast:
    case HloOpcode::kMap:
    case HloOpcode::kSlice:
+    case HloOpcode::kConstant:
+    case HloOpcode::kTrace:
      LOG(FATAL) << "Base class impl called for opcode with subclass: "
                 << opcode();
  }
@ -2172,34 +2153,7 @@ string HloInstruction::OperandsToStringWithCanonicalNameMap(
    const HloPrintOptions& options,
    CanonicalNameMap* canonical_name_map) const {
  string operands;
-  if (opcode() == HloOpcode::kConstant) {
-    // For constants, show the actual value in place of an empty operand list.
-    //
-    // In HloInstruction, sometimes a constant literal is not constructed due
-    // to its size. Skip the printing in this case.
-    if (HasLiteral() && ((!ShapeUtil::IsTuple(shape()) &&
-                          ShapeUtil::ElementsIn(shape()) <= 10) ||
-                         options.print_large_constants())) {
-      // Literal::ToString emits multidimensional arrays over multiple
-      // lines. Compact this into one line by stripping out white space.
-      string tmp = literal().ToString();
-      std::replace(tmp.begin(), tmp.end(), '\n', ' ');
-      std::vector<string> v = tensorflow::str_util::Split(tmp, ' ');
-      bool first = true;
-      // Concatenate elements in "v" with spaces separating them, but ignoring
-      // empty entries.
-      for (const auto& s : v) {
-        if (s.empty()) {
-          continue;
-        }
-        StrAppend(&operands, (first ? "" : " "), s);
-        first = false;
-      }
-    } else {
-      // Do not show large constants or tuples.
-      operands = "{...}";
-    }
-  } else if (opcode() == HloOpcode::kParameter) {
+  if (opcode() == HloOpcode::kParameter) {
    StrAppend(&operands, parameter_number_);
  } else {
    tensorflow::gtl::ArraySlice<HloInstruction*> slice(operands_);
@ -2410,9 +2364,6 @@ HloInstructionProto HloInstruction::ToProto() const {

  *proto.mutable_metadata() = metadata_;
  proto.set_backend_config(backend_config_);
-  if (literal_ != nullptr) {
-    *proto.mutable_literal() = literal_->ToProto();
-  }
  proto.set_parameter_number(parameter_number_);
  if (opcode() == HloOpcode::kFusion) {
    proto.set_fusion_kind(xla::ToString(fusion_kind()));
@ -2518,12 +2469,6 @@ void HloInstruction::set_tracing(HloInstruction* trace_instruction) {
  trace_instruction_ = trace_instruction;
 }

-string HloInstruction::TracingTag() const {
-  CHECK_EQ(HloOpcode::kTrace, opcode());
-  CHECK(literal_ != nullptr);
-  return literal_->GetR1U8AsString();
-}
-
 bool HloInstruction::IsFused() const { return parent_->IsFusionComputation(); }

 bool HloInstruction::IsFusable() const {
@ -3035,10 +2980,6 @@ bool HloInstruction::IsElementwiseBinary() const {

 bool HloInstruction::IsElementwise() const {
  switch (opcode_) {
-    // Nullary elementwise operations.
-    case HloOpcode::kConstant:
-      return true;
-
    // Unary elementwise operations.
    case HloOpcode::kAbs:
    case HloOpcode::kRoundNearestAfz:
@ -3500,23 +3441,6 @@ void HloInstruction::set_outer_dimension_partitions(
  outer_dimension_partitions_ = outer_dimension_partitions;
 }

-void HloInstruction::RelayoutConstant(const Layout& new_layout,
-                                      const ShapeIndex& shape_index) {
-  CHECK_EQ(opcode(), HloOpcode::kConstant);
-  Shape* mutable_array_subshape =
-      ShapeUtil::GetMutableSubshape(mutable_shape(), shape_index);
-  CHECK(ShapeUtil::IsArray(*mutable_array_subshape));
-
-  // Normally array_subshape will always have a layout, but this invariant is
-  // temporarily broken in LayoutAssignment::AssignLayouts.
-
-  if (!mutable_array_subshape->has_layout() ||
-      !LayoutUtil::Equal(mutable_array_subshape->layout(), new_layout)) {
-    literal_ = literal_->Relayout(new_layout, shape_index);
-    *mutable_array_subshape->mutable_layout() = new_layout;
-  }
-}
-
 // TODO(b/80131774): Remove these temporary methods after transition.
 int64 HloInstruction::feature_index() const {
  return Cast<HloBatchNormInstruction>(this)->feature_index();
@ -3574,4 +3498,21 @@ const std::vector<int64>& HloInstruction::slice_strides() const {
 bool HloInstruction::IsInPlaceSlice() const {
  return Cast<HloSliceInstruction>(this)->IsInPlaceSlice();
 }
+
+const Literal& HloInstruction::literal() const {
+  return Cast<HloConstantInstruction>(this)->literal();
+}
+
+bool HloInstruction::IsConstant() const {
+  return DynCast<HloConstantInstruction>(this) != nullptr;
+}
+
+void HloInstruction::RelayoutConstant(const Layout& new_layout,
+                                      const ShapeIndex& shape_index) {
+  Cast<HloConstantInstruction>(this)->RelayoutConstant(new_layout, shape_index);
+}
+
+string HloInstruction::TracingTag() const {
+  return Cast<HloTraceInstruction>(this)->TracingTag();
+}
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@ -875,14 +875,6 @@ class HloInstruction {
  template <typename HloInstructionPtr>
  Status Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor);

-  // Returns the literal associated with this instruction.
-  //
-  // Note: only constant and parameter opcodes have an associated literal.
-  const Literal& literal() const;
-
-  // Returns whether there is literal associated with this instruction.
-  bool HasLiteral() const;
-
  // Returns the parameter number associated with this instruction.
  //
  // Note: only parameter opcodes have an associated parameter number.
@ -1014,14 +1006,6 @@ class HloInstruction {
  string infeed_config() const { return infeed_config_; }
  void set_infeed_config(const string& config) { infeed_config_ = config; }

-  // Returns a tag to be used in tracing.
-  //
-  // Precondition: opcode() == HloOpcode::kTrace
-  string TracingTag() const;
-
-  // Returns whether the instruction is a constant.
-  bool IsConstant() const;
-
  // Returns true if this instruction is fused, ie contained within a fusion
  // instruction.
  bool IsFused() const;
@ -1452,12 +1436,6 @@ class HloInstruction {
  void set_outer_dimension_partitions(
      const std::vector<int64>& outer_dimension_partitions);

-  // Change the layout for an Constant Hlo instruction to match new_layout.  For
-  // tuple shaped constants shape_index is the path to the internal array
-  // subshape whose layout needs to be changed.
-  void RelayoutConstant(const Layout& new_layout,
-                        const ShapeIndex& shape_index = {});
-
  // Old methods kept for smooth subclassing transition BEGIN.
  // TODO(b/80131774): Remove this code.

@ -1504,6 +1482,19 @@ class HloInstruction {

  // Delegates to HloSliceInstruction::IsInPlaceSlice.
  bool IsInPlaceSlice() const;
+
+  // Returns the literal associated with this instruction.
+  const Literal& literal() const;
+
+  // Returns whether the instruction is a constant.
+  bool IsConstant() const;
+
+  // Delegate to HloConstantInstruction::RelayoutConstant.
+  void RelayoutConstant(const Layout& new_layout,
+                        const ShapeIndex& shape_index = {});
+
+  // Delegates to HloTraceInstruction::TracingTag.
+  string TracingTag() const;
  // Old methods kept for smooth subclassing transition END.

 protected:
@ -1544,7 +1535,7 @@ class HloInstruction {
      CanonicalNameMap* canonical_name_map) const;

  // Prints an operand to a string.
-  string OperandsToStringWithCanonicalNameMap(
+  virtual string OperandsToStringWithCanonicalNameMap(
      const HloPrintOptions& options,
      CanonicalNameMap* canonical_name_map) const;

@ -1639,9 +1630,6 @@ class HloInstruction {
  // Result shape of this instruction.
  Shape shape_;

-  // Literal, only present for kConstant.
-  std::unique_ptr<Literal> literal_;
-
  // Constant index, only present for kGetTupleElement.
  int64 tuple_index_ = -1;

--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@ -20,6 +20,7 @@ limitations under the License.
 namespace xla {

 using ::tensorflow::str_util::Join;
+using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;

 HloBatchNormInstruction::HloBatchNormInstruction(
@ -586,4 +587,105 @@ std::unique_ptr<HloInstruction> HloSliceInstruction::CloneWithNewOperandsImpl(
  return MakeUnique<HloSliceInstruction>(shape, new_operands[0], slice_starts_,
                                         slice_limits_, slice_strides_);
 }
+
+HloConstantInstruction::HloConstantInstruction(std::unique_ptr<Literal> literal)
+    : HloInstruction(HloOpcode::kConstant, CHECK_NOTNULL(literal)->shape()),
+      literal_(std::move(literal)) {}
+
+HloInstructionProto HloConstantInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  *proto.mutable_literal() = literal_->ToProto();
+  return proto;
+}
+
+bool HloConstantInstruction::IsElementwise() const { return true; }
+
+void HloConstantInstruction::RelayoutConstant(const Layout& new_layout,
+                                              const ShapeIndex& shape_index) {
+  Shape* mutable_array_subshape =
+      ShapeUtil::GetMutableSubshape(mutable_shape(), shape_index);
+  CHECK(ShapeUtil::IsArray(*mutable_array_subshape));
+
+  // Normally array_subshape will always have a layout, but this invariant is
+  // temporarily broken in LayoutAssignment::AssignLayouts.
+
+  if (!mutable_array_subshape->has_layout() ||
+      !LayoutUtil::Equal(mutable_array_subshape->layout(), new_layout)) {
+    literal_ = literal_->Relayout(new_layout, shape_index);
+    *mutable_array_subshape->mutable_layout() = new_layout;
+  }
+}
+
+bool HloConstantInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& other_slice = static_cast<const HloSliceInstruction&>(other);
+  return literal() == other_slice.literal();
+}
+
+std::unique_ptr<HloInstruction>
+HloConstantInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  return MakeUnique<HloConstantInstruction>(literal_->CloneToUnique());
+}
+
+string HloConstantInstruction::OperandsToStringWithCanonicalNameMap(
+    const HloPrintOptions& options,
+    CanonicalNameMap* canonical_name_map) const {
+  string operands;
+  // For constants, show the actual value in place of an empty operand list.
+  if ((!ShapeUtil::IsTuple(shape()) && ShapeUtil::ElementsIn(shape()) <= 10) ||
+      options.print_large_constants()) {
+    // Literal::ToString emits multidimensional arrays over multiple
+    // lines. Compact this into one line by stripping out white space.
+    string tmp = literal().ToString();
+    std::replace(tmp.begin(), tmp.end(), '\n', ' ');
+    std::vector<string> v = tensorflow::str_util::Split(tmp, ' ');
+    bool first = true;
+    // Concatenate elements in "v" with spaces separating them, but ignoring
+    // empty entries.
+    for (const auto& s : v) {
+      if (s.empty()) {
+        continue;
+      }
+      StrAppend(&operands, (first ? "" : " "), s);
+      first = false;
+    }
+  } else {
+    // Do not show large constants or tuples.
+    operands = "{...}";
+  }
+  return operands;
+}
+
+HloTraceInstruction::HloTraceInstruction(const string& tag,
+                                         HloInstruction* operand)
+    : HloInstruction(HloOpcode::kTrace, ShapeUtil::MakeNil()),
+      literal_(Literal::CreateR1U8(tag)) {
+  AppendOperand(operand);
+  operand->set_tracing(this);
+}
+
+HloInstructionProto HloTraceInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  *proto.mutable_literal() = literal_->ToProto();
+  return proto;
+}
+
+bool HloTraceInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  return false;
+}
+
+std::unique_ptr<HloInstruction> HloTraceInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode());
+}
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@ -433,6 +433,62 @@ class HloSliceInstruction : public HloInstruction {
  // Describes whether the slice can be lowered to an offset into the operand.
  bool is_in_place_slice_ = false;
 };
+
+class HloConstantInstruction : public HloInstruction {
+ public:
+  explicit HloConstantInstruction(std::unique_ptr<Literal> literal);
+  // Returns the literal associated with this instruction.
+  const Literal& literal() const { return *literal_; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+  // Returns true if this instruction is elementwise on all its operands.
+  bool IsElementwise() const override;
+
+  // Change the layout for an Constant Hlo instruction to match new_layout.  For
+  // tuple shaped constants shape_index is the path to the internal array
+  // subshape whose layout needs to be changed.
+  void RelayoutConstant(const Layout& new_layout,
+                        const ShapeIndex& shape_index = {});
+
+ private:
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  string OperandsToStringWithCanonicalNameMap(
+      const HloPrintOptions& options,
+      CanonicalNameMap* canonical_name_map) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+  // TODO(b/36360764): Remove unique_ptr wrapping.
+  std::unique_ptr<Literal> literal_;
+};
+
+class HloTraceInstruction : public HloInstruction {
+ public:
+  explicit HloTraceInstruction(const string& tag, HloInstruction* operand);
+  // Returns a tag to be used in tracing.
+  string TracingTag() const { return literal_->GetR1U8AsString(); }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+  // TODO(b/36360764): Remove unique_ptr wrapping.
+  std::unique_ptr<Literal> literal_;
+};
+
 }  // namespace xla

 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@ -127,9 +127,14 @@ Status HloModuleGroupMetadata::VerifyCompanionSets() const {
    for (HloInstruction* instruction : *companions) {
      // Go through all the communicating instructions (send, recv) of the given
      // companion, and record their device.
+      auto it = tracked_instructions_comms_.find(instruction);
+      if (it == tracked_instructions_comms_.end()) {
+        // Companions can be added even if they have no communicating
+        // instructions, if they are parent of companions.
+        continue;
+      }
      std::unordered_set<int64> comm_devices;
-      for (HloInstruction* comm_instruction :
-           tracked_instructions_comms_.at(instruction)) {
+      for (HloInstruction* comm_instruction : it->second) {
        auto device = GetInstructionDevice(*comm_instruction);
        TF_RET_CHECK(device) << "Instruction " << comm_instruction->ToString()
                             << " does not have a device";
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@ -232,7 +232,13 @@ def _dnn_tree_combined_model_fn(features,
        return update_op

  if predict_with_tree_only:
-    tree_train_logits = tree_logits
+    if mode == model_fn.ModeKeys.TRAIN or mode == model_fn.ModeKeys.PREDICT:
+      tree_train_logits = tree_logits
+    else:
+      tree_train_logits = control_flow_ops.cond(
+          global_step > dnn_steps_to_train,
+          lambda: tree_logits,
+          lambda: dnn_logits)
  else:
    tree_train_logits = dnn_logits + tree_logits

--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@ -36,6 +36,7 @@ except ImportError:


 _GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
+_ENDPOINTS_SEPARATOR = ','
 _DEFAULT_ENV_VARIABLE = 'TPU_NAME'
 _DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'

@ -69,8 +70,8 @@ class TPUClusterResolver(ClusterResolver):
    return _GKE_ENV_VARIABLE in os.environ

  @staticmethod
-  def _gkeMaster():
-    return os.environ[_GKE_ENV_VARIABLE].split(',')[0]
+  def _gkeEndpoints():
+    return os.environ[_GKE_ENV_VARIABLE]

  @staticmethod
  def _envVarFallback():
@ -143,7 +144,7 @@ class TPUClusterResolver(ClusterResolver):
    # When using GKE with Cloud TPUs, the env variable will be set.
    if tpu is None:
      if in_gke:
-        tpu = self._gkeMaster()
+        tpu = self._gkeEndpoints()
      else:
        tpu = self._envVarFallback()

@ -214,7 +215,7 @@ class TPUClusterResolver(ClusterResolver):
      ValueError: If none of the TPUs specified exists.
    """
    if not self._shouldResolve():
-      return self._tpu
+      return self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))[0]

    job_tasks = self.cluster_spec().job_tasks(self._job_name)
    if not job_tasks:
@ -280,8 +281,12 @@ class TPUClusterResolver(ClusterResolver):
        # Case 3.
        return None
      # Case 2.
-      cluster_spec = {self._job_name: [self._tpu[len(
-          compat.as_bytes('grpc://')):]]}
+      cluster_spec = {
+          self._job_name: [
+              x[len(compat.as_bytes('grpc://')):]
+              for x in self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))
+          ]
+      }

    if self._coordinator_address:
      # {1, 2}.a
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@ -402,13 +402,61 @@ class TPUClusterResolverTest(test.TestCase):
        compat.as_bytes('/bns/foo/bar'), tpu_cluster_resolver.master())
    self.assertEqual(None, tpu_cluster_resolver.cluster_spec())

-  def testGkeEnvironment(self):
+  def testGkeEnvironmentForDonut(self):
    os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
-    self.assertTrue('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS' in os.environ)
+
+    self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
    self.assertTrue(TPUClusterResolver._inGke())
    self.assertEqual(
        compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(TPUClusterResolver._gkeMaster()))
+        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))
+
+    tpu_cluster_resolver = TPUClusterResolver()
+    self.assertEqual(
+        compat.as_bytes('grpc://10.120.27.5:8470'),
+        compat.as_bytes(tpu_cluster_resolver.master()))
+    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.120.27.5:8470' }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+    del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
+
+  def testGkeEnvironmentForPod(self):
+    os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = ('grpc://10.120.27.5:8470,'
+                                                     'grpc://10.120.27.6:8470,'
+                                                     'grpc://10.120.27.7:8470,'
+                                                     'grpc://10.120.27.8:8470')
+
+    self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
+    self.assertTrue(TPUClusterResolver._inGke())
+    self.assertEqual(
+        compat.as_bytes('grpc://10.120.27.5:8470,'
+                        'grpc://10.120.27.6:8470,'
+                        'grpc://10.120.27.7:8470,'
+                        'grpc://10.120.27.8:8470'),
+        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))
+
+    tpu_cluster_resolver = TPUClusterResolver()
+    self.assertEqual(
+        compat.as_bytes('grpc://10.120.27.5:8470'),
+        compat.as_bytes(tpu_cluster_resolver.master()))
+    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.120.27.5:8470' }
+      tasks { key: 1 value: '10.120.27.6:8470' }
+      tasks { key: 2 value: '10.120.27.7:8470' }
+      tasks { key: 3 value: '10.120.27.8:8470' }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
    del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']

  def testDiscoveryUrl(self):
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -18,7 +18,16 @@ cmake_policy(SET CMP0022 NEW)

 # Options
 option(tensorflow_VERBOSE "Enable for verbose output" OFF)
+
+if(WIN32)
+# BoringSSL is disabled for windows as it currently doesn't build with
+# MSBuild. (Ninja is required.)
 option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" OFF)
+else()
+# BoringSSL is enabled for gRPC.
+option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" ON)
+endif()
+
 option(tensorflow_ENABLE_GRPC_SUPPORT "Enable gRPC support" ON)
 option(tensorflow_ENABLE_HDFS_SUPPORT "Enable HDFS support" OFF)
 option(tensorflow_ENABLE_JEMALLOC_SUPPORT "Enable jemalloc support" OFF)
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@ -20,6 +20,10 @@ set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
 set(GRPC_TAG d184fa229d75d336aedea0041bd59cb93e7e267f)

 if(WIN32)
+  # We use unsecure gRPC because boringssl does not build on windows
+  set(grpc_TARGET grpc++_unsecure)
+  set(grpc_DEPENDS protobuf zlib)
+  set(grpc_SSL_PROVIDER NONE)
  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
    set(grpc_STATIC_LIBRARIES
        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/grpc++_unsecure.lib
@ -32,9 +36,12 @@ if(WIN32)
        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/gpr.lib)
  endif()
 else()
+  set(grpc_TARGET grpc++)
+  set(grpc_DEPENDS boringssl protobuf zlib)
+  set(grpc_SSL_PROVIDER module)
  set(grpc_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc_unsecure.a
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++.a
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc.a
      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libaddress_sorting.a
      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/cares/lib/libcares.a
      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a)
@ -44,13 +51,13 @@ add_definitions(-DGRPC_ARES=0)

 ExternalProject_Add(grpc
    PREFIX grpc
-    DEPENDS protobuf zlib
+    DEPENDS ${grpc_DEPENDS}
    GIT_REPOSITORY ${GRPC_URL}
    GIT_TAG ${GRPC_TAG}
    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
    BUILD_IN_SOURCE 1
    BUILD_BYPRODUCTS ${grpc_STATIC_LIBRARIES}
-    BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc++_unsecure
+    BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target ${grpc_TARGET}
    COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc_cpp_plugin
    INSTALL_COMMAND ""
    CMAKE_CACHE_ARGS
@ -59,7 +66,7 @@ ExternalProject_Add(grpc
        -DPROTOBUF_INCLUDE_DIRS:STRING=${PROTOBUF_INCLUDE_DIRS}
        -DPROTOBUF_LIBRARIES:STRING=${protobuf_STATIC_LIBRARIES}
        -DZLIB_ROOT:STRING=${ZLIB_INSTALL}
-	-DgRPC_SSL_PROVIDER:STRING=NONE
+	-DgRPC_SSL_PROVIDER:STRING=${grpc_SSL_PROVIDER}
 )

 # grpc/src/core/ext/census/tracing.c depends on the existence of openssl/rand.h.
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@ -77,6 +77,7 @@ py_library(
        "//tensorflow/python:device_util",
        "//tensorflow/python:distribute",
        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
        "//tensorflow/python:pywrap_tensorflow",
        "//tensorflow/python:training",
        "//tensorflow/python:variable_scope",
@ -590,3 +591,22 @@ cuda_py_test(
        "notsan",
    ],
 )
+
+cuda_py_test(
+    name = "metrics_v1_test",
+    srcs = ["metrics_v1_test.py"],
+    additional_deps = [
+        ":combinations",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/contrib/data/python/ops:batching",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:test",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
--- a/tensorflow/contrib/distribute/python/metrics_v1_test.py
+++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py
@ -0,0 +1,438 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for V1 metrics."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
+from tensorflow.python.ops import variables
+
+
+def _labeled_dataset_fn():
+  # First four batches of x: labels, predictions -> (labels == predictions)
+  #  0: 0, 0 -> True;   1: 1, 1 -> True;   2: 2, 2 -> True;   3: 3, 0 -> False
+  #  4: 4, 1 -> False;  5: 0, 2 -> False;  6: 1, 0 -> False;  7: 2, 1 -> False
+  #  8: 3, 2 -> False;  9: 4, 0 -> False; 10: 0, 1 -> False; 11: 1, 2 -> False
+  # 12: 2, 0 -> False; 13: 3, 1 -> False; 14: 4, 2 -> False; 15: 0, 0 -> True
+  return dataset_ops.Dataset.range(1000).map(
+      lambda x: {"labels": x % 5, "predictions": x % 3}).batch(4)
+
+
+def _boolean_dataset_fn():
+  # First four batches of labels, predictions: {TP, FP, TN, FN}
+  # with a threshold of 0.5:
+  #   T, T -> TP;  F, T -> FP;   T, F -> FN
+  #   F, F -> TN;  T, T -> TP;   F, T -> FP
+  #   T, F -> FN;  F, F -> TN;   T, T -> TP
+  #   F, T -> FP;  T, F -> FN;   F, F -> TN
+  return dataset_ops.Dataset.from_tensor_slices({
+      "labels": [True, False, True, False],
+      "predictions": [True, True, False, False]}).repeat().batch(3)
+
+
+def _threshold_dataset_fn():
+  # First four batches of labels, predictions: {TP, FP, TN, FN}
+  # with a threshold of 0.5:
+  #   True, 1.0 -> TP;  False, .75 -> FP;   True, .25 -> FN
+  #  False, 0.0 -> TN;   True, 1.0 -> TP;  False, .75 -> FP
+  #   True, .25 -> FN;  False, 0.0 -> TN;   True, 1.0 -> TP
+  #  False, .75 -> FP;   True, .25 -> FN;  False, 0.0 -> TN
+  return dataset_ops.Dataset.from_tensor_slices({
+      "labels": [True, False, True, False],
+      "predictions": [1.0, 0.75, 0.25, 0.]}).repeat().batch(3)
+
+
+def _regression_dataset_fn():
+  return dataset_ops.Dataset.from_tensor_slices({
+      "labels": [1., .5, 1., 0.],
+      "predictions": [1., .75, .25, 0.]}).repeat()
+
+
+def all_combinations():
+  return combinations.combine(
+      distribution=[combinations.default_strategy,
+                    combinations.one_device_strategy,
+                    combinations.mirrored_strategy_with_gpu_and_cpu,
+                    combinations.mirrored_strategy_with_two_gpus],
+      mode=["graph"])
+
+
+# TODO(josh11b): Test metrics.recall_at_top_k, metrics.average_precision_at_k,
+# metrics.precision_at_k
+class MetricsV1Test(test.TestCase, parameterized.TestCase):
+
+  def _test_metric(self, distribution, dataset_fn, metric_fn, expected_fn):
+    with ops.Graph().as_default(), distribution.scope():
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
+      value, update = distribution.call_for_each_tower(
+          metric_fn, iterator.get_next())
+      update = distribution.group(update)
+      self.evaluate(variables.local_variables_initializer())
+      # TODO(josh11b): Once we switch to using a global batch size for input,
+      # replace "distribution.num_towers" with "1".
+      batches_per_update = distribution.num_towers
+
+      # Update variables using the first `num_towers` batches.
+      self.evaluate(update)
+      self.assertAllClose(expected_fn(batches_per_update), self.evaluate(value),
+                          0.001, msg="After first update")
+
+      # Update variables using the second `num_towers` batches.
+      self.evaluate(update)
+      self.assertAllClose(expected_fn(2 * batches_per_update),
+                          self.evaluate(value),
+                          0.001,
+                          msg="After second update")
+
+      if batches_per_update == 1:  # Consume 4 input batches
+        self.evaluate(update)
+        self.assertAllClose(expected_fn(3 * batches_per_update),
+                            self.evaluate(value),
+                            0.001,
+                            msg="After third update")
+        self.evaluate(update)
+        self.assertAllClose(expected_fn(4 * batches_per_update),
+                            self.evaluate(value),
+                            0.001,
+                            msg="After fourth update")
+
+  @combinations.generate(all_combinations())
+  def testMean(self, distribution):
+    def _dataset_fn():
+      return dataset_ops.Dataset.range(1000).map(math_ops.to_float).batch(4)
+
+    def _expected_fn(num_batches):
+      # Mean(0..3) = 1.5, Mean(0..7) = 3.5, Mean(0..11) = 5.5, etc.
+      return num_batches * 2 - 0.5
+
+    self._test_metric(distribution, _dataset_fn, metrics.mean, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testAccuracy(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.accuracy(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [3./4, 3./8, 3./12, 4./16][num_batches - 1]
+
+    self._test_metric(
+        distribution, _labeled_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testMeanPerClassAccuracy(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.mean_per_class_accuracy(
+          labels, predictions, num_classes=5)
+
+    def _expected_fn(num_batches):
+      mean = lambda x: sum(x) / len(x)
+      return [mean([1., 1., 1., 0., 0.]),
+              mean([0.5, 0.5, 0.5, 0., 0.]),
+              mean([1./3, 1./3, 0.5, 0., 0.]),
+              mean([0.5, 1./3, 1./3, 0., 0.])][num_batches - 1]
+
+    self._test_metric(
+        distribution, _labeled_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testMeanIOU(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.mean_iou(
+          labels, predictions, num_classes=5)
+
+    def _expected_fn(num_batches):
+      mean = lambda x: sum(x) / len(x)
+      return [mean([1./2, 1./1, 1./1, 0.]),  # no class 4 in first batch
+              mean([1./4, 1./4, 1./3, 0., 0.]),
+              mean([1./6, 1./6, 1./5, 0., 0.]),
+              mean([2./8, 1./7, 1./7, 0., 0.])][num_batches - 1]
+
+    self._test_metric(
+        distribution, _labeled_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testMeanTensor(self, distribution):
+    def _dataset_fn():
+      dataset = dataset_ops.Dataset.range(1000).map(math_ops.to_float)
+      # Want to produce a fixed, known shape, so drop remainder when batching.
+      dataset = dataset.apply(batching.batch_and_drop_remainder(4))
+      return dataset
+
+    def _expected_fn(num_batches):
+      # Mean(0, 4, ..., 4 * num_batches - 4) == 2 * num_batches - 2
+      # Mean(1, 5, ..., 4 * num_batches - 3) == 2 * num_batches - 1
+      # Mean(2, 6, ..., 4 * num_batches - 2) == 2 * num_batches
+      # Mean(3, 7, ..., 4 * num_batches - 1) == 2 * num_batches + 1
+      first = 2. * num_batches - 2.
+      return [first, first + 1., first + 2., first + 3.]
+
+    self._test_metric(
+        distribution, _dataset_fn, metrics.mean_tensor, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testAUCROC(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.auc(labels, predictions, num_thresholds=8, curve="ROC",
+                         summation_method="careful_interpolation")
+
+    def _expected_fn(num_batches):
+      return [0.5, 7./9, 0.8, 0.75][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testAUCPR(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.auc(labels, predictions, num_thresholds=8, curve="PR",
+                         summation_method="careful_interpolation")
+
+    def _expected_fn(num_batches):
+      return [0.797267, 0.851238, 0.865411, 0.797267][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testFalseNegatives(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.false_negatives(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [1., 1., 2., 3.][num_batches - 1]
+
+    self._test_metric(
+        distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testFalseNegativesAtThresholds(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.false_negatives_at_thresholds(labels, predictions, [.5])
+
+    def _expected_fn(num_batches):
+      return [[1.], [1.], [2.], [3.]][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testTrueNegatives(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.true_negatives(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [0., 1., 2., 3.][num_batches - 1]
+
+    self._test_metric(
+        distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testTrueNegativesAtThresholds(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.true_negatives_at_thresholds(labels, predictions, [.5])
+
+    def _expected_fn(num_batches):
+      return [[0.], [1.], [2.], [3.]][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testFalsePositives(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.false_positives(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [1., 2., 2., 3.][num_batches - 1]
+
+    self._test_metric(
+        distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testFalsePositivesAtThresholds(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.false_positives_at_thresholds(labels, predictions, [.5])
+
+    def _expected_fn(num_batches):
+      return [[1.], [2.], [2.], [3.]][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testTruePositives(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.true_positives(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [1., 2., 3., 3.][num_batches - 1]
+
+    self._test_metric(
+        distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testTruePositivesAtThresholds(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.true_positives_at_thresholds(labels, predictions, [.5])
+
+    def _expected_fn(num_batches):
+      return [[1.], [2.], [3.], [3.]][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testPrecision(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.precision(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [0.5, 0.5, 0.6, 0.5][num_batches - 1]
+
+    self._test_metric(
+        distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testPrecisionAtThreshold(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.precision_at_thresholds(labels, predictions, [0.5])
+
+    def _expected_fn(num_batches):
+      return [[0.5], [0.5], [0.6], [0.5]][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testRecall(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.recall(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [0.5, 2./3, 0.6, 0.5][num_batches - 1]
+
+    self._test_metric(
+        distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testRecallAtThreshold(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.recall_at_thresholds(labels, predictions, [0.5])
+
+    def _expected_fn(num_batches):
+      return [[0.5], [2./3], [0.6], [0.5]][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testMeanSquaredError(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.mean_squared_error(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [0., 1./32, 0.208333, 0.15625][num_batches - 1]
+
+    self._test_metric(
+        distribution, _regression_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testRootMeanSquaredError(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.root_mean_squared_error(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [0., 0.176777, 0.456435, 0.395285][num_batches - 1]
+
+    self._test_metric(
+        distribution, _regression_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testSensitivityAtSpecificity(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.sensitivity_at_specificity(labels, predictions, 0.8)
+
+    def _expected_fn(num_batches):
+      return [0.5, 2./3, 0.6, 0.5][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testSpecificityAtSensitivity(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.specificity_at_sensitivity(labels, predictions, 0.95)
+
+    def _expected_fn(num_batches):
+      return [0., 1./3, 0.5, 0.5][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+
+if __name__ == "__main__":
+  test.main()
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@ -31,6 +31,7 @@ from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import device_util
@ -343,6 +344,13 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                        **values.select_device_mirrored(d, kwargs))
    return values.regroup(updates, values.Mirrored)

+  def read_var(self, tower_local_var):
+    """Read the aggregate value of a tower-local variable."""
+    if isinstance(tower_local_var, values.TowerLocalVariable):
+      return math_ops.add_n(self.unwrap(tower_local_var))
+    assert isinstance(tower_local_var, values.Mirrored)
+    return array_ops.identity(tower_local_var.get())
+
  def _fetch(self, val, destination, fn):
    """Return a copy of `val` or `fn(val)` on `destination`."""
    if isinstance(val, values.TowerLocalVariable):
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@ -102,6 +102,10 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
    with ops.device(self._device), distribute_lib.UpdateContext(self._device):
      return fn(*args, **kwargs)

+  def read_var(self, tower_local_var):
+    """Read the aggregate value of a tower-local variable."""
+    return array_ops.identity(tower_local_var)
+
  def _fetch(self, val, destination, fn):
    """Return a copy of `val` or `fn(val)` on `destination`."""
    with ops.device(self._device):
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
@ -29,7 +29,9 @@ from tensorflow.contrib.distributions.python.ops import mvn_diag
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import categorical
 from tensorflow.python.ops.distributions import normal
 from tensorflow.python.ops.linalg import linear_operator_diag
@ -540,5 +542,51 @@ class PadDynamicTest(_PadTest, test.TestCase):
    return False


+class TestMoveDimension(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_move_dimension_static_shape(self):
+
+    x = random_ops.random_normal(shape=[200, 30, 4, 1, 6])
+
+    x_perm = distribution_util.move_dimension(x, 1, 1)
+    self.assertAllEqual(x_perm.shape.as_list(), [200, 30, 4, 1, 6])
+
+    x_perm = distribution_util.move_dimension(x, 0, 3)
+    self.assertAllEqual(x_perm.shape.as_list(), [30, 4, 1, 200, 6])
+
+    x_perm = distribution_util.move_dimension(x, 0, -2)
+    self.assertAllEqual(x_perm.shape.as_list(), [30, 4, 1, 200, 6])
+
+    x_perm = distribution_util.move_dimension(x, 4, 2)
+    self.assertAllEqual(x_perm.shape.as_list(), [200, 30, 6, 4, 1])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_move_dimension_dynamic_shape(self):
+
+    x_ = random_ops.random_normal(shape=[200, 30, 4, 1, 6])
+    x = array_ops.placeholder_with_default(input=x_, shape=None)
+
+    x_perm = distribution_util.move_dimension(x, 1, 1)
+    self.assertAllEqual(self.evaluate(array_ops.shape(x_perm)),
+                        [200, 30, 4, 1, 6])
+
+    x_perm = distribution_util.move_dimension(x, 0, 3)
+    self.assertAllEqual(self.evaluate(array_ops.shape(x_perm)),
+                        [30, 4, 1, 200, 6])
+
+    x_perm = distribution_util.move_dimension(x, 0, -2)
+    self.assertAllEqual(self.evaluate(array_ops.shape(x_perm)),
+                        [30, 4, 1, 200, 6])
+
+    x_perm = distribution_util.move_dimension(x, 4, 2)
+    self.assertAllEqual(self.evaluate(array_ops.shape(x_perm)),
+                        [200, 30, 6, 4, 1])
+
+    x_perm = distribution_util.move_dimension(x, -1, 2)
+    self.assertAllEqual(self.evaluate(array_ops.shape(x_perm)),
+                        [200, 30, 6, 4, 1])
+
+
 if __name__ == "__main__":
  test.main()
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@ -21,12 +21,19 @@ from __future__ import print_function
 from tensorflow.contrib import linalg
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import distribution as distribution_lib
+
+# The following two lines are redundant, in a sense. The first enables
+# good coding practice  *within* this file (`util.prefer_static_value`
+# rather than  `prefer_static_value`). The  second ensures  that users
+# also get the core utils when they import this file.
+from tensorflow.python.ops.distributions import util
 from tensorflow.python.ops.distributions.util import *  # pylint: disable=wildcard-import


@ -484,3 +491,75 @@ def pad_mixture_dimensions(x, mixture_distribution, categorical_distribution,
 def static_value(x):
  """Returns the static value of a `Tensor` or `None`."""
  return tensor_util.constant_value(ops.convert_to_tensor(x))
+
+
+def move_dimension(x, source_idx, dest_idx):
+  """Move a single tensor dimension within its shape.
+
+  This is a special case of `tf.transpose()`, which applies
+  arbitrary permutations to tensor dimensions.
+
+  Args:
+    x: Tensor of rank `ndims`.
+    source_idx: Integer index into `x.shape` (negative indexing is
+      supported).
+    dest_idx: Integer index into `x.shape` (negative indexing is
+      supported).
+
+  Returns:
+    x_perm: Tensor of rank `ndims`, in which the dimension at original
+     index `source_idx` has been moved to new index `dest_idx`, with
+     all other dimensions retained in their original order.
+
+  Example:
+
+  ```python
+  x = tf.placeholder(shape=[200, 30, 4, 1, 6])
+  x_perm = _move_dimension(x, 1, 1) # no-op
+  x_perm = _move_dimension(x, 0, 3) # result shape [30, 4, 1, 200, 6]
+  x_perm = _move_dimension(x, 0, -2) # equivalent to previous
+  x_perm = _move_dimension(x, 4, 2) # result shape [200, 30, 6, 4, 1]
+  ```
+  """
+  ndims = util.prefer_static_rank(x)
+  if isinstance(source_idx, int):
+    dtype = dtypes.int32
+  else:
+    dtype = dtypes.as_dtype(source_idx.dtype)
+
+  # Handle negative indexing. Since ndims might be dynamic, this makes
+  # source_idx and dest_idx also possibly dynamic.
+  if source_idx < 0:
+    source_idx = ndims + source_idx
+  if dest_idx < 0:
+    dest_idx = ndims + dest_idx
+
+  # Construct the appropriate permutation of dimensions, depending
+  # whether the source is before or after the destination.
+  def move_left_permutation():
+    return util.prefer_static_value(
+        array_ops.concat([
+            math_ops.range(0, dest_idx, dtype=dtype),
+            [source_idx],
+            math_ops.range(dest_idx, source_idx, dtype=dtype),
+            math_ops.range(source_idx+1, ndims, dtype=dtype)], axis=0))
+
+  def move_right_permutation():
+    return util.prefer_static_value(
+        array_ops.concat([
+            math_ops.range(0, source_idx, dtype=dtype),
+            math_ops.range(source_idx+1, dest_idx+1, dtype=dtype),
+            [source_idx],
+            math_ops.range(dest_idx+1, ndims, dtype=dtype)], axis=0))
+
+  def x_permuted():
+    return array_ops.transpose(
+        x, perm=smart_cond.smart_cond(source_idx < dest_idx,
+                                      move_right_permutation,
+                                      move_left_permutation))
+
+  # One final conditional to handle the special case where source
+  # and destination indices are equal.
+  return smart_cond.smart_cond(math_ops.equal(source_idx, dest_idx),
+                               lambda: x,
+                               x_permuted)
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@ -529,6 +529,7 @@ def multi_label_head(n_classes,
  applications, the shape is `[batch_size, n_classes]`.

  Labels can be:
+
  * A multi-hot tensor of shape `[D0, D1, ... DN, n_classes]`
  * An integer `SparseTensor` of class indices. The `dense_shape` must be
    `[D0, D1, ... DN, ?]` and the values within `[0, n_classes)`.
--- a/tensorflow/contrib/integrate/python/ops/odes.py
+++ b/tensorflow/contrib/integrate/python/ops/odes.py
@ -28,7 +28,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops

@ -279,13 +278,27 @@ def _assert_increasing(t):
  return ops.control_dependencies([assert_increasing])


-def _check_input_types(t, y0):
+def _check_input_types(y0, t, dt=None):
  if not (y0.dtype.is_floating or y0.dtype.is_complex):
    raise TypeError('`y0` must have a floating point or complex floating '
                    'point dtype')
  if not t.dtype.is_floating:
    raise TypeError('`t` must have a floating point dtype')

+  if dt is not None and not dt.dtype.is_floating:
+    raise TypeError('`dt` must have a floating point dtype')
+
+
+def _check_input_sizes(t, dt):
+  if len(t.get_shape().as_list()) > 1:
+    raise ValueError('t must be a 1D tensor')
+
+  if len(dt.get_shape().as_list()) > 1:
+    raise ValueError('t must be a 1D tensor')
+
+  if t.get_shape()[0] != dt.get_shape()[0] + 1:
+    raise ValueError('t and dt have incompatible lengths, must be N and N-1')
+

 def _dopri5(func,
            y0,
@ -510,7 +523,7 @@ def odeint(func,
    # avoiding the need to pack/unpack in user functions.
    y0 = ops.convert_to_tensor(y0, name='y0')
    t = ops.convert_to_tensor(t, preferred_dtype=dtypes.float64, name='t')
-    _check_input_types(t, y0)
+    _check_input_types(y0, t)

    error_dtype = abs(y0).dtype
    rtol = ops.convert_to_tensor(rtol, dtype=error_dtype, name='rtol')
@ -530,24 +543,74 @@ def odeint(func,
 class _FixedGridIntegrator(six.with_metaclass(abc.ABCMeta)):
  """Base class for fixed-grid ODE integrators."""

-  def integrate(self, evol_func, y0, time_grid):
-    time_delta_grid = time_grid[1:] - time_grid[:-1]
+  def integrate(self, evol_func, y0, time_grid, dt_grid, steps_on_intervals):
+    """Returns integrated values of differential equation on the `time grid`.

-    scan_func = self._make_scan_func(evol_func)
+    Numerically integrates differential equation defined via time derivative
+    evaluator `evol_func` using fixed time steps specified in dt_grid.

-    y_grid = functional_ops.scan(scan_func, (time_grid[:-1], time_delta_grid),
-                                 y0)
-    return array_ops.concat([[y0], y_grid], axis=0)
+    Args:
+      evol_func: Callable, evaluates time derivative of y at a given time.
+      y0: N-D Tensor holds initial values of the solution.
+      time_grid: 1-D Tensor holding the time points at which the solution
+        will be recorded, must have a floating dtype.
+      dt_grid: 1-D Tensor holds fixed time steps to be used on time_grid
+        intervals. Must be a floating dtype and have one less element than that
+        of the time_grid.
+      steps_on_intervals: 1-D Tensor of integer dtype, must have the same size
+        as dt_grid. Specifies number of steps needed for every interval. Assumes
+        steps_on_intervals * dt_grid == time intervals.

-  def _make_scan_func(self, evol_func):
+    Returns:
+      (N+1)-D tensor, where the first dimension corresponds to different
+      time points. Contains the solved value of y for each desired time point in
+      `t`, with the initial value `y0` being the first element along the first
+      dimension.
+    """

-    def scan_func(y, t_and_dt):
-      t, dt = t_and_dt
+    iteration_func = self._make_iteration_func(evol_func, dt_grid)
+    integrate_interval = self._make_interval_integrator(iteration_func,
+                                                        steps_on_intervals)
+
+    num_times = array_ops.size(time_grid)
+    current_time = time_grid[0]
+    solution_array = tensor_array_ops.TensorArray(y0.dtype, num_times)
+    solution_array = solution_array.write(0, y0)
+
+    solution_array, _, _, _ = control_flow_ops.while_loop(
+        lambda _, __, ___, i: i < num_times,
+        integrate_interval,
+        (solution_array, y0, current_time, 1)
+    )
+    solution_array = solution_array.stack()
+    solution_array.set_shape(time_grid.get_shape().concatenate(y0.get_shape()))
+    return solution_array
+
+  def _make_iteration_func(self, evol_func, dt_grid):
+    """Returns a function that builds operations of a single time step."""
+
+    def iteration_func(y, t, dt_step, interval_step):
+      """Performs a single time step advance."""
+      dt = dt_grid[interval_step - 1]
      dy = self._step_func(evol_func, t, dt, y)
      dy = math_ops.cast(dy, dtype=y.dtype)
-      return y + dy
+      return y + dy, t + dt, dt_step + 1, interval_step

-    return scan_func
+    return iteration_func
+
+  def _make_interval_integrator(self, iteration_func, interval_sizes):
+    """Returns a function that builds operations for interval integration."""
+
+    def integrate_interval(solution_array, y, t, interval_num):
+      """Integrates y with fixed time step on interval `interval_num`."""
+      y, t, _, _ = control_flow_ops.while_loop(
+          lambda _, __, j, interval_num: j < interval_sizes[interval_num - 1],
+          iteration_func,
+          (y, t, 0, interval_num)
+      )
+      return solution_array.write(interval_num, y), y, t, interval_num + 1
+
+    return integrate_interval

  @abc.abstractmethod
  def _step_func(self, evol_func, t, dt, y):
@ -555,6 +618,7 @@ class _FixedGridIntegrator(six.with_metaclass(abc.ABCMeta)):


 class _MidpointFixedGridIntegrator(_FixedGridIntegrator):
+  """Fixed grid integrator implementing midpoint scheme."""

  def _step_func(self, evol_func, t, dt, y):
    dt_cast = math_ops.cast(dt, y.dtype)
@ -563,6 +627,7 @@ class _MidpointFixedGridIntegrator(_FixedGridIntegrator):


 class _RK4FixedGridIntegrator(_FixedGridIntegrator):
+  """Fixed grid integrator implementing RK4 scheme."""

  def _step_func(self, evol_func, t, dt, y):
    k1 = evol_func(y, t)
@ -575,7 +640,7 @@ class _RK4FixedGridIntegrator(_FixedGridIntegrator):
    return math_ops.add_n([k1, 2 * k2, 2 * k3, k4]) * (dt_cast / 6)


-def odeint_fixed(func, y0, t, method='rk4', name=None):
+def odeint_fixed(func, y0, t, dt=None, method='rk4', name=None):
  """ODE integration on a fixed grid (with no step size control).

  Useful in certain scenarios to avoid the overhead of adaptive step size
@ -590,6 +655,14 @@ def odeint_fixed(func, y0, t, method='rk4', name=None):
      `y`. The initial time point should be the first element of this sequence,
      and each time must be larger than the previous time. May have any floating
      point dtype.
+    dt: 0-D or 1-D Tensor providing time step suggestion to be used on time
+      integration intervals in `t`. 1-D Tensor should provide values
+      for all intervals, must have 1 less element than that of `t`.
+      If given a 0-D Tensor, the value is interpreted as time step suggestion
+      same for all intervals. If passed None, then time step is set to be the
+      t[1:] - t[:-1]. Defaults to None. The actual step size is obtained by
+      insuring an integer number of steps per interval, potentially reducing the
+      time step.
    method: One of 'midpoint' or 'rk4'.
    name: Optional name for the resulting operation.

@ -602,16 +675,29 @@ def odeint_fixed(func, y0, t, method='rk4', name=None):
  Raises:
    ValueError: Upon caller errors.
  """
-  with ops.name_scope(name, 'odeint_fixed', [y0, t]):
+  with ops.name_scope(name, 'odeint_fixed', [y0, t, dt]):
    t = ops.convert_to_tensor(t, preferred_dtype=dtypes.float64, name='t')
    y0 = ops.convert_to_tensor(y0, name='y0')
-    _check_input_types(t, y0)
+
+    intervals = t[1:] - t[:-1]
+    if dt is None:
+      dt = intervals
+    dt = ops.convert_to_tensor(dt, preferred_dtype=dtypes.float64, name='dt')
+
+    steps_on_intervals = math_ops.ceil(intervals / dt)
+    dt = intervals / steps_on_intervals
+    steps_on_intervals = math_ops.cast(steps_on_intervals, dtype=dtypes.int32)
+
+    _check_input_types(y0, t, dt)
+    _check_input_sizes(t, dt)

    with _assert_increasing(t):
      with ops.name_scope(method):
        if method == 'midpoint':
-          return _MidpointFixedGridIntegrator().integrate(func, y0, t)
+          return _MidpointFixedGridIntegrator().integrate(func, y0, t, dt,
+                                                          steps_on_intervals)
        elif method == 'rk4':
-          return _RK4FixedGridIntegrator().integrate(func, y0, t)
+          return _RK4FixedGridIntegrator().integrate(func, y0, t, dt,
+                                                     steps_on_intervals)
        else:
          raise ValueError('method not supported: {!s}'.format(method))
--- a/tensorflow/contrib/integrate/python/ops/odes_test.py
+++ b/tensorflow/contrib/integrate/python/ops/odes_test.py
@ -242,40 +242,56 @@ class InterpolationTest(test.TestCase):

 class OdeIntFixedTest(test.TestCase):

-  def _test_integrate_sine(self, method):
+  def _test_integrate_sine(self, method, t, dt=None):

    def evol_func(y, t):
      del t
      return array_ops.stack([y[1], -y[0]])

    y0 = [0., 1.]
-    time_grid = np.linspace(0., 10., 200)
-    y_grid = odes.odeint_fixed(evol_func, y0, time_grid, method=method)
+    y_grid = odes.odeint_fixed(evol_func, y0, t, dt, method=method)

    with self.test_session() as sess:
      y_grid_array = sess.run(y_grid)

    np.testing.assert_allclose(
-        y_grid_array[:, 0], np.sin(time_grid), rtol=1e-2, atol=1e-2)
+        y_grid_array[:, 0], np.sin(t), rtol=1e-2, atol=1e-2)

-  def _test_integrate_gaussian(self, method):
+  def _test_integrate_gaussian(self, method, t, dt=None):

    def evol_func(y, t):
      return -math_ops.cast(t, dtype=y.dtype) * y[0]

    y0 = [1.]
-    time_grid = np.linspace(0., 2., 100)
-    y_grid = odes.odeint_fixed(evol_func, y0, time_grid, method=method)
+    y_grid = odes.odeint_fixed(evol_func, y0, t, dt, method=method)

    with self.test_session() as sess:
      y_grid_array = sess.run(y_grid)

    np.testing.assert_allclose(
-        y_grid_array[:, 0], np.exp(-time_grid**2 / 2), rtol=1e-2, atol=1e-2)
+        y_grid_array[:, 0], np.exp(-t**2 / 2), rtol=1e-2, atol=1e-2)
+
+  def _test_integrate_sine_all(self, method):
+    uniform_time_grid = np.linspace(0., 10., 200)
+    non_uniform_time_grid = np.asarray([0.0, 0.4, 4.7, 5.2, 7.0])
+    uniform_dt = 0.02
+    non_uniform_dt = np.asarray([0.01, 0.001, 0.05, 0.03])
+    self._test_integrate_sine(method, uniform_time_grid)
+    self._test_integrate_sine(method, non_uniform_time_grid, uniform_dt)
+    self._test_integrate_sine(method, non_uniform_time_grid, non_uniform_dt)
+
+  def _test_integrate_gaussian_all(self, method):
+    uniform_time_grid = np.linspace(0., 2., 100)
+    non_uniform_time_grid = np.asarray([0.0, 0.1, 0.7, 1.2, 2.0])
+    uniform_dt = 0.01
+    non_uniform_dt = np.asarray([0.01, 0.001, 0.1, 0.03])
+    self._test_integrate_gaussian(method, uniform_time_grid)
+    self._test_integrate_gaussian(method, non_uniform_time_grid, uniform_dt)
+    self._test_integrate_gaussian(method, non_uniform_time_grid, non_uniform_dt)

  def _test_everything(self, method):
-    self._test_integrate_sine(method)
-    self._test_integrate_gaussian(method)
+    self._test_integrate_sine_all(method)
+    self._test_integrate_gaussian_all(method)

  def test_midpoint(self):
    self._test_everything('midpoint')
@ -283,6 +299,21 @@ class OdeIntFixedTest(test.TestCase):
  def test_rk4(self):
    self._test_everything('rk4')

+  def test_dt_size_exceptions(self):
+    times = np.linspace(0., 2., 100)
+    dt = np.ones(99) * 0.01
+    dt_wrong_length = np.asarray([0.01, 0.001, 0.1, 0.03])
+    dt_wrong_dim = np.expand_dims(np.linspace(0., 2., 99), axis=0)
+    times_wrong_dim = np.expand_dims(np.linspace(0., 2., 100), axis=0)
+    with self.assertRaises(ValueError):
+      self._test_integrate_gaussian('midpoint', times, dt_wrong_length)
+
+    with self.assertRaises(ValueError):
+      self._test_integrate_gaussian('midpoint', times, dt_wrong_dim)
+
+    with self.assertRaises(ValueError):
+      self._test_integrate_gaussian('midpoint', times_wrong_dim, dt)
+

 if __name__ == '__main__':
  test.main()
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_LITE_BUILTIN_OPS_H_

 // DO NOT EDIT MANUALLY: This file is automatically generated by
-// `schema_builtin_ops_header_generator.py`.
+// `schema/builtin_ops_header/generator.cc`.

 #ifdef __cplusplus
 extern "C" {
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@ -474,8 +474,9 @@ cc_test(
 )

 cc_test(
-    name = "resize_bilinear_float_test",
-    srcs = ["resize_bilinear_float_test.cc"],
+    name = "resize_bilinear_test",
+    srcs = ["resize_bilinear_test.cc"],
+    tags = ["tflite_not_portable"],
    deps = [
        ":optimized_base",
        ":reference_base",
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@ -5722,6 +5722,46 @@ inline void ResizeBilinearGeneric(const float* input_data,
  }
 }

+template <typename T>
+inline void ResizeBilinearGenericSmallChannel(
+    const T* input_data, const Dims<4>& input_dims, T* output_data,
+    const Dims<4>& output_dims, int32 batches, int32 input_height,
+    int32 input_width, int32 depth, int32 output_height, int32 output_width,
+    float height_scale, float width_scale) {
+  memset(output_data, 0,
+         batches * output_height * output_width * depth * sizeof(T));
+
+  T* output_ptr = &output_data[0];
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      float input_y = y * height_scale;
+      int32 y0 = static_cast<int32>(std::floor(input_y));
+      int32 y1 = std::min(y0 + 1, input_height - 1);
+      for (int x = 0; x < output_width; ++x) {
+        float input_x = x * width_scale;
+        int32 x0 = static_cast<int32>(input_x);
+        int32 x1 = std::min(x0 + 1, input_width - 1);
+
+        int32 input_offset[4] = {
+            Offset(input_dims, 0, x0, y0, b), Offset(input_dims, 0, x1, y0, b),
+            Offset(input_dims, 0, x0, y1, b), Offset(input_dims, 0, x1, y1, b)};
+        float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)),
+                          (1 - (input_y - y0)) * (input_x - x0),
+                          (input_y - y0) * (1 - (input_x - x0)),
+                          (input_y - y0) * (input_x - x0)};
+
+        for (int d = 0; d < depth; d++) {
+          const T* input_ptr = &input_data[d];
+          *output_ptr++ = static_cast<T>(input_ptr[input_offset[0]] * scale[0] +
+                                         input_ptr[input_offset[1]] * scale[1] +
+                                         input_ptr[input_offset[2]] * scale[2] +
+                                         input_ptr[input_offset[3]] * scale[3]);
+        }
+      }
+    }
+  }
+}
+
 inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                           const int32* output_size_data,
                           const Dims<4>& output_size_dims, float* output_data,
@ -5762,6 +5802,41 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
  }
 }

+// TODO(prabhumk): This is not a real quantized bilinear. It does not use int8
+// or int16 arithmetic.
+inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, uint8* output_data,
+                           const Dims<4>& output_dims, bool align_corners) {
+  gemmlowp::ScopedProfilingLabel label("ResizeBilinear");
+  int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  int32 input_height = ArraySize(input_dims, 2);
+  int32 input_width = ArraySize(input_dims, 1);
+  int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1);
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1);
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1);
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2);
+  int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)];
+  int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
+
+  float height_scale =
+      (align_corners && output_height > 1)
+          ? (static_cast<float>(input_height - 1) / (output_height - 1))
+          : (static_cast<float>(input_height) / output_height);
+
+  float width_scale =
+      (align_corners && output_width > 1)
+          ? (static_cast<float>(input_width - 1) / (output_width - 1))
+          : (static_cast<float>(input_width) / output_width);
+
+  ResizeBilinearGenericSmallChannel<uint8>(
+      input_data, input_dims, output_data, output_dims, batches, input_height,
+      input_width, depth, output_height, output_width, height_scale,
+      width_scale);
+}
+
 // legacy, for compatibility with old checked-in code
 inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                           const int32* output_size_data,
@ -5771,6 +5846,15 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                 output_data, output_dims, /*align_corners=*/false);
 }

+// legacy, for compatibility with old checked-in code
+inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, uint8* output_data,
+                           const Dims<4>& output_dims) {
+  ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims,
+                 output_data, output_dims, /*align_corners=*/false);
+}
+
 template <typename T>
 inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                           const int32* block_shape_data,
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@ -3202,9 +3202,10 @@ inline void Gather(const T* input_data, const Dims<4>& input_dims,
  }
 }

-inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+template <typename T>
+inline void ResizeBilinear(const T* input_data, const Dims<4>& input_dims,
                           const int32* output_size_data,
-                           const Dims<4>& output_size_dims, float* output_data,
+                           const Dims<4>& output_size_dims, T* output_data,
                           const Dims<4>& output_dims, bool align_corners) {
  int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
  int32 input_height = ArraySize(input_dims, 2);
@ -3236,15 +3237,15 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
        int32 x0 = static_cast<int32>(std::floor(input_x));
        int32 x1 = std::min(x0 + 1, input_width - 1);
        for (int c = 0; c < depth; ++c) {
-          float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] *
-                                    (1 - (input_y - y0)) *
-                                    (1 - (input_x - x0)) +
-                                input_data[Offset(input_dims, c, x0, y1, b)] *
-                                    (input_y - y0) * (1 - (input_x - x0)) +
-                                input_data[Offset(input_dims, c, x1, y0, b)] *
-                                    (1 - (input_y - y0)) * (input_x - x0) +
-                                input_data[Offset(input_dims, c, x1, y1, b)] *
-                                    (input_y - y0) * (input_x - x0);
+          T interpolation =
+              static_cast<T>(input_data[Offset(input_dims, c, x0, y0, b)] *
+                                 (1 - (input_y - y0)) * (1 - (input_x - x0)) +
+                             input_data[Offset(input_dims, c, x0, y1, b)] *
+                                 (input_y - y0) * (1 - (input_x - x0)) +
+                             input_data[Offset(input_dims, c, x1, y0, b)] *
+                                 (1 - (input_y - y0)) * (input_x - x0) +
+                             input_data[Offset(input_dims, c, x1, y1, b)] *
+                                 (input_y - y0) * (input_x - x0));
          output_data[Offset(output_dims, c, x, y, b)] = interpolation;
        }
      }
@ -3257,8 +3258,18 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                           const int32* output_size_data,
                           const Dims<4>& output_size_dims, float* output_data,
                           const Dims<4>& output_dims) {
-  ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims,
-                 output_data, output_dims, /*align_corners=*/false);
+  ResizeBilinear<float>(input_data, input_dims, output_size_data,
+                        output_size_dims, output_data, output_dims,
+                        /*align_corners=*/false);
+}
+
+inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, uint8* output_data,
+                           const Dims<4>& output_dims) {
+  ResizeBilinear<uint8>(input_data, input_dims, output_size_data,
+                        output_size_dims, output_data, output_dims,
+                        /*align_corners=*/false);
 }

 template <typename T>
--- a/tensorflow/contrib/lite/kernels/internal/resize_bilinear_float_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/resize_bilinear_float_test.cc
@ -24,9 +24,10 @@ limitations under the License.

 namespace tflite {
 namespace {
+template <typename T>
 void TestOneResizeBilinear(int batch, int depth, int input_width,
                           int input_height, int output_width,
-                           int output_height) {
+                           int output_height, float error_threshold) {
  Dims<4> input_dims_inference =
      MakeDimsForInference(depth, input_width, input_height, batch);
  Dims<4> output_dims_inference =
@ -36,14 +37,15 @@ void TestOneResizeBilinear(int batch, int depth, int input_width,
  const int output_buffer_size =
      RequiredBufferSizeForDims(output_dims_inference);

-  std::vector<float> input_data(input_buffer_size, 0);
-  std::vector<float> reference_output_data(output_buffer_size, 0);
+  std::vector<T> input_data(input_buffer_size, 0);
+  std::vector<T> reference_output_data(output_buffer_size, 0);
  // Initialize the output data with something other than zero, so we can catch
  // issue with kernels failing to initialize the output.
-  std::vector<float> output_data(output_buffer_size, 3.1415);
+  std::vector<T> output_data(output_buffer_size, 3);

-  const float input_amplitude = 1.f;
-  FillRandom(&input_data, -input_amplitude, input_amplitude);
+  const T min_amplitude = static_cast<T>(0);
+  const T max_amplitude = static_cast<T>(255);
+  FillRandom(&input_data, min_amplitude, max_amplitude);

  Dims<4> output_size_dims = MakeDimsForInference(2, 1, 1, 1);
  std::vector<int32> output_size_data = {output_height, output_width};
@ -58,14 +60,46 @@ void TestOneResizeBilinear(int batch, int depth, int input_width,
  double sum_diff = 0;
  float max_abs_val = 0;
  for (int i = 0; i < output_buffer_size; i++) {
-    sum_diff += std::abs(output_data[i] - reference_output_data[i]);
-    max_abs_val = std::max(max_abs_val, std::abs(reference_output_data[i]));
+    sum_diff += std::abs(static_cast<float>(output_data[i]) -
+                         static_cast<float>(reference_output_data[i]));
+    max_abs_val = std::max(
+        max_abs_val, std::abs(static_cast<float>(reference_output_data[i])));
  }

  if (sum_diff != 0.f) {
    const float mean_diff = static_cast<float>(sum_diff / output_buffer_size);
    const float relative_error = std::abs(mean_diff) / max_abs_val;
-    ASSERT_LT(relative_error, 1e-5f);
+    ASSERT_LT(relative_error, error_threshold);
+  }
+}
+
+TEST(ResizeBilinear, TestResizeBilinear8Bit) {
+  const int kTestsToRun = 100 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
+    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+
+    TestOneResizeBilinear<uint8>(batch, depth, input_width, input_height,
+                                 output_width, output_height, 0.025);
+  }
+}
+
+TEST(ResizeBilinear2x2, TestResizeBilinear8Bit) {
+  const int kTestsToRun = 100 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
+    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_width = input_width * 2;
+    const int output_height = input_height * 2;
+
+    TestOneResizeBilinear<uint8>(batch, depth, input_width, input_height,
+                                 output_width, output_height, 1e-5);
  }
 }

@ -79,8 +113,8 @@ TEST(ResizeBilinear, TestResizeBilinear) {
    const int output_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
    const int output_height = ExponentialRandomPositiveInt(0.9f, 20, 200);

-    TestOneResizeBilinear(batch, depth, input_width, input_height, output_width,
-                          output_height);
+    TestOneResizeBilinear<float>(batch, depth, input_width, input_height,
+                                 output_width, output_height, 1e-5);
  }
 }

@ -94,8 +128,8 @@ TEST(ResizeBilinear2x2, TestResizeBilinear) {
    const int output_width = input_width * 2;
    const int output_height = input_height * 2;

-    TestOneResizeBilinear(batch, depth, input_width, input_height, output_width,
-                          output_height);
+    TestOneResizeBilinear<float>(batch, depth, input_width, input_height,
+                                 output_width, output_height, 1e-5);
  }
 }
 }  // namespace
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@ -121,6 +121,10 @@ class RuntimeShape {
    }
  }

+  inline void BuildFrom(const std::initializer_list<int> init_list) {
+    BuildFrom<const std::initializer_list<int>>(init_list);
+  }
+
  // Returns the total count of elements, that is the size when flattened into a
  // vector.
  inline int FlatSize() const {
--- a/tensorflow/contrib/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
@ -61,12 +61,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
  TF_LITE_ENSURE_EQ(context, NumDimensions(size), 1);

-  // TODO(ahentz): Our current implementations only support float32.
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
  TF_LITE_ENSURE_EQ(context, size->type, kTfLiteInt32);
  // ResizeBilinear creates a float tensor even when the input is made of
  // integers.
-  output->type = kTfLiteFloat32;
+  output->type = input->type;

  if (!IsConstantTensor(size)) {
    SetTensorToDynamic(output);
@ -90,17 +88,24 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  }

  if (output->type == kTfLiteFloat32) {
-#define TF_LITE_RESIZE_BILINEAR(type)                                       \
-  type::ResizeBilinear(GetTensorData<float>(input), GetTensorDims(input),   \
-                       GetTensorData<int32>(size), GetTensorDims(size),     \
-                       GetTensorData<float>(output), GetTensorDims(output), \
+#define TF_LITE_RESIZE_BILINEAR(type, datatype)                                \
+  type::ResizeBilinear(GetTensorData<datatype>(input), GetTensorDims(input),   \
+                       GetTensorData<int32>(size), GetTensorDims(size),        \
+                       GetTensorData<datatype>(output), GetTensorDims(output), \
                       params->align_corners)

    if (kernel_type == kReference) {
-      TF_LITE_RESIZE_BILINEAR(reference_ops);
+      TF_LITE_RESIZE_BILINEAR(reference_ops, float);
    }
    if (kernel_type == kGenericOptimized || kernel_type == kNeonOptimized) {
-      TF_LITE_RESIZE_BILINEAR(optimized_ops);
+      TF_LITE_RESIZE_BILINEAR(optimized_ops, float);
+    }
+  } else if (output->type == kTfLiteUInt8) {
+    if (kernel_type == kReference) {
+      TF_LITE_RESIZE_BILINEAR(reference_ops, uint8_t);
+    }
+    if (kernel_type == kGenericOptimized || kernel_type == kNeonOptimized) {
+      TF_LITE_RESIZE_BILINEAR(optimized_ops, uint8_t);
    }
 #undef TF_LITE_RESIZE_BILINEAR
  } else {
--- a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
@ -22,6 +22,7 @@ namespace tflite {
 namespace {

 using ::testing::ElementsAreArray;
+using uint8 = std::uint8_t;

 class ResizeBilinearOpModel : public SingleOpModel {
 public:
@ -34,7 +35,7 @@ class ResizeBilinearOpModel : public SingleOpModel {
    } else {
      size_ = AddInput({TensorType_INT32, {2}});
    }
-    output_ = AddOutput(TensorType_FLOAT32);  // Always float.
+    output_ = AddOutput(input.type);
    SetBuiltinOp(BuiltinOperator_RESIZE_BILINEAR,
                 BuiltinOptions_ResizeBilinearOptions,
                 CreateResizeBilinearOptions(builder_).Union());
@ -45,12 +46,16 @@ class ResizeBilinearOpModel : public SingleOpModel {
    }
  }

-  void SetInput(std::initializer_list<float> data) {
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
    PopulateTensor(input_, data);
  }
  void SetSize(std::initializer_list<int> data) { PopulateTensor(size_, data); }

-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }

 private:
  int input_;
@ -60,60 +65,121 @@ class ResizeBilinearOpModel : public SingleOpModel {

 TEST(ResizeBilinearOpTest, HorizontalResize) {
  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}});
-  m.SetInput({3, 6});
+  m.SetInput<float>({3, 6});
  m.SetSize({1, 3});
  m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));

  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 1, 2, 1}}, {1, 3});
-  const_m.SetInput({3, 6});
+  const_m.SetInput<float>({3, 6});
  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+  EXPECT_THAT(const_m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+}
+
+TEST(ResizeBilinearOpTest, HorizontalResize8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 1, 2, 1}});
+  m.SetInput<uint8>({3, 6});
+  m.SetSize({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput<uint8>({3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
 }

 TEST(ResizeBilinearOpTest, VerticalResize) {
  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
-  m.SetInput({3, 9});
+  m.SetInput<float>({3, 9});
  m.SetSize({3, 1});
  m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));

  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 1, 1}}, {3, 1});
-  const_m.SetInput({3, 9});
+  const_m.SetInput<float>({3, 9});
  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+  EXPECT_THAT(const_m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+}
+
+TEST(ResizeBilinearOpTest, VerticalResize8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 1, 1}});
+  m.SetInput<uint8>({3, 9});
+  m.SetSize({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput<uint8>({3, 9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
 }

 TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
-  m.SetInput({
+  m.SetInput<float>({
      3, 6,  //
      9, 12  //
  });
  m.SetSize({3, 3});
  m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
-                                 3, 5, 6,    //
-                                 7, 9, 10,   //
-                                 9, 11, 12,  //
-                             })));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 5, 6,    //
+                                        7, 9, 10,   //
+                                        9, 11, 12,  //
+                                    })));

  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 1}}, {3, 3});
-  const_m.SetInput({
+  const_m.SetInput<float>({
      3, 6,  //
      9, 12  //
  });
  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
-                                       3, 5, 6,    //
-                                       7, 9, 10,   //
-                                       9, 11, 12,  //
-                                   })));
+  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 5, 6,    //
+                                              7, 9, 10,   //
+                                              9, 11, 12,  //
+                                          })));
+}
+
+TEST(ResizeBilinearOpTest, TwoDimensionalResize8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 1}});
+  m.SetInput<uint8>({
+      3, 6,  //
+      9, 12  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 5, 6,    //
+                                        7, 9, 10,   //
+                                        9, 11, 12,  //
+                                    })));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<uint8>({
+      3, 6,  //
+      9, 12  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 5, 6,    //
+                                              7, 9, 10,   //
+                                              9, 11, 12,  //
+                                          })));
 }

 TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
  ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}});
-  m.SetInput({
+  m.SetInput<float>({
      3, 6,   //
      9, 12,  //
      4, 10,  //
@ -121,60 +187,123 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
  });
  m.SetSize({3, 3});
  m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
-                                 3, 5, 6,     //
-                                 7, 9, 10,    //
-                                 9, 11, 12,   //
-                                 4, 8, 10,    //
-                                 8, 12, 14,   //
-                                 10, 14, 16,  //
-                             })));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 5, 6,     //
+                                        7, 9, 10,    //
+                                        9, 11, 12,   //
+                                        4, 8, 10,    //
+                                        8, 12, 14,   //
+                                        10, 14, 16,  //
+                                    })));

  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3});
-  const_m.SetInput({
+  const_m.SetInput<float>({
      3, 6,   //
      9, 12,  //
      4, 10,  //
      10, 16  //
  });
  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
-                                       3, 5, 6,     //
-                                       7, 9, 10,    //
-                                       9, 11, 12,   //
-                                       4, 8, 10,    //
-                                       8, 12, 14,   //
-                                       10, 14, 16,  //
-                                   })));
+  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 5, 6,     //
+                                              7, 9, 10,    //
+                                              9, 11, 12,   //
+                                              4, 8, 10,    //
+                                              8, 12, 14,   //
+                                              10, 14, 16,  //
+                                          })));
 }

 TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}});
-  m.SetInput({
+  m.SetInput<float>({
      3, 4, 6, 10,    //
      9, 10, 12, 16,  //
  });
  m.SetSize({3, 3});
  m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
-                                 3, 4, 5, 8, 6, 10,      //
-                                 7, 8, 9, 12, 10, 14,    //
-                                 9, 10, 11, 14, 12, 16,  //
-                             })));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 4, 5, 8, 6, 10,      //
+                                        7, 8, 9, 12, 10, 14,    //
+                                        9, 10, 11, 14, 12, 16,  //
+                                    })));

  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 2}}, {3, 3});
-  const_m.SetInput({
+  const_m.SetInput<float>({
      3, 4, 6, 10,    //
      9, 10, 12, 16,  //
  });
  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
-                                       3, 4, 5, 8, 6, 10,      //
-                                       7, 8, 9, 12, 10, 14,    //
-                                       9, 10, 11, 14, 12, 16,  //
-                                   })));
+  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 4, 5, 8, 6, 10,      //
+                                              7, 8, 9, 12, 10, 14,    //
+                                              9, 10, 11, 14, 12, 16,  //
+                                          })));
 }

+TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {2, 2, 2, 1}});
+  m.SetInput<uint8>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      10, 16  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 5, 6,     //
+                                        7, 9, 10,    //
+                                        9, 11, 12,   //
+                                        4, 8, 10,    //
+                                        8, 12, 14,   //
+                                        10, 13, 16,  //
+                                    })));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {2, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<uint8>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      10, 16  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 5, 6,     //
+                                              7, 9, 10,    //
+                                              9, 11, 12,   //
+                                              4, 8, 10,    //
+                                              8, 12, 14,   //
+                                              10, 13, 16,  //
+                                          })));
+}
+
+TEST(ResizeBilinearOpTest, ThreeDimensionalResize8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 2}});
+  m.SetInput<uint8>({
+      3, 4, 6, 10,    //
+      9, 10, 12, 16,  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 4, 5, 8, 6, 10,      //
+                                        7, 8, 9, 12, 10, 14,    //
+                                        9, 10, 11, 13, 12, 16,  //
+                                    })));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 2, 2}}, {3, 3});
+  const_m.SetInput<uint8>({
+      3, 4, 6, 10,    //
+      9, 10, 12, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 4, 5, 8, 6, 10,      //
+                                              7, 8, 9, 12, 10, 14,    //
+                                              9, 10, 11, 13, 12, 16,  //
+                                          })));
+}
 }  // namespace
 }  // namespace tflite

--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@ -322,12 +322,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,

  *builtin_data = nullptr;
  switch (op_type) {
-    case BuiltinOperator_CALL:
-      // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
-      // ok for now, since there is no call implementation either.
-      break;
-    case BuiltinOperator_CUSTOM:
-      break;
    case BuiltinOperator_CONV_2D: {
      TfLiteConvParams* params = MallocPOD<TfLiteConvParams>();
      if (auto* conv_params = op->builtin_options_as_Conv2DOptions()) {
@ -343,22 +337,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
      *builtin_data = reinterpret_cast<void*>(params);
      break;
    }
-    case BuiltinOperator_TANH:
-    case BuiltinOperator_LOGISTIC:
-    case BuiltinOperator_RELU:
-    case BuiltinOperator_RELU_N1_TO_1:
-    case BuiltinOperator_RELU6:
-    case BuiltinOperator_CONCAT_EMBEDDINGS:
-    case BuiltinOperator_EXP:
-    case BuiltinOperator_TOPK_V2:
-    case BuiltinOperator_LOG_SOFTMAX:
-    case BuiltinOperator_DEQUANTIZE:
-    case BuiltinOperator_PRELU:
-    case BuiltinOperator_FLOOR:
-    case BuiltinOperator_NEG:
-    case BuiltinOperator_SIN:
-    case BuiltinOperator_LOG:
-      break;
    case BuiltinOperator_CAST: {
      TfLiteCastParams* params = MallocPOD<TfLiteCastParams>();
      if (auto* schema_params = op->builtin_options_as_CastOptions()) {
@ -446,9 +424,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
      *builtin_data = reinterpret_cast<void*>(params);
      break;
    }
-    case BuiltinOperator_EMBEDDING_LOOKUP:
-      // no-op.
-      break;
    case BuiltinOperator_EMBEDDING_LOOKUP_SPARSE: {
      TfLiteEmbeddingLookupSparseParams* params =
          MallocPOD<TfLiteEmbeddingLookupSparseParams>();
@ -580,12 +555,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
      *builtin_data = reinterpret_cast<void*>(params);
      break;
    }
-    case BuiltinOperator_PAD: {
-      break;
-    }
-    case BuiltinOperator_PADV2: {
-      break;
-    }
    case BuiltinOperator_RESHAPE: {
      auto* params = MallocPOD<TfLiteReshapeParams>();
      if (auto* schema_params = op->builtin_options_as_ReshapeOptions()) {
@ -625,15 +594,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
      *builtin_data = reinterpret_cast<void*>(params);
      break;
    }
-    case BuiltinOperator_SPACE_TO_BATCH_ND: {
-      break;
-    }
-    case BuiltinOperator_BATCH_TO_SPACE_ND: {
-      break;
-    }
-    case BuiltinOperator_TRANSPOSE: {
-      break;
-    }
    case BuiltinOperator_MEAN: {
      auto* params = MallocPOD<TfLiteMeanParams>();
      if (auto* schema_params = op->builtin_options_as_MeanOptions()) {
@ -673,10 +633,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
      *builtin_data = reinterpret_cast<void*>(params);
      break;
    }
-    case BuiltinOperator_MAXIMUM:
-    case BuiltinOperator_MINIMUM: {
-      break;
-    }
    case BuiltinOperator_ARG_MAX: {
      auto* params = MallocPOD<TfLiteArgMaxParams>();
      if (auto* schema_params = op->builtin_options_as_ArgMaxOptions()) {
@ -686,18 +642,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
      *builtin_data = reinterpret_cast<void*>(params);
      break;
    }
-    case BuiltinOperator_GREATER:
-    case BuiltinOperator_GREATER_EQUAL:
-    case BuiltinOperator_LESS:
-    case BuiltinOperator_LESS_EQUAL:
-    case BuiltinOperator_EQUAL:
-    case BuiltinOperator_NOT_EQUAL:
-    case BuiltinOperator_SELECT: {
-      break;
-    }
-    case BuiltinOperator_SLICE: {
-      break;
-    }
    case BuiltinOperator_TRANSPOSE_CONV: {
      TfLiteTransposeConvParams* params =
          MallocPOD<TfLiteTransposeConvParams>();
@ -725,10 +669,46 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
      error_reporter->Report("DELEGATE op shouldn't exist in model.");
      return kTfLiteError;
    }
+
+    // Below are the ops with no builtin_data strcture.
+    case BuiltinOperator_BATCH_TO_SPACE_ND:
+    // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
+    // ok for now, since there is no call implementation either.
+    case BuiltinOperator_CALL:
+    case BuiltinOperator_CONCAT_EMBEDDINGS:
+    case BuiltinOperator_CUSTOM:
+    case BuiltinOperator_DEQUANTIZE:
+    case BuiltinOperator_EMBEDDING_LOOKUP:
+    case BuiltinOperator_EQUAL:
+    case BuiltinOperator_EXP:
    case BuiltinOperator_EXPAND_DIMS:
-    case BuiltinOperator_TILE: {
+    case BuiltinOperator_FLOOR:
+    case BuiltinOperator_GREATER:
+    case BuiltinOperator_GREATER_EQUAL:
+    case BuiltinOperator_LESS:
+    case BuiltinOperator_LESS_EQUAL:
+    case BuiltinOperator_LOG:
+    case BuiltinOperator_LOGISTIC:
+    case BuiltinOperator_LOG_SOFTMAX:
+    case BuiltinOperator_MAXIMUM:
+    case BuiltinOperator_MINIMUM:
+    case BuiltinOperator_NEG:
+    case BuiltinOperator_NOT_EQUAL:
+    case BuiltinOperator_PAD:
+    case BuiltinOperator_PADV2:
+    case BuiltinOperator_PRELU:
+    case BuiltinOperator_RELU:
+    case BuiltinOperator_RELU6:
+    case BuiltinOperator_RELU_N1_TO_1:
+    case BuiltinOperator_SELECT:
+    case BuiltinOperator_SIN:
+    case BuiltinOperator_SLICE:
+    case BuiltinOperator_SPACE_TO_BATCH_ND:
+    case BuiltinOperator_TANH:
+    case BuiltinOperator_TILE:
+    case BuiltinOperator_TOPK_V2:
+    case BuiltinOperator_TRANSPOSE:
      break;
-    }
  }
  return kTfLiteOk;
 }
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@ -234,7 +234,10 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
          next_id++;
        };

-    auto add_add_params = [&add_scalar_int32]() { add_scalar_int32(0); };
+    auto add_add_params = [&add_scalar_int32](void* data) {
+      auto* builtin = reinterpret_cast<TfLiteAddParams*>(data);
+      add_scalar_int32(builtin->activation);
+    };

    auto add_pooling_params = [&add_scalar_int32](void* data) {
      auto builtin = reinterpret_cast<TfLitePoolParams*>(data);
@ -345,11 +348,11 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
    switch (builtin) {
      case tflite::BuiltinOperator_ADD:
        nn_op_type = ANEURALNETWORKS_ADD;
-        add_add_params();
+        add_add_params(node.builtin_data);
        break;
      case tflite::BuiltinOperator_MUL:
        nn_op_type = ANEURALNETWORKS_MUL;
-        add_add_params();
+        add_add_params(node.builtin_data);
        break;
      case tflite::BuiltinOperator_AVERAGE_POOL_2D:
        add_pooling_params(node.builtin_data);
--- a/tensorflow/contrib/lite/profiling/BUILD
+++ b/tensorflow/contrib/lite/profiling/BUILD
@ -2,9 +2,11 @@ package(default_visibility = ["//visibility:public"])

 licenses(["notice"])  # Apache 2.0

+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
+
 common_copts = [
    "-Wall",
-]
+] + tflite_copts()

 cc_library(
    name = "profiler",
@ -36,12 +38,14 @@ cc_library(
    name = "time",
    srcs = ["time.cc"],
    hdrs = ["time.h"],
+    copts = common_copts,
 )

 cc_library(
    name = "profile_summarizer",
    srcs = ["profile_summarizer.cc"],
    hdrs = ["profile_summarizer.h"],
+    copts = common_copts,
    deps = [
        ":profiler",
        "//tensorflow/contrib/lite:framework",
@ -53,6 +57,7 @@ cc_library(
 cc_test(
    name = "profile_summarizer_test",
    srcs = ["profile_summarizer_test.cc"],
+    copts = common_copts,
    deps = [
        ":profile_summarizer",
        "//tensorflow/contrib/lite:framework",
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@ -111,37 +111,35 @@ def tensor_name(x):
  return x.name.split(":")[0]


-def toco_convert(input_data,
-                 input_tensors,
-                 output_tensors,
-                 inference_type=lite_constants.FLOAT,
-                 inference_input_type=None,
-                 input_format=lite_constants.TENSORFLOW_GRAPHDEF,
-                 output_format=lite_constants.TFLITE,
-                 quantized_input_stats=None,
-                 default_ranges_stats=None,
-                 drop_control_dependency=True,
-                 reorder_across_fake_quant=False,
-                 allow_custom_ops=False,
-                 change_concat_input_ranges=False,
-                 quantize_weights=False,
-                 dump_graphviz_dir=None,
-                 dump_graphviz_video=False):
-  """Convert a model using TOCO from `input_format` to `output_format`.
+def build_toco_convert_protos(input_tensors,
+                              output_tensors,
+                              inference_type=lite_constants.FLOAT,
+                              inference_input_type=None,
+                              input_format=lite_constants.TENSORFLOW_GRAPHDEF,
+                              output_format=lite_constants.TFLITE,
+                              quantized_input_stats=None,
+                              default_ranges_stats=None,
+                              drop_control_dependency=True,
+                              reorder_across_fake_quant=False,
+                              allow_custom_ops=False,
+                              change_concat_input_ranges=False,
+                              quantize_weights=False,
+                              dump_graphviz_dir=None,
+                              dump_graphviz_video=False):
+  """Builds protocol buffers describing a conversion of a model using TOCO.

  Typically this is to convert from TensorFlow GraphDef to TFLite, in which
  case the default `input_format` and `output_format` are sufficient.

  Args:
-    input_data: Input data (i.e. often `sess.graph_def`).
    input_tensors: List of input tensors. Type and shape are computed using
      `foo.get_shape()` and `foo.dtype`.
    output_tensors: List of output tensors (only .name is used from this).
    inference_type: Target data type of arrays in the output file. Currently
-      must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
+      must be `{FLOAT, QUANTIZED_UINT8, STRING}`.  (default FLOAT)
    inference_input_type: Target data type of input arrays. Allows for a
      different type for input arrays in the case of quantization. Currently
-      must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
+      must be `{FLOAT, QUANTIZED_UINT8, STRING}`. (default `inference_type`)
    input_format: Type of data to read Currently must be
      `{TENSORFLOW_GRAPHDEF}`. (default TENSORFLOW_GRAPHDEF)
    output_format: Output file format. Currently must be `{TFLITE,
@ -180,8 +178,8 @@ def toco_convert(input_data,
      every graph transformation. (default False)

  Returns:
-    The converted data. For example if TFLite was the destination, then
-    this will be a tflite flatbuffer in a bytes array.
+    model_flags, toco_flags: two protocol buffers describing the conversion
+    process.

  Raises:
    ValueError: If the input tensor type is unknown
@ -204,7 +202,6 @@ def toco_convert(input_data,
  if dump_graphviz_dir:
    toco.dump_graphviz_dir = dump_graphviz_dir
  toco.dump_graphviz_include_video = dump_graphviz_video
-
  model = _model_flags_pb2.ModelFlags()
  model.change_concat_input_ranges = change_concat_input_ranges
  for idx, input_tensor in enumerate(input_tensors):
@ -216,7 +213,8 @@ def toco_convert(input_data,
      tflite_input_type = lite_constants.INT64
    elif input_tensor.dtype == _dtypes.uint8:
      tflite_input_type = lite_constants.QUANTIZED_UINT8
-    # TODO(aselle): Insert strings when they are available
+    elif input_tensor.dtype == _dtypes.string:
+      tflite_input_type = lite_constants.STRING
    else:
      raise ValueError("Tensors %s not known type %r" % (input_tensor.name,
                                                         input_tensor.dtype))
@ -233,10 +231,35 @@ def toco_convert(input_data,

  for output_tensor in output_tensors:
    model.output_arrays.append(tensor_name(output_tensor))
+  return model, toco

-  # TODO(aselle): Consider handling the case of allowing quantized
-  # inputs to be converted to float (via the toco.inference_input_type field).
-  data = toco_convert_protos(model.SerializeToString(),
-                             toco.SerializeToString(),
+
+def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
+  """"Convert a model using TOCO.
+
+  Typically this function is used to convert from TensorFlow GraphDef to TFLite.
+  Conversion can be customized by providing arguments that are forwarded to
+  `build_toco_convert_protos` (see documentation for details).
+
+  Args:
+    input_data: Input data (i.e. often `sess.graph_def`),
+    input_tensors: List of input tensors. Type and shape are computed using
+      `foo.get_shape()` and `foo.dtype`.
+    output_tensors: List of output tensors (only .name is used from this).
+    *args: See `build_toco_convert_protos`,
+    **kwargs: See `build_toco_convert_protos`.
+
+  Returns:
+    The converted data. For example if TFLite was the destination, then
+    this will be a tflite flatbuffer in a bytes array.
+
+  Raises:
+    Defined in `build_toco_convert_protos`.
+  """
+  model_flags, toco_flags = build_toco_convert_protos(input_tensors,
+                                                      output_tensors,
+                                                      *args, **kwargs)
+  data = toco_convert_protos(model_flags.SerializeToString(),
+                             toco_flags.SerializeToString(),
                             input_data.SerializeToString())
  return data
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@ -25,6 +25,7 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.

@@FLOAT
@@QUANTIZED_UINT8
+@@STRING
@@TFLITE
@@GRAPHVIZ_DOT

@ -38,6 +39,7 @@ from six import PY3
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.contrib.lite.python import lite_constants as constants
+from tensorflow.contrib.lite.python.convert import build_toco_convert_protos  # pylint: disable=unused-import
 from tensorflow.contrib.lite.python.convert import tensor_name
 from tensorflow.contrib.lite.python.convert import toco_convert
 from tensorflow.contrib.lite.python.convert import toco_convert_protos  # pylint: disable=unused-import
@ -65,10 +67,10 @@ class TocoConverter(object):
  Attributes:

    inference_type: Target data type of arrays in the output file. Currently
-      must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
+      must be `{FLOAT, QUANTIZED_UINT8, STRING}`.  (default FLOAT)
    inference_input_type: Target data type of input arrays. Allows for a
      different type for input arrays in the case of quantization. Currently
-      must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
+      must be `{FLOAT, QUANTIZED_UINT8, STRING}`. (default `inference_type`)
    output_format: Output file format. Currently must be `{TFLITE,
      GRAPHVIZ_DOT}`. (default TFLITE)
    quantized_input_stats: Dict of strings representing input tensor names
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@ -116,7 +116,8 @@ def _convert_model(flags):
                       "tensors in order to map between names and "
                       "values.".format(",".join(input_arrays)))
    converter.quantized_input_stats = dict(zip(input_arrays, quant_stats))
-  if flags.default_ranges_min and flags.default_ranges_max:
+  if (flags.default_ranges_min is not None) and (flags.default_ranges_max is
+                                                 not None):
    converter.default_ranges_stats = (flags.default_ranges_min,
                                      flags.default_ranges_max)

@ -195,7 +196,7 @@ def _check_flags(flags, unparsed):
      raise ValueError("--std_dev_values, --mean_values must have the same "
                       "number of items")

-  if bool(flags.default_ranges_min) != bool(flags.default_ranges_max):
+  if (flags.default_ranges_min is None) != (flags.default_ranges_max is None):
    raise ValueError("--default_ranges_min and --default_ranges_max must be "
                     "used together")

@ -233,12 +234,12 @@ def run_main(_):
  parser.add_argument(
      "--inference_type",
      type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8"],
+      choices=["FLOAT", "QUANTIZED_UINT8", "STRING"],
      help="Target data type of arrays in the output file.")
  parser.add_argument(
      "--inference_input_type",
      type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8"],
+      choices=["FLOAT", "QUANTIZED_UINT8", "STRING"],
      help=("Target data type of input arrays. Allows for a different type for "
            "input arrays in the case of quantization."))

--- a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
+++ b/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
@ -39,7 +39,7 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_LITE_BUILTIN_OPS_H_

 // DO NOT EDIT MANUALLY: This file is automatically generated by
-// `schema_builtin_ops_header_generator.py`.
+// `schema/builtin_ops_header/generator.cc`.

 #ifdef __cplusplus
 extern "C" {
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@ -362,6 +362,8 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
      changed = HardcodeMinMaxForAverageOrMaxPool(model, op);
      break;

+    case OperatorType::kResizeBilinear:
+    case OperatorType::kSlice:
    case OperatorType::kStridedSlice:
    case OperatorType::kSqueeze:
    case OperatorType::kTensorFlowReshape:
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@ -45,12 +45,14 @@ bool SupportsQuantization(const Operator& op) {
         type == OperatorType::kTensorFlowMinimum ||
         type == OperatorType::kTensorFlowMaximum ||
         type == OperatorType::kLogistic || type == OperatorType::kSoftmax ||
-         type == OperatorType::kLogSoftmax ||
+         type == OperatorType::kLogSoftmax || type == OperatorType::kSlice ||
+         type == OperatorType::kResizeBilinear ||
         type == OperatorType::kTensorFlowSplit || type == OperatorType::kSub ||
         type == OperatorType::kSqueeze || type == OperatorType::kPad ||
         type == OperatorType::kPadV2 ||
         type == OperatorType::kTensorFlowReshape ||
         type == OperatorType::kTanh || type == OperatorType::kMul ||
+         type == OperatorType::kSpaceToBatchND ||
         type == OperatorType::kSpaceToDepth ||
         type == OperatorType::kStridedSlice ||
         type == OperatorType::kDepthToSpace ||
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@ -920,7 +920,7 @@ void CheckEachArray(const Model& model) {
      CHECK(array->buffer->type == array->data_type);
      // The presence of a fixed buffer should imply the presence of a fixed
      // shape.
-      CHECK(array->has_shape());
+      CHECK(array->has_shape()) << "Invalid array: " << array_entry.first;
      // Constant buffer should has a valid shape.
      for (int d : array->shape().dims()) {
        CHECK_GE(d, 1);
--- a/tensorflow/contrib/lite/tools/benchmark/BUILD
+++ b/tensorflow/contrib/lite/tools/benchmark/BUILD
@ -8,7 +8,7 @@ load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite"
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_linkopts")
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")

-common_copts = ["-Wall"]
+common_copts = ["-Wall"] + tflite_copts()

 cc_binary(
    name = "benchmark_model",
@ -16,14 +16,11 @@ cc_binary(
        "benchmark_main.cc",
        "logging.h",
    ],
-    copts = tflite_copts() + common_copts,
-    linkopts = select({
+    copts = common_copts,
+    linkopts = tflite_linkopts() + select({
        "//tensorflow:android": [
-            "-pie",
-            "-landroid",
-            "-lm",
-            "-z defs",
-            "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export
+            "-pie",  # Android 5.0 and later supports only PIE
+            "-lm",  # some builtin ops, e.g., tanh, need -lm
        ],
        "//conditions:default": [],
    }),
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@ -53,7 +53,7 @@ tf_cc_binary(
        "//tensorflow/core:lib",
        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
        "//tensorflow/core/platform/cloud:gcs_file_system",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@ -58,7 +58,7 @@ cc_library(
        "//tensorflow/core/distributed_runtime/rpc:async_service_interface",
        "//tensorflow/core/distributed_runtime/rpc:grpc_call",
        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
    alwayslink = 1,
 )
@ -69,7 +69,7 @@ cc_library(
    hdrs = ["grpc_verbs_service_impl.h"],
    deps = [
        ":verbs_service_proto_cc",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -879,6 +879,7 @@ cc_library(
    hdrs = [
        "util/stats_calculator.h",
    ],
+    copts = tf_copts(),
 )

 cc_library(
--- a/tensorflow/core/api_def/base_api/api_def_BesselI0e.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BesselI0e.pbtxt
@ -0,0 +1,10 @@
+op {
+  graph_op_name: "BesselI0e"
+  summary: "Computes the Bessel i0e function of `x` element-wise."
+  description: <<END
+Exponentially scaled modified Bessel function of order 0 defined as
+`bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
+
+This function is faster and numerically stabler than `bessel_i0(x)`.
+END
+}
--- a/tensorflow/core/api_def/base_api/api_def_BesselI1e.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BesselI1e.pbtxt
@ -0,0 +1,10 @@
+op {
+  graph_op_name: "BesselI1e"
+  summary: "Computes the Bessel i1e function of `x` element-wise."
+  description: <<END
+Exponentially scaled modified Bessel function of order 0 defined as
+`bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
+
+This function is faster and numerically stabler than `bessel_i1(x)`.
+END
+}
--- a/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselI0e"
+  visibility: HIDDEN
+}
--- a/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselI1e"
+  visibility: HIDDEN
+}
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@ -66,6 +66,88 @@ int StepStatsDeviceIndex(StepStats* step_stats, EagerContext* ctx,
  return 0;
 }

+// This function expects *handle to point to an existing tensor handle. The
+// function will (maybe) update the *handle to be pointed to the newly copied
+// tensor handle.
+//
+// The passed in *handle will be Unreffed if it is replaced.
+Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
+                                      const Device* expected_device,
+                                      RunMetadata* run_metadata,
+                                      TensorHandle** handle) {
+  EagerContext* ctx = op->EagerContext();
+  Device* handle_device = nullptr;
+  TF_RETURN_IF_ERROR((*handle)->Device(&handle_device));
+  const Device* actual_device =
+      handle_device == nullptr ? ctx->HostCPU() : handle_device;
+
+  if (expected_device != actual_device) {
+    switch (ctx->GetDevicePlacementPolicy()) {
+      case DEVICE_PLACEMENT_SILENT_FOR_INT32:
+        // TODO(xpan): See if we could bubble python related error up
+        // to python level.
+        if ((*handle)->dtype == DT_INT32) {
+          // Note: enabling silent copies of int32 tensors to match behavior
+          // of graph mode.
+          break;
+        }
+        TF_FALLTHROUGH_INTENDED;
+      case DEVICE_PLACEMENT_EXPLICIT:
+        return errors::InvalidArgument(
+            "Tensors on conflicting devices:"
+            " cannot compute ",
+            op->Name(), " as input #", i, " was expected to be on ",
+            expected_device->name(), " but is actually on ",
+            actual_device->name(), " (operation running on ",
+            op->Device()->name(), ")",
+            " Tensors can be copied explicitly using .gpu() or .cpu() "
+            "methods,"
+            " or transparently copied by using tf.enable_eager_execution("
+            "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors "
+            "between devices"
+            " may slow down your model");
+      case DEVICE_PLACEMENT_WARN:
+        LOG(WARNING) << "before computing " << op->Name() << " input #" << i
+                     << " was expected to be on " << expected_device->name()
+                     << " but is actually on " << actual_device->name()
+                     << " (operation running on " << op->Device()->name()
+                     << "). This triggers a copy which can be a performance "
+                        "bottleneck.";
+        break;
+      case DEVICE_PLACEMENT_SILENT:  // Do nothing.
+        break;
+    }
+    // We are only here if the policy is warn or silent copies, so we should
+    // trigger a copy.
+    auto pre_time = Env::Default()->NowMicros();
+    TensorHandle* result_handle;
+    Status status = EagerCopyToDevice(
+        *handle, ctx, expected_device->name().c_str(), &result_handle);
+    if (run_metadata != nullptr) {
+      auto* step_stats = run_metadata->mutable_step_stats();
+      MaybeInitializeStepStats(step_stats, ctx);
+      // Record the sending on the source device for now.
+      int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device);
+      auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+      auto* node_stats = dev_stats->add_node_stats();
+      node_stats->set_node_name("_Send");
+      node_stats->set_all_start_micros(pre_time);
+      node_stats->set_op_end_rel_micros(Env::Default()->NowMicros() - pre_time);
+    }
+    if (!status.ok()) {
+      if (result_handle != nullptr) result_handle->Unref();
+      return errors::Internal("Failed copying input tensor from ",
+                              actual_device->name(), " to ",
+                              expected_device->name(), " in order to run ",
+                              op->Name(), ": ", status.error_message());
+    }
+
+    (*handle)->Unref();
+    *handle = result_handle;
+  }
+  return Status::OK();
+}
+
 Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
                                     EagerOperation* op, const OpKernel* kernel,
                                     RunMetadata* run_metadata) {
@ -78,76 +160,9 @@ Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
  for (int i = 0; i < op->Inputs().size(); ++i) {
    const Device* expected_device =
        memtypes[i] == HOST_MEMORY ? host_device : op_device;
-    TensorHandle* handle = op->Inputs()[i];
-    Device* handle_device = nullptr;
-    TF_RETURN_IF_ERROR(handle->Device(&handle_device));
-    const Device* actual_device =
-        handle_device == nullptr ? host_device : handle_device;
-    if (expected_device != actual_device) {
-      switch (ctx->GetDevicePlacementPolicy()) {
-        case DEVICE_PLACEMENT_SILENT_FOR_INT32:
-          // TODO(xpan): See if we could bubble python related error up
-          // to python level.
-          if (handle->dtype == DT_INT32) {
-            // Note: enabling silent copies of int32 tensors to match behavior
-            // of graph mode.
-            break;
-          }
-          TF_FALLTHROUGH_INTENDED;
-        case DEVICE_PLACEMENT_EXPLICIT:
-          return errors::InvalidArgument(
-              "Tensors on conflicting devices:"
-              " cannot compute ",
-              op->Name(), " as input #", i, " was expected to be on ",
-              expected_device->name(), " but is actually on ",
-              actual_device->name(), " (operation running on ",
-              op_device->name(), ")",
-              " Tensors can be copied explicitly using .gpu() or .cpu() "
-              "methods,"
-              " or transparently copied by using tf.enable_eager_execution("
-              "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors "
-              "between devices"
-              " may slow down your model");
-        case DEVICE_PLACEMENT_WARN:
-          LOG(WARNING) << "before computing " << op->Name() << " input #" << i
-                       << " was expected to be on " << expected_device->name()
-                       << " but is actually on " << actual_device->name()
-                       << " (operation running on " << op_device->name()
-                       << "). This triggers a copy which can be a performance "
-                          "bottleneck.";
-          break;
-        case DEVICE_PLACEMENT_SILENT:  // Do nothing.
-          break;
-      }
-      // We are only here if the policy is warn or silent copies, so we should
-      // trigger a copy.
-      auto pre_time = Env::Default()->NowMicros();
-      TensorHandle* copied_tensor = nullptr;
-      Status status = EagerCopyToDevice(
-          handle, ctx, expected_device->name().c_str(), &copied_tensor);
-      if (run_metadata != nullptr) {
-        auto* step_stats = run_metadata->mutable_step_stats();
-        MaybeInitializeStepStats(step_stats, ctx);
-        // Record the sending on the source device for now.
-        int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device);
-        auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
-        auto* node_stats = dev_stats->add_node_stats();
-        node_stats->set_node_name("_Send");
-        node_stats->set_all_start_micros(pre_time);
-        node_stats->set_op_end_rel_micros(Env::Default()->NowMicros() -
-                                          pre_time);
-      }
-      if (!status.ok()) {
-        if (copied_tensor != nullptr) copied_tensor->Unref();
-        return errors::Internal("Failed copying input tensor from ",
-                                actual_device->name(), " to ",
-                                expected_device->name(), " in order to run ",
-                                op->Name(), ": ", status.error_message());
-      }
-      handle->Unref();
-      handle = copied_tensor;
-      (*op->MutableInputs())[i] = copied_tensor;
-    }
+    TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
+        op, i, expected_device, run_metadata, &((*op->MutableInputs())[i])));
+    tensorflow::TensorHandle* handle = op->Inputs()[i];
    if (handle->dtype != kernel->input_type(i)) {
      return errors::InvalidArgument(
          "cannot compute ", op->Name(), " as input #", i,
@ -192,8 +207,8 @@ Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
 // Resource4> as the input params to the synthesized function.
 //
 // It populates `const_input_types`, `arg_input_types` and
-// `op_input_to_func_input` based on the reordering results, that the caller can
-// use them to build an XlaLaunch. On error, it returns NULL, and sets
+// `op_input_to_func_input` based on the reordering results, that the caller
+// can use them to build an XlaLaunch. On error, it returns NULL, and sets
 // `status` accordingly.
 const FunctionDef* OpToFunction(TFE_Op* op,
                                std::vector<TF_DataType>* const_input_types,
@ -221,8 +236,8 @@ const FunctionDef* OpToFunction(TFE_Op* op,
  const std::unordered_set<string> const_inputs(
      *XlaOpRegistry::CompileTimeConstantInputs(op->operation.Name()));

-  // First add place holders for the input args, so that we can refer to them by
-  // position in the next loop. Also tally up the resource inputs.
+  // First add place holders for the input args, so that we can refer to them
+  // by position in the next loop. Also tally up the resource inputs.
  int num_resource_inputs = 0;
  for (int i = 0; i < op_def.input_arg_size(); ++i) {
    if (op_def.input_arg(i).type() == DT_RESOURCE) {
@ -336,8 +351,9 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
                        &op_input_to_func_input, status);
    if (!status.ok()) return nullptr;
  } else {
-    // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work for
-    // functions, so we need to find another way to handle constant inputs.
+    // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work
+    // for functions, so we need to find another way to handle constant
+    // inputs.
    for (int i = const_input_types.size();
         i < fdef->signature().input_arg_size(); ++i) {
      VLOG(1) << "Adding Targs from input arg " << i;
@ -348,8 +364,9 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
  DCHECK(fdef != nullptr);

  // Copy inputs and their devices.
-  // Since input param reordering may have occurred between `op` and `launch_op`
-  // via `op_input_to_func_input`, adjust the actual inputs accordingly.
+  // Since input param reordering may have occurred between `op` and
+  // `launch_op` via `op_input_to_func_input`, adjust the actual inputs
+  // accordingly.
  *launch_op->operation.MutableInputs() = op->operation.Inputs();
  for (TensorHandle* h : launch_op->operation.Inputs()) {
    h->Ref();
@ -545,24 +562,24 @@ Status EagerLocalExecute(EagerOperation* op,
 Status EagerRemoteExecute(EagerOperation* op, eager::EagerClient* eager_client,
                          uint64 context_id, TensorHandle** retvals,
                          int* num_retvals) {
-  // All tensors must be on the same device.
-  // TODO(nareshmodi): handle silent copies
  eager::EnqueueRequest request;
  eager::EnqueueResponse response;

  auto* remote_op = request.add_queue()->mutable_operation();

-  for (auto* input : op->Inputs()) {
+  for (int i = 0; i < op->Inputs().size(); i++) {
    tensorflow::Device* input_device;
-    TF_RETURN_IF_ERROR(input->Device(&input_device));
+    TF_RETURN_IF_ERROR(op->Inputs()[i]->Device(&input_device));
    if (op->Device() != input_device) {
-      return tensorflow::errors::InvalidArgument(
-          "Ops and inputs are not on the same device. Use "
-          "TFE_TensorHandleCopyToDevice to get ops on the same "
-          "device. Expected device: ",
-          op->Device()->name(), ", Actual device: ", input_device->name());
+      // TODO(b/110044833): It's possible the same tensor gets copied to the
+      // remote device repeatedly.
+      TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
+          op, i, op->Device(), /* run_metadata= */ nullptr,
+          &(*op->MutableInputs())[i]));
    }

+    tensorflow::TensorHandle* input = op->Inputs()[i];
+
    tensorflow::uint64 op_id;
    int32 output_num;
    TF_RETURN_IF_ERROR(input->RemoteAddress(&op_id, &output_num));
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@ -42,7 +42,7 @@ load(
 # Check that tensorflow/core:tensorflow does not depend on grpc.
 check_deps(
    name = "core_tensorflow_check_deps",
-    disallowed_deps = ["@grpc//:grpc++_unsecure"],
+    disallowed_deps = ["@grpc//:grpc++"],
    deps = ["//tensorflow/core:tensorflow"],
 )

@ -150,7 +150,7 @@ tf_cuda_library(
        "//tensorflow/core:lib_internal",
        "//tensorflow/core:proto_text",
        "//tensorflow/core:protos_all_cc",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
    alwayslink = 1,
 )
@ -170,7 +170,7 @@ tf_cuda_library(
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
        "//tensorflow/core:protos_all_cc",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
    alwayslink = 1,
 )
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@ -649,7 +649,7 @@ tf_cuda_cc_test(
        "//tensorflow/core/kernels:dense_update_ops",
        "//tensorflow/core/kernels:identity_op",
        "//tensorflow/core/kernels:variable_ops",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -682,7 +682,7 @@ tf_cuda_cc_test(
        "//tensorflow/core/distributed_runtime/rpc:grpc_testlib",
        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@ -65,8 +65,8 @@ cc_library(
        "//tensorflow/core/distributed_runtime:worker_env",
        "//tensorflow/core/distributed_runtime/eager:remote_tensor_handle",
        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
-        "@grpc//:grpc++_unsecure",
-        "@grpc//:grpc_unsecure",
+        "@grpc",
+        "@grpc//:grpc++",
    ],
 )

--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@ -41,8 +41,8 @@ cc_library(
    srcs = ["grpc_util.cc"],
    hdrs = ["grpc_util.h"],
    deps = [
-        "@grpc//:grpc_unsecure",
-        "@grpc//:grpc++_unsecure",
+        "@grpc",
+        "@grpc//:grpc++",
        "//tensorflow/core:lib",
        # Required to be able to overload TensorResponse parsing.
        "//tensorflow/core/distributed_runtime:tensor_coding",
@ -56,7 +56,7 @@ cc_library(
    deps = [
        ":grpc_util",
        "//tensorflow/core:lib",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -70,7 +70,7 @@ cc_library(
        "//tensorflow/core:lib",
        "//tensorflow/core/distributed_runtime:call_options",
        "//tensorflow/core/distributed_runtime:tensor_coding",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -90,7 +90,7 @@ cc_library(
        "//tensorflow/core/distributed_runtime:tensor_coding",
        "//tensorflow/core/distributed_runtime:worker_cache_logger",
        "//tensorflow/core/distributed_runtime:worker_interface",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -103,7 +103,7 @@ cc_library(
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -118,7 +118,7 @@ cc_library(
        "//tensorflow/core:lib",
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core:worker_proto_cc",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -129,7 +129,7 @@ cc_library(
    deps = [
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -180,7 +180,7 @@ tf_cuda_library(
        "//tensorflow/core/distributed_runtime:worker_cache",
        "//tensorflow/core/distributed_runtime:worker_env",
        "//tensorflow/core/distributed_runtime:worker_session",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -192,7 +192,7 @@ cc_library(
        ":grpc_util",
        "//tensorflow/core:worker_proto_cc",
        "//tensorflow/core/distributed_runtime:tensor_coding",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -225,7 +225,7 @@ cc_library(
        "//tensorflow/core:lib_internal",
        "//tensorflow/core:master_proto_cc",
        "//tensorflow/core/distributed_runtime:master",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
    alwayslink = 1,
 )
@ -236,7 +236,7 @@ cc_library(
    hdrs = ["grpc_master_service_impl.h"],
    deps = [
        "//tensorflow/core:master_proto_cc",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -285,8 +285,8 @@ cc_library(
        "//tensorflow/core/distributed_runtime:server_lib",
        "//tensorflow/core/distributed_runtime:session_mgr",
        "//tensorflow/core/distributed_runtime:worker_env",
-        "@grpc//:grpc++_unsecure",
-        "@grpc//:grpc_unsecure",
+        "@grpc",
+        "@grpc//:grpc++",
    ],
    alwayslink = 1,
 )
@ -313,7 +313,7 @@ tf_cc_binary(
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core/distributed_runtime:server_lib",
        "//tensorflow/core/kernels:data_flow",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -338,7 +338,7 @@ tf_cc_binary(
        "//tensorflow/core/kernels:matmul_op",
        "//tensorflow/core/kernels:reduction_ops",
        "//tensorflow/core/kernels:variable_ops",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -432,7 +432,7 @@ tf_cc_test(
        "//tensorflow/core:test_main",
        "//tensorflow/core:testlib",
        "//tensorflow/core:worker_proto_cc",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -445,8 +445,8 @@ tf_cc_test(
        "//tensorflow/core:test",
        "//tensorflow/core:test_main",
        "//tensorflow/core:worker_proto_cc",
-        "@grpc//:grpc++_unsecure",
-        "@grpc//:grpc_unsecure",
+        "@grpc",
+        "@grpc//:grpc++",
    ],
 )

--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@ -12,7 +12,7 @@ cc_library(
    hdrs = ["grpc_eager_service.h"],
    deps = [
        "//tensorflow/core:eager_service_proto_cc",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -29,7 +29,7 @@ cc_library(
        "//tensorflow/core/distributed_runtime/rpc:grpc_state",
        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

@ -48,7 +48,7 @@ cc_library(
        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
    ],
 )

--- a/tensorflow/core/kernels/cwise_op_bessel.cc
+++ b/tensorflow/core/kernels/cwise_op_bessel.cc
@ -0,0 +1,29 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "BesselI0e", functor::bessel_i0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselI1e", functor::bessel_i1e, Eigen::half, float,
+          double);
+#if GOOGLE_CUDA
+REGISTER3(UnaryOp, GPU, "BesselI0e", functor::bessel_i0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselI1e", functor::bessel_i1e, Eigen::half, float,
+          double);
+#endif
+}  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_bessel.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_bessel.cu.cc
@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY3(bessel_i0e, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_i1e, Eigen::half, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@ -616,6 +616,12 @@ struct acos : base<T, Eigen::internal::scalar_acos_op<T>> {};
 template <typename T>
 struct atan : base<T, Eigen::internal::scalar_atan_op<T>> {};

+template <typename T>
+struct bessel_i0e : base<T, Eigen::internal::scalar_i0e_op<T>> {};
+
+template <typename T>
+struct bessel_i1e : base<T, Eigen::internal::scalar_i1e_op<T>> {};
+
 struct logical_not : base<bool, Eigen::internal::scalar_boolean_not_op<bool>> {
 };

--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@ -207,12 +207,6 @@ class IteratorResource : public ResourceBase {
    return Status::OK();
  }

-
-  std::shared_ptr<StatsAggregator> stats_aggregator() {
-    tf_shared_lock l(mu_);
-    return stats_aggregator_;
-  }
-
  string DebugString() override { return "Iterator resource"; }

  const DataTypeVector& output_dtypes() const { return output_dtypes_; }
@ -231,7 +225,6 @@ class IteratorResource : public ResourceBase {
  FunctionLibraryRuntime* lib_ = nullptr;  // not owned.
  std::shared_ptr<IteratorBase> iterator_;
  mutex mu_;
-  std::shared_ptr<StatsAggregator> stats_aggregator_ GUARDED_BY(mu_);
  std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
  const DataTypeVector output_dtypes_;
  const std::vector<PartialTensorShape> output_shapes_;
@ -944,9 +937,6 @@ class IteratorGetNextOp : public AsyncOpKernel {

          IteratorContext::Params params;
          params.env = ctx->env();
-          params.stats_aggregator_getter = [iterator]() {
-            return iterator->stats_aggregator();
-          };
          params.runner = *(ctx->runner());
          params.function_library = iterator->function_library();
          DeviceBase* device = ctx->function_library()->device();
@ -995,9 +985,6 @@ class IteratorGetNextSyncOp : public OpKernel {

    IteratorContext::Params params;
    params.env = ctx->env();
-    params.stats_aggregator_getter = [iterator]() {
-      return iterator->stats_aggregator();
-    };
    params.runner = *(ctx->runner());
    params.function_library = iterator->function_library();
    DeviceBase* device = ctx->function_library()->device();
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@ -10085,6 +10085,52 @@ op {
    }
  }
 }
+op {
+  name: "BesselI0e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselI1e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
  name: "Betainc"
  input_arg {
@ -25468,6 +25514,44 @@ op {
    type: "func"
  }
 }
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
 op {
  name: "Igamma"
  input_arg {
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@ -239,6 +239,10 @@ REGISTER_OP("Acos").UNARY();

 REGISTER_OP("Atan").UNARY();

+REGISTER_OP("BesselI0e").UNARY_REAL();
+
+REGISTER_OP("BesselI1e").UNARY_REAL();
+
 #undef UNARY
 #undef UNARY_REAL
 #undef UNARY_COMPLEX
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@ -3860,6 +3860,52 @@ op {
    }
  }
 }
+op {
+  name: "BesselI0e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselI1e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
  name: "Betainc"
  input_arg {
@ -12358,7 +12404,6 @@ op {
    name: "Tin"
    type: "list(type)"
    has_minimum: true
-    minimum: 1
  }
  attr {
    name: "Tout"
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@ -4210,69 +4210,6 @@ func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }

-// Shuffle dimensions of x according to a permutation.
-//
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Transpose",
-		Input: []tf.Input{
-			x, perm,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
-
-// MinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the minimum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Min",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
 type Conv2DBackpropFilterAttr func(optionalAttr)

@ -6181,6 +6118,77 @@ func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }

+// Computes offsets of concat inputs within its output.
+//
+// For example:
+//
+// ```
+// # 'x' is [2, 2, 7]
+// # 'y' is [2, 3, 7]
+// # 'z' is [2, 5, 7]
+// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
+// ```
+//
+// This is typically used by gradient computations for a concat operation.
+//
+// Arguments:
+//	concat_dim: The dimension along which to concatenate.
+//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
+//
+// Returns The `N` int32 vectors representing the starting offset
+// of input tensors within the concatenated output.
+func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatOffset",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(shape),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
+		scope.UpdateErr("ConcatOffset", err)
+		return
+	}
+	return offset
+}
+
+// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
+//
+// The lower regularized incomplete Gamma function is defined as:
+//
+//
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+//
+// where
+//
+// \\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
+//
+// is the lower incomplete Gamma function.
+//
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igamma",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DepthToSpaceAttr is an optional argument to DepthToSpace.
 type DepthToSpaceAttr func(optionalAttr)

@ -7000,6 +7008,69 @@ func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output)
 	return op.Output(0)
 }

+// Shuffle dimensions of x according to a permutation.
+//
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Transpose",
+		Input: []tf.Input{
+			x, perm,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
+
+// MinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the minimum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Min",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Transforms a Tensor into a serialized TensorProto proto.
 //
 // Arguments:
@ -11592,60 +11663,6 @@ func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output
 	return op.Output(0)
 }

-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
-
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// The range of pixel values for the output image might be slightly different
-// from the range for the input image because of limited numerical precision.
-// To guarantee an output range, for example `[0.0, 1.0]`, apply
-// `tf.clip_by_value` to the output.
-//
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeArea",
-		Input: []tf.Input{
-			images, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // 2D real-valued fast Fourier transform.
 //
 // Computes the 2-dimensional discrete Fourier transform of a real-valued signal
@ -13635,170 +13652,6 @@ func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values
 	return op.Output(0), op.Output(1)
 }

-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
-
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
-//
-// The input tensors `real` and `imag` must have the same shape.
-//
-// For example:
-//
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Complex",
-		Input: []tf.Input{
-			real, imag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the imaginary part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Imag",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the maximum along segments of a tensor.
-//
-// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMax",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tanh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that skips `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
-//
-//
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SkipDataset",
-		Input: []tf.Input{
-			input_dataset, count,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 //
 // The Hurwitz zeta function is defined as:
@ -14064,49 +13917,6 @@ func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }

-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
-
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the real part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Real",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // AudioSummaryAttr is an optional argument to AudioSummary.
 type AudioSummaryAttr func(optionalAttr)

@ -19704,6 +19514,267 @@ func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...Or
 	return op.Output(0)
 }

+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Converts two real numbers to a complex number.
+//
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
+//
+// The input tensors `real` and `imag` must have the same shape.
+//
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Complex",
+		Input: []tf.Input{
+			real, imag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the imaginary part of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Imag",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the maximum along segments of a tensor.
+//
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMax",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that skips `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
+//
+//
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SkipDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
+
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the real part of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Real",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
+
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using area interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// The range of pixel values for the output image might be slightly different
+// from the range for the input image because of limited numerical precision.
+// To guarantee an output range, for example `[0.0, 1.0]`, apply
+// `tf.clip_by_value` to the output.
+//
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeArea",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // VarHandleOpAttr is an optional argument to VarHandleOp.
 type VarHandleOpAttr func(optionalAttr)

@ -30639,74 +30710,3 @@ func UnravelIndex(scope *Scope, indices tf.Output, dims tf.Output) (output tf.Ou
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
-//
-// The lower regularized incomplete Gamma function is defined as:
-//
-//
-// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
-//
-// where
-//
-// \\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
-//
-// is the lower incomplete Gamma function.
-//
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Igamma",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes offsets of concat inputs within its output.
-//
-// For example:
-//
-// ```
-// # 'x' is [2, 2, 7]
-// # 'y' is [2, 3, 7]
-// # 'z' is [2, 5, 7]
-// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-// ```
-//
-// This is typically used by gradient computations for a concat operation.
-//
-// Arguments:
-//	concat_dim: The dimension along which to concatenate.
-//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
-//
-// Returns The `N` int32 vectors representing the starting offset
-// of input tensors within the concatenated output.
-func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatOffset",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(shape),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
-		scope.UpdateErr("ConcatOffset", err)
-		return
-	}
-	return offset
-}
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -2530,6 +2530,7 @@ py_library(
        ":check_ops",
        ":confusion_matrix",
        ":control_flow_ops",
+        ":distribute",
        ":framework",
        ":framework_for_generated_wrappers",
        ":math_ops",
--- a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
@ -259,9 +259,7 @@ class DatasetConstructorTest(test.TestCase):
      sess.run(init_op)
      self.assertAllEqual([1, 2, 3], sess.run(get_next))
      self.assertAllEqual([4, 5, 6], sess.run(get_next))
-      # NOTE(mrry): Type name in message differs between Python 2 (`long`) and
-      # 3 (`int`).
-      with self.assertRaisesOpError(r"invalid literal for"):
+      with self.assertRaisesOpError("The expected type was int64"):
        sess.run(get_next)
      self.assertAllEqual([7, 8, 9], sess.run(get_next))
      with self.assertRaises(errors.OutOfRangeError):
@ -290,6 +288,34 @@ class DatasetConstructorTest(test.TestCase):
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)

+  def testFromGeneratorStructureError(self):
+    def generator():
+      yield 1, 2
+      yield 3, 4
+      yield 5
+      yield 6, 7, 8
+      yield 9, 10
+
+    iterator = (dataset_ops.Dataset.from_generator(
+        generator, output_types=(dtypes.int64, dtypes.int64))
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      self.assertEqual((1, 2), sess.run(get_next))
+      self.assertEqual((3, 4), sess.run(get_next))
+      with self.assertRaisesOpError(
+          r"The expected structure was \(tf\.int64, tf\.int64\)"):
+        sess.run(get_next)
+      with self.assertRaisesOpError(
+          r"The expected structure was \(tf\.int64, tf\.int64\)"):
+        sess.run(get_next)
+      self.assertEqual((9, 10), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
  def testFromGeneratorHeterogeneous(self):
    def generator():
      yield 1
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@ -223,6 +223,13 @@ class Dataset(object):
  def from_tensors(tensors):
    """Creates a `Dataset` with a single element, comprising the given tensors.

+    Note that if `tensors` contains a NumPy array, and eager execution is not
+    enabled, the values will be embedded in the graph as one or more
+    @{tf.constant} operations. For large datasets (> 1 GB), this can waste
+    memory and run into byte limits of graph serialization.  If tensors contains
+    one or more large NumPy arrays, consider the alternative described in
+    @{$programmers_guide/datasets#consuming_numpy_arrays$this guide}.
+
    Args:
      tensors: A nested structure of tensors.

@ -235,6 +242,13 @@ class Dataset(object):
  def from_tensor_slices(tensors):
    """Creates a `Dataset` whose elements are slices of the given tensors.

+    Note that if `tensors` contains a NumPy array, and eager execution is not
+    enabled, the values will be embedded in the graph as one or more
+    @{tf.constant} operations. For large datasets (> 1 GB), this can waste
+    memory and run into byte limits of graph serialization.  If tensors contains
+    one or more large NumPy arrays, consider the alternative described in
+    @{$programmers_guide/datasets#consuming_numpy_arrays$this guide}.
+
    Args:
      tensors: A nested structure of tensors, each having the same size in the
        0th dimension.
@ -409,13 +423,23 @@ class Dataset(object):
        # Use the same _convert function from the py_func() implementation to
        # convert the returned values to arrays early, so that we can inspect
        # their values.
-        # pylint: disable=protected-access
-        ret_arrays = [
-            script_ops.FuncRegistry._convert(ret, dtype=dtype.as_numpy_dtype)
-            for ret, dtype in zip(
-                nest.flatten_up_to(output_types, values), flattened_types)
-        ]
-        # pylint: enable=protected-access
+        try:
+          flattened_values = nest.flatten_up_to(output_types, values)
+        except (TypeError, ValueError):
+          raise TypeError(
+              "`generator` yielded an element that did not match the expected "
+              "structure. The expected structure was %s, but the yielded "
+              "element was %s." % (output_types, values))
+        ret_arrays = []
+        for ret, dtype in zip(flattened_values, flattened_types):
+          try:
+            ret_arrays.append(script_ops.FuncRegistry._convert(  # pylint: disable=protected-access
+                ret, dtype=dtype.as_numpy_dtype))
+          except (TypeError, ValueError):
+            raise TypeError(
+                "`generator` yielded an element that could not be converted to "
+                "the expected type. The expected type was %s, but the yielded "
+                "element was %s." % (dtype.name, ret))

        # Additional type and shape checking to ensure that the components
        # of the generated element match the `output_types` and `output_shapes`
--- a/tensorflow/python/debug/cli/cli_shared.py
+++ b/tensorflow/python/debug/cli/cli_shared.py
@ -451,42 +451,48 @@ def get_error_intro(tf_error):
      sample commands for debugging.
  """

-  op_name = tf_error.op.name
+  if hasattr(tf_error, "op") and hasattr(tf_error.op, "name"):
+    op_name = tf_error.op.name
+  else:
+    op_name = None

  intro_lines = [
      "--------------------------------------",
      RL("!!! An error occurred during the run !!!", "blink"),
      "",
-      "You may use the following commands to debug:",
  ]

  out = debugger_cli_common.rich_text_lines_from_rich_line_list(intro_lines)

-  out.extend(
-      _recommend_command("ni -a -d -t %s" % op_name,
-                         "Inspect information about the failing op.",
-                         create_link=True))
-  out.extend(
-      _recommend_command("li -r %s" % op_name,
-                         "List inputs to the failing op, recursively.",
-                         create_link=True))
+  if op_name is not None:
+    out.extend(debugger_cli_common.RichTextLines(
+        ["You may use the following commands to debug:"]))
+    out.extend(
+        _recommend_command("ni -a -d -t %s" % op_name,
+                           "Inspect information about the failing op.",
+                           create_link=True))
+    out.extend(
+        _recommend_command("li -r %s" % op_name,
+                           "List inputs to the failing op, recursively.",
+                           create_link=True))

-  out.extend(
-      _recommend_command(
-          "lt",
-          "List all tensors dumped during the failing run() call.",
-          create_link=True))
+    out.extend(
+        _recommend_command(
+            "lt",
+            "List all tensors dumped during the failing run() call.",
+            create_link=True))
+  else:
+    out.extend(debugger_cli_common.RichTextLines([
+        "WARNING: Cannot determine the name of the op that caused the error."]))

  more_lines = [
      "",
-      "Op name:    " + op_name,
+      "Op name:    %s" % op_name,
      "Error type: " + str(type(tf_error)),
      "",
      "Details:",
      str(tf_error),
      "",
-      "WARNING: Using client GraphDef due to the error, instead of "
-      "executor GraphDefs.",
      "--------------------------------------",
      "",
  ]
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@ -372,6 +372,11 @@ class GetErrorIntroTest(test_util.TensorFlowTestCase):
    self.assertEqual("Details:", error_intro.lines[14])
    self.assertStartsWith(error_intro.lines[15], "foo description")

+  def testGetErrorIntroForNoOpName(self):
+    tf_error = errors.OpError(None, None, "Fake OpError", -1)
+    error_intro = cli_shared.get_error_intro(tf_error)
+    self.assertIn("Cannot determine the name of the op", error_intro.lines[3])
+

 if __name__ == "__main__":
  googletest.main()
--- a/tensorflow/python/debug/examples/examples_test.sh
+++ b/tensorflow/python/debug/examples/examples_test.sh
@ -69,6 +69,12 @@ run
 exit
 EOF

+cat << EOF | ${DEBUG_ERRORS_BIN} --error=uninitialized_variable --debug --ui_type=readline
+run
+ni -a -d -t v/read
+exit
+EOF
+
 cat << EOF | ${DEBUG_MNIST_BIN} --debug --max_steps=1 --fake_data --ui_type=readline
 run -t 1
 run --node_name_filter hidden --op_type_filter MatMul
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@ -748,7 +748,7 @@ class DebugDumpDir(object):
    return sum(len(self._dump_tensor_data[device_name])
               for device_name in self._dump_tensor_data)

-  def _load_partition_graphs(self, partition_graphs, validate):
+  def _load_partition_graphs(self, client_partition_graphs, validate):
    """Load and process partition graphs.

    Load the graphs; parse the input and control input structure; obtain the
@ -757,8 +757,10 @@ class DebugDumpDir(object):
    tensor dumps.

    Args:
-      partition_graphs: A repeated field of GraphDefs representing the
-          partition graphs executed by the TensorFlow runtime.
+      client_partition_graphs: A repeated field of GraphDefs representing the
+        partition graphs executed by the TensorFlow runtime, from the Python
+        client. These partition graphs are used only if partition graphs
+        cannot be loaded from the dump directory on the file system.
      validate: (`bool`) Whether the dump files are to be validated against the
        partition graphs.

@ -769,24 +771,23 @@ class DebugDumpDir(object):
    self._debug_graphs = {}
    self._node_devices = {}

-    if partition_graphs:
-      partition_graphs_and_device_names = [
-          (partition_graph, None) for partition_graph in partition_graphs]
-    else:
-      partition_graphs_and_device_names = []
-      for device_name in self._device_names:
-        partition_graph = None
-        if device_name in self._dump_graph_file_paths:
-          partition_graph = _load_graph_def_from_event_file(
-              self._dump_graph_file_paths[device_name])
-        else:
-          partition_graph = self._find_partition_graph(partition_graphs,
-                                                       device_name)
-        if partition_graph:
-          partition_graphs_and_device_names.append((partition_graph,
-                                                    device_name))
-        else:
-          logging.warn("Failed to load partition graphs from disk.")
+    partition_graphs_and_device_names = []
+    for device_name in self._device_names:
+      partition_graph = None
+      if device_name in self._dump_graph_file_paths:
+        partition_graph = _load_graph_def_from_event_file(
+            self._dump_graph_file_paths[device_name])
+      else:
+        logging.warn(
+            "Failed to load partition graphs for device %s from disk. "
+            "As a fallback, the client graphs will be used. This "
+            "may cause mismatches in device names." % device_name)
+        partition_graph = self._find_partition_graph(client_partition_graphs,
+                                                     device_name)
+
+      if partition_graph:
+        partition_graphs_and_device_names.append((partition_graph,
+                                                  device_name))

    for partition_graph, maybe_device_name in partition_graphs_and_device_names:
      debug_graph = debug_graphs.DebugGraph(partition_graph,
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@ -1873,6 +1873,8 @@ PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
        delete backward_function;
      });

+  Py_DECREF(num_inputs);
+
  Py_RETURN_NONE;
 }

@ -1931,8 +1933,10 @@ bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info,
    Py_INCREF(output->get());  // stay alive after since tuple steals.
    PyTuple_SET_ITEM(outputs.get(), 0, output->get());

-    if (!RecordGradient(GetPythonObjectFromString("ReadVariableOp"),
-                        inputs.get(), Py_None, outputs.get(), Py_None)) {
+    tensorflow::Safe_PyObjectPtr op_string(
+        GetPythonObjectFromString("ReadVariableOp"));
+    if (!RecordGradient(op_string.get(), inputs.get(), Py_None, outputs.get(),
+                        Py_None)) {
      return false;
    }
  }
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@ -1242,11 +1242,11 @@ class TensorFlowTestCase(googletest.TestCase):
            b,
            rtol=rtol,
            atol=atol,
-            msg="Mismatched value: a%s is different from b%s." % (path_str,
-                                                                  path_str))
+            msg=("Mismatched value: a%s is different from b%s. %s" %
+                 (path_str, path_str, msg)))
      except TypeError as e:
-        msg = "Error: a%s has %s, but b%s has %s" % (path_str, type(a),
-                                                     path_str, type(b))
+        msg = ("Error: a%s has %s, but b%s has %s. %s" %
+               (path_str, type(a), path_str, type(b), msg))
        e.args = ((e.args[0] + " : " + msg,) + e.args[1:])
        raise

--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@ -118,6 +118,7 @@ class LocallyConnectedLayersTest(test.TestCase):
            },
            input_shape=(num_samples, num_row, num_col, stack_size))

+  @tf_test_util.run_in_graph_and_eager_modes()
  def test_locallyconnected_2d_channels_first(self):
    num_samples = 8
    filters = 3
@ -125,15 +126,14 @@ class LocallyConnectedLayersTest(test.TestCase):
    num_row = 6
    num_col = 10

-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.LocallyConnected2D,
-          kwargs={
-              'filters': filters,
-              'kernel_size': 3,
-              'data_format': 'channels_first'
-          },
-          input_shape=(num_samples, num_row, num_col, stack_size))
+    testing_utils.layer_test(
+        keras.layers.LocallyConnected2D,
+        kwargs={
+            'filters': filters,
+            'kernel_size': 3,
+            'data_format': 'channels_first'
+        },
+        input_shape=(num_samples, num_row, num_col, stack_size))

  def test_locallyconnected_2d_regularization(self):
    num_samples = 8
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@ -241,6 +241,12 @@ class UnaryOpTest(test.TestCase):
                      math_ops.lgamma)
    self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
    self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
+      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
+    except ImportError as e:
+      tf_logging.warn("Cannot test special functions: %s" % str(e))

    self._compareBothSparse(x, np.abs, math_ops.abs)
    self._compareBothSparse(x, np.negative, math_ops.negative)
@ -286,6 +292,12 @@ class UnaryOpTest(test.TestCase):
    self._compareBoth(x, np.arcsin, math_ops.asin)
    self._compareBoth(x, np.arccos, math_ops.acos)
    self._compareBoth(x, np.arctan, math_ops.atan)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
+      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
+    except ImportError as e:
+      tf_logging.warn("Cannot test special functions: %s" % str(e))

    self._compareBothSparse(x, np.abs, math_ops.abs)
    self._compareBothSparse(x, np.negative, math_ops.negative)
@ -334,6 +346,12 @@ class UnaryOpTest(test.TestCase):
    self._compareBoth(k, np.arcsin, math_ops.asin)
    self._compareBoth(k, np.arccos, math_ops.acos)
    self._compareBoth(k, np.tan, math_ops.tan)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
+      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
+    except ImportError as e:
+      tf_logging.warn("Cannot test special functions: %s" % str(e))

    self._compareBothSparse(x, np.abs, math_ops.abs)
    self._compareBothSparse(x, np.negative, math_ops.negative)
@ -370,6 +388,12 @@ class UnaryOpTest(test.TestCase):
                      math_ops.lgamma)
    self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
    self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
+      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
+    except ImportError as e:
+      tf_logging.warn("Cannot test special functions: %s" % str(e))

    self._compareBothSparse(x, np.abs, math_ops.abs)
    self._compareBothSparse(x, np.negative, math_ops.negative)
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@ -939,7 +939,8 @@ class ResizeMethod(object):
 def resize_images(images,
                  size,
                  method=ResizeMethod.BILINEAR,
-                  align_corners=False):
+                  align_corners=False,
+                  preserve_aspect_ratio=False):
  """Resize `images` to `size` using the specified `method`.

  Resized images will be distorted if their original aspect ratio is not
@ -971,6 +972,10 @@ def resize_images(images,
    align_corners: bool.  If True, the centers of the 4 corner pixels of the
        input and output tensors are aligned, preserving the values at the
        corner pixels. Defaults to `False`.
+    preserve_aspect_ratio: Whether to preserve the aspect ratio. If this is set,
+      then `images` will be resized to a size that fits in `size` while
+      preserving the aspect ratio of the original image. Scales up the image if
+      `size` is bigger than the current size of the `image`. Defaults to False.

  Raises:
    ValueError: if the shape of `images` is incompatible with the
@ -1009,6 +1014,28 @@ def resize_images(images,
    new_height_const = size_const_as_shape[0].value
    new_width_const = size_const_as_shape[1].value

+    if preserve_aspect_ratio:
+      # Get the current shapes of the image, even if dynamic.
+      _, current_height, current_width, _ = _ImageDimensions(images, rank=4)
+
+      # do the computation to find the right scale and height/width.
+      scale_factor_height = (math_ops.to_float(new_height_const) /
+                             math_ops.to_float(current_height))
+      scale_factor_width = (math_ops.to_float(new_width_const) /
+                            math_ops.to_float(current_width))
+      scale_factor = math_ops.minimum(scale_factor_height, scale_factor_width)
+      scaled_height_const = math_ops.to_int32(scale_factor *
+                                              math_ops.to_float(current_height))
+      scaled_width_const = math_ops.to_int32(scale_factor *
+                                             math_ops.to_float(current_width))
+
+      # NOTE: Reset the size and other constants used later.
+      size = ops.convert_to_tensor([scaled_height_const, scaled_width_const],
+                                   dtypes.int32, name='size')
+      size_const_as_shape = tensor_util.constant_value_as_shape(size)
+      new_height_const = size_const_as_shape[0].value
+      new_width_const = size_const_as_shape[1].value
+
    # If we can determine that the height and width will be unmodified by this
    # transformation, we avoid performing the resize.
    if all(x is not None
@ -1469,6 +1496,75 @@ def adjust_hue(image, delta, name=None):
    return convert_image_dtype(rgb_altered, orig_dtype)


+# pylint: disable=invalid-name
+@tf_export('image.random_jpeg_quality')
+def random_jpeg_quality(image, min_jpeg_quality, max_jpeg_quality, seed=None):
+  """Randomly changes jpeg encoding quality for inducing jpeg noise.
+
+  `min_jpeg_quality` must be in the interval `[0, 100]` and less than
+  `max_jpeg_quality`.
+  `max_jpeg_quality` must be in the interval `[0, 100]`.
+
+  Args:
+    image: RGB image or images. Size of the last dimension must be 3.
+    min_jpeg_quality: Minimum jpeg encoding quality to use.
+    max_jpeg_quality: Maximum jpeg encoding quality to use.
+    seed: An operation-specific seed. It will be used in conjunction
+      with the graph-level seed to determine the real seeds that will be
+      used in this operation. Please see the documentation of
+      set_random_seed for its interaction with the graph-level random seed.
+
+  Returns:
+    Adjusted image(s), same shape and DType as `image`.
+
+  Raises:
+    ValueError: if `min_jpeg_quality` or `max_jpeg_quality` is invalid.
+  """
+  if (min_jpeg_quality < 0 or max_jpeg_quality < 0 or
+      min_jpeg_quality > 100 or max_jpeg_quality > 100):
+    raise ValueError('jpeg encoding range must be between 0 and 100.')
+
+  if min_jpeg_quality >= max_jpeg_quality:
+    raise ValueError('`min_jpeg_quality` must be less than `max_jpeg_quality`.')
+
+  np.random.seed(seed)
+  jpeg_quality = np.random.randint(min_jpeg_quality, max_jpeg_quality)
+  return adjust_jpeg_quality(image, jpeg_quality)
+
+
+@tf_export('image.adjust_jpeg_quality')
+def adjust_jpeg_quality(image, jpeg_quality, name=None):
+  """Adjust jpeg encoding quality of an RGB image.
+
+  This is a convenience method that adjusts jpeg encoding quality of an
+  RGB image.
+
+  `image` is an RGB image.  The image's encoding quality is adjusted
+  to `jpeg_quality`.
+  `jpeg_quality` must be in the interval `[0, 100]`.
+
+  Args:
+    image: RGB image or images. Size of the last dimension must be 3.
+    jpeg_quality: int.  jpeg encoding quality.
+    name: A name for this operation (optional).
+
+  Returns:
+    Adjusted image(s), same shape and DType as `image`.
+  """
+  with ops.name_scope(name, 'adjust_jpeg_quality', [image]) as name:
+    image = ops.convert_to_tensor(image, name='image')
+    # Remember original dtype to so we can convert back if needed
+    orig_dtype = image.dtype
+    # Convert to uint8
+    image = convert_image_dtype(image, dtypes.uint8)
+    # Encode image to jpeg with given jpeg quality
+    image = gen_image_ops.encode_jpeg(image, quality=jpeg_quality)
+    # Decode jpeg image
+    image = gen_image_ops.decode_jpeg(image)
+    # Convert back to original dtype and return
+    return convert_image_dtype(image, orig_dtype)
+
+
@tf_export('image.random_saturation')
 def random_saturation(image, lower, upper, seed=None):
  """Adjust the saturation of an RGB image by a random factor.
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@ -2599,6 +2599,86 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
      y = image_ops.resize_images(single_image, [55, 66])
      self.assertTrue(y.op.name.startswith("resize_images"))

+  def _ResizeImageCall(self, x, max_h, max_w, preserve_aspect_ratio,
+                       use_tensor_inputs):
+    if use_tensor_inputs:
+      target_max = ops.convert_to_tensor([max_h, max_w])
+      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
+      feed_dict = {x_tensor: x}
+    else:
+      target_max = [max_h, max_w]
+      x_tensor = x
+      feed_dict = {}
+
+    y = image_ops.resize_images(x_tensor, target_max,
+                                preserve_aspect_ratio=preserve_aspect_ratio)
+
+    with self.test_session(use_gpu=True):
+      return y.eval(feed_dict=feed_dict)
+
+  def _assertResizeEqual(self, x, x_shape, y, y_shape,
+                         preserve_aspect_ratio=True,
+                         use_tensor_inputs_options=None):
+    use_tensor_inputs_options = use_tensor_inputs_options or [False, True]
+    target_height, target_width, _ = y_shape
+    x = np.array(x).reshape(x_shape)
+    y = np.array(y).reshape(y_shape)
+
+    for use_tensor_inputs in use_tensor_inputs_options:
+      y_tf = self._ResizeImageCall(x, target_height, target_width,
+                                   preserve_aspect_ratio, use_tensor_inputs)
+      self.assertAllClose(y, y_tf)
+
+  def _assertResizeCheckShape(self, x, x_shape, target_shape,
+                              y_shape, preserve_aspect_ratio=True,
+                              use_tensor_inputs_options=None):
+    use_tensor_inputs_options = use_tensor_inputs_options or [False, True]
+    target_height, target_width = target_shape
+    x = np.array(x).reshape(x_shape)
+    y = np.zeros(y_shape)
+
+    for use_tensor_inputs in use_tensor_inputs_options:
+      y_tf = self._ResizeImageCall(x, target_height, target_width,
+                                   preserve_aspect_ratio, use_tensor_inputs)
+      self.assertShapeEqual(y, ops.convert_to_tensor(y_tf))
+
+  def testPreserveAspectRatioMultipleImages(self):
+    x_shape = [10, 100, 100, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [250, 250], [10, 250, 250, 10],
+                                 preserve_aspect_ratio=False)
+
+  def testPreserveAspectRatioNoOp(self):
+    x_shape = [10, 10, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeEqual(x, x_shape, x, x_shape)
+
+  def testPreserveAspectRatioSmaller(self):
+    x_shape = [100, 100, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [75, 50], [50, 50, 10])
+
+  def testPreserveAspectRatioSmallerMultipleImages(self):
+    x_shape = [10, 100, 100, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [75, 50], [10, 50, 50, 10])
+
+  def testPreserveAspectRatioLarger(self):
+    x_shape = [100, 100, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [150, 200], [150, 150, 10])
+
+  def testPreserveAspectRatioSameRatio(self):
+    x_shape = [1920, 1080, 3]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [3840, 2160], [3840, 2160, 3])
+

 class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):

--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@ -620,6 +620,35 @@ def _DigammaGrad(op, grad):
    return grad * math_ops.polygamma(array_ops.constant(1, dtype=x.dtype), x)


+@ops.RegisterGradient("BesselI0e")
+def _BesselI0eGrad(op, grad):
+  """Compute gradient of bessel_i0e(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    return grad * (math_ops.bessel_i1e(x) - math_ops.sign(x) * y)
+
+
+@ops.RegisterGradient("BesselI1e")
+def _BesselI1eGrad(op, grad):
+  """Compute gradient of bessel_i1e(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    # For x = 0, the correct gradient is 0.5.
+    # However, the main branch gives NaN because of the division by x, so
+    # we impute the gradient manually.
+    # An alternative solution is to express the gradient via bessel_i0e and
+    # bessel_i2e, but the latter is not yet implemented in Eigen.
+    eps = np.finfo(x.dtype.as_numpy_dtype).eps
+    zeros = array_ops.zeros_like(x)
+    x_is_not_tiny = math_ops.abs(x) > eps
+    safe_x = array_ops.where(x_is_not_tiny, x, eps + zeros)
+    dy_dx = math_ops.bessel_i0e(safe_x) - y * (
+        math_ops.sign(safe_x) + math_ops.reciprocal(safe_x))
+    return grad * array_ops.where(x_is_not_tiny, dy_dx, 0.5 + zeros)
+
+
@ops.RegisterGradient("Igamma")
 def _IgammaGrad(op, grad):
  """Returns gradient of igamma(a, x) with respect to x."""
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@ -2954,6 +2954,67 @@ def polyval(coeffs, x, name=None):
      p = c + p * x
    return p

+
+@tf_export("math.bessel_i0e")
+def bessel_i0e(x, name=None):
+  """Computes the Bessel i0e function of `x` element-wise.
+
+  Exponentially scaled modified Bessel function of order 0 defined as
+  `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
+
+  This function is faster and numerically stabler than `bessel_i0(x)`.
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.i0e
+  @end_compatibility
+  """
+  with ops.name_scope(name, "bessel_i0e", [x]) as name:
+    if isinstance(x, sparse_tensor.SparseTensor):
+      x_i0e = gen_math_ops.bessel_i0e(x.values, name=name)
+      return sparse_tensor.SparseTensor(
+          indices=x.indices, values=x_i0e, dense_shape=x.dense_shape)
+    else:
+      return gen_math_ops.bessel_i0e(x, name=name)
+
+
+@tf_export("math.bessel_i1e")
+def bessel_i1e(x, name=None):
+  """Computes the Bessel i1e function of `x` element-wise.
+
+  Exponentially scaled modified Bessel function of order 1 defined as
+  `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
+
+  This function is faster and numerically stabler than `bessel_i1(x)`.
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.i1e
+  @end_compatibility
+  """
+  with ops.name_scope(name, "bessel_i1e", [x]) as name:
+    if isinstance(x, sparse_tensor.SparseTensor):
+      x_i1e = gen_math_ops.bessel_i1e(x.values, name=name)
+      return sparse_tensor.SparseTensor(
+          indices=x.indices, values=x_i1e, dense_shape=x.dense_shape)
+    else:
+      return gen_math_ops.bessel_i1e(x, name=name)
+
+
 # FFT ops were moved to tf.spectral. tf.fft symbols were part of the TensorFlow
 # 1.0 API so we leave these here for backwards compatibility.
 fft = gen_spectral_ops.fft
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@ -34,21 +34,54 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export


 def metric_variable(shape, dtype, validate_shape=True, name=None):
-  """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
+  """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES)` collections.

-  return variable_scope.variable(
-      lambda: array_ops.zeros(shape, dtype),
-      trainable=False,
-      collections=[
-          ops.GraphKeys.LOCAL_VARIABLES, ops.GraphKeys.METRIC_VARIABLES
-      ],
-      validate_shape=validate_shape,
-      name=name)
+  If running in a `DistributionStrategy` context, the variable will be
+  "tower local". This means:
+
+  *   The returned object will be a container with separate variables
+      per replica/tower of the model.
+
+  *   When writing to the variable, e.g. using `assign_add` in a metric
+      update, the update will be applied to the variable local to the
+      replica/tower.
+
+  *   To get a metric's result value, we need to sum the variable values
+      across the replicas/towers before computing the final answer.
+      Furthermore, the final answer should be computed once instead of
+      in every replica/tower. Both of these are accomplished by
+      running the computation of the final result value inside
+      `tf.contrib.distribute.get_tower_context().merge_call(fn)`.
+      Inside the `merge_call()`, ops are only added to the graph once
+      and access to a tower-local variable in a computation returns
+      the sum across all replicas/towers.
+
+  Args:
+    shape: Shape of the created variable.
+    dtype: Type of the created variable.
+    validate_shape: (Optional) Whether shape validation is enabled for
+      the created variable.
+    name: (Optional) String name of the created variable.
+
+  Returns:
+    A (non-trainable) variable initialized to zero, or if inside a
+    `DistributionStrategy` scope a tower-local variable container.
+  """
+  with distribute_lib.get_tower_context().tower_local_var_scope('sum'):
+    # Note that "tower local" implies trainable=False.
+    return variable_scope.variable(
+        lambda: array_ops.zeros(shape, dtype),
+        collections=[
+            ops.GraphKeys.LOCAL_VARIABLES, ops.GraphKeys.METRIC_VARIABLES
+        ],
+        validate_shape=validate_shape,
+        name=name)


 def _remove_squeezable_dimensions(predictions, labels, weights):
@ -333,11 +366,15 @@ def mean(values,
    with ops.control_dependencies([values]):
      update_count_op = state_ops.assign_add(count, num_values)

-    mean_t = _safe_div(total, count, 'value')
-    update_op = _safe_div(update_total_op, update_count_op, 'update_op')
+    def aggregate_across_towers(_, t, c):
+      mean_t = _safe_div(t, c, 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, mean_t)
+      return mean_t

-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_t)
+    mean_t = distribute_lib.get_tower_context().merge_call(
+        aggregate_across_towers, total, count)
+    update_op = _safe_div(update_total_op, update_count_op, 'update_op')

    if updates_collections:
      ops.add_to_collections(updates_collections, update_op)
@ -572,6 +609,17 @@ def _confusion_matrix_at_thresholds(labels,
  return values, update_ops


+def _aggregate_variable(v, collections):
+
+  def f(distribution, value):
+    value = distribution.read_var(value)
+    if collections:
+      ops.add_to_collections(collections, value)
+    return value
+
+  return distribute_lib.get_tower_context().merge_call(f, v)
+
+
@tf_export('metrics.auc')
 def auc(labels,
        predictions,
@ -757,14 +805,18 @@ def auc(labels,
        raise ValueError('Invalid summation_method: %s' % summation_method)

    # sum up the areas of all the trapeziums
-    auc_value = compute_auc(values['tp'], values['fn'], values['tn'],
-                            values['fp'], 'value')
+    def aggregate_auc(_, values):
+      auc_value = compute_auc(values['tp'], values['fn'], values['tn'],
+                              values['fp'], 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, auc_value)
+      return auc_value
+
+    auc_value = distribute_lib.get_tower_context().merge_call(
+        aggregate_auc, values)
    update_op = compute_auc(update_ops['tp'], update_ops['fn'],
                            update_ops['tn'], update_ops['fp'], 'update_op')

-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, auc_value)
-
    if updates_collections:
      ops.add_to_collections(updates_collections, update_op)

@ -992,15 +1044,18 @@ def mean_per_class_accuracy(labels,
    update_total_op = state_ops.scatter_add(total, labels, ones)
    update_count_op = state_ops.scatter_add(count, labels, is_correct)

-    per_class_accuracy = _safe_div(count, total, None)
+    def aggregate_mean_accuracy(_, count, total):
+      per_class_accuracy = _safe_div(count, total, None)
+      mean_accuracy_v = math_ops.reduce_mean(
+          per_class_accuracy, name='mean_accuracy')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, mean_accuracy_v)
+      return mean_accuracy_v
+
+    mean_accuracy_v = distribute_lib.get_tower_context().merge_call(
+        aggregate_mean_accuracy, count, total)

-    mean_accuracy_v = math_ops.reduce_mean(
-        per_class_accuracy, name='mean_accuracy')
    update_op = _safe_div(update_count_op, update_total_op, name='update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_accuracy_v)
-
    if updates_collections:
      ops.add_to_collections(updates_collections, update_op)

@ -1071,7 +1126,7 @@ def mean_iou(labels,
    total_cm, update_op = _streaming_confusion_matrix(labels, predictions,
                                                      num_classes, weights)

-    def compute_mean_iou(name):
+    def compute_mean_iou(total_cm, name):
      """Compute the mean intersection-over-union via the confusion matrix."""
      sum_over_row = math_ops.to_float(math_ops.reduce_sum(total_cm, 0))
      sum_over_col = math_ops.to_float(math_ops.reduce_sum(total_cm, 1))
@ -1098,10 +1153,14 @@ def mean_iou(labels,
          math_ops.reduce_sum(iou, name=name) / num_valid_entries, 0)
      return result

-    mean_iou_v = compute_mean_iou('mean_iou')
+    def mean_iou_across_towers(_, v):
+      mean_iou_v = compute_mean_iou(v, 'mean_iou')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, mean_iou_v)
+      return mean_iou_v

-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_iou_v)
+    mean_iou_v = distribute_lib.get_tower_context().merge_call(
+        mean_iou_across_towers, total_cm)

    if updates_collections:
      ops.add_to_collections(updates_collections, update_op)
@ -1310,12 +1369,16 @@ def mean_tensor(values,
    with ops.control_dependencies([values]):
      update_count_op = state_ops.assign_add(count, num_values)

-    mean_t = _safe_div(total, count, 'value')
+    def aggregate_across_towers(_, t, c):
+      mean_t = _safe_div(t, c, 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, mean_t)
+      return mean_t
+
+    mean_t = distribute_lib.get_tower_context().merge_call(
+        aggregate_across_towers, total, count)
+
    update_op = _safe_div(update_total_op, update_count_op, 'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_t)
-
    if updates_collections:
      ops.add_to_collections(updates_collections, update_op)

@ -1413,12 +1476,9 @@ def _count_condition(values,
      weights = math_ops.to_float(weights)
      values = math_ops.multiply(values, weights)

-  value_tensor = array_ops.identity(count)
+  value_tensor = _aggregate_variable(count, metrics_collections)
+
  update_op = state_ops.assign_add(count, math_ops.reduce_sum(values))
-
-  if metrics_collections:
-    ops.add_to_collections(metrics_collections, value_tensor)
-
  if updates_collections:
    ops.add_to_collections(updates_collections, update_op)

@ -1525,13 +1585,12 @@ def false_negatives_at_thresholds(labels,
    values, update_ops = _confusion_matrix_at_thresholds(
        labels, predictions, thresholds, weights=weights, includes=('fn',))

-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, values['fn'])
+    fn_value = _aggregate_variable(values['fn'], metrics_collections)

    if updates_collections:
      ops.add_to_collections(updates_collections, update_ops['fn'])

-    return values['fn'], update_ops['fn']
+    return fn_value, update_ops['fn']


@tf_export('metrics.false_positives')
@ -1635,13 +1694,12 @@ def false_positives_at_thresholds(labels,
    values, update_ops = _confusion_matrix_at_thresholds(
        labels, predictions, thresholds, weights=weights, includes=('fp',))

-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, values['fp'])
+    fp_value = _aggregate_variable(values['fp'], metrics_collections)

    if updates_collections:
      ops.add_to_collections(updates_collections, update_ops['fp'])

-    return values['fp'], update_ops['fp']
+    return fp_value, update_ops['fp']


@tf_export('metrics.true_negatives')
@ -1745,13 +1803,12 @@ def true_negatives_at_thresholds(labels,
    values, update_ops = _confusion_matrix_at_thresholds(
        labels, predictions, thresholds, weights=weights, includes=('tn',))

-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, values['tn'])
+    tn_value = _aggregate_variable(values['tn'], metrics_collections)

    if updates_collections:
      ops.add_to_collections(updates_collections, update_ops['tn'])

-    return values['tn'], update_ops['tn']
+    return tn_value, update_ops['tn']


@tf_export('metrics.true_positives')
@ -1855,13 +1912,12 @@ def true_positives_at_thresholds(labels,
    values, update_ops = _confusion_matrix_at_thresholds(
        labels, predictions, thresholds, weights=weights, includes=('tp',))

-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, values['tp'])
+    tp_value = _aggregate_variable(values['tp'], metrics_collections)

    if updates_collections:
      ops.add_to_collections(updates_collections, update_ops['tp'])

-    return values['tp'], update_ops['tp']
+    return tp_value, update_ops['tp']


@tf_export('metrics.precision')
@ -1945,13 +2001,17 @@ def precision(labels,
      return array_ops.where(
          math_ops.greater(tp + fp, 0), math_ops.div(tp, tp + fp), 0, name)

-    p = compute_precision(true_p, false_p, 'value')
+    def once_across_towers(_, true_p, false_p):
+      p = compute_precision(true_p, false_p, 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, p)
+      return p
+
+    p = distribute_lib.get_tower_context().merge_call(
+        once_across_towers, true_p, false_p)
+
    update_op = compute_precision(true_positives_update_op,
                                  false_positives_update_op, 'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, p)
-
    if updates_collections:
      ops.add_to_collections(updates_collections, update_op)

@ -2025,13 +2085,17 @@ def precision_at_thresholds(labels,
    def compute_precision(tp, fp, name):
      return math_ops.div(tp, epsilon + tp + fp, name='precision_' + name)

-    prec = compute_precision(values['tp'], values['fp'], 'value')
+    def precision_across_towers(_, values):
+      prec = compute_precision(values['tp'], values['fp'], 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, prec)
+      return prec
+
+    prec = distribute_lib.get_tower_context().merge_call(
+        precision_across_towers, values)
+
    update_op = compute_precision(update_ops['tp'], update_ops['fp'],
                                  'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, prec)
-
    if updates_collections:
      ops.add_to_collections(updates_collections, update_op)

@ -2050,7 +2114,7 @@ def recall(labels,
  The `recall` function creates two local variables, `true_positives`
  and `false_negatives`, that are used to compute the recall. This value is
  ultimately returned as `recall`, an idempotent operation that simply divides
-  `true_positives` by the sum of `true_positives`  and `false_negatives`.
+  `true_positives` by the sum of `true_positives` and `false_negatives`.

  For estimation of the metric over a stream of data, the function creates an
  `update_op` that updates these variables and returns the `recall`. `update_op`
@ -2117,13 +2181,17 @@ def recall(labels,
          math_ops.greater(true_p + false_n, 0),
          math_ops.div(true_p, true_p + false_n), 0, name)

-    rec = compute_recall(true_p, false_n, 'value')
+    def once_across_towers(_, true_p, false_n):
+      rec = compute_recall(true_p, false_n, 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, rec)
+      return rec
+
+    rec = distribute_lib.get_tower_context().merge_call(
+        once_across_towers, true_p, false_n)
+
    update_op = compute_recall(true_positives_update_op,
                               false_negatives_update_op, 'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, rec)
-
    if updates_collections:
      ops.add_to_collections(updates_collections, update_op)

@ -2552,11 +2620,17 @@ def recall_at_top_k(labels,
        class_id=class_id,
        weights=weights)

-    metric = math_ops.div(tp, math_ops.add(tp, fn), name=scope)
+    def aggregate_across_towers(_, tp, fn):
+      metric = math_ops.div(tp, math_ops.add(tp, fn), name=scope)
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, metric)
+      return metric
+
+    metric = distribute_lib.get_tower_context().merge_call(
+        aggregate_across_towers, tp, fn)
+
    update = math_ops.div(
        tp_update, math_ops.add(tp_update, fn_update), name='update')
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, metric)
    if updates_collections:
      ops.add_to_collections(updates_collections, update)
    return metric, update
@ -2627,12 +2701,16 @@ def recall_at_thresholds(labels,
    def compute_recall(tp, fn, name):
      return math_ops.div(tp, epsilon + tp + fn, name='recall_' + name)

-    rec = compute_recall(values['tp'], values['fn'], 'value')
+    def recall_across_towers(_, values):
+      rec = compute_recall(values['tp'], values['fn'], 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, rec)
+      return rec
+
+    rec = distribute_lib.get_tower_context().merge_call(
+        recall_across_towers, values)
+
    update_op = compute_recall(update_ops['tp'], update_ops['fn'], 'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, rec)
-
    if updates_collections:
      ops.add_to_collections(updates_collections, update_op)

@ -2698,13 +2776,16 @@ def root_mean_squared_error(labels,
  mse, update_mse_op = mean_squared_error(labels, predictions, weights, None,
                                          None, name or
                                          'root_mean_squared_error')
+  def once_across_towers(_, mse):
+    rmse = math_ops.sqrt(mse)
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, rmse)
+    return rmse
+
+  rmse = distribute_lib.get_tower_context().merge_call(
+      once_across_towers, mse)

-  rmse = math_ops.sqrt(mse)
  update_rmse_op = math_ops.sqrt(update_mse_op)
-
-  if metrics_collections:
-    ops.add_to_collections(metrics_collections, rmse)
-
  if updates_collections:
    ops.add_to_collections(updates_collections, update_rmse_op)

@ -2797,15 +2878,19 @@ def sensitivity_at_specificity(labels,
      return math_ops.div(tp[tf_index], tp[tf_index] + fn[tf_index] + kepsilon,
                          name)

-    sensitivity = compute_sensitivity_at_specificity(
-        values['tp'], values['tn'], values['fp'], values['fn'], 'value')
+    def aggregate_across_towers(_, values):
+      sensitivity = compute_sensitivity_at_specificity(
+          values['tp'], values['tn'], values['fp'], values['fn'], 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, sensitivity)
+      return sensitivity
+
+    sensitivity = distribute_lib.get_tower_context().merge_call(
+        aggregate_across_towers, values)
+
    update_op = compute_sensitivity_at_specificity(
        update_ops['tp'], update_ops['tn'], update_ops['fp'], update_ops['fn'],
        'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, sensitivity)
-
    if updates_collections:
      ops.add_to_collections(updates_collections, update_op)

@ -3070,11 +3155,16 @@ def _streaming_sparse_average_precision_at_top_k(labels,
      total_update = state_ops.assign_add(total_var, batch_total, name='update')

    # Divide total by max to get mean, for both vars and the update ops.
-    mean_average_precision = _safe_scalar_div(total_var, max_var, name='mean')
-    update = _safe_scalar_div(total_update, max_update, name=scope)
+    def aggregate_across_towers(_, total_var, max_var):
+      mean_average_precision = _safe_scalar_div(total_var, max_var, name='mean')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, mean_average_precision)
+      return mean_average_precision

-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_average_precision)
+    mean_average_precision = distribute_lib.get_tower_context().merge_call(
+        aggregate_across_towers, total_var, max_var)
+
+    update = _safe_scalar_div(total_update, max_update, name=scope)
    if updates_collections:
      ops.add_to_collections(updates_collections, update)

@ -3351,11 +3441,17 @@ def precision_at_top_k(labels,
        class_id=class_id,
        weights=weights)

-    metric = math_ops.div(tp, math_ops.add(tp, fp), name=scope)
+    def aggregate_across_towers(_, tp, fp):
+      metric = math_ops.div(tp, math_ops.add(tp, fp), name=scope)
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, metric)
+      return metric
+
+    metric = distribute_lib.get_tower_context().merge_call(
+        aggregate_across_towers, tp, fp)
+
    update = math_ops.div(
        tp_update, math_ops.add(tp_update, fp_update), name='update')
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, metric)
    if updates_collections:
      ops.add_to_collections(updates_collections, update)
    return metric, update
@ -3583,15 +3679,19 @@ def specificity_at_sensitivity(labels,
      return math_ops.div(tn[tf_index], tn[tf_index] + fp[tf_index] + kepsilon,
                          name)

-    specificity = compute_specificity_at_sensitivity(
-        values['tp'], values['tn'], values['fp'], values['fn'], 'value')
+    def aggregate_across_towers(_, values):
+      specificity = compute_specificity_at_sensitivity(
+          values['tp'], values['tn'], values['fp'], values['fn'], 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, specificity)
+      return specificity
+
+    specificity = distribute_lib.get_tower_context().merge_call(
+        aggregate_across_towers, values)
+
    update_op = compute_specificity_at_sensitivity(
        update_ops['tp'], update_ops['tn'], update_ops['fp'], update_ops['fn'],
        'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, specificity)
-
    if updates_collections:
      ops.add_to_collections(updates_collections, update_op)

--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@ -82,6 +82,54 @@ def lbeta(x, name='lbeta'):
    return result


+@tf_export('math.bessel_i0')
+def bessel_i0(x, name='bessel_i0'):
+  """Computes the Bessel i0 function of `x` element-wise.
+
+  Modified Bessel function of order 0.
+
+  It is preferable to use the numerically stabler function `i0e(x)` instead.
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.i0
+  @end_compatibility
+  """
+  with ops.name_scope(name, [x]):
+    return math_ops.exp(math_ops.abs(x)) * math_ops.bessel_i0e(x)
+
+
+@tf_export('math.bessel_i1')
+def bessel_i1(x, name='bessel_i1'):
+  """Computes the Bessel i1 function of `x` element-wise.
+
+  Modified Bessel function of order 1.
+
+  It is preferable to use the numerically stabler function `i1e(x)` instead.
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.i1
+  @end_compatibility
+  """
+  with ops.name_scope(name, [x]):
+    return math_ops.exp(math_ops.abs(x)) * math_ops.bessel_i1e(x)
+
+
@tf_export('einsum', 'linalg.einsum')
 def einsum(equation, *inputs, **kwargs):
  """A generalized contraction between tensors of arbitrary dimension.
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@ -29,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging


 class LBetaTest(test.TestCase):
@ -150,6 +151,33 @@ class LBetaTest(test.TestCase):
        self.assertEqual(expected_result.get_shape(), lbeta_x.get_shape())


+class BesselTest(test.TestCase):
+
+  def test_bessel_i0(self):
+    x_single = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float32)
+    x_double = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float64)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(special.i0(x_single),
+                          self.evaluate(special_math_ops.bessel_i0(x_single)))
+      self.assertAllClose(special.i0(x_double),
+                          self.evaluate(special_math_ops.bessel_i0(x_double)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+  def test_bessel_i1(self):
+    x_single = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float32)
+    x_double = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float64)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(special.i1(x_single),
+                          self.evaluate(special_math_ops.bessel_i1(x_single)))
+      self.assertAllClose(special.i1(x_double),
+                          self.evaluate(special_math_ops.bessel_i1(x_double)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+
 class EinsumTest(test.TestCase):

  simple_cases = [
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@ -79,12 +79,14 @@ def _parse_saved_model(export_dir):
                   constants.SAVED_MODEL_FILENAME_PB))


-def _get_asset_tensors(export_dir, meta_graph_def_to_load):
+def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
  """Gets the asset tensors, if defined in the meta graph def to load.

  Args:
    export_dir: Directory where the SavedModel is located.
    meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
+    import_scope: Optional `string` -- if specified, prepend this followed by
+        '/' to all returned asset tensor names.

  Returns:
    A dictionary of asset tensors, keyed by the name of the asset tensor. The
@ -104,7 +106,10 @@ def _get_asset_tensors(export_dir, meta_graph_def_to_load):
    for asset_any_proto in assets_any_proto:
      asset_proto = meta_graph_pb2.AssetFileDef()
      asset_any_proto.Unpack(asset_proto)
-      asset_tensor_dict[asset_proto.tensor_info.name] = os.path.join(
+      tensor_name = asset_proto.tensor_info.name
+      if import_scope:
+        tensor_name = "%s/%s" % (import_scope, tensor_name)
+      asset_tensor_dict[tensor_name] = os.path.join(
          compat.as_bytes(assets_directory),
          compat.as_bytes(asset_proto.filename))
  return asset_tensor_dict
@ -179,7 +184,7 @@ def maybe_saved_model_directory(export_dir):


@tf_export("saved_model.loader.load")
-def load(sess, tags, export_dir, **saver_kwargs):
+def load(sess, tags, export_dir, import_scope=None, **saver_kwargs):
  """Loads the model from a SavedModel as specified by tags.

  Args:
@ -189,6 +194,10 @@ def load(sess, tags, export_dir, **saver_kwargs):
        SavedModel `save()` API.
    export_dir: Directory in which the SavedModel protocol buffer and variables
        to be loaded are located.
+    import_scope: Optional `string` -- if specified, prepend this string
+        followed by '/' to all loaded tensor names. This scope is applied to
+        tensor instances loaded into the passed session, but it is *not* written
+        through to the static `MetaGraphDef` protocol buffer that is returned.
    **saver_kwargs: Optional keyword arguments passed through to Saver.

  Returns:
@ -216,7 +225,8 @@ def load(sess, tags, export_dir, **saver_kwargs):
      )

    # Build a saver by importing the meta graph def to load.
-    saver = tf_saver.import_meta_graph(meta_graph_def_to_load, **saver_kwargs)
+    saver = tf_saver.import_meta_graph(
+        meta_graph_def_to_load, import_scope=import_scope, **saver_kwargs)

    if saver:
      # Build the checkpoint path where the variables are located.
@ -232,8 +242,8 @@ def load(sess, tags, export_dir, **saver_kwargs):
                      "checkpoints were restored.")

    # Get asset tensors, if any.
-    asset_tensors_dictionary = _get_asset_tensors(export_dir,
-                                                  meta_graph_def_to_load)
+    asset_tensors_dictionary = _get_asset_tensors(
+        export_dir, meta_graph_def_to_load, import_scope=import_scope)

    main_op_tensor = (
        _get_main_op_tensor(meta_graph_def_to_load) or
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@ -1197,6 +1197,59 @@ class SavedModelTest(test.TestCase):
    _validate_custom_saver("tag_1", "save_1/restore_all")
    _validate_custom_saver("tag_2", "save_2/restore_all")

+  def testImportScope(self):
+    export_dir = self._get_export_dir("test_scoped_assets")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    # Build a SavedModel with a variable, an asset, and a constant tensor.
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
+                                                      "asset_file_tensor")
+      constant_op.constant("constant value", name="constant_tensor_name")
+      builder.add_meta_graph_and_variables(
+          sess, ["tag_name"], assets_collection=asset_collection)
+
+      # Save the asset file path for later comparison.
+      asset_file_path = asset_collection[0].eval()
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      # Restore the SavedModel under an import_scope in a new graph/session.
+      graph_proto = loader.load(
+          sess, ["tag_name"], export_dir, import_scope="scope_name")
+
+      # The loaded variable tensor should be scoped, but its contents should be
+      # unchanged.
+      self.assertEqual(
+          "scope_name/v:0",
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].name)
+      self.assertEqual(
+          42,
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+
+      # The loaded asset tensor should be scoped, but the asset file path and
+      # contents should be unchanged.
+      asset_collection = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
+      self.assertEqual(1, len(asset_collection))
+      self.assertEqual(asset_file_path, asset_collection[0].eval())
+      self.assertEqual("scope_name/asset_file_tensor:0",
+                       asset_collection[0].name)
+      # The static asset data inside graph_proto.collection_def should not be
+      # scoped.
+      self._validate_asset_collection(export_dir, graph_proto.collection_def,
+                                      "foo.txt", "content_foo",
+                                      "asset_file_tensor:0")
+
+      # The constant tensor should be scoped, but its contents should be
+      # unchanged.
+      self.assertEqual(
+          compat.as_bytes("constant value"),
+          ops.get_default_graph().get_tensor_by_name(
+              "scope_name/constant_tensor_name:0").eval())
+
  def testClearDevices(self):
    export_dir = self._get_export_dir("test_clear_devices")
    builder = saved_model_builder.SavedModelBuilder(export_dir)
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@ -528,6 +528,8 @@ class DistributionStrategy(object):
  * `d.update_non_slot(d.non_slot_devices(), fn)`: in cross-tower
    context, like `d.update()` except with locality N.
  * `d.fetch(t)`: Copy `t` with any locality to the client's CPU device.
+    TODO(josh11b): Deprecate `fetch`, switch to `read_var` for
+    reading tower-local variables.

  The standard pattern for updating variables is to:

@ -614,8 +616,8 @@ class DistributionStrategy(object):

    There will still be one component variable per tower, but there is
    no requirement that they stay in sync. Instead, when saving them
-    or calling `fetch()`, we use the value that results when calling
-    `reduce()` on all the towers' variables.
+    or calling `fetch()/read_var()`, we use the value that
+    results when calling `reduce()` on all the towers' variables.

    Note: tower-local implies not trainable. Instead, it is expected
    that each tower will directly update (using `assign_add()` or
@ -646,6 +648,21 @@ class DistributionStrategy(object):
    _require_distribution_strategy_scope(self)
    return variable_scope.variable_creator_scope(create_tower_local_variable)

+  def read_var(self, v):
+    """Reads the value of a variable.
+
+    Returns the aggregate value of a tower-local variable, or the
+    (read-only) value of any other variable.
+
+    Args:
+      v: A variable allocated within the scope of this `DistributionStrategy`.
+
+    Returns:
+      A tensor representing the value of `v`, aggregated across towers if
+      necessary.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
  def colocate_vars_with(self, colocate_with_variable):
    """Scope that controls which devices variables will be created on.

@ -904,6 +921,8 @@ class DistributionStrategy(object):
    will attempt to avoid a copy by checking if the value is already
    on the destination device.

+    TODO(josh11b): Switch to `read_var`.
+
    Args:
      val: Value (which may be mirrored) to copy.
      destination: A device string to copy the value to.
@ -1197,6 +1216,9 @@ class _DefaultDistributionStrategy(DistributionStrategy):
    with ops.colocate_with(colocate_with), UpdateContext(colocate_with):
      return fn(*args, **kwargs)

+  def read_var(self, tower_local_var):
+    return array_ops.identity(tower_local_var)
+
  def _fetch(self, var, destination, fn):
    with ops.colocate_with(var):
      var = fn(var)
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@ -1970,7 +1970,7 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,

    return Saver(saver_def=meta_graph_def.saver_def, name=scope)
  else:
-    if variables._all_saveable_objects():  # pylint: disable=protected-access
+    if variables._all_saveable_objects(scope=import_scope):  # pylint: disable=protected-access
      # Return the default saver instance for all graph variables.
      return Saver()
    else:
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@ -2339,6 +2339,46 @@ class MetaGraphTest(test.TestCase):
              10, size=[1, 10])
      })

+  def testImportIntoNamescopeWithoutVariables(self):
+    # Save a simple graph that contains no variables into a checkpoint.
+    test_dir = self._get_test_dir("no_vars_graph")
+    filename = os.path.join(test_dir, "ckpt")
+    graph_1 = ops_lib.Graph()
+    with session.Session(graph=graph_1) as sess:
+      constant_op.constant([1, 2, 3], name="x")
+      constant_op.constant([1, 2, 3], name="y")
+      saver = saver_module.Saver(allow_empty=True)
+      saver.save(sess, filename)
+
+    # Create a fresh graph.
+    graph_2 = ops_lib.Graph()
+    with session.Session(graph=graph_2) as sess:
+      # Restore the above checkpoint under scope "subgraph_1".
+      new_saver_1 = saver_module.import_meta_graph(
+          filename + ".meta", graph=graph_2, import_scope="subgraph_1")
+      # There are no variables to restore, so import_meta_graph should not
+      # return a Saver.
+      self.assertIsNone(new_saver_1)
+
+      # Create a variable in graph_2 under scope "my_scope".
+      variables.Variable(array_ops.zeros([10]), name="my_scope/my_var")
+      sess.run(variables.global_variables_initializer())
+      # Restore the checkpoint into a different scope "subgraph_2".
+      new_saver_2 = saver_module.import_meta_graph(
+          filename + ".meta", graph=graph_2, import_scope="subgraph_2")
+      # Because the variable does not live in scope "subgraph_2",
+      # import_meta_graph should not attempt to restore the variable. So,
+      # import_meta_graph still won't return a Saver instance.
+      self.assertIsNone(new_saver_2)
+
+      # However, if we restore the checkpoint under scope "my_scope",
+      # import_meta_graph will detect the variable and return a Saver for
+      # restoring it. This should happen even when the variable does not
+      # originate from graph_1.
+      new_saver_3 = saver_module.import_meta_graph(
+          filename + ".meta", graph=graph_2, import_scope="my_scope")
+      self.assertIsInstance(new_saver_3, saver_module.Saver)
+
  def testImportIntoImplicitNamescope(self):
    # Test that we can import a meta graph into an implicit namescope.
    test_dir = self._get_test_dir("import_into_namescope")
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@ -24,17 +24,12 @@ limitations under the License.
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#ifdef __APPLE__
-#include <IOKit/kext/KextManager.h>
-#include <mach-o/dyld.h>
-#else
 #if !defined(PLATFORM_WINDOWS)
 #include <link.h>
 #include <sys/sysmacros.h>
 #include <unistd.h>
 #endif
 #include <sys/stat.h>
-#endif
 #include <algorithm>
 #include <memory>
 #include <vector>
@ -54,9 +49,7 @@ limitations under the License.
 namespace stream_executor {
 namespace cuda {

-#ifdef __APPLE__
-static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
-#elif !defined(PLATFORM_WINDOWS)
+#if !defined(PLATFORM_WINDOWS)
 static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
 #endif

@ -121,26 +114,7 @@ string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
 }

 void Diagnostician::LogDiagnosticInformation() {
-#ifdef __APPLE__
-  CFStringRef kext_ids[1];
-  kext_ids[0] = kDriverKextIdentifier;
-  CFArrayRef kext_id_query = CFArrayCreate(nullptr, (const void**)kext_ids, 1, &kCFTypeArrayCallBacks);
-  CFDictionaryRef kext_infos = KextManagerCopyLoadedKextInfo(kext_id_query, nullptr);
-  CFRelease(kext_id_query);
-
-  CFDictionaryRef cuda_driver_info = nullptr;
-  if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier, (const void**)&cuda_driver_info)) {
-    bool started = CFBooleanGetValue((CFBooleanRef)CFDictionaryGetValue(cuda_driver_info, CFSTR("OSBundleStarted")));
-    if (!started) {
-      LOG(INFO) << "kernel driver is installed, but does not appear to be running on this host "
-                << "(" << port::Hostname() << ")";
-    }
-  } else {
-    LOG(INFO) << "kernel driver does not appear to be installed on this host "
-              << "(" << port::Hostname() << ")";
-  }
-  CFRelease(kext_infos);
-#elif !defined(PLATFORM_WINDOWS)
+#if !defined(PLATFORM_WINDOWS)
  if (access(kDriverVersionPath, F_OK) != 0) {
    LOG(INFO) << "kernel driver does not appear to be running on this host "
              << "(" << port::Hostname() << "): "
@ -194,8 +168,7 @@ void Diagnostician::LogDiagnosticInformation() {
 	  << DriverVersionStatusToString(kernel_version);
 #endif

-  // OS X kernel driver does not report version accurately
-#if !defined(__APPLE__) && !defined(PLATFORM_WINDOWS)
+#if !defined(PLATFORM_WINDOWS)
  if (kernel_version.ok() && dso_version.ok()) {
    WarnOnDsoKernelMismatch(dso_version, kernel_version);
  }
@ -209,29 +182,6 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
      port::error::NOT_FOUND,
      "was unable to find libcuda.so DSO loaded into this program"));

-#if defined(__APPLE__)
-    // OSX CUDA libraries have names like: libcuda_310.41.15_mercury.dylib
-    const string prefix("libcuda_");
-    const string suffix("_mercury.dylib");
-    for (uint32_t image_index = 0; image_index < _dyld_image_count(); ++image_index) {
-      const string path(_dyld_get_image_name(image_index));
-      const size_t suffix_pos = path.rfind(suffix);
-      const size_t prefix_pos = path.rfind(prefix, suffix_pos);
-      if (prefix_pos == string::npos ||
-          suffix_pos == string::npos) {
-        // no match
-        continue;
-      }
-      const size_t start = prefix_pos + prefix.size();
-      if (start >= suffix_pos) {
-        // version not included
-        continue;
-      }
-      const size_t length = suffix_pos - start;
-      const string version = path.substr(start, length);
-      result = StringToDriverVersion(version);
-    }
-#else
 #if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
  // Callback used when iterating through DSOs. Looks for the driver-interfacing
  // DSO and yields its version number into the callback data, when found.
@ -264,7 +214,6 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
  };

  dl_iterate_phdr(iterate_phdr, &result);
-#endif
 #endif

  return result;
@ -310,38 +259,7 @@ void Diagnostician::WarnOnDsoKernelMismatch(


 port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
-#if defined(__APPLE__)
-  CFStringRef kext_ids[1];
-  kext_ids[0] = kDriverKextIdentifier;
-  CFArrayRef kext_id_query = CFArrayCreate(nullptr, (const void**)kext_ids, 1, &kCFTypeArrayCallBacks);
-  CFDictionaryRef kext_infos = KextManagerCopyLoadedKextInfo(kext_id_query, nullptr);
-  CFRelease(kext_id_query);
-
-  CFDictionaryRef cuda_driver_info = nullptr;
-  if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier, (const void**)&cuda_driver_info)) {
-    // NOTE: OSX CUDA driver does not currently store the same driver version
-    // in kCFBundleVersionKey as is returned by cuDriverGetVersion
-    CFRelease(kext_infos);
-    const CFStringRef str = (CFStringRef)CFDictionaryGetValue(
-        cuda_driver_info, kCFBundleVersionKey);
-    const char *version = CFStringGetCStringPtr(str, kCFStringEncodingUTF8);
-
-    // version can be NULL in which case treat it as empty string
-    // see
-    // https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112
-    if (version == NULL) {
-      return StringToDriverVersion("");
-    }
-    return StringToDriverVersion(version);
-  }
-  CFRelease(kext_infos);
-  auto status = port::Status(
-      port::error::INTERNAL,
-      port::StrCat(
-          "failed to read driver bundle version: ",
-          CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8)));
-  return status;
-#elif defined(PLATFORM_WINDOWS)
+#if defined(PLATFORM_WINDOWS)
  auto status =
      port::Status(port::error::UNIMPLEMENTED,
                   "kernel reported driver version not implemented on Windows");
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@ -495,10 +495,10 @@ PersistentRnnPlan CreatePersistentRnnPlan(cudnnRNNDescriptor_t rnn_desc,

 // Turns a BatchDescriptor structure into a cudnn tensor handle within a
 // scope.
-class ScopedTensorDescriptor {
+class CudnnTensorDescriptor {
 public:
-  ScopedTensorDescriptor(const dnn::BatchDescriptor& batch_descriptor,
-                         cudnnDataType_t elem_type)
+  CudnnTensorDescriptor(const dnn::BatchDescriptor& batch_descriptor,
+                        cudnnDataType_t elem_type)
      : handle_(CreateTensorDescriptor()) {
    switch (batch_descriptor.layout()) {
      case dnn::DataLayout::kBatchYXDepth:
@ -540,15 +540,15 @@ class ScopedTensorDescriptor {
 private:
  TensorDescriptor handle_;

-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedTensorDescriptor);
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnTensorDescriptor);
 };

 // Turns a FilterDescriptor structure into a cudnn filter handle within a
 // scope.
-class ScopedFilterDescriptor {
+class CudnnFilterDescriptor {
 public:
-  ScopedFilterDescriptor(const dnn::FilterDescriptor& filter_descriptor,
-                         cudnnDataType_t elem_type)
+  CudnnFilterDescriptor(const dnn::FilterDescriptor& filter_descriptor,
+                        cudnnDataType_t elem_type)
      : handle_(CreateFilterDescriptor()) {
    // TODO(b/23032134): Even if the filter layout is not supported,
    // cudnnSetFilter4DDescriptor_v4 will return CUDNN_STATUS_SUCCESS because
@ -586,7 +586,7 @@ class ScopedFilterDescriptor {
 private:
  FilterDescriptor handle_;  // Owned.

-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFilterDescriptor);
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnFilterDescriptor);
 };

 // A helper function to decide whether to enable the TENSOR_OP_MATH math type
@ -636,9 +636,9 @@ bool BatchnormSpatialPersistentEnabled() {

 // Turns a ConvolutionDescriptor structure into a cudnn convolution handle
 // within a scope.
-class ScopedConvolutionDescriptor {
+class CudnnConvolutionDescriptor {
 public:
-  ScopedConvolutionDescriptor(
+  CudnnConvolutionDescriptor(
      const dnn::ConvolutionDescriptor& convolution_descriptor,
      cudnnDataType_t data_type)
      : handle_(CreateConvolutionDescriptor()) {
@ -700,14 +700,14 @@ class ScopedConvolutionDescriptor {
 private:
  ConvolutionDescriptor handle_;  // Owned.

-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnConvolutionDescriptor);
 };

 // Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle
 // within a scope.
-class ScopedPoolingDescriptor {
+class CudnnPoolingDescriptor {
 public:
-  explicit ScopedPoolingDescriptor(
+  explicit CudnnPoolingDescriptor(
      const dnn::PoolingDescriptor& pooling_descriptor)
      : handle_(CreatePoolingDescriptor()) {
    const std::vector<int64> strides64 = pooling_descriptor.strides();
@ -739,13 +739,13 @@ class ScopedPoolingDescriptor {
 private:
  PoolingDescriptor handle_;  // Owned.

-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnPoolingDescriptor);
 };

 // Turns a NormalizeDescriptor structure into a cudnn LRN descriptor handle.
-class ScopedNormalizeDescriptor {
+class CudnnNormalizeDescriptor {
 public:
-  explicit ScopedNormalizeDescriptor(
+  explicit CudnnNormalizeDescriptor(
      const dnn::NormalizeDescriptor& normalize_descriptor)
      : handle_(CreateLrnDescriptor()) {
    // The range specifies that the indices in the closed range
@ -777,16 +777,16 @@ class ScopedNormalizeDescriptor {
 private:
  LrnDescriptor handle_;  // Owned.

-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedNormalizeDescriptor);
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnNormalizeDescriptor);
 };

 // Turns a ActivationDescriptor structure into a cudnn activation
 // descriptor handle within a scope.
-class ScopedActivationDescriptor {
+class CudnnActivationDescriptor {
 public:
-  ScopedActivationDescriptor(dnn::ActivationMode activation_mode,
-                             cudnnNanPropagation_t nan_propagation,
-                             double value_max)
+  CudnnActivationDescriptor(dnn::ActivationMode activation_mode,
+                            cudnnNanPropagation_t nan_propagation,
+                            double value_max)
      : handle_(CreateActivationDescriptor()) {
    double relu_ceiling = 0.0;
    cudnnActivationMode_t mode;
@ -822,7 +822,7 @@ class ScopedActivationDescriptor {
 private:
  ActivationDescriptor handle_;  // Owned.

-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivationDescriptor);
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnActivationDescriptor);
 };

 cudnnDataType_t ToCudnnDataType(
@ -888,21 +888,21 @@ int CudnnDataTypeToByteSize(cudnnDataType_t data_type) {
  }
 }

-class ScopedDropoutDescriptor {
-  explicit ScopedDropoutDescriptor(DropoutDescriptor handle)
+class CudnnDropoutDescriptor {
+  explicit CudnnDropoutDescriptor(DropoutDescriptor handle)
      : handle_(std::move(handle)) {}

 public:
-  ScopedDropoutDescriptor(ScopedDropoutDescriptor&&) = default;
+  CudnnDropoutDescriptor(CudnnDropoutDescriptor&&) = default;

-  static port::StatusOr<ScopedDropoutDescriptor> Create(
+  static port::StatusOr<CudnnDropoutDescriptor> Create(
      const CudnnHandle& cudnn, float dropout, uint64 seed,
      ScratchAllocator* state_allocator) {
    DropoutDescriptor handle = CreateDropoutDescriptor();

    if (dropout == 0.0f) {
      // Return 'empty' dropout descriptor.
-      return ScopedDropoutDescriptor(std::move(handle));
+      return CudnnDropoutDescriptor(std::move(handle));
    }

    DeviceMemory<uint8> state_memory;
@ -917,14 +917,14 @@ class ScopedDropoutDescriptor {
        handle.get(), cudnn.handle(), dropout, state_memory.opaque(),
        state_memory.size(), seed));

-    return ScopedDropoutDescriptor(std::move(handle));
+    return CudnnDropoutDescriptor(std::move(handle));
  }

  cudnnDropoutDescriptor_t handle() const { return handle_.get(); }

 private:
  DropoutDescriptor handle_;  // Owned.
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedDropoutDescriptor);
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnDropoutDescriptor);
 };

 class CudnnRnnParamsDescriptor {
@ -973,7 +973,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
                     cudnnRNNMode_t rnn_mode, cudnnDataType_t data_type,
                     cudnnDataType_t compute_type,
                     const dnn::AlgorithmConfig& algorithm_config,
-                     ScopedDropoutDescriptor dropout_desc,
+                     CudnnDropoutDescriptor dropout_desc,
                     CudnnRnnParamsDescriptor params_desc)
      : rnn_desc_(std::move(rnn_desc)),
        rnn_plan_(std::move(rnn_plan)),
@ -1002,8 +1002,8 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
      const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
      ScratchAllocator* state_allocator) {
    SE_ASSIGN_OR_RETURN(
-        ScopedDropoutDescriptor dropout_desc,
-        ScopedDropoutDescriptor::Create(cudnn, dropout, seed, state_allocator));
+        CudnnDropoutDescriptor dropout_desc,
+        CudnnDropoutDescriptor::Create(cudnn, dropout, seed, state_allocator));

    cuda::RnnDescriptor rnn_desc = CreateRnnDescriptor();
    cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config.algorithm());
@ -1097,7 +1097,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
  cudnnDataType_t data_type_;
  cudnnDataType_t compute_type_;
  dnn::AlgorithmConfig algorithm_config_;
-  ScopedDropoutDescriptor dropout_desc_;
+  CudnnDropoutDescriptor dropout_desc_;
  CudnnRnnParamsDescriptor params_desc_;
  SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnDescriptor);
 };
@ -1926,10 +1926,9 @@ namespace {
 // and backward filter.

 port::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
-    const CudnnHandle& cudnn, const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd, bool specify_workspace_limit,
+    const CudnnHandle& cudnn, const CudnnTensorDescriptor& input_nd,
+    const CudnnFilterDescriptor& filter, const CudnnConvolutionDescriptor& conv,
+    const CudnnTensorDescriptor& output_nd, bool specify_workspace_limit,
    size_t memory_limit_bytes) {
  cudnnConvolutionFwdPreference_t preference =
      specify_workspace_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
@ -1943,10 +1942,10 @@ port::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(

 port::StatusOr<cudnnConvolutionBwdDataAlgo_t>
 GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
-                                    const ScopedTensorDescriptor& input_nd,
-                                    const ScopedFilterDescriptor& filter,
-                                    const ScopedConvolutionDescriptor& conv,
-                                    const ScopedTensorDescriptor& output_nd,
+                                    const CudnnTensorDescriptor& input_nd,
+                                    const CudnnFilterDescriptor& filter,
+                                    const CudnnConvolutionDescriptor& conv,
+                                    const CudnnTensorDescriptor& output_nd,
                                    bool specify_workspace_limit,
                                    size_t memory_limit_bytes) {
  cudnnConvolutionBwdDataPreference_t preference =
@ -1962,10 +1961,10 @@ GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,

 port::StatusOr<cudnnConvolutionBwdFilterAlgo_t>
 GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
-                                      const ScopedTensorDescriptor& input_nd,
-                                      const ScopedFilterDescriptor& filter,
-                                      const ScopedConvolutionDescriptor& conv,
-                                      const ScopedTensorDescriptor& output_nd,
+                                      const CudnnTensorDescriptor& input_nd,
+                                      const CudnnFilterDescriptor& filter,
+                                      const CudnnConvolutionDescriptor& conv,
+                                      const CudnnTensorDescriptor& output_nd,
                                      bool specify_workspace_limit,
                                      size_t memory_limit_bytes) {
  cudnnConvolutionBwdFilterPreference_t preference =
@ -1982,10 +1981,9 @@ GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
 port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
    Stream* stream, const CudnnHandle& cudnn,
    const dnn::AlgorithmDesc& algorithm_desc,
-    const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd,
+    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
+    const CudnnConvolutionDescriptor& conv,
+    const CudnnTensorDescriptor& output_nd,
    ScratchAllocator* scratch_allocator) {
  // TODO(csigg): This has side effects on the convolution descriptor. It is
  // functionally correct because the convolution is run with the algorithm of
@ -2025,10 +2023,9 @@ port::StatusOr<DeviceMemory<uint8>>
 AllocateCudnnConvolutionBackwardDataWorkspace(
    Stream* stream, const CudnnHandle& cudnn,
    const dnn::AlgorithmDesc& algorithm_desc,
-    const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd,
+    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
+    const CudnnConvolutionDescriptor& conv,
+    const CudnnTensorDescriptor& output_nd,
    ScratchAllocator* scratch_allocator) {
  // TODO(csigg): This has side effects on the convolution descriptor. It is
  // functionally correct because the convolution is run with the algorithm of
@ -2070,10 +2067,9 @@ port::StatusOr<DeviceMemory<uint8>>
 AllocateCudnnConvolutionBackwardFilterWorkspace(
    Stream* stream, const CudnnHandle& cudnn,
    const dnn::AlgorithmDesc& algorithm_desc,
-    const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd,
+    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
+    const CudnnConvolutionDescriptor& conv,
+    const CudnnTensorDescriptor& output_nd,
    ScratchAllocator* scratch_allocator) {
  // TODO(csigg): This has side effects on the convolution descriptor. It is
  // functionally correct because the convolution is run with the algorithm of
@ -2114,11 +2110,10 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
    Stream* stream, const CudnnHandle& cudnn,
    const dnn::AlgorithmConfig& algorithm_config,
-    const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd,
-    ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
+    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
+    const CudnnConvolutionDescriptor& conv,
+    const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
+    DeviceMemory<uint8>* scratch) {
  dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
  if (algorithm_config.algorithm().is_default()) {
    // Pick fastest algorithm within memory limit according to cuDNN's
@ -2164,11 +2159,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
    Stream* stream, const CudnnHandle& cudnn,
    const dnn::AlgorithmConfig& algorithm_config,
-    const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd,
-    ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
+    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
+    const CudnnConvolutionDescriptor& conv,
+    const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
+    DeviceMemory<uint8>* scratch) {
  dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
  if (algorithm_config.algorithm().is_default()) {
    // Pick fastest algorithm within memory limit according to cuDNN's
@ -2214,11 +2208,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
    Stream* stream, const CudnnHandle& cudnn,
    const dnn::AlgorithmConfig& algorithm_config,
-    const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd,
-    ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
+    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
+    const CudnnConvolutionDescriptor& conv,
+    const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
+    DeviceMemory<uint8>* scratch) {
  dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
  if (algorithm_config.algorithm().is_default()) {
    // Pick fastest algorithm within memory limit according to cuDNN's
@ -2387,11 +2380,11 @@ port::Status CudnnSupport::DoConvolveImpl(
    const dnn::AlgorithmConfig& algorithm_config,
    dnn::ProfileResult* output_profile_result) {
  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  ScopedTensorDescriptor input_nd(input_descriptor, cudnn_type);
-  ScopedTensorDescriptor output_nd(output_descriptor, cudnn_type);
-  ScopedFilterDescriptor filter(filter_descriptor, cudnn_type);
-  ScopedConvolutionDescriptor conv(convolution_descriptor,
-                                   GetConvComputeType<T>());
+  CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
+  CudnnTensorDescriptor output_nd(output_descriptor, cudnn_type);
+  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  GetConvComputeType<T>());

  auto cudnn = cudnn_->GetHandle(parent_, stream);
  // Alpha is the scaling factor for input.
@ -2493,14 +2486,14 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
                        "Relu activation.");
  }

-  ScopedTensorDescriptor conv_input_nd(
+  CudnnTensorDescriptor conv_input_nd(
      conv_input_descriptor, static_cast<cudnnDataType_t>(cudnn_data_type));
-  ScopedTensorDescriptor output_nd(
+  CudnnTensorDescriptor output_nd(
      output_descriptor, static_cast<cudnnDataType_t>(cudnn_data_type));
-  ScopedFilterDescriptor filter(filter_descriptor,
-                                static_cast<cudnnDataType_t>(cudnn_data_type));
-  ScopedTensorDescriptor bias_nd(bias_descriptor, CUDNN_DATA_FLOAT);
-  ScopedConvolutionDescriptor conv(
+  CudnnFilterDescriptor filter(filter_descriptor,
+                               static_cast<cudnnDataType_t>(cudnn_data_type));
+  CudnnTensorDescriptor bias_nd(bias_descriptor, CUDNN_DATA_FLOAT);
+  CudnnConvolutionDescriptor conv(
      convolution_descriptor, static_cast<cudnnDataType_t>(cudnn_compute_type));

  auto cudnn = cudnn_->GetHandle(parent_, stream);
@ -2528,7 +2521,7 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
  // activation descriptor. Note that this will change the nan propagation
  // behavior from separate conv, bias, and relu (which by default is
  // CUDNN_PROPAGATE_NAN.
-  ScopedActivationDescriptor activation_desc(
+  CudnnActivationDescriptor activation_desc(
      activation_mode, CUDNN_NOT_PROPAGATE_NAN, output_descriptor.value_max());
  auto side_input_data_ptr = (side_input_scale == 0) ? output_data->opaque()
                                                     : side_input_data.opaque();
@ -2740,8 +2733,8 @@ port::Status CudnnSupport::DoBatchNormalizationForwardImpl(
    DeviceMemory<U>* saved_mean, DeviceMemory<U>* saved_inv_var,
    bool is_training, std::function<const DeviceMemory<U>&()> var_to_inv_var,
    std::function<void()> inv_var_to_var) {
-  ScopedTensorDescriptor x_descriptor(x_desc, ToCudnnDataType(input_data_type));
-  ScopedTensorDescriptor scale_offset_descriptor(
+  CudnnTensorDescriptor x_descriptor(x_desc, ToCudnnDataType(input_data_type));
+  CudnnTensorDescriptor scale_offset_descriptor(
      scale_offset_desc, ToCudnnDataType(scale_data_type));
  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
 #if CUDNN_VERSION >= 7000
@ -2825,9 +2818,9 @@ port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
    const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
    DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
    DeviceMemory<U>* offset_backprop) {
-  ScopedTensorDescriptor x_descriptor(
+  CudnnTensorDescriptor x_descriptor(
      x_desc, static_cast<cudnnDataType_t>(cudnn_input_type));
-  ScopedTensorDescriptor scale_offset_descriptor(
+  CudnnTensorDescriptor scale_offset_descriptor(
      scale_offset_desc, static_cast<cudnnDataType_t>(cudnn_scale_type));
  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
 #if CUDNN_VERSION >= 7000
@ -3017,9 +3010,9 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
                                     dnn::DataType output_type, float scale,
                                     DeviceMemoryBase* output_data) {
  float beta = 0.0f;
-  ScopedTensorDescriptor input_tensor_desc(
+  CudnnTensorDescriptor input_tensor_desc(
      input_desc, ToCudnnDataType(input_type, input_desc.layout()));
-  ScopedTensorDescriptor output_tensor_desc(
+  CudnnTensorDescriptor output_tensor_desc(
      output_desc, ToCudnnDataType(output_type, output_desc.layout()));
  auto cudnn = cudnn_->GetHandle(parent_, stream);
  auto status = [&] {
@ -3056,11 +3049,11 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(

  auto cudnn = cudnn_->GetHandle(parent_, stream);

-  ScopedTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
-  ScopedTensorDescriptor in_back_nd(input_descriptor, cudnn_type);
-  ScopedFilterDescriptor filter(filter_descriptor, cudnn_type);
-  ScopedConvolutionDescriptor conv(convolution_descriptor,
-                                   GetConvComputeType<T>());
+  CudnnTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
+  CudnnTensorDescriptor in_back_nd(input_descriptor, cudnn_type);
+  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  GetConvComputeType<T>());

  const bool is_profiling = output_profile_result != nullptr;

@ -3192,11 +3185,11 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(

  auto cudnn = cudnn_->GetHandle(parent_, stream);

-  ScopedTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
-  ScopedTensorDescriptor input_nd(input_descriptor, cudnn_type);
-  ScopedFilterDescriptor filter(filter_descriptor, cudnn_type);
-  ScopedConvolutionDescriptor conv(convolution_descriptor,
-                                   GetConvComputeType<T>());
+  CudnnTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
+  CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
+  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  GetConvComputeType<T>());

  const bool is_profiling = output_profile_result != nullptr;

@ -3338,8 +3331,8 @@ port::Status CudnnSupport::DoConvolveBackwardBiasImpl(
    const dnn::BatchDescriptor& bias_descriptor,
    DeviceMemory<T>* backward_bias_data) {
  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  ScopedTensorDescriptor input_nd(input_descriptor, cudnn_type);
-  ScopedTensorDescriptor bias_nd(bias_descriptor, cudnn_type);
+  CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
+  CudnnTensorDescriptor bias_nd(bias_descriptor, cudnn_type);

  // Alpha is the scaling factor for input.
  float alpha = 1.0;
@ -3526,7 +3519,7 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
                             const DeviceMemory<float>& biases,
                             const dnn::BatchDescriptor& dimensions,
                             DeviceMemory<float>* output_data) {
-  ScopedTensorDescriptor input_descriptor(dimensions, CUDNN_DATA_FLOAT);
+  CudnnTensorDescriptor input_descriptor(dimensions, CUDNN_DATA_FLOAT);

  dnn::BatchDescriptor bias_dimensions;
  bias_dimensions.set_count(1)
@ -3534,7 +3527,7 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
      .set_height(1)
      .set_width(1)
      .set_layout(dnn::DataLayout::kBatchYXDepth);
-  ScopedTensorDescriptor bias_descriptor(bias_dimensions, CUDNN_DATA_FLOAT);
+  CudnnTensorDescriptor bias_descriptor(bias_dimensions, CUDNN_DATA_FLOAT);

  // cudnnAddTensor after R3 is in-place, so we need to copy input_data to
  // output_data before doing the addition, unless the input and
@ -3570,10 +3563,10 @@ bool CudnnSupport::DoActivate(Stream* stream,
                              const DeviceMemory<float>& input_data,
                              DeviceMemory<float>* output_data,
                              uint64 options) {
-  ScopedActivationDescriptor activation_desc(
+  CudnnActivationDescriptor activation_desc(
      activation_mode, CUDNN_PROPAGATE_NAN, dimensions.value_max());

-  ScopedTensorDescriptor input_nd(dimensions, CUDNN_DATA_FLOAT);
+  CudnnTensorDescriptor input_nd(dimensions, CUDNN_DATA_FLOAT);
  // Alpha is the input scaling factor.
  float alpha = 1.0;
  // Beta is the output scaling factor.
@ -3600,9 +3593,9 @@ bool CudnnSupport::DoPoolForward(
  // Beta is the scaling factor for output.
  double beta = 0.0;

-  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_DOUBLE);
-  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_DOUBLE);
-  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_DOUBLE);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_DOUBLE);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);

  auto cudnn = cudnn_->GetHandle(parent_, stream);
  auto status = [&] {
@ -3625,9 +3618,9 @@ bool CudnnSupport::DoPoolForward(
  // Beta is the scaling factor for output.
  float beta = 0.0;

-  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_FLOAT);
-  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_FLOAT);
-  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_FLOAT);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_FLOAT);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);

  auto cudnn = cudnn_->GetHandle(parent_, stream);
  auto status = [&] {
@ -3650,9 +3643,9 @@ bool CudnnSupport::DoPoolForward(
  // Beta is the scaling factor for output.
  float beta = 0.0;

-  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_HALF);
-  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_HALF);
-  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_HALF);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_HALF);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
  auto cudnn = cudnn_->GetHandle(parent_, stream);
  auto status = [&] {
    RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
@ -3676,9 +3669,9 @@ bool CudnnSupport::DoPoolBackward(
  // Beta is the scaling factor for output.
  double beta = 0.0;

-  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_DOUBLE);
-  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_DOUBLE);
-  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_DOUBLE);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_DOUBLE);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);

  auto cudnn = cudnn_->GetHandle(parent_, stream);
  auto status = [&] {
@ -3705,9 +3698,9 @@ bool CudnnSupport::DoPoolBackward(
  // Beta is the scaling factor for output.
  float beta = 0.0;

-  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_FLOAT);
-  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_FLOAT);
-  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_FLOAT);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_FLOAT);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);

  auto cudnn = cudnn_->GetHandle(parent_, stream);
  auto status = [&] {
@ -3734,9 +3727,9 @@ bool CudnnSupport::DoPoolBackward(
  // Beta is the scaling factor for output.
  float beta = 0.0;

-  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_HALF);
-  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_HALF);
-  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_HALF);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_HALF);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);

  auto cudnn = cudnn_->GetHandle(parent_, stream);
  auto status = [&] {
@ -3771,8 +3764,8 @@ bool CudnnSupport::DoNormalizeWithDimensions(
    return false;
  }

-  ScopedTensorDescriptor dims(dimensions, CUDNN_DATA_FLOAT);
-  ScopedNormalizeDescriptor normalize(normalize_descriptor);
+  CudnnTensorDescriptor dims(dimensions, CUDNN_DATA_FLOAT);
+  CudnnNormalizeDescriptor normalize(normalize_descriptor);

  // Alpha is the scaling factor for input.
  float alpha = 1.0f;
@ -3808,8 +3801,8 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
    return false;
  }

-  ScopedTensorDescriptor dims(dimensions, CUDNN_DATA_FLOAT);
-  ScopedNormalizeDescriptor normalize(normalize_descriptor);
+  CudnnTensorDescriptor dims(dimensions, CUDNN_DATA_FLOAT);
+  CudnnNormalizeDescriptor normalize(normalize_descriptor);

  float alpha = 1.0f;
  float beta = 0.0f;
@ -3932,9 +3925,9 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
    const dnn::FilterDescriptor& filter_descriptor,
    const dnn::ConvolutionDescriptor& convolution_descriptor,
    dnn::BatchDescriptor* output_batch_descriptor) {
-  ScopedTensorDescriptor input_nd(batch_descriptor, CUDNN_DATA_FLOAT);
-  ScopedFilterDescriptor filter(filter_descriptor, CUDNN_DATA_FLOAT);
-  ScopedConvolutionDescriptor conv(convolution_descriptor, CUDNN_DATA_FLOAT);
+  CudnnTensorDescriptor input_nd(batch_descriptor, CUDNN_DATA_FLOAT);
+  CudnnFilterDescriptor filter(filter_descriptor, CUDNN_DATA_FLOAT);
+  CudnnConvolutionDescriptor conv(convolution_descriptor, CUDNN_DATA_FLOAT);

  int dn = batch_descriptor.ndims() + 2;
  std::vector<int> dims(dn);  // in BDYX
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@ -15,9 +15,6 @@ limitations under the License.

 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"

-#if defined(__APPLE__)
-#include <mach-o/dyld.h>
-#endif
 #if defined(PLATFORM_WINDOWS)
 #include <windows.h>
 #define PATH_MAX MAX_PATH
@ -179,19 +176,11 @@ bool CUDAExecutor::FindOnDiskForComputeCapability(
 //                 would return /usr/bin.
 static string GetBinaryDir(bool strip_exe) {
  char exe_path[PATH_MAX] = {0};
-#if defined(__APPLE__)
-    uint32_t buffer_size = 0U;
-    _NSGetExecutablePath(nullptr, &buffer_size);
-    char unresolved_path[buffer_size];
-    _NSGetExecutablePath(unresolved_path, &buffer_size);
-    CHECK_ERR(realpath(unresolved_path, exe_path) ? 1 : -1);
-#else
 #if defined(PLATFORM_WINDOWS)
  HMODULE hModule = GetModuleHandle(NULL);
  GetModuleFileName(hModule, exe_path, MAX_PATH);
 #else
  CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
-#endif
 #endif
  // Make sure it's null-terminated:
  exe_path[sizeof(exe_path) - 1] = 0;
@ -854,10 +843,7 @@ CudaContext* CUDAExecutor::cuda_context() { return context_; }
 // For anything more complicated/prod-focused than this, you'll likely want to
 // turn to gsys' topology modeling.
 static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
-#if defined(__APPLE__)
-  LOG(INFO) << "OS X does not support NUMA - returning NUMA node zero";
-  return 0;
-#elif defined(PLATFORM_WINDOWS)
+#if defined(PLATFORM_WINDOWS)
  // Windows support for NUMA is not currently implemented. Return node 0.
  return 0;
 #elif defined(__aarch64__)
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@ -5,12 +5,21 @@ licenses(["notice"])  # Apache 2.0

 exports_files(["LICENSE"])

+load("//tensorflow/tools/api/generator:api_gen.bzl", "TENSORFLOW_API_INIT_FILES")
+
+py_library(
+    name = "doc_srcs",
+    srcs = ["doc_srcs.py"],
+    srcs_version = "PY2AND3",
+)
+
 py_binary(
    name = "create_python_api",
    srcs = ["create_python_api.py"],
    srcs_version = "PY2AND3",
    visibility = ["//visibility:public"],
    deps = [
+        ":doc_srcs",
        "//tensorflow/python:no_contrib",
    ],
 )
@ -24,3 +33,18 @@ py_test(
        "//tensorflow/python:client_testlib",
    ],
 )
+
+py_test(
+    name = "tensorflow_doc_srcs_test",
+    srcs = ["doc_srcs_test.py"],
+    args = [
+        "--package=tensorflow.python",
+    ] + TENSORFLOW_API_INIT_FILES,
+    main = "doc_srcs_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":doc_srcs",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:no_contrib",
+    ],
+)
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@ -26,6 +26,7 @@ import sys

 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_export
+from tensorflow.tools.api.generator import doc_srcs

 API_ATTRS = tf_export.API_ATTRS

@ -36,10 +37,9 @@ _SYMBOLS_TO_SKIP_EXPLICITLY = {
    # would have side effects.
    'tensorflow.python.platform.flags.FLAGS'
 }
-_GENERATED_FILE_HEADER = """\"\"\"Imports for Python API.
-
-This file is MACHINE GENERATED! Do not edit.
-Generated by: tensorflow/tools/api/generator/create_python_api.py script.
+_GENERATED_FILE_HEADER = """# This file is MACHINE GENERATED! Do not edit.
+# Generated by: tensorflow/tools/api/generator/create_python_api.py script.
+\"\"\"%s
 \"\"\"

 from __future__ import print_function
@ -252,6 +252,44 @@ def get_module(dir_path, relative_to_dir):
  return dir_path.replace('/', '.').strip('.')


+def get_module_docstring(module_name, package):
+  """Get docstring for the given module.
+
+  This method looks for docstring in the following order:
+  1. Checks if module has a docstring specified in doc_srcs.
+  2. Checks if module has a docstring source module specified
+     in doc_srcs. If it does, gets docstring from that module.
+  3. Checks if module with module_name exists under base package.
+     If it does, gets docstring from that module.
+  4. Returns a default docstring.
+
+  Args:
+    module_name: module name relative to tensorflow
+      (excluding 'tensorflow.' prefix) to get a docstring for.
+    package: Base python package containing python with target tf_export
+      decorators.
+
+  Returns:
+    One-line docstring to describe the module.
+  """
+  # Module under base package to get a docstring from.
+  docstring_module_name = module_name
+
+  if module_name in doc_srcs.TENSORFLOW_DOC_SOURCES:
+    docsrc = doc_srcs.TENSORFLOW_DOC_SOURCES[module_name]
+    if docsrc.docstring:
+      return docsrc.docstring
+    if docsrc.docstring_module_name:
+      docstring_module_name = docsrc.docstring_module_name
+
+  docstring_module_name = package + '.' + docstring_module_name
+  if (docstring_module_name in sys.modules and
+      sys.modules[docstring_module_name].__doc__):
+    return sys.modules[docstring_module_name].__doc__
+
+  return 'Public API for tf.%s namespace.' % module_name
+
+
 def create_api_files(
    output_files, package, root_init_template, output_dir, api_name):
  """Creates __init__.py files for the Python API.
@ -295,7 +333,9 @@ def create_api_files(
      continue
    contents = ''
    if module or not root_init_template:
-      contents = _GENERATED_FILE_HEADER + text + _GENERATED_FILE_FOOTER
+      contents = (
+          _GENERATED_FILE_HEADER %
+          get_module_docstring(module, package) + text + _GENERATED_FILE_FOOTER)
    else:
      # Read base init file
      with open(root_init_template, 'r') as root_init_template_file:
@ -308,7 +348,7 @@ def create_api_files(
    raise ValueError(
        'Missing outputs for python_api_gen genrule:\n%s.'
        'Make sure all required outputs are in the '
-        'tensorflow/tools/api/generator/BUILD file.' %
+        'tensorflow/tools/api/generator/api_gen.bzl file.' %
        ',\n'.join(sorted(missing_output_files)))


--- a/tensorflow/tools/api/generator/doc_srcs.py
+++ b/tensorflow/tools/api/generator/doc_srcs.py
@ -0,0 +1,65 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Specifies sources of doc strings for API modules."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+
+# Specifies docstring source for a module.
+# Only one of docstring or docstring_module_name should be set.
+# * If docstring is set, then we will use this docstring when
+#   for the module.
+# * If docstring_module_name is set, then we will copy the docstring
+#   from docstring source module.
+DocSource = collections.namedtuple(
+    'DocSource', ['docstring', 'docstring_module_name'])
+# Each attribute of DocSource is optional.
+DocSource.__new__.__defaults__ = (None,) * len(DocSource._fields)
+
+TENSORFLOW_DOC_SOURCES = {
+    'app': DocSource(docstring_module_name='platform.app'),
+    'compat': DocSource(docstring_module_name='util.compat'),
+    'distributions': DocSource(
+        docstring_module_name='ops.distributions.distributions'),
+    'bitwise': DocSource(docstring_module_name='ops.bitwise_ops'),
+    'errors': DocSource(docstring_module_name='framework.errors'),
+    'gfile': DocSource(docstring_module_name='platform.gfile'),
+    'graph_util': DocSource(docstring_module_name='framework.graph_util'),
+    'image': DocSource(docstring_module_name='ops.image_ops'),
+    'keras.estimator': DocSource(docstring_module_name='estimator.keras'),
+    'linalg': DocSource(docstring_module_name='ops.linalg_ops'),
+    'logging': DocSource(docstring_module_name='ops.logging_ops'),
+    'losses': DocSource(docstring_module_name='ops.losses.losses'),
+    'manip': DocSource(docstring_module_name='ops.manip_ops'),
+    'math': DocSource(docstring_module_name='ops.math_ops'),
+    'metrics': DocSource(docstring_module_name='ops.metrics'),
+    'nn': DocSource(docstring_module_name='ops.nn_ops'),
+    'nn.rnn_cell': DocSource(docstring_module_name='ops.rnn_cell'),
+    'python_io': DocSource(docstring_module_name='lib.io.python_io'),
+    'resource_loader': DocSource(
+        docstring_module_name='platform.resource_loader'),
+    'sets': DocSource(docstring_module_name='ops.sets'),
+    'sparse': DocSource(docstring_module_name='ops.sparse_ops'),
+    'spectral': DocSource(docstring_module_name='ops.spectral_ops'),
+    'strings': DocSource(docstring_module_name='ops.string_ops'),
+    'sysconfig': DocSource(docstring_module_name='platform.sysconfig'),
+    'test': DocSource(docstring_module_name='platform.test'),
+    'train': DocSource(docstring_module_name='training.training'),
+    'train.queue_runner': DocSource(
+        docstring_module_name='training.queue_runner'),
+}
--- a/tensorflow/tools/api/generator/doc_srcs_test.py
+++ b/tensorflow/tools/api/generator/doc_srcs_test.py
@ -0,0 +1,80 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests for tensorflow.tools.api.generator.doc_srcs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import importlib
+import sys
+
+from tensorflow.python.platform import test
+from tensorflow.tools.api.generator import doc_srcs
+
+
+FLAGS = None
+
+
+class DocSrcsTest(test.TestCase):
+
+  def testModulesAreValidAPIModules(self):
+    for module_name in doc_srcs.TENSORFLOW_DOC_SOURCES:
+      # Convert module_name to corresponding __init__.py file path.
+      file_path = module_name.replace('.', '/')
+      if file_path:
+        file_path += '/'
+      file_path += '__init__.py'
+
+      if file_path not in FLAGS.outputs:
+        self.assertFalse('%s is not a valid API module' % module_name)
+
+  def testHaveDocstringOrDocstringModule(self):
+    for module_name, docsrc in doc_srcs.TENSORFLOW_DOC_SOURCES.items():
+      if docsrc.docstring and docsrc.docstring_module_name:
+        self.assertFalse(
+            '%s contains DocSource has both a docstring and a '
+            'docstring_module_name. '
+            'Only one of "docstring" or "docstring_module_name" should be set.'
+            % (module_name))
+
+  def testDocstringModulesAreValidModules(self):
+    for _, docsrc in doc_srcs.TENSORFLOW_DOC_SOURCES.items():
+      if docsrc.docstring_module_name:
+        doc_module_name = '.'.join([
+            FLAGS.package, docsrc.docstring_module_name])
+        if doc_module_name not in sys.modules:
+          sys.assertFalse(
+              'docsources_module %s is not a valid module under %s.' %
+              (docsrc.docstring_module_name, FLAGS.package))
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      'outputs', metavar='O', type=str, nargs='+',
+      help='create_python_api output files.')
+  parser.add_argument(
+      '--package', type=str,
+      help='Base package that imports modules containing the target tf_export '
+           'decorators.')
+  FLAGS, unparsed = parser.parse_known_args()
+
+  importlib.import_module(FLAGS.package)
+
+  # Now update argv, so that unittest library does not get confused.
+  sys.argv = [sys.argv[0]] + unparsed
+  test.main()
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@ -20,6 +20,10 @@ tf_module {
    name: "adjust_hue"
    argspec: "args=[\'image\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
  }
+  member_method {
+    name: "adjust_jpeg_quality"
+    argspec: "args=[\'image\', \'jpeg_quality\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
  member_method {
    name: "adjust_saturation"
    argspec: "args=[\'image\', \'saturation_factor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@ -144,6 +148,10 @@ tf_module {
    name: "random_hue"
    argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
  }
+  member_method {
+    name: "random_jpeg_quality"
+    argspec: "args=[\'image\', \'min_jpeg_quality\', \'max_jpeg_quality\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
  member_method {
    name: "random_saturation"
    argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
@ -166,7 +174,7 @@ tf_module {
  }
  member_method {
    name: "resize_images"
-    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\'], varargs=None, keywords=None, defaults=[\'0\', \'False\'], "
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
  }
  member_method {
    name: "resize_nearest_neighbor"
--- a/tensorflow/tools/api/golden/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.math.pbtxt
@ -1,5 +1,21 @@
 path: "tensorflow.math"
 tf_module {
+  member_method {
+    name: "bessel_i0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'bessel_i0\'], "
+  }
+  member_method {
+    name: "bessel_i0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_i1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'bessel_i1\'], "
+  }
+  member_method {
+    name: "bessel_i1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
  member_method {
    name: "polyval"
    argspec: "args=[\'coeffs\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
--- a/tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt
@ -2,7 +2,7 @@ path: "tensorflow.saved_model.loader"
 tf_module {
  member_method {
    name: "load"
-    argspec: "args=[\'sess\', \'tags\', \'export_dir\'], varargs=None, keywords=saver_kwargs, defaults=None"
+    argspec: "args=[\'sess\', \'tags\', \'export_dir\', \'import_scope\'], varargs=None, keywords=saver_kwargs, defaults=[\'None\'], "
  }
  member_method {
    name: "maybe_saved_model_directory"
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -778,11 +778,9 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
      actual = "@grpc//:grpc_python_plugin",
  )

-  # gRPC has three empty C++ functions which it wants the user to define
-  # at build time. https://github.com/grpc/grpc/issues/13590
  native.bind(
      name = "grpc_lib",
-      actual = "@grpc//:grpc++_unsecure",
+      actual = "@grpc//:grpc++",
  )

  # Needed by gRPC