fix merge issues

2017-06-03 00:00:53 -04:00 · 2017-06-03 00:00:53 -04:00 · 5efd272aab
commit 5efd272aab
parent c9d2f432b6 563f05ff67
378 changed files with 9541 additions and 4762 deletions
--- a/RELEASE.md
+++ b/RELEASE.md
@ -41,6 +41,15 @@
  be replaced by calling `embedding_lookup` or `layers.dense` as pre- or post-
  processing of the rnn.  For RNN decoding, this functionality has been replaced
  with an alternative API in `tf.contrib.seq2seq`.
 * Intel MKL Integration (https://software.intel.com/en-us/articles/tensorflow-optimizations-on-modern-intel-architecture). Intel developed a number of
  optimized deep learning primitives: In addition to matrix multiplication and
  convolution, these building blocks include:
  Direct batched convolution
  Pooling: maximum, minimum, average
  Normalization: LRN, batch normalization
  Activation: rectified linear unit (ReLU)
  Data manipulation: multi-dimensional transposition (conversion), split,
  concat, sum and scale.
 * TensorForest Estimator now supports SavedModel export for serving.
 * Support client-provided ClusterSpec's and propagate them to all workers to enable the creation of dynamic TensorFlow clusters.
 * TensorFlow C library now available for Windows.
--- a/8
+++ b/8
@ -2,11 +2,11 @@ workspace(name = "org_tensorflow")
 http_archive(
    name = "io_bazel_rules_closure",
-    sha256 = "4be8a887f6f38f883236e77bb25c2da10d506f2bf1a8e5d785c0f35574c74ca4",
+    sha256 = "edc91f556b762fc5212d1050d00b12e40dd0b0b1c1d5d96886b59e9a30a6cae4",
-    strip_prefix = "rules_closure-aac19edc557aec9b603cd7ffe359401264ceff0d",
+    strip_prefix = "rules_closure-3f07fb6a58870afbb36051bd5d54da4479561cc6",
    urls = [
-        "http://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/aac19edc557aec9b603cd7ffe359401264ceff0d.tar.gz",  # 2017-05-10
+        "http://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/3f07fb6a58870afbb36051bd5d54da4479561cc6.tar.gz",  # 2017-05-31
-        "https://github.com/bazelbuild/rules_closure/archive/aac19edc557aec9b603cd7ffe359401264ceff0d.tar.gz",
+        "https://github.com/bazelbuild/rules_closure/archive/3f07fb6a58870afbb36051bd5d54da4479561cc6.tar.gz",
    ],
 )
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@ -393,6 +393,9 @@ filegroup(
        "//tensorflow/tensorboard/demo:all_files",
        "//tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize:all_files",
        "//tensorflow/tensorboard/plugins:all_files",
        "//tensorflow/tensorboard/plugins/audio:all_files",
        "//tensorflow/tensorboard/plugins/distributions:all_files",
        "//tensorflow/tensorboard/plugins/graphs:all_files",
        "//tensorflow/tensorboard/plugins/histograms:all_files",
        "//tensorflow/tensorboard/plugins/images:all_files",
        "//tensorflow/tensorboard/plugins/projector:all_files",
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@ -805,6 +805,7 @@ void TF_GraphSetTensorShape(TF_Graph* graph, TF_Output output,
  }
  std::vector<tensorflow::shape_inference::DimensionHandle> dim_vec;
  dim_vec.reserve(num_dims);
  for (int i = 0; i < num_dims; ++i) {
    dim_vec.push_back(ic->MakeDim(dims[i]));
  }
--- a/tensorflow/cc/client/client_session.cc
+++ b/tensorflow/cc/client/client_session.cc
@ -113,10 +113,12 @@ Status ClientSession::Run(const RunOptions& run_options, const FeedType& inputs,
    feeds.emplace_back(feed.first.name(), feed.second.tensor);
  }
  std::vector<string> output_tensor_names;
  output_tensor_names.reserve(fetch_outputs.size());
  for (auto const& output : fetch_outputs) {
    output_tensor_names.push_back(output.name());
  }
  std::vector<string> target_node_names;
  target_node_names.reserve(run_outputs.size());
  for (auto const& output : run_outputs) {
    target_node_names.push_back(output.node()->name());
  }
--- a/tensorflow/cc/framework/gradient_checker.cc
+++ b/tensorflow/cc/framework/gradient_checker.cc
@ -44,6 +44,7 @@ Status ComputeTheoreticalJacobianTranspose(
  size_t x_num = x_shapes.size();
  // Call AddSymbolicGradients to get 'dxs' (we will feed 'dys').
  OutputList dys;
  dys.reserve(y_shapes.size());
  for (const auto& y_shape : y_shapes) {
    // TODO(suharshs): This currently assumes that all x's are the same type.
    dys.push_back(Cast(scope, Const(scope, 1.0, y_shape), xs[0].type()));
--- a/tensorflow/cc/framework/testutil.cc
+++ b/tensorflow/cc/framework/testutil.cc
@ -15,6 +15,8 @@ limitations under the License.
 #include "tensorflow/cc/framework/testutil.h"
 #include <utility>
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/default_device.h"
@ -30,7 +32,7 @@ void GetTensors(const Scope& scope, OutputList tensors,
 void GetTensor(const Scope& scope, Output tensor, Tensor* out) {
  std::vector<Tensor> outputs;
-  GetTensors(scope, {tensor}, &outputs);
+  GetTensors(scope, {std::move(tensor)}, &outputs);
  *out = outputs[0];
 }
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@ -350,6 +350,7 @@ Status CompileXla(xla::CompileOnlyClient* client,
  compile_result->program_shape = *pshape_or.ValueOrDie();
  xla::ProgramShape* pshape = &compile_result->program_shape;
  std::vector<const xla::Shape*> arg_layouts;
  arg_layouts.reserve(pshape->parameters_size());
  for (int i = 0; i < pshape->parameters_size(); ++i) {
    arg_layouts.push_back(pshape->mutable_parameters(i));
  }
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@ -218,6 +218,7 @@ cc_library(
    deps = [
        ":common",
        ":graph_to_functiondef",
        ":union_find",
        "//tensorflow/compiler/jit/graphcycles",
        "//tensorflow/compiler/jit/kernels:parallel_check_op",
        "//tensorflow/compiler/jit/kernels:xla_local_launch_op",
@ -237,6 +238,11 @@ cc_library(
    ],
 )
 cc_library(
    name = "union_find",
    hdrs = ["union_find.h"],
 )
 cc_test(
    name = "compilation_passes_test",
    size = "small",
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <utility>
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/cc/framework/ops.h"
@ -101,12 +103,12 @@ Node* Input(const GraphDefBuilder::Options& opts) {
 }
 Node* Unary(ops::NodeOut a, const GraphDefBuilder::Options& opts) {
-  return ops::UnaryOp("UnaryTest", a, opts);
+  return ops::UnaryOp("UnaryTest", std::move(a), opts);
 }
 Node* Binary(ops::NodeOut a, ops::NodeOut b,
             const GraphDefBuilder::Options& opts) {
-  return ops::BinaryOp("BinaryTest", a, b, opts);
+  return ops::BinaryOp("BinaryTest", std::move(a), std::move(b), opts);
 }
 Node* AddNLike(const std::vector<ops::NodeOut>& inputs,
@ -127,7 +129,7 @@ Node* RetOp(int index, ops::NodeOut a, const GraphDefBuilder::Options& opts) {
  if (opts.HaveError()) return nullptr;
  NodeBuilder node_builder(opts.GetNameForOp("Retval"), "_Retval",
                           opts.op_registry());
-  node_builder.Input(a).Attr("index", index);
+  node_builder.Input(std::move(a)).Attr("index", index);
  return opts.FinalizeBuilder(&node_builder);
 }
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
 #include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/function.h"
@ -206,70 +207,12 @@ Status FindCompilationCandidates(
  return Status::OK();
 }
-// Union-Find data structure used to compute clusters. We use our own
+struct Cluster {
-// implementation because we want one key feature: when merging clusters, we
+  // Identifies the node that represents this cluster in the cycle detection
-// need to know which value becomes the representative of the merged clusters.
+  // graph.
-// We use the representatives to name nodes in a cycle detection graph, and we
+  int representative = -1;
 // need to control which node is named.
 // TODO(phawkins): consider merging this code with union-find implementations
 // in Tensorflow, e.g., in SimplePlacer.
 class Cluster {
 public:
  Cluster();
  int Size() { return FindRoot()->size_; }
  // Merges this cluster with 'other'. This cluster's representative becomes
  // the representative of the merged cluster; the representative of 'other'
  // is ignored.
  void Merge(Cluster* other);
  // Each cluster has an associated integer 'representative', initialized to -1
  // by default.
  int GetRepresentative() { return FindRoot()->representative_; }
  void SetRepresentative(int representative) {
    FindRoot()->representative_ = representative;
  }
 private:
  // Finds the root element of the cluster. Performs path compression.
  Cluster* FindRoot();
  int representative_;
  int rank_;
  int size_;  // Size of the cluster.
  Cluster* parent_;
 };
 Cluster::Cluster()
    : representative_(-1), rank_(0), size_(1), parent_(nullptr) {}
 void Cluster::Merge(Cluster* other) {
  Cluster* a = FindRoot();
  Cluster* b = other->FindRoot();
  if (a == b) return;
  if (a->rank_ > b->rank_) {
    b->parent_ = a;
    a->size_ += b->size_;
    return;
  }
  a->parent_ = b;
  if (a->rank_ == b->rank_) {
    b->rank_++;
  }
  b->representative_ = a->representative_;
  b->size_ += a->size_;
 }
 Cluster* Cluster::FindRoot() {
  if (!parent_) return this;
  // Path compression: update intermediate nodes to point to the root of the
  // equivalence class.
  parent_ = parent_->FindRoot();
  return parent_;
 }
 }  // anonymous namespace
 bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) {
@ -432,10 +375,11 @@ Status MarkForCompilationPass::RunImpl(
  // Each compilation candidate belongs to a cluster. The cluster's
  // representative
  // names the node in the 'cycles' graph that represents the cluster.
-  std::vector<Cluster> clusters(graph->num_node_ids());
+  std::vector<UnionFind<Cluster>> clusters(graph->num_node_ids());
-  std::deque<Cluster*> worklist;
+  std::deque<UnionFind<Cluster>*> worklist;
  for (Node* node : compilation_candidates) {
-    clusters[node->id()].SetRepresentative(node->id());
+    Cluster& cluster = clusters[node->id()].Get();
    cluster.representative = node->id();
    worklist.push_back(&clusters[node->id()]);
  }
@ -445,7 +389,7 @@ Status MarkForCompilationPass::RunImpl(
  // Repeatedly contract edges between clusters that are on the same device,
  // provided the contraction would not create a cycle.
  while (!worklist.empty()) {
-    int from = worklist.front()->GetRepresentative();
+    int from = worklist.front()->Get().representative;
    worklist.pop_front();
    Node* node_from = graph->FindNodeId(from);
@ -518,7 +462,7 @@ Status MarkForCompilationPass::RunImpl(
  // Count the number of elements in each cluster.
  std::vector<int> cluster_sizes(graph->num_node_ids());
  for (const Node* n : compilation_candidates) {
-    int cluster = clusters[n->id()].GetRepresentative();
+    int cluster = clusters[n->id()].Get().representative;
    cluster_sizes[cluster]++;
  }
@ -532,7 +476,7 @@ Status MarkForCompilationPass::RunImpl(
  //   if compilation is enabled, otherwise there will be no such candidates).
  const int min_cluster_size = flags->tf_xla_min_cluster_size;
  for (Node* n : compilation_candidates) {
-    int cluster = clusters[n->id()].GetRepresentative();
+    int cluster = clusters[n->id()].Get().representative;
    // Compile if the user marked this node _XlaCompile=true
    bool compile_attr = false;
--- a/tensorflow/compiler/jit/union_find.h
+++ b/tensorflow/compiler/jit/union_find.h
@ -0,0 +1,81 @@
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_JIT_UNION_FIND_H_
 #define TENSORFLOW_COMPILER_JIT_UNION_FIND_H_
 namespace tensorflow {
 // Union-Find data structure.
 // Each cluster has an associated value; when merging clusters we can control
 // which value becomes the representative of the merged clusters. Values must be
 // copyable.
 template <typename T>
 class UnionFind {
 public:
  UnionFind() : rank_(0), size_(1), parent_(nullptr) {}
  // Returns the number of elements in a cluster.
  int Size() { return FindRoot()->size_; }
  // Merges this cluster with 'other'. This cluster's value becomes
  // the value of the merged cluster; the value of 'other' is ignored.
  void Merge(UnionFind* other);
  // Each cluster has an associated value. Retrieves the value associated
  // with this cluster.
  T& Get() { return FindRoot()->value_; }
 private:
  // Finds the root element of the cluster. Performs path compression.
  UnionFind* FindRoot();
  int rank_;
  int size_;  // Size of the cluster.
  UnionFind* parent_;
  T value_;
 };
 template <typename T>
 void UnionFind<T>::Merge(UnionFind* other) {
  UnionFind<T>* a = FindRoot();
  UnionFind<T>* b = other->FindRoot();
  if (a == b) return;
  if (a->rank_ > b->rank_) {
    b->parent_ = a;
    a->size_ += b->size_;
    return;
  }
  a->parent_ = b;
  if (a->rank_ == b->rank_) {
    b->rank_++;
  }
  b->value_ = a->value_;
  b->size_ += a->size_;
 }
 template <typename T>
 UnionFind<T>* UnionFind<T>::FindRoot() {
  if (!parent_) return this;
  // Path compression: update intermediate nodes to point to the root of the
  // equivalence class.
  parent_ = parent_->FindRoot();
  return parent_;
 }
 }  // namespace tensorflow
 #endif  // TENSORFLOW_COMPILER_JIT_UNION_FIND_H_
--- a/tensorflow/compiler/tf2xla/kernels/fill_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
@ -50,6 +50,7 @@ class FillOp : public XlaOpKernel {
    // Convert the dims literal into a vector that we can pass to
    // ComputationBuilder.
    std::vector<int64> broadcast;
    broadcast.reserve(dims_literal.shape().dimensions(0));
    for (int i = 0; i < dims_literal.shape().dimensions(0); ++i) {
      broadcast.push_back(xla::LiteralUtil::Get<int>(dims_literal, {i}));
    }
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@ -50,6 +50,7 @@ class SliceOp : public XlaOpKernel {
    // slice will be an empty handle if the output has no elements.
    CHECK_EQ(begin.size(), size.size());
    std::vector<int64> limits;
    limits.reserve(begin.size());
    for (int i = 0; i < begin.size(); ++i) {
      limits.push_back(begin[i] + size[i]);
    }
--- a/tensorflow/compiler/tf2xla/literal_util.h
+++ b/tensorflow/compiler/tf2xla/literal_util.h
@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LITERAL_UTIL_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LITERAL_UTIL_H_
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@ -58,14 +58,13 @@ StatusOr<std::unique_ptr<Literal>> Client::Transfer(
        "server provided response without a literal in "
        "TransferToClient request");
  }
-
+  return MakeUnique<Literal>(response.literal());
  return WrapUnique(response.release_literal());
 }
 StatusOr<std::unique_ptr<GlobalData>> Client::TransferToServer(
    const Literal& literal, const DeviceHandle* device_handle) {
  TransferToServerRequest request;
-  *request.mutable_literal() = literal;
+  *request.mutable_literal() = literal.ToProto();
  if (device_handle) {
    *request.mutable_device_handle() = *device_handle;
  }
@ -93,7 +92,7 @@ StatusOr<std::unique_ptr<GlobalData>> Client::TransferToServer(
 Status Client::TransferToInfeed(const Literal& literal, int64 replica_id,
                                const DeviceHandle* device_handle) {
  TransferToInfeedRequest request;
-  *request.mutable_literal() = literal;
+  *request.mutable_literal() = literal.ToProto();
  if (device_handle) {
    *request.mutable_device_handle() = *device_handle;
  }
@ -141,7 +140,8 @@ StatusOr<std::unique_ptr<Literal>> Client::TransferFromOutfeed(
        "TransferToClient request");
  }
-  return WrapUnique(response.release_literal());
+  Literal literal(response.literal());
  return MakeUnique<Literal>(literal);
 }
 Status Client::ResetDevice() {
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/service_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@ -165,9 +165,10 @@ ComputationDataHandle ComputationBuilder::ConstantOp(
  }
  ConstantRequest request;
-  Literal* literal = request.mutable_literal();
+  Literal literal;
-  populate(literal);
+  populate(&literal);
-  VLOG(3) << "created constant: " << literal->ShortDebugString();
+  *request.mutable_literal() = literal.ToProto();
  VLOG(3) << "created constant: " << request.literal().ShortDebugString();
  OpRequest op_request;
  *op_request.mutable_constant_request() = request;
  *op_request.mutable_computation() = computation_.handle();
--- a/tensorflow/compiler/xla/client/global_data.cc
+++ b/tensorflow/compiler/xla/client/global_data.cc
@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include <string>
 #include <utility>
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
@ -23,7 +24,7 @@ limitations under the License.
 namespace xla {
 GlobalData::GlobalData(ServiceInterface* parent, GlobalDataHandle handle)
-    : handle_(handle), parent_(parent) {}
+    : handle_(std::move(handle)), parent_(parent) {}
 GlobalData::~GlobalData() {
  UnregisterRequest request;
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@ -222,8 +222,9 @@ tensorflow::Status LocalExecutable::RecordArguments(
    SessionModule* session_module) {
  session_module->clear_arguments();
  for (const ShapedBuffer* argument : arguments) {
-    TF_RETURN_IF_ERROR(
+    Literal literal;
-        LiteralFromShapedBuffer(*argument, session_module->add_arguments()));
+    TF_RETURN_IF_ERROR(LiteralFromShapedBuffer(*argument, &literal));
    *session_module->add_arguments() = literal.ToProto();
  }
  return tensorflow::Status::OK();
 }
@ -231,9 +232,13 @@ tensorflow::Status LocalExecutable::RecordArguments(
 tensorflow::Status LocalExecutable::RecordResult(
    const ShapedBuffer* result, SessionModule* session_module) {
  session_module->clear_result();
-  return LiteralFromShapedBuffer(*result, session_module->mutable_result());
+  Literal literal(session_module->result());
  TF_RETURN_IF_ERROR(LiteralFromShapedBuffer(*result, &literal));
  *session_module->mutable_result() = literal.ToProto();
  return tensorflow::Status::OK();
 }
 // TODO(dnovillo) Change signature to return StatusOr<Literal>.
 tensorflow::Status LocalExecutable::LiteralFromShapedBuffer(
    const ShapedBuffer& shaped_buffer, Literal* literal) {
  TF_ASSIGN_OR_RETURN(
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@ -856,5 +856,26 @@ TEST_F(LiteralUtilTest, ConvertR4) {
  EXPECT_TRUE(LiteralUtil::Equal(*expected, *converted));
 }
 TEST_F(LiteralUtilTest, CopyFromProto_Bool) {
  LiteralProto p;
  p.mutable_shape()->set_element_type(PRED);
  for (int len = 0; len < 25; ++len) {
    p.mutable_shape()->clear_dimensions();
    p.mutable_shape()->add_dimensions(len);
    p.clear_preds();
    for (int i = 0; i < len; ++i) {
      p.add_preds((i % 2) == (len % 2));
    }
    Literal literal(p);
    ASSERT_EQ(len, literal.preds_size());
    int i = 0;
    for (auto it = literal.preds().begin(); it < literal.preds().end(); ++it) {
      EXPECT_EQ((i % 2) == (len % 2), *it);
      ++i;
    }
  }
 }
 }  // namespace
 }  // namespace xla
--- a/tensorflow/compiler/xla/packed_literal_reader.cc
+++ b/tensorflow/compiler/xla/packed_literal_reader.cc
@ -60,8 +60,8 @@ StatusOr<std::unique_ptr<Literal>> PackedLiteralReader::Read(
  int64 elements = ShapeUtil::ElementsIn(shape);
  LiteralUtil::Resize(elements, std::numeric_limits<float>::quiet_NaN(),
                      result.get());
-  tensorflow::protobuf::RepeatedField<float>* field = result->mutable_f32s();
+  std::vector<float>* field = result->mutable_f32s();
-  char* data = tensorflow::bit_cast<char*>(field->mutable_data());
+  char* data = tensorflow::bit_cast<char*>(field->data());
  uint64 bytes = elements * sizeof(float);
  tensorflow::StringPiece sp;
  auto s = file_->Read(offset_, bytes, &sp, data);
--- a/tensorflow/compiler/xla/packed_literal_reader.h
+++ b/tensorflow/compiler/xla/packed_literal_reader.h
@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/reference_util.h"
 #include <array>
 #include <utility>
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
@ -331,7 +332,8 @@ ReferenceUtil::ConvArray4DGeneralDimensions(
    std::pair<int64, int64> kernel_stride, Padding padding,
    ConvolutionDimensionNumbers dimension_numbers) {
  return ConvArray4DGeneralDimensionsDilated(lhs, rhs, kernel_stride, padding,
-                                             {1, 1}, {1, 1}, dimension_numbers);
+                                             {1, 1}, {1, 1},
                                             std::move(dimension_numbers));
 }
 /* static */ std::unique_ptr<Array4D<float>>
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@ -529,6 +529,7 @@ cc_library(
    srcs = ["transfer_manager.cc"],
    hdrs = ["transfer_manager.h"],
    deps = [
        "//tensorflow/compiler/xla:literal_util",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:status_macros",
        "//tensorflow/compiler/xla:statusor",
@ -1680,10 +1681,8 @@ cc_library(
    deps = [
        ":buffer_assignment",
        ":hlo",
        ":hlo_ordering",
        ":hlo_proto",
        "//tensorflow/compiler/xla:status",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/core:lib",
    ],
 )
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@ -171,6 +171,7 @@ StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
          executor, allocation->device_memory(), allocation->shape()));
  std::vector<GlobalDataHandle> element_handles;
  element_handles.reserve(element_bases.size());
  for (int i = 0; i < element_bases.size(); ++i) {
    element_handles.push_back(RegisterInternal(
        allocation->backend(), allocation->device_ordinal(), element_bases[i],
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@ -229,7 +229,8 @@ Status InstructionCopier::RecordAmbiguousOrNonDistinctIndices(
  // Mapping from LogicalBuffer to index (used to detect non-distinct indices).
  FlatMap<const LogicalBuffer*, std::vector<ShapeIndex>>
      buffer_to_source_indices;
-  TF_RETURN_IF_ERROR(points_to.ForEachElement([this, &buffer_to_source_indices](
+  TF_RETURN_IF_ERROR(points_to.ForEachElement(
      [this, &buffer_to_source_indices](
          const ShapeIndex& index, bool /*is_leaf*/,
          const std::vector<const LogicalBuffer*>& buffers) {
        if (buffers.size() > 1) {
@ -449,10 +450,14 @@ RevertReadOnlyIndicesForEntryParamsAndConstants(
    FlatMap<const HloInstruction*, HloInstruction*>* shared_copies) {
  const HloInstruction* init_hlo = while_hlo->operand(0);
  const PointsToSet& points_to = points_to_analysis.GetPointsToSet(init_hlo);
  // Mapping from LogicalBuffer to index (used to detect non-distinct indices).
  FlatSet<const LogicalBuffer*> buffer_set;
  ShapeTree<HloInstruction*> copy_overrides(init_hlo->shape());
  TF_RETURN_IF_ERROR(points_to.ForEachElement(
-      [init_hlo, read_only_indices, shared_copies, &copy_overrides](
+      [init_hlo, read_only_indices, shared_copies, &buffer_set,
-          const ShapeIndex& index, bool /*is_leaf*/,
+       &copy_overrides](const ShapeIndex& index, bool /*is_leaf*/,
                        const std::vector<const LogicalBuffer*>& buffers) {
        // Look for read-only entry parameters.
        if (!read_only_indices->element(index)) {
@ -468,6 +473,7 @@ RevertReadOnlyIndicesForEntryParamsAndConstants(
          if (!is_entry_parameter && !is_constant) {
            continue;
          }
          // We have found an entry parameter or constant that is read-only in
          // the while body. These buffers are managed by the caller, and cannot
          // be aliased with non-parameter buffers. Revert this read-only index,
@ -476,16 +482,17 @@ RevertReadOnlyIndicesForEntryParamsAndConstants(
          // Optimization to allow multiple while loops that share the same
          // read-only entry parameters (or constants) to share a single copy.
-          // Only unambiguous array-shaped buffers are allowed, to reduce code
+          // Only unambiguous and distinct array-shaped buffers are allowed, to
-          // complexity. The shape of the entry parameter must be identical to
+          // reduce code complexity. The shape of the entry parameter must be
-          // the shape of the init_hlo at this index, to ensure there were no
+          // identical to the shape of the init_hlo at this index, to ensure
-          // intervening bitcast or GTE instructions, which are also hard to
+          // there were no intervening bitcast or GTE instructions, which are
-          // handle.
+          // also hard to handle.
          const Shape& pointee_shape = pointee->shape();
          const Shape& init_shape =
              ShapeUtil::GetSubshape(init_hlo->shape(), index);
          if (buffers.size() == 1 && ShapeUtil::IsArray(pointee_shape) &&
-              ShapeUtil::Equal(pointee_shape, init_shape)) {
+              ShapeUtil::Equal(pointee_shape, init_shape) &&
              buffer_set.count(buffer) < 1) {
            HloInstruction** copy = &(*shared_copies)[pointee];
            if (*copy == nullptr) {
              *copy =
@ -496,6 +503,9 @@ RevertReadOnlyIndicesForEntryParamsAndConstants(
            *copy_overrides.mutable_element(index) = *copy;
          }
          // Tracks whether this current buffer is distinct.
          buffer_set.insert(buffer);
          // We've already reverted the read-only index and handled the
          // single-copy optimization above, so there's nothing more to do.
          break;
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@ -44,13 +44,20 @@ class CopyInsertionTest : public HloTestBase {
    EXPECT_IS_OK(copy_insertion.Run(module).status());
    // Verify the points to set of the root of the computation after copy
-    // insertion contains no constants or parameters.
+    // insertion contains no constants or parameters, and is distinct and
    // non-ambiguous.
    auto points_to_analysis =
        TuplePointsToAnalysis::Run(module).ConsumeValueOrDie();
    const auto& points_to = points_to_analysis->GetPointsToSet(
        module->entry_computation()->root_instruction());
    EXPECT_TRUE(points_to.IsDistinct());
    EXPECT_TRUE(!points_to.IsAmbiguous());
    tensorflow::gtl::FlatSet<const LogicalBuffer*> maybe_live_out_buffers =
        points_to_analysis
            ->GetPointsToSet(module->entry_computation()->root_instruction())
            .CreateFlattenedSet();
    for (const LogicalBuffer* buffer : maybe_live_out_buffers) {
      EXPECT_NE(buffer->instruction()->opcode(), HloOpcode::kConstant);
      EXPECT_NE(buffer->instruction()->opcode(), HloOpcode::kParameter);
@ -390,6 +397,47 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
    return builder.Build();
  }
  // Builds a While body computation with two output tuple elements dependent on
  // both input tuple elements.
  //
  // EX: Body({in0, in1, in2})
  //   out0 = Add(in0, 1)
  //   out1 = in1
  //   out2 = in2
  //   Tuple(out0, out1, out2)
  std::unique_ptr<HloComputation> BuildDependentBodyComputation2() {
    auto builder = HloComputation::Builder(TestName() + ".Body");
    const Shape& loop_state_shape = ShapeUtil::MakeTupleShape(
        {induction_variable_shape_, data_shape_, data_shape_});
    auto loop_state = builder.AddInstruction(
        HloInstruction::CreateParameter(0, loop_state_shape, "loop_state"));
    // Update the induction variable GTE(0).
    auto induction_variable =
        builder.AddInstruction(HloInstruction::CreateGetTupleElement(
            induction_variable_shape_, loop_state, 0));
    auto inc = builder.AddInstruction(
        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
    // add0 = Add(in0, 1)
    auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
        induction_variable->shape(), HloOpcode::kAdd, induction_variable, inc));
    // data1 = GTE(1).
    HloInstruction* data1 = builder.AddInstruction(
        HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 1));
    // data2 = GTE(2).
    HloInstruction* data2 = builder.AddInstruction(
        HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 2));
    // Create output Tuple.
    builder.AddInstruction(HloInstruction::CreateTuple({add0, data1, data2}));
    return builder.Build();
  }
  // Builds a While body computation with read-only tuple element 0.
  // EX:
  // Body({in0, in1})
@ -408,6 +456,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
    // Update data GTE(1).
    auto data = builder.AddInstruction(
        HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 1));
    // Use 'induction_variable' in computation with no path to output tuple.
    auto update = builder.AddInstruction(
        HloInstruction::CreateBroadcast(data_shape_, induction_variable, {8}));
@ -431,6 +480,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
    // Create param instruction to access loop state.
    const Shape& loop_state_shape =
        nested ? nested_loop_state_shape_ : loop_state_shape_;
    auto loop_state = builder.AddInstruction(
        HloInstruction::CreateParameter(0, loop_state_shape, "loop_state"));
    // Update the induction variable GTE(0).
@ -972,7 +1022,8 @@ TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinct) {
                                  op::Copy(old_init->operand(1)->operand(0)))));
 }
-// Tests while init instruction buffer which interferes with while result buffer.
+// Tests while init instruction buffer which interferes with while result
 // buffer.
 //
 //     init_data = Broadcast(...)
 //     add_unrelated = Add(init_data) // takes a reference to cause interference
@ -989,5 +1040,81 @@ TEST_F(WhileCopyInsertionTest, InitPointsToInterfering) {
                                               op::Copy(old_init->operand(1))));
 }
 // Tests while init instruction buffer which has a non-distinct points-to set:
 //
 //     init = Tuple(Parameter(S32, {}), Parameter(F32, {8},
 //                  Parameter(F32, {8})))
 //
 // where the second and third parameters are identical *and* the tuple shared
 // by another while instruction..
 //
 // Verifies that the resulting point-to set is distinct in the resulting Tuple
 // (non-identical Copys). In other words, verifies that copy sharing does not
 // insert identical copies to the resulting tuple.
 TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinctUsedByTwoWhileLoops) {
  auto condition1 = module_.AddEmbeddedComputation(BuildConditionComputation());
  auto condition2 = module_.AddEmbeddedComputation(BuildConditionComputation());
  // Loop body that outputs tuple comprises two elements dependent on the init
  // tuple.
  auto body1 = module_.AddEmbeddedComputation(BuildDependentBodyComputation2());
  auto body2 = module_.AddEmbeddedComputation(BuildDependentBodyComputation2());
  auto builder = HloComputation::Builder(TestName() + ".While");
  auto iter_param = builder.AddInstruction(
      HloInstruction::CreateParameter(0, induction_variable_shape_, "iter"));
  auto data_param = builder.AddInstruction(
      HloInstruction::CreateParameter(1, data_shape_, "data"));
  // Loop init tuple contains two identical parameter buffers.
  auto loop_init = builder.AddInstruction(
      HloInstruction::CreateTuple({iter_param, data_param, data_param}));
  const Shape& loop_state_shape = ShapeUtil::MakeTupleShape(
      {induction_variable_shape_, data_shape_, data_shape_});
  // Two while loops shares the same loop init tuple.
  auto while_hlo1 = builder.AddInstruction(HloInstruction::CreateWhile(
      loop_state_shape, condition1, body1, loop_init));
  auto while_hlo2 = builder.AddInstruction(HloInstruction::CreateWhile(
      loop_state_shape, condition2, body2, loop_init));
  module_.AddEntryComputation(builder.Build());
  auto points_to_analysis =
      TuplePointsToAnalysis::Run(&module_).ConsumeValueOrDie();
  // Asserts that the init tuples before copy insertion is non-distinct.
  ASSERT_FALSE(
      points_to_analysis->GetPointsToSet(while_hlo1->operand(0)).IsDistinct());
  ASSERT_FALSE(
      points_to_analysis->GetPointsToSet(while_hlo2->operand(0)).IsDistinct());
  auto old_init1 = while_hlo1->operand(0);
  auto old_init2 = while_hlo2->operand(0);
  InsertCopies(&module_);
  EXPECT_THAT(while_hlo1->operand(0),
              op::Tuple(op::Copy(old_init1->operand(0)),
                        op::Copy(old_init1->operand(1)),
                        op::Copy(old_init1->operand(2))));
  EXPECT_THAT(while_hlo2->operand(0),
              op::Tuple(op::Copy(old_init2->operand(0)),
                        op::Copy(old_init2->operand(1)),
                        op::Copy(old_init2->operand(2))));
  // Verifies the init tuples after copy insertion is distinct.
  points_to_analysis = TuplePointsToAnalysis::Run(&module_).ConsumeValueOrDie();
  const auto& points_to1 =
      points_to_analysis->GetPointsToSet(while_hlo1->operand(0));
  EXPECT_TRUE(points_to1.IsDistinct());
  const auto& points_to2 =
      points_to_analysis->GetPointsToSet(while_hlo2->operand(0));
  EXPECT_TRUE(points_to2.IsDistinct());
 }
 }  // namespace
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_WITH_DEFAULT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_WITH_DEFAULT_H_
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/types.h"
--- a/tensorflow/compiler/xla/service/execution_tracker.cc
+++ b/tensorflow/compiler/xla/service/execution_tracker.cc
@ -31,7 +31,7 @@ AsyncExecution::AsyncExecution(Backend* backend,
    : backend_(CHECK_NOTNULL(backend)),
      streams_(std::move(streams)),
      profile_(profile),
-      result_(result) {
+      result_(std::move(result)) {
  for (const auto& stream : streams_) {
    CHECK(stream != nullptr);
  }
--- a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
@ -254,6 +254,7 @@ TEST_F(HloScheduleTest, LatticeMatMul) {
  //      d40      -- layer 4
  HloComputation::Builder builder("entry_computation");
  std::vector<HloInstruction*> params;
  params.reserve(6);
  for (int i = 0; i < 6; ++i) {
    params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
        i, f32_2x2_, /*name=*/tensorflow::strings::Printf("param%d", i))));
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@ -1631,6 +1631,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildKernelThunk(
  // Compute the input buffer indices.
  std::vector<BufferAllocation::Slice> io_buffers;
  io_buffers.reserve(io_hlos.size());
  for (const HloInstruction* io_hlo : io_hlos) {
    io_buffers.push_back(GetAllocationSlice(*LatestNonGteAncestor(io_hlo)));
  }
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
@ -86,6 +86,7 @@ TEST_F(StreamAssignmentTest, LatticeMatMul) {
  //      d40      -- layer 4
  HloComputation::Builder builder("entry_computation");
  std::vector<HloInstruction*> params;
  params.reserve(6);
  for (int i = 0; i < 6; ++i) {
    params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
        i, f32_2x2_, /*name=*/tensorflow::strings::Printf("param%d", i))));
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@ -46,7 +46,7 @@ message HloInstructionProto {
  xla.OpMetadata metadata = 7;
  // Literal, only present for kConstant.
-  xla.Literal literal = 8;
+  xla.LiteralProto literal = 8;
  // Parameter info, only present for kParameter.
  int64 parameter_number = 9;
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@ -311,7 +311,6 @@ void ComputeComputationPostOrder(
  visited->insert(computation);
  post_order->push_back(computation);
  return;
 }
 }  // namespace
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@ -65,7 +65,7 @@ using ::tensorflow::strings::StrCat;
      WrapUnique(new HloInstruction(HloOpcode::kTrace, ShapeUtil::MakeNil()));
  instruction->operands_.push_back(operand);
  instruction->literal_.reset(new Literal);
-  *instruction->literal_->mutable_u8s() += tag;
+  instruction->literal_->append_u8s(tag);
  return instruction;
 }
@ -1484,6 +1484,7 @@ string HloInstruction::ToString(bool compact_operands,
  }
  if (!slice_starts_.empty() && !slice_limits_.empty()) {
    std::vector<string> bounds;
    bounds.reserve(slice_starts_.size());
    for (int i = 0; i < slice_starts_.size(); ++i) {
      bounds.push_back(
          StrCat("[", slice_starts_[i], ":", slice_limits_[i], "]"));
@ -1550,7 +1551,7 @@ HloInstructionProto HloInstruction::ToProto() const {
  *proto.mutable_metadata() = metadata_;
  switch (opcode_) {
    case HloOpcode::kConstant:
-      *proto.mutable_literal() = *literal_;
+      *proto.mutable_literal() = literal_->ToProto();
      break;
    case HloOpcode::kParameter:
      proto.set_parameter_number(parameter_number_);
@ -1647,10 +1648,10 @@ void HloInstruction::set_tracing(HloInstruction* trace_instruction) {
  trace_instruction_ = trace_instruction;
 }
-const string& HloInstruction::tracing_tag() const {
+string HloInstruction::TracingTag() const {
  CHECK_EQ(HloOpcode::kTrace, opcode());
  CHECK(literal_ != nullptr);
-  return literal_->u8s();
+  return literal_->u8s_string();
 }
 bool HloInstruction::IsFused() const {
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@ -30,6 +30,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
@ -535,7 +536,7 @@ class HloInstruction {
  // Returns a tag to be used in tracing.
  //
  // Precondition: opcode() == HloOpcode::kTrace
-  const string& tracing_tag() const;
+  string TracingTag() const;
  // Returns whether the instruction is a constant.
  bool IsConstant() const;
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@ -151,7 +151,26 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
        return true;
      };
-      if (std::all_of(hlo->users().begin(), hlo->users().end(),
+      // An "effectively unary" operation is one that has one "large"
      // input with the others being negligible in terms of memory usage.
      // We use "has a smaller true rank than the output" as a heuristic
      // for "negligible" memory usage.
      auto effectively_unary = [](HloInstruction* hlo) {
        if (hlo->operands().size() == 1) {
          return true;
        }
        auto output_rank = ShapeUtil::TrueRank(hlo->shape());
        return std::count_if(
                   hlo->operands().begin(), hlo->operands().end(),
                   [output_rank](HloInstruction* operand) {
                     return ((operand->opcode() != HloOpcode::kBroadcast) &&
                             ShapeUtil::TrueRank(operand->shape()) >=
                                 output_rank);
                   }) <= 1;
      };
      if (effectively_unary(hlo) ||
          std::all_of(hlo->users().begin(), hlo->users().end(),
                      user_fusable_into_hlo)) {
        all_consumers_fusable.insert(hlo);
      }
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@ -156,21 +156,67 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfParameterUnfused) {
 TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusable) {
  HloComputation::Builder builder(TestName());
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+  auto shape = ShapeUtil::MakeShape(F32, {16, 16});
-      0, ShapeUtil::MakeShape(F32, {16, 16}), "0"));
+  auto param0 =
-  HloInstruction* unary1 = builder.AddInstruction(HloInstruction::CreateUnary(
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "0"));
-      ShapeUtil::MakeShape(S32, {}), HloOpcode::kFloor, param0));
+  auto param1 =
-  builder.AddInstruction(HloInstruction::CreateSend(unary1, 0));
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "1"));
-  HloInstruction* unary2 = builder.AddInstruction(HloInstruction::CreateUnary(
+  HloInstruction* binary1 = builder.AddInstruction(
-      ShapeUtil::MakeShape(S32, {}), HloOpcode::kAbs, unary1));
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
  builder.AddInstruction(HloInstruction::CreateSend(binary1, 0));
  HloInstruction* unary = builder.AddInstruction(
      HloInstruction::CreateUnary(shape, HloOpcode::kAbs, binary1));
  auto module = MakeUnique<HloModule>(TestName());
  auto computation = module->AddEntryComputation(builder.Build());
-  EXPECT_EQ(unary2, computation->root_instruction());
+  EXPECT_EQ(unary, computation->root_instruction());
  EXPECT_FALSE(
      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
          .Run(module.get())
          .ValueOrDie());
 }
 TEST_F(InstructionFusionTest, AllowUnaryDuplication) {
  HloComputation::Builder builder(TestName());
  auto shape = ShapeUtil::MakeShape(F32, {16, 16});
  auto param0 =
      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "0"));
  HloInstruction* unary1 = builder.AddInstruction(
      HloInstruction::CreateUnary(shape, HloOpcode::kFloor, param0));
  builder.AddInstruction(HloInstruction::CreateSend(unary1, 0));
  HloInstruction* unary2 = builder.AddInstruction(
      HloInstruction::CreateUnary(shape, HloOpcode::kAbs, unary1));
  auto module = MakeUnique<HloModule>(TestName());
  auto computation = module->AddEntryComputation(builder.Build());
  EXPECT_EQ(unary2, computation->root_instruction());
  EXPECT_TRUE(
      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
          .Run(module.get())
          .ValueOrDie());
 }
 TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) {
  auto shape = ShapeUtil::MakeShape(F32, {16, 16});
  auto small_shape = ShapeUtil::MakeShape(F32, {16});
  HloComputation::Builder builder(TestName());
  auto param0 = builder.AddInstruction(
      HloInstruction::CreateParameter(0, small_shape, "0"));
  auto param1 =
      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "1"));
  HloInstruction* binary1 = builder.AddInstruction(
      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
  builder.AddInstruction(HloInstruction::CreateSend(binary1, 0));
  HloInstruction* unary = builder.AddInstruction(
      HloInstruction::CreateUnary(shape, HloOpcode::kAbs, binary1));
  auto module = MakeUnique<HloModule>(TestName());
  auto computation = module->AddEntryComputation(builder.Build());
  EXPECT_EQ(unary, computation->root_instruction());
  EXPECT_TRUE(
      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
          .Run(module.get())
          .ValueOrDie());
 }
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@ -27,6 +27,7 @@ limitations under the License.
 #include "external/llvm/include/llvm/IR/Module.h"
 #include "external/llvm/include/llvm/IR/Value.h"
 #include "external/llvm/include/llvm/Support/raw_ostream.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@ -77,8 +77,10 @@ tensorflow::Status RecordArguments(
    SessionModule* module) {
  module->clear_arguments();
  for (const Allocation* allocation : arg_allocations) {
-    TF_RETURN_IF_ERROR(LiteralFromAllocation(allocation, allocation->shape(),
+    Literal argument;
-                                             module->add_arguments()));
+    TF_RETURN_IF_ERROR(
        LiteralFromAllocation(allocation, allocation->shape(), &argument));
    *module->add_arguments() = argument.ToProto();
  }
  return tensorflow::Status::OK();
 }
@ -87,8 +89,11 @@ tensorflow::Status RecordArguments(
 tensorflow::Status RecordResult(const Allocation* result_allocation,
                                SessionModule* module) {
  module->clear_result();
-  return LiteralFromAllocation(result_allocation, result_allocation->shape(),
+  Literal result;
-                               module->mutable_result());
+  TF_RETURN_IF_ERROR(LiteralFromAllocation(
      result_allocation, result_allocation->shape(), &result));
  *module->mutable_result() = result.ToProto();
  return tensorflow::Status::OK();
 }
 }  // namespace
@ -649,6 +654,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
        ResolveAndValidateArguments(request.arguments(), execute_backend_.get(),
                                    executor->device_ordinal()));
    std::vector<se::DeviceMemoryBase> arguments;
    arguments.reserve(arg_allocations.size());
    for (const Allocation* allocation : arg_allocations) {
      arguments.push_back(allocation->device_memory());
    }
@ -677,6 +683,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
      BuildExecutables(versioned_handles, std::move(module_configs),
                       execute_backend_.get(), executors));
  std::vector<Executable*> executable_ptrs;
  executable_ptrs.reserve(executables.size());
  for (const auto& executable : executables) {
    executable_ptrs.push_back(executable.get());
  }
@ -752,6 +759,7 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
          << module_config->entry_computation_layout().ToString();
  std::vector<se::DeviceMemoryBase> arguments;
  arguments.reserve(arg_allocations.size());
  for (const Allocation* allocation : arg_allocations) {
    arguments.push_back(allocation->device_memory());
  }
@ -820,6 +828,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
          << module_config->entry_computation_layout().ToString();
  std::vector<se::DeviceMemoryBase> arguments;
  arguments.reserve(arg_allocations.size());
  for (const Allocation* allocation : arg_allocations) {
    arguments.push_back(allocation->device_memory());
  }
@ -908,13 +917,15 @@ tensorflow::Status Service::TransferToClient(const TransferToClientRequest* arg,
    literal_shape = &allocation->shape();
  }
-  return LiteralFromAllocation(allocation, *literal_shape,
+  Literal literal;
-                               result->mutable_literal());
+  auto status = LiteralFromAllocation(allocation, *literal_shape, &literal);
  *result->mutable_literal() = literal.ToProto();
  return status;
 }
 tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
                                             TransferToServerResponse* result) {
-  const Literal& literal = arg->literal();
+  Literal literal = Literal(arg->literal());
  const Shape& shape = literal.shape();
  if (ShapeUtil::IsTuple(shape) && execute_backend_->Replicas().size() > 1) {
@ -978,7 +989,7 @@ tensorflow::Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
  }
  return execute_backend_->transfer_manager()->TransferLiteralToInfeed(
-      executor, arg->literal());
+      executor, Literal(arg->literal()));
 }
 tensorflow::Status Service::TransferFromOutfeed(
@ -1001,8 +1012,12 @@ tensorflow::Status Service::TransferFromOutfeed(
    executor = execute_backend_->Replicas()[arg->replica_id()];
  }
-  return execute_backend_->transfer_manager()->TransferLiteralFromOutfeed(
+  Literal literal;
-      executor, arg->shape_with_layout(), result->mutable_literal());
+  TF_RETURN_IF_ERROR(
      execute_backend_->transfer_manager()->TransferLiteralFromOutfeed(
          executor, arg->shape_with_layout(), &literal));
  *result->mutable_literal() = literal.ToProto();
  return tensorflow::Status::OK();
 }
 tensorflow::Status Service::ResetDevice(const ResetDeviceRequest* arg,
--- a/tensorflow/compiler/xla/service/session.proto
+++ b/tensorflow/compiler/xla/service/session.proto
@ -75,10 +75,10 @@ message SessionModule {
  repeated SessionComputation embedded_computations = 2;
  // The arguments passed to the computation.
-  repeated Literal arguments = 3;
+  repeated LiteralProto arguments = 3;
  // The result of the computation.
-  Literal result = 4;
+  LiteralProto result = 4;
  // The name of the platform used to run the computation.
  string execution_platform = 5;
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@ -20,6 +20,7 @@ limitations under the License.
 #include <set>
 #include <vector>
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
--- a/tensorflow/compiler/xla/service/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager_test.cc
@ -121,7 +121,7 @@ TEST_F(CpuTransferManagerTest, TransferR1U8FromDevice) {
  const Shape shape = ShapeUtil::MakeShape(U8, {4});
  TF_CHECK_OK(transfer_manager_.TransferLiteralFromDevice(
      stream_exec_, memptr, shape, shape, &literal));
-  CHECK_EQ("klmn", literal.u8s());
+  CHECK_EQ("klmn", literal.u8s_string());
 }
 TEST_F(CpuTransferManagerTest, TransferBufferFromDevice) {
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@ -2275,7 +2275,7 @@ void ComputationLowerer::Visit(
      const ConstantRequest& constant_request =
          request.request().constant_request();
      hlo_instruction = add_instruction(HloInstruction::CreateConstant(
-          LiteralUtil::CloneToUnique(constant_request.literal())));
+          LiteralUtil::CloneToUnique(Literal(constant_request.literal()))));
      break;
    }
@ -2467,6 +2467,7 @@ void ComputationLowerer::Visit(
      // to append dimensions on the left the broadcast_dimensions should just
      // be the n highest dimension numbers of the output shape where n is
      // the number of input dimensions.
      broadcast_dimensions.reserve(ShapeUtil::Rank(operand->shape()));
      for (int i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
        broadcast_dimensions.push_back(i +
                                       ShapeUtil::Rank(request.output_shape()) -
--- a/tensorflow/compiler/xla/service/user_computation_test.cc
+++ b/tensorflow/compiler/xla/service/user_computation_test.cc
@ -50,7 +50,7 @@ TEST_F(UserComputationTest, SimpleComputation) {
  ConstantRequest constant_request;
  *constant_request.mutable_literal() =
-      *LiteralUtil::CreateR1<float>({123.0f, 42.0f});
+      LiteralUtil::CreateR1<float>({123.0f, 42.0f})->ToProto();
  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle constant_handle,
                         computation.AddConstantInstruction(constant_request));
@ -160,12 +160,13 @@ TEST_F(UserComputationTest, EliminateScalarBroadcast) {
  UserComputation computation("TheComputation", handle);
  ConstantRequest a_request;
-  *a_request.mutable_literal() = *LiteralUtil::CreateR1<float>({123.0f, 42.0f});
+  *a_request.mutable_literal() =
      LiteralUtil::CreateR1<float>({123.0f, 42.0f})->ToProto();
  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle a_handle,
                         computation.AddConstantInstruction(a_request));
  ConstantRequest b_request;
-  *b_request.mutable_literal() = *LiteralUtil::CreateR0<float>(1.0f);
+  *b_request.mutable_literal() = LiteralUtil::CreateR0<float>(1.0f)->ToProto();
  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle b_handle,
                         computation.AddConstantInstruction(b_request));
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@ -44,6 +44,7 @@ struct ShapeTreeNode {
  // Children of this node.
  std::vector<std::unique_ptr<ShapeTreeNode>> children;
  ShapeTreeNode() = default;
  explicit ShapeTreeNode(const T& data) : data(data) {}
  ShapeTreeNode(const ShapeTreeNode& other)
@ -85,8 +86,9 @@ class ShapeTree {
 public:
  // Default constructor creates a tree with a nil shape (i.e. an empty tuple).
  ShapeTree() : ShapeTree(ShapeUtil::MakeNil()) {}
-  // Create ShapeTree with the given shape, and default T values for all nodes.
+  // Create ShapeTree with the given shape, and default-constructed T values for
-  explicit ShapeTree(const Shape& shape) : ShapeTree(shape, T()) {}
+  // all nodes.
  explicit ShapeTree(const Shape& shape);
  // Create ShapeTree with the given shape, and init_value for all nodes.
  ShapeTree(const Shape& shape, const T& init_value);
@ -127,6 +129,19 @@ class ShapeTree {
      const ShapeIndex& /*index*/, bool /*is_leaf*/, T* /*data*/)>;
  Status ForEachMutableElement(const MutableVisitorFunction& func);
  // Copy the subtree of values from 'other' rooted at ShapeIndex
  // 'source_base_index' into the subtree of value in this ShapeTree rooted at
  // 'target_base_index'.
  //
  // Precondition: The subshape of other.shape() at index source_base_index must
  // be compatible with the subshape of shape() at index target_base_index.
  void CopySubtreeFrom(const ShapeTree<T>& other,
                       const ShapeIndex& source_base_index,
                       const ShapeIndex& target_base_index);
  bool operator==(const ShapeTree<T>& other) const;
  bool operator!=(const ShapeTree<T>& other) const { return !(*this == other); }
 private:
  using Node = internal::ShapeTreeNode<T>;
@ -134,6 +149,10 @@ class ShapeTree {
  // the given 'init_value'.
  void InitChildren(const Shape& shape, const T& init_value, Node* node);
  // Initialize node->children based on 'shape'. All children have
  // default-constructed data values.
  void InitChildren(const Shape& shape, Node* node);
  // Helpers for traversing the shape via ForEachElement. The helpers
  // recursively traverse the subtree rooted at "index" (defined as in
  // ShapeUtil::GetSubshape).
@ -165,6 +184,24 @@ void ShapeTree<T>::InitChildren(const Shape& shape, const T& init_value,
  }
 }
 template <typename T>
 void ShapeTree<T>::InitChildren(const Shape& shape, Node* node) {
  if (ShapeUtil::IsTuple(shape)) {
    for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
      node->children.emplace_back(new Node());
      InitChildren(shape.tuple_shapes(i), node->children.back().get());
    }
  }
 }
 template <typename T>
 ShapeTree<T>::ShapeTree(const Shape& shape) : root_(), shape_(shape) {
  // The shape_ field is just used to hold the structure of the shape.
  // It should not be relied upon to store layout information.
  LayoutUtil::ClearLayout(&shape_);
  InitChildren(shape_, &root_);
 }
 template <typename T>
 ShapeTree<T>::ShapeTree(const Shape& shape, const T& init_value)
    : root_(init_value), shape_(shape) {
@ -240,6 +277,48 @@ Status ShapeTree<T>::ForEachMutableElement(const MutableVisitorFunction& func) {
  return ForEachMutableHelper(func, &root_, &index);
 }
 template <typename T>
 void ShapeTree<T>::CopySubtreeFrom(const ShapeTree<T>& other,
                                   const ShapeIndex& source_base_index,
                                   const ShapeIndex& target_base_index) {
  CHECK(ShapeUtil::Compatible(
      ShapeUtil::GetSubshape(shape(), target_base_index),
      ShapeUtil::GetSubshape(other.shape(), source_base_index)));
  ForEachMutableElement(
      [this, &other, &source_base_index, &target_base_index](
          const ShapeIndex& index, bool /*is_leaf*/, T* data) {
        // Copy the data element only if index is in the
        // subtree rooted at target_base_index.
        for (int i = 0; i < target_base_index.size(); ++i) {
          if (i >= index.size() || index[i] != target_base_index[i]) {
            return Status::OK();
          }
        }
        // Construct source element index to copy from.
        ShapeIndex source_index = source_base_index;
        for (int i = target_base_index.size(); i < index.size(); ++i) {
          source_index.push_back(index[i]);
        }
        *data = other.element(source_index);
        return Status::OK();
      })
      .IgnoreError();
 }
 template <typename T>
 bool ShapeTree<T>::operator==(const ShapeTree<T>& other) const {
  bool equal = true;
  ForEachElement([this, &other, &equal](const ShapeIndex& index,
                                        bool /*is_leaf*/, const T& data) {
    if (data != other.element(index)) {
      equal = false;
    }
    return Status::OK();
  })
      .IgnoreError();
  return equal;
 }
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_SHAPE_TREE_H_
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@ -245,5 +245,139 @@ TEST_F(ShapeTreeTest, InvalidIndexingNestedTuple) {
  EXPECT_DEATH(shape_tree.element({0, 0}), "");
 }
 TEST_F(ShapeTreeTest, ShapeTreeOfNonCopyableType) {
  ShapeTree<std::unique_ptr<int>> shape_tree{tuple_shape_};
  EXPECT_EQ(shape_tree.element({2}).get(), nullptr);
  *shape_tree.mutable_element({2}) = MakeUnique<int>(42);
  EXPECT_EQ(*shape_tree.element({2}), 42);
 }
 TEST_F(ShapeTreeTest, CopySubtreeFromArrayShape) {
  // Test CopySubtreeFrom method for a single value copied between array-shaped
  // ShapeTrees.
  ShapeTree<int> source(array_shape_);
  *source.mutable_element(/*index=*/{}) = 42;
  ShapeTree<int> destination(array_shape_, 123);
  EXPECT_EQ(destination.element(/*index=*/{}), 123);
  destination.CopySubtreeFrom(source, /*source_base_index=*/{},
                              /*target_base_index=*/{});
  EXPECT_EQ(destination.element(/*index=*/{}), 42);
 }
 TEST_F(ShapeTreeTest, FullCopySubtreeFromTupleShape) {
  // Test CopySubtreeFrom method for a copy of all elements from one
  // tuple-shaped ShapeTree to another.
  ShapeTree<int> source(tuple_shape_);
  *source.mutable_element(/*index=*/{}) = 10;
  *source.mutable_element(/*index=*/{0}) = 11;
  *source.mutable_element(/*index=*/{1}) = 12;
  *source.mutable_element(/*index=*/{2}) = 13;
  ShapeTree<int> destination(tuple_shape_, 0);
  destination.CopySubtreeFrom(source, /*source_base_index=*/{},
                              /*target_base_index=*/{});
  EXPECT_EQ(destination.element(/*index=*/{}), 10);
  EXPECT_EQ(destination.element(/*index=*/{0}), 11);
  EXPECT_EQ(destination.element(/*index=*/{1}), 12);
  EXPECT_EQ(destination.element(/*index=*/{2}), 13);
 }
 TEST_F(ShapeTreeTest, SingleElementCopySubtreeFromTupleShape) {
  // Test CopySubtreeFrom method for a copy of a single element from one
  // tuple-shaped ShapeTree to another.
  ShapeTree<int> source(tuple_shape_);
  *source.mutable_element(/*index=*/{}) = 10;
  *source.mutable_element(/*index=*/{0}) = 11;
  *source.mutable_element(/*index=*/{1}) = 12;
  *source.mutable_element(/*index=*/{2}) = 13;
  ShapeTree<int> destination(tuple_shape_, 0);
  destination.CopySubtreeFrom(source, /*source_base_index=*/{0},
                              /*target_base_index=*/{1});
  EXPECT_EQ(destination.element(/*index=*/{}), 0);
  EXPECT_EQ(destination.element(/*index=*/{0}), 0);
  EXPECT_EQ(destination.element(/*index=*/{1}), 11);
  EXPECT_EQ(destination.element(/*index=*/{2}), 0);
 }
 TEST_F(ShapeTreeTest, CopySubtreeIntoNestedShape) {
  // Test CopySubtreeFrom method for a copy of a tuple-shaped ShapeTree into a
  // nested-tuple-shaped ShapeTree.
  ShapeTree<int> source(
      ShapeUtil::MakeTupleShape({array_shape_, array_shape_}));
  *source.mutable_element(/*index=*/{}) = 10;
  *source.mutable_element(/*index=*/{0}) = 11;
  *source.mutable_element(/*index=*/{1}) = 12;
  ShapeTree<int> destination(nested_tuple_shape_, 0);
  destination.CopySubtreeFrom(source, /*source_base_index=*/{},
                              /*target_base_index=*/{2, 0});
  EXPECT_EQ(destination.element(/*index=*/{}), 0);
  EXPECT_EQ(destination.element(/*index=*/{0}), 0);
  EXPECT_EQ(destination.element(/*index=*/{1}), 0);
  EXPECT_EQ(destination.element(/*index=*/{1, 0}), 0);
  EXPECT_EQ(destination.element(/*index=*/{1, 1}), 0);
  EXPECT_EQ(destination.element(/*index=*/{2}), 0);
  EXPECT_EQ(destination.element(/*index=*/{2, 0}), 10);
  EXPECT_EQ(destination.element(/*index=*/{2, 0, 0}), 11);
  EXPECT_EQ(destination.element(/*index=*/{2, 0, 1}), 12);
  EXPECT_EQ(destination.element(/*index=*/{2, 1}), 0);
 }
 TEST_F(ShapeTreeTest, CopySubtreeFromNestedShape) {
  // Test CopySubtreeFrom method for a copy from a nested-tuple-shape.
  ShapeTree<int> source(nested_tuple_shape_, 42);
  *source.mutable_element(/*index=*/{1}) = 10;
  *source.mutable_element(/*index=*/{1, 0}) = 11;
  *source.mutable_element(/*index=*/{1, 1}) = 12;
  ShapeTree<int> destination(
      ShapeUtil::MakeTupleShape({array_shape_, array_shape_}), 0);
  destination.CopySubtreeFrom(source, /*source_base_index=*/{1},
                              /*target_base_index=*/{});
  EXPECT_EQ(destination.element(/*index=*/{}), 10);
  EXPECT_EQ(destination.element(/*index=*/{0}), 11);
  EXPECT_EQ(destination.element(/*index=*/{1}), 12);
 }
 TEST_F(ShapeTreeTest, OperatorEquals) {
  {
    ShapeTree<int> a(array_shape_, 123);
    ShapeTree<int> b(array_shape_, 42);
    ShapeTree<int> c(array_shape_, 42);
    EXPECT_FALSE(a == b);
    EXPECT_TRUE(a != b);
    EXPECT_TRUE(b == c);
  }
  {
    ShapeTree<int> a(tuple_shape_);
    *a.mutable_element(/*index=*/{}) = 10;
    *a.mutable_element(/*index=*/{0}) = 11;
    *a.mutable_element(/*index=*/{1}) = 12;
    ShapeTree<int> b(tuple_shape_);
    *b.mutable_element(/*index=*/{}) = 10;
    *b.mutable_element(/*index=*/{0}) = 42;
    *b.mutable_element(/*index=*/{1}) = 11;
    ShapeTree<int> c(tuple_shape_);
    *c.mutable_element(/*index=*/{}) = 10;
    *c.mutable_element(/*index=*/{0}) = 42;
    *c.mutable_element(/*index=*/{1}) = 11;
    EXPECT_FALSE(a == b);
    EXPECT_TRUE(a != b);
    EXPECT_TRUE(b == c);
    EXPECT_FALSE(b != c);
  }
 }
 }  // namespace
 }  // namespace xla
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@ -122,7 +122,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
  for (const auto& shape : parameters) {
    *program_shape.add_parameters() = shape;
  }
-  *program_shape.mutable_result() = result;
+  *program_shape.mutable_result() = std::move(result);
  return program_shape;
 }
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@ -829,6 +829,7 @@ TEST_P(ArrayElementwiseOpTestParamCount, SquareManyValues) {
  const int count = GetParam();
  ComputationBuilder builder(client_, TestName());
  std::vector<float> values;
  values.reserve(count);
  for (int i = 0; i < count; ++i) {
    values.push_back(i / static_cast<float>(count));
  }
@ -836,6 +837,7 @@ TEST_P(ArrayElementwiseOpTestParamCount, SquareManyValues) {
  auto exp = builder.Pow(x, builder.ConstantR0<float>(2.0f));
  std::vector<float> expected;
  expected.reserve(values.size());
  for (float value : values) {
    expected.push_back(value * value);
  }
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@ -179,7 +179,7 @@ void ClientLibraryTestBase::ComputeAndCompareR1U8(
  VLOG(1) << "expected: " << LiteralUtil::ToString(*expected_literal);
  VLOG(1) << "actual:   " << LiteralUtil::ToString(*actual);
-  EXPECT_EQ(expected, actual->u8s());
+  EXPECT_EQ(expected, actual->u8s_string());
 }
 void ClientLibraryTestBase::ComputeAndCompareTuple(
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@ -442,6 +442,39 @@ XLA_TEST_F(ConcatTest, ConcatSeveralR1S32s) {
  ComputeAndCompareR1<int32>(&builder, expected, {});
 }
 XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
  ComputationBuilder builder(client_, TestName());
  Array3D<float> arr0(9, 17, 1);
  arr0.Fill(1);
  Array3D<float> arr1(9, 17, 256);
  arr1.Fill(2);
  Array3D<float> expected(9, 17, arr0.n3() + arr1.n3());
  for (int64 i = 0; i < expected.n1(); ++i) {
    for (int64 j = 0; j < expected.n2(); ++j) {
      int64 kk = 0;
      for (const Array3D<float>& arr : {arr0, arr1}) {
        for (int64 k = 0; k < arr.n3(); ++k, ++kk) {
          expected(i, j, kk) = arr(i, j, k);
        }
      }
    }
  }
  ComputationDataHandle h0;
  auto p0 = CreateR3Parameter<float>(arr0, /*parameter_number=*/0, "p0",
                                     &builder, &h0);
  ComputationDataHandle h1;
  auto p1 = CreateR3Parameter<float>(arr1, /*parameter_number=*/1, "p1",
                                     &builder, &h1);
  auto concatenated = builder.ConcatInDim({h0, h1}, 2);
  ComputeAndCompareR3<float>(&builder, expected, {p0.get(), p1.get()});
 }
 // Describes a binary rank-2 concatenation test.
 struct R2BinarySpec {
  int64 lhs_dim0;
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@ -262,7 +262,7 @@ class NearComparator {
    max_abs_err_ = 0.0;
    *miscompares_.mutable_shape() =
        ShapeUtil::ChangeElementType(actual.shape(), PRED);
-    miscompares_.mutable_preds()->Resize(
+    miscompares_.mutable_preds()->resize(
        ShapeUtil::ElementsIn(miscompares_.shape()), false);
    multi_index_.resize(expected.shape().dimensions_size(), 0);
@ -389,7 +389,7 @@ class NearComparator {
        tensorflow::strings::Printf("tempfile-%s-%llx-%s", Hostname().c_str(),
                                    now_usec, name.c_str()));
    TF_CHECK_OK(tensorflow::WriteBinaryProto(tensorflow::Env::Default(),
-                                             filename, literal));
+                                             filename, literal.ToProto()));
    LOG(ERROR) << "wrote to " << name << " file: " << filename;
  }
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@ -83,9 +83,10 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
  LOG(INFO) << "results: [" << tensorflow::str_util::Join(results, ", ") << "]";
  EXPECT_EQ(3, results.size());
  for (const string& result : results) {
-    Literal literal;
+    LiteralProto literal_proto;
    TF_CHECK_OK(tensorflow::ReadBinaryProto(tensorflow::Env::Default(), result,
-                                            &literal));
+                                            &literal_proto));
    Literal literal(literal_proto);
    if (result.find("expected") != string::npos) {
      EXPECT_EQ("2", LiteralUtil::ToString(literal));
    } else if (result.find("actual") != string::npos) {
--- a/tensorflow/compiler/xla/tests/log_test.cc
+++ b/tensorflow/compiler/xla/tests/log_test.cc
@ -47,6 +47,7 @@ TEST_F(LogTest, LogTenValues) {
  builder.Log(x);
  std::vector<float> expected;
  expected.reserve(input.size());
  for (float f : input) {
    expected.push_back(std::log(f));
  }
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@ -246,6 +246,7 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
  }
  std::vector<GlobalData*> param_data;
  param_data.reserve(param_data_owner.size());
  for (const std::unique_ptr<GlobalData>& data : param_data_owner) {
    param_data.push_back(data.get());
  }
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@ -37,6 +37,7 @@ class SliceTest : public ClientLibraryTestBase {
  template <typename NativeT>
  void RunSliceTenToTwo() {
    std::vector<NativeT> constant;
    constant.reserve(10);
    for (int i = 0; i < 10; ++i) {
      constant.push_back(static_cast<NativeT>(i));
    }
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@ -64,6 +64,7 @@ TEST_F(VecOpsSimpleTest, ExpManyValues) {
  for (int count : {63, 64, 65, 127, 128, 129, 17 * 4096}) {
    ComputationBuilder builder(client_, TestName());
    std::vector<float> exponents;
    exponents.reserve(count);
    for (int i = 0; i < count; ++i) {
      exponents.push_back(i / static_cast<float>(count));
    }
@ -71,6 +72,7 @@ TEST_F(VecOpsSimpleTest, ExpManyValues) {
    auto exp = builder.Exp(x);
    std::vector<float> expected;
    expected.reserve(exponents.size());
    for (float exponent : exponents) {
      expected.push_back(std::exp(exponent));
    }
--- a/tensorflow/compiler/xla/text_literal_reader.h
+++ b/tensorflow/compiler/xla/text_literal_reader.h
@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
--- a/tensorflow/compiler/xla/text_literal_writer.h
+++ b/tensorflow/compiler/xla/text_literal_writer.h
@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_TEXT_LITERAL_WRITER_H_
 #define TENSORFLOW_COMPILER_XLA_TEXT_LITERAL_WRITER_H_
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@ -81,6 +81,7 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
        client->GetComputationShape(computation).ConsumeValueOrDie();
    std::vector<const Shape*> layouts;
    layouts.reserve(program_shape->parameters_size());
    for (int i = 0; i < program_shape->parameters_size(); ++i) {
      layouts.push_back(&program_shape->parameters(i));
    }
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@ -56,6 +56,7 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
          client->GetComputationShape(computation).ConsumeValueOrDie();
      std::vector<const Shape*> layouts;
      layouts.reserve(program_shape->parameters_size());
      for (int i = 0; i < program_shape->parameters_size(); ++i) {
        layouts.push_back(&program_shape->parameters(i));
      }
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@ -66,7 +66,8 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
  if (use_fake_data) {
    arguments = MakeFakeArgumentsOrDie(computation, client);
  } else {  // use recorded data if available
-    for (const Literal& literal : module.arguments()) {
+    for (const auto& proto : module.arguments()) {
      Literal literal(proto);
      TF_ASSIGN_OR_RETURN(std::unique_ptr<GlobalData> data,
                          client->TransferToServer(literal));
      arguments.push_back(std::move(data));
@ -74,6 +75,7 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
  }
  std::vector<GlobalData*> execute_arguments;
  execute_arguments.reserve(arguments.size());
  for (auto& argument : arguments) {
    execute_arguments.push_back(argument.get());
  }
@ -100,7 +102,7 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool use_fake_data) {
    if (module.has_result()) {
      fprintf(stdout, "was %s:%s\n",
              ShapeUtil::HumanString(module.result().shape()).c_str(),
-              LiteralUtil::ToString(module.result()).c_str());
+              LiteralUtil::ToString(Literal(module.result())).c_str());
    }
  }
 }
--- a/tensorflow/compiler/xla/tools/show_literal.cc
+++ b/tensorflow/compiler/xla/tools/show_literal.cc
@ -37,9 +37,10 @@ int main(int argc, char **argv) {
                << " <path-to-serialized-literal-proto>";
  }
-  xla::Literal literal;
+  xla::LiteralProto literal_proto;
  TF_CHECK_OK(tensorflow::ReadBinaryProto(tensorflow::Env::Default(), argv[1],
-                                          &literal));
+                                          &literal_proto));
-  LOG(INFO) << "literal: " << literal.ShortDebugString();
+  xla::Literal literal(literal_proto);
  LOG(INFO) << "literal: " << literal_proto.ShortDebugString();
  fprintf(stderr, "%s\n", xla::LiteralUtil::ToString(literal).c_str());
 }
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@ -92,11 +92,11 @@ message TransferToClientRequest {
 }
 message TransferToClientResponse {
-  Literal literal = 1;
+  LiteralProto literal = 1;
 }
 message TransferToServerRequest {
-  Literal literal = 1;
+  LiteralProto literal = 1;
  DeviceHandle device_handle = 2;
 }
@ -105,7 +105,7 @@ message TransferToServerResponse {
 }
 message TransferToInfeedRequest {
-  Literal literal = 1;
+  LiteralProto literal = 1;
  int64 replica_id = 2;
  DeviceHandle device_handle = 3;
 }
@ -123,7 +123,7 @@ message TransferFromOutfeedRequest {
 }
 message TransferFromOutfeedResponse {
-  Literal literal = 1;
+  LiteralProto literal = 1;
 }
 message ResetDeviceRequest {
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@ -275,7 +275,7 @@ message ChannelHandle {
 //
 // Transfers to/from the client are encoded in literal form, and the structure
 // of the repeated fields is implied by the shape.
-message Literal {
+message LiteralProto {
  Shape shape = 1;
  repeated bool preds = 2;
  bytes u8s = 3;
@ -285,7 +285,7 @@ message Literal {
  repeated uint64 u64s = 7;
  repeated float f32s = 8;
  repeated double f64s = 9;
-  repeated Literal tuple_literals = 10;
+  repeated LiteralProto tuple_literals = 10;
  bytes f16s = 11;  // Note: the F16s are encoded in little endian byte order
 }
@ -337,7 +337,7 @@ message Window {
 // field in OpRequest.
 message ConstantRequest {
-  Literal literal = 2;
+  LiteralProto literal = 2;
 }
 message GetTupleElementRequest {
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@ -85,6 +85,7 @@ cc_library(
        "//tensorflow/contrib/input_pipeline:input_pipeline_ops_kernels",
        "//tensorflow/contrib/layers:sparse_feature_cross_op_kernel",
        "//tensorflow/contrib/nccl:nccl_kernels",
        "//tensorflow/contrib/seq2seq:beam_search_ops_kernels",
        "//tensorflow/contrib/tensor_forest:tensor_forest_kernels",
        "//tensorflow/contrib/text:all_kernels",
    ],
@ -100,6 +101,7 @@ cc_library(
        "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
        "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
        "//tensorflow/contrib/nccl:nccl_ops_op_lib",
        "//tensorflow/contrib/seq2seq:beam_search_ops_op_lib",
        "//tensorflow/contrib/tensor_forest:tensor_forest_ops_op_lib",
        "//tensorflow/contrib/text:all_ops",
    ],
--- a/tensorflow/contrib/batching/kernels/batch_kernels.cc
+++ b/tensorflow/contrib/batching/kernels/batch_kernels.cc
@ -347,6 +347,7 @@ class BatchResource : public ResourceBase {
      // Concatenate the tasks ith input tensors into a big output tensor.
      std::vector<Tensor> to_concatenate;
      to_concatenate.reserve(batch->num_tasks());
      for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
        to_concatenate.push_back(batch->task(task_idx).inputs.at(i));
      }
--- a/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
+++ b/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
@ -139,6 +139,7 @@ TEST(SharedBatchSchedulerTest, ObeyBatchSizeConstraint) {
                   &callback_data](std::unique_ptr<Batch<FakeTask>> batch) {
    ASSERT_TRUE(batch->IsClosed());
    std::vector<size_t> batch_data;
    batch_data.reserve(batch->num_tasks());
    for (int i = 0; i < batch->num_tasks(); ++i) {
      batch_data.push_back(batch->mutable_task(i)->size());
    }
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
@ -295,6 +295,7 @@ void ExpectVecsEquiv(const std::vector<float>& vec1,
 std::vector<float> GetWeightsByIndex(const std::vector<float>& weights,
                                     const std::vector<int>& indices) {
  std::vector<float> res;
  res.reserve(indices.size());
  for (const int index : indices) {
    res.push_back(weights[index]);
  }
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@ -236,6 +236,9 @@ add_python_module("tensorflow/tensorboard")
 add_python_module("tensorflow/tensorboard/backend")
 add_python_module("tensorflow/tensorboard/backend/event_processing")
 add_python_module("tensorflow/tensorboard/plugins")
 add_python_module("tensorflow/tensorboard/plugins/audio")
 add_python_module("tensorflow/tensorboard/plugins/distributions")
 add_python_module("tensorflow/tensorboard/plugins/graphs")
 add_python_module("tensorflow/tensorboard/plugins/histograms")
 add_python_module("tensorflow/tensorboard/plugins/images")
 add_python_module("tensorflow/tensorboard/plugins/projector")
@ -536,6 +539,7 @@ set(tf_python_op_gen_main_srcs
    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.cc"
    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_main.cc"
    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.h"
    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_internal.h"
 )
 add_library(tf_python_op_gen_main OBJECT ${tf_python_op_gen_main_srcs})
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@ -209,10 +209,11 @@ if (tensorflow_BUILD_PYTHON_TESTS)
      # Broken TensorBoard tests due to different paths in windows
      "${tensorflow_source_dir}/tensorflow/tensorboard/backend/application_test.py"
      "${tensorflow_source_dir}/tensorflow/tensorboard/lib/python/http_util_test.py"
      "${tensorflow_source_dir}/tensorflow/tensorboard/plugins/audio/audio_plugin_test.py"
      "${tensorflow_source_dir}/tensorflow/tensorboard/plugins/images/images_plugin_test.py"
      # Broken tensorboard test due to cmake issues.
      "${tensorflow_source_dir}/tensorflow/tensorboard/plugins/debugger/plugin_test.py"
      "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py"
      "${tensorflow_source_dir}/tensorflow/tensorboard/plugins/images/images_plugin_test.py"
      # tensor_forest tests (also note that we exclude the hybrid tests for now)
      "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py"  # Results in wrong order.
      "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py"  # Results in wrong order.
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@ -150,7 +150,8 @@ class MapDatasetTest(test.TestCase):
              results.append(sess.run(get_next))
            except errors.OutOfRangeError:
              return
-        threads = [self.checkedThread(target=iterator_thread) for _ in range(8)]
+        threads = [self.checkedThread(target=iterator_thread)
                   for _ in range(64)]
        for t in threads:
          t.start()
        for t in threads:
--- a/tensorflow/contrib/factorization/kernels/clustering_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/clustering_ops.cc
@ -375,8 +375,8 @@ class NearestNeighborsOp : public OpKernel {
      const Eigen::Ref<const Eigen::VectorXf>& points_half_squared_norm,
      const Eigen::Ref<const MatrixXfRowMajor>& centers,
      const Eigen::Ref<const Eigen::VectorXf>& centers_half_squared_norm,
-      Eigen::Ref<MatrixXi64RowMajor> nearest_center_indices,
+      const Eigen::Ref<MatrixXi64RowMajor>& nearest_center_indices,
-      Eigen::Ref<MatrixXfRowMajor> nearest_center_distances) {
+      const Eigen::Ref<MatrixXfRowMajor>& nearest_center_distances) {
    CHECK_LE(k, centers.rows());
    if (centers.rows() <= kNearestNeighborsCentersMaxBlockSize) {
      FindKNearestCentersOneBlock(k, points, points_half_squared_norm, centers,
--- a/tensorflow/contrib/factorization/python/ops/clustering_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
@ -164,9 +164,10 @@ class KMeans(object):
      with ops.colocate_with(inp):
        # Computes Euclidean distance. Note the first and third terms are
        # broadcast additions.
-        squared_distance = (math_ops.reduce_sum(
+        squared_distance = (
-            math_ops.square(inp), 1, keep_dims=True) - 2 * math_ops.matmul(
+            math_ops.reduce_sum(math_ops.square(inp), 1, keep_dims=True) -
-                inp, clusters, transpose_b=True) + array_ops.transpose(
+            2 * math_ops.matmul(inp, clusters, transpose_b=True) +
            array_ops.transpose(
                math_ops.reduce_sum(
                    math_ops.square(clusters), 1, keep_dims=True)))
        output.append(squared_distance)
@ -229,12 +230,12 @@ class KMeans(object):
        clusters = nn_impl.l2_normalize(clusters, dim=1)
    for inp, score in zip(inputs, scores):
      with ops.colocate_with(inp):
-        (indices,
+        (indices, distances) = gen_clustering_ops.nearest_neighbors(
-         distances) = gen_clustering_ops.nearest_neighbors(inp, clusters, 1)
+            inp, clusters, 1)
        if self._distance_metric == COSINE_DISTANCE:
          distances *= 0.5
-        output.append(
+        output.append((score, array_ops.squeeze(distances),
-            (score, array_ops.squeeze(distances), array_ops.squeeze(indices)))
+                       array_ops.squeeze(indices)))
    return zip(*output)
  def _init_clusters_random(self):
@ -265,9 +266,7 @@ class KMeans(object):
            (not self._use_mini_batch or
             self._mini_batch_steps_per_iteration > 1))
-  def _initialize_clusters(self,
+  def _initialize_clusters(self, cluster_centers, cluster_centers_initialized,
                           cluster_centers,
                           cluster_centers_initialized,
                           cluster_centers_updated):
    """Returns an op to initialize the cluster centers."""
@ -294,21 +293,19 @@ class KMeans(object):
    with ops.colocate_with(cluster_centers_initialized):
      initialized = control_flow_ops.with_dependencies(
-          [clusters_init],
+          [clusters_init], array_ops.identity(cluster_centers_initialized))
          array_ops.identity(cluster_centers_initialized))
    with ops.colocate_with(cluster_centers):
-      assign_centers = state_ops.assign(cluster_centers, clusters_init,
+      assign_centers = state_ops.assign(
-                                        validate_shape=False)
+          cluster_centers, clusters_init, validate_shape=False)
      if cluster_centers_updated != cluster_centers:
-        assign_centers = control_flow_ops.group(
+        assign_centers = control_flow_ops.group(assign_centers,
-            assign_centers,
+                                                state_ops.assign(
-            state_ops.assign(cluster_centers_updated, clusters_init,
+                                                    cluster_centers_updated,
                                                    clusters_init,
                                                    validate_shape=False))
    assign_centers = control_flow_ops.with_dependencies(
-          [assign_centers],
+        [assign_centers], state_ops.assign(cluster_centers_initialized, True))
-          state_ops.assign(cluster_centers_initialized, True))
+    return control_flow_ops.cond(initialized, control_flow_ops.no_op,
      return control_flow_ops.cond(initialized,
                                   control_flow_ops.no_op,
                                 lambda: assign_centers).op
  def _create_variables(self):
@ -327,19 +324,16 @@ class KMeans(object):
        cluster_centers_updated back to cluster_centers.
    """
    init_value = array_ops.constant([], dtype=dtypes.float32)
-    cluster_centers = variable_scope.variable(init_value,
+    cluster_centers = variable_scope.variable(
-                                              name='clusters',
+        init_value, name='clusters', validate_shape=False)
-                                              validate_shape=False)
+    cluster_centers_initialized = variable_scope.variable(
-    cluster_centers_initialized = variable_scope.variable(False,
+        False, dtype=dtypes.bool, name='initialized')
                                                          dtype=dtypes.bool,
                                                          name='initialized')
    if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
      # Copy of cluster centers actively updated each step according to
      # mini-batch update rule.
-      cluster_centers_updated = variable_scope.variable(init_value,
+      cluster_centers_updated = variable_scope.variable(
-                                                        name='clusters_updated',
+          init_value, name='clusters_updated', validate_shape=False)
                                                        validate_shape=False)
      # How many steps till we copy the updated clusters to cluster_centers.
      update_in_steps = variable_scope.variable(
          self._mini_batch_steps_per_iteration,
@ -347,20 +341,15 @@ class KMeans(object):
          name='update_in_steps')
      # Count of points assigned to cluster_centers_updated.
      cluster_counts = variable_scope.variable(
-          array_ops.zeros([self._num_clusters],
+          array_ops.zeros([self._num_clusters], dtype=dtypes.int64))
                          dtype=dtypes.int64))
    else:
      cluster_centers_updated = cluster_centers
      update_in_steps = None
-      cluster_counts = (variable_scope.variable(array_ops.ones(
+      cluster_counts = (variable_scope.variable(
-          [self._num_clusters],
+          array_ops.ones([self._num_clusters], dtype=dtypes.int64))
          dtype=dtypes.int64))
                        if self._use_mini_batch else None)
-    return (cluster_centers,
+    return (cluster_centers, cluster_centers_initialized, cluster_counts,
-            cluster_centers_initialized,
+            cluster_centers_updated, update_in_steps)
            cluster_counts,
            cluster_centers_updated,
            update_in_steps)
  @classmethod
  def _l2_normalize_data(cls, inputs):
@ -391,11 +380,8 @@ class KMeans(object):
    """
    # Implementation of kmeans.
    inputs = self._inputs
-    (cluster_centers_var,
+    (cluster_centers_var, cluster_centers_initialized, total_counts,
-     cluster_centers_initialized,
+     cluster_centers_updated, update_in_steps) = self._create_variables()
     total_counts,
     cluster_centers_updated,
     update_in_steps) = self._create_variables()
    init_op = self._initialize_clusters(cluster_centers_var,
                                        cluster_centers_initialized,
                                        cluster_centers_updated)
@ -409,8 +395,7 @@ class KMeans(object):
    all_scores, scores, cluster_idx = self._infer_graph(inputs, cluster_centers)
    if self._use_mini_batch:
      sync_updates_op = self._mini_batch_sync_updates_op(
-          update_in_steps,
+          update_in_steps, cluster_centers_var, cluster_centers_updated,
          cluster_centers_var, cluster_centers_updated,
          total_counts)
      assert sync_updates_op is not None
      with ops.control_dependencies([sync_updates_op]):
@ -421,15 +406,15 @@ class KMeans(object):
      training_op = self._full_batch_training_op(inputs, cluster_idx,
                                                 cluster_centers_var)
-    return (all_scores, cluster_idx, scores,
+    return (all_scores, cluster_idx, scores, cluster_centers_initialized,
-            cluster_centers_initialized, init_op, training_op)
+            init_op, training_op)
-  def _mini_batch_sync_updates_op(self, update_in_steps,
+  def _mini_batch_sync_updates_op(self, update_in_steps, cluster_centers_var,
-                                  cluster_centers_var, cluster_centers_updated,
+                                  cluster_centers_updated, total_counts):
                                  total_counts):
    if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
      assert update_in_steps is not None
      with ops.colocate_with(update_in_steps):
        def _f():
          # Note that there is a race condition here, so we do a best effort
          # updates here. We reset update_in_steps first so that other workers
@ -437,33 +422,36 @@ class KMeans(object):
          # before resetting total_counts to avoid large updates to
          # cluster_centers_updated based on partially updated
          # cluster_center_vars.
-          with ops.control_dependencies([state_ops.assign(
+          with ops.control_dependencies([
-              update_in_steps,
+              state_ops.assign(update_in_steps,
-              self._mini_batch_steps_per_iteration - 1)]):
+                               self._mini_batch_steps_per_iteration - 1)
-            with ops.colocate_with(cluster_centers_updated):
+          ]):
            with ops.colocate_with(
                cluster_centers_updated, ignore_existing=True):
              if self._distance_metric == COSINE_DISTANCE:
-                cluster_centers = nn_impl.l2_normalize(cluster_centers_updated,
+                cluster_centers = nn_impl.l2_normalize(
-                                                       dim=1)
+                    cluster_centers_updated, dim=1)
              else:
                cluster_centers = cluster_centers_updated
            with ops.colocate_with(cluster_centers_var):
-              with ops.control_dependencies([state_ops.assign(
+              with ops.control_dependencies(
-                  cluster_centers_var,
+                  [state_ops.assign(cluster_centers_var, cluster_centers)]):
-                  cluster_centers)]):
+                with ops.colocate_with(
-                with ops.colocate_with(cluster_centers_var):
+                    cluster_centers_var, ignore_existing=True):
                  with ops.control_dependencies([
                      state_ops.assign(total_counts,
-                                       array_ops.zeros_like(total_counts))]):
+                                       array_ops.zeros_like(total_counts))
                  ]):
                    return array_ops.identity(update_in_steps)
        return control_flow_ops.cond(
-            update_in_steps <= 0,
+            update_in_steps <= 0, _f,
            _f,
            lambda: state_ops.assign_sub(update_in_steps, 1))
    else:
      return control_flow_ops.no_op()
-  def _mini_batch_training_op(self, inputs, cluster_idx_list,
+  def _mini_batch_training_op(self, inputs, cluster_idx_list, cluster_centers,
-                              cluster_centers, total_counts):
+                              total_counts):
    """Creates an op for training for mini batch case.
    Args:
@ -487,17 +475,15 @@ class KMeans(object):
        unique_ids, unique_idx = array_ops.unique(cluster_idx)
        num_unique_cluster_idx = array_ops.size(unique_ids)
        # Fetch the old values of counts and cluster_centers.
-        with ops.colocate_with(total_counts):
+        with ops.colocate_with(total_counts, ignore_existing=True):
          old_counts = array_ops.gather(total_counts, unique_ids)
        # TODO(agarwal): This colocation seems to run into problems. Fix it.
-        # with ops.colocate_with(cluster_centers):
+        with ops.colocate_with(cluster_centers, ignore_existing=True):
          old_cluster_centers = array_ops.gather(cluster_centers, unique_ids)
        # Locally aggregate the increment to counts.
        count_updates = math_ops.unsorted_segment_sum(
-            array_ops.ones_like(
+            array_ops.ones_like(unique_idx, dtype=total_counts.dtype),
-                unique_idx, dtype=total_counts.dtype),
+            unique_idx, num_unique_cluster_idx)
            unique_idx,
            num_unique_cluster_idx)
        # Locally compute the sum of inputs mapped to each id.
        # For a cluster with old cluster value x, old count n, and with data
        # d_1,...d_k newly assigned to it, we recompute the new value as
@ -507,13 +493,12 @@ class KMeans(object):
            inp, unique_idx, num_unique_cluster_idx)
        # Shape to enable broadcasting count_updates and learning_rate to inp.
        # It extends the shape with 1's to match the rank of inp.
-        broadcast_shape = array_ops.concat(
+        broadcast_shape = array_ops.concat([
-            [
+            array_ops.reshape(num_unique_cluster_idx, [1]),
-                array_ops.reshape(num_unique_cluster_idx, [1]), array_ops.ones(
+            array_ops.ones(
                array_ops.reshape(array_ops.rank(inp) - 1, [1]),
                dtype=dtypes.int32)
-            ],
+        ], 0)
            0)
        # Subtract k * x, see comment above.
        cluster_center_updates -= math_ops.cast(
            array_ops.reshape(count_updates, broadcast_shape),
@ -524,14 +509,10 @@ class KMeans(object):
        # scale by 1 / (n + k), see comment above.
        cluster_center_updates *= learning_rate
        # Apply the updates.
-      update_counts = state_ops.scatter_add(
+      update_counts = state_ops.scatter_add(total_counts, unique_ids,
          total_counts,
          unique_ids,
                                            count_updates)
      update_cluster_centers = state_ops.scatter_add(
-          cluster_centers,
+          cluster_centers, unique_ids, cluster_center_updates)
          unique_ids,
          cluster_center_updates)
      update_ops.extend([update_counts, update_cluster_centers])
    return control_flow_ops.group(*update_ops)
@ -552,7 +533,7 @@ class KMeans(object):
    cluster_counts = []
    epsilon = constant_op.constant(1e-6, dtype=inputs[0].dtype)
    for inp, cluster_idx in zip(inputs, cluster_idx_list):
-      with ops.colocate_with(inp):
+      with ops.colocate_with(inp, ignore_existing=True):
        cluster_sums.append(
            math_ops.unsorted_segment_sum(inp, cluster_idx, self._num_clusters))
        cluster_counts.append(
@ -561,7 +542,7 @@ class KMeans(object):
                    array_ops.ones(
                        array_ops.reshape(array_ops.shape(inp)[0], [-1])),
                    [-1, 1]), cluster_idx, self._num_clusters))
-    with ops.colocate_with(cluster_centers):
+    with ops.colocate_with(cluster_centers, ignore_existing=True):
      new_clusters_centers = math_ops.add_n(cluster_sums) / (math_ops.cast(
          math_ops.add_n(cluster_counts), cluster_sums[0].dtype) + epsilon)
      if self._clusters_l2_normalized():
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
@ -94,6 +94,7 @@ TEST(FfmpegLibTest, TestRoundTripGeneratedWav) {
  }
  std::vector<float> sine_wave;
  sine_wave.reserve(20000);
  for (int i = 0; i < 20000; ++i) {
    sine_wave.push_back(std::sin(6.28 * 440.0 * i / 20000.0));
  }
--- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
+++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
@ -494,6 +494,7 @@ class SparseFeatureCrossOp : public OpKernel {
    ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
                       &feature_start_indices);
    columns.reserve(values_list_in.size());
    for (int i = 0; i < values_list_in.size(); ++i) {
      columns.emplace_back(new SparseTensorColumn<InternalType>(
          values_list_in[i], std::move(feature_counts[i]),
--- a/tensorflow/contrib/learn/python/learn/estimators/init.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/init.py
@ -308,6 +308,7 @@ from tensorflow.contrib.learn.python.learn.estimators.estimator import infer_rea
 from tensorflow.contrib.learn.python.learn.estimators.estimator import SKCompat
 from tensorflow.contrib.learn.python.learn.estimators.head import binary_svm_head
 from tensorflow.contrib.learn.python.learn.estimators.head import Head
 from tensorflow.contrib.learn.python.learn.estimators.head import loss_only_head
 from tensorflow.contrib.learn.python.learn.estimators.head import multi_class_head
 from tensorflow.contrib.learn.python.learn.estimators.head import multi_head
 from tensorflow.contrib.learn.python.learn.estimators.head import multi_label_head
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@ -429,6 +429,23 @@ def multi_label_head(n_classes,
      loss_fn=_wrap_custom_loss_fn(loss_fn) if loss_fn else None)
 def loss_only_head(loss_fn, head_name=None):
  """Creates a Head that contains only loss terms.
  Loss only head holds additional loss terms to be added to other heads and
  usually represents additional regularization terms in the objective function.
  Args:
    loss_fn: a function that takes no argument and returns a list of
        scalar tensors.
    head_name: a name for for the head.
  Returns:
    An instance of `Head` to hold the additional losses.
  """
  return _LossOnlyHead(loss_fn, head_name=head_name)
 def multi_head(heads, loss_weights=None):
  """Creates a MultiHead stemming from same logits/hidden layer.
@ -1406,6 +1423,80 @@ class _MultiLabelHead(_SingleHead):
    return metrics
 class _LossOnlyHead(Head):
  """`Head` implementation for additional loss terms.
  This class only holds loss terms unrelated to any other heads (labels),
  e.g. regularization.
  Common usage:
  This is oftem combine with other heads in a multi head setup.
    ```python
    head = multi_head([
        head1, head2, loss_only_head('regularizer', regularizer)])
    ```
  """
  def __init__(self, loss_fn, head_name=None):
    self._loss_fn = loss_fn
    self.head_name = head_name or "loss_only_head"
  @property
  def logits_dimension(self):
    return 0
  def create_model_fn_ops(self,
                          features,
                          mode,
                          labels=None,
                          train_op_fn=None,
                          logits=None,
                          logits_input=None,
                          scope=None):
    """See `_Head.create_model_fn_ops`.
    Args:
      features: Not been used.
      mode: Estimator's `ModeKeys`.
      labels: Labels `Tensor`, or `dict` of same.
      train_op_fn: Function that takes a scalar loss and returns an op to
          optimize with the loss.
      logits: Not been used.
      logits_input: Not been used.
      scope: Optional scope for variable_scope. If provided, will be passed to
          all heads. Most users will want to set this to `None`, so each head
          constructs a separate variable_scope according to its `head_name`.
    Returns:
      A `ModelFnOps` object.
    Raises:
      ValueError: if `mode` is not recognition.
    """
    _check_mode_valid(mode)
    loss = None
    train_op = None
    if mode != model_fn.ModeKeys.INFER:
      with variable_scope.variable_scope(scope, default_name=self.head_name):
        loss = self._loss_fn()
        if isinstance(loss, list):
          loss = math_ops.add_n(loss)
        logging_ops.scalar_summary(
            _summary_key(self.head_name, mkey.LOSS), loss)
        if mode == model_fn.ModeKeys.TRAIN:
          if train_op_fn is None:
            raise ValueError("train_op_fn can not be None in TRAIN mode")
          with ops.name_scope(None, "train_op", (loss,)):
            train_op = train_op_fn(loss)
    return model_fn.ModelFnOps(
        mode=mode,
        loss=loss,
        train_op=train_op,
        predictions={},
        eval_metric_ops={})
 class _MultiHead(Head):
  """`Head` implementation for multi objective learning.
@ -1525,6 +1616,9 @@ class _MultiHead(Head):
      if isinstance(logits, dict):
        head_logits_pairs = []
        for head in self._heads:
          if isinstance(head, _LossOnlyHead):
            head_logits_pairs.append((head, None))
          else:
            head_logits_pairs.append((head, logits[head.head_name]))
      else:
        # Split logits for each head.
@ -1606,6 +1700,8 @@ class _MultiHead(Head):
    predictions = {}
    output_alternatives = {}
    for head, m in zip(self._heads, all_model_fn_ops):
      if isinstance(head, _LossOnlyHead):
        continue
      head_name = head.head_name
      output_alternatives[head_name] = m.output_alternatives[head_name]
      for k, v in m.predictions.items():
--- a/tensorflow/contrib/learn/python/learn/estimators/head_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
@ -1638,6 +1638,21 @@ class BinarySvmHeadTest(test.TestCase):
      }, model_fn_ops)
 class LossOnlyHead(test.TestCase):
  def testNoPredictionsAndNoMetrics(self):
    head = head_lib.loss_only_head(lambda: 1, head_name="const")
    model_fn_ops = head.create_model_fn_ops(
        features={},
        mode=model_fn.ModeKeys.TRAIN,
        train_op_fn=head_lib.no_op_train_fn)
    self.assertDictEqual(model_fn_ops.predictions, {})
    self.assertDictEqual(model_fn_ops.eval_metric_ops, {})
    self.assertIsNotNone(model_fn_ops.loss)
    with session.Session() as sess:
      self.assertEqual(1, sess.run(model_fn_ops.loss))
 class MultiHeadTest(test.TestCase):
  def testInvalidHeads(self):
@ -1672,7 +1687,8 @@ class MultiHeadTest(test.TestCase):
        n_classes=3, label_name="label1", head_name="head1")
    head2 = head_lib.multi_class_head(
        n_classes=4, label_name="label2", head_name="head2")
-    head = head_lib.multi_head((head1, head2))
+    head3 = head_lib.loss_only_head(lambda: 1.0, head_name="const")
    head = head_lib.multi_head((head1, head2, head3))
    labels = {
        "label1": (1,),
        "label2": (1,)
@ -1691,7 +1707,7 @@ class MultiHeadTest(test.TestCase):
    self.assertIsNone(model_fn_ops.output_alternatives)
    with session.Session() as sess:
-      self.assertAlmostEqual(2.224, sess.run(model_fn_ops.loss), places=3)
+      self.assertAlmostEqual(3.224, sess.run(model_fn_ops.loss), places=3)
  def testTrain_withHeadWeights(self):
    head1 = head_lib.multi_class_head(
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@ -871,7 +871,7 @@ def index_table_from_file(vocabulary_file=None,
  ```
  Args:
-    vocabulary_file: The vocabulary filename.
+    vocabulary_file: The vocabulary filename, may be a constant scalar `Tensor`.
    num_oov_buckets: The number of out-of-vocabulary buckets.
    vocab_size: Number of the elements in the vocabulary, if known.
    default_value: The value to use for out-of-vocabulary feature values.
@ -889,8 +889,9 @@ def index_table_from_file(vocabulary_file=None,
    ValueError: If `num_oov_buckets` is negative or `vocab_size` is not greater
      than zero.
  """
-  if not vocabulary_file:
+  if vocabulary_file is None or (
-    raise ValueError("vocabulary_file must be specified.")
+      isinstance(vocabulary_file, str) and not vocabulary_file):
    raise ValueError("vocabulary_file must be specified and must not be empty.")
  if num_oov_buckets < 0:
    raise ValueError("num_oov_buckets must be greater or equal than 0, got %d."
                     % num_oov_buckets)
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@ -1187,6 +1187,18 @@ class IndexTableFromFile(test.TestCase):
      lookup_ops.tables_initializer().run()
      self.assertAllEqual((1, 2, 3), ids.eval())
  def test_string_index_table_from_file_tensor_filename(self):
    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
    with self.test_session():
      vocabulary_file = constant_op.constant(vocabulary_file)
      table = lookup.index_table_from_file(
          vocabulary_file=vocabulary_file, num_oov_buckets=1)
      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
      self.assertRaises(errors_impl.OpError, ids.eval)
      lookup_ops.tables_initializer().run()
      self.assertAllEqual((1, 2, 3), ids.eval())
  def test_int32_index_table_from_file(self):
    vocabulary_file = self._createVocabFile(
        "f2i_vocab2.txt", values=("42", "1", "-1000"))
@ -1245,7 +1257,13 @@ class IndexTableFromFile(test.TestCase):
              860),  # 3 + fingerprint("toccata") mod 300.
          ids.eval())
-  def test_index_table_from_file_with_only_oov_buckets(self):
+  def test_index_table_from_file_fails_with_empty_vocabulary_file_name(self):
    self.assertRaises(
        ValueError,
        lookup.index_table_from_file,
        vocabulary_file="")
  def test_index_table_from_file_fails_with_empty_vocabulary(self):
    self.assertRaises(
        ValueError,
        lookup.index_table_from_file,
--- a/tensorflow/contrib/metrics/init.py
+++ b/tensorflow/contrib/metrics/init.py
@ -23,6 +23,7 @@ See the @{$python/contrib.metrics} guide.
@@streaming_precision
@@streaming_precision_at_thresholds
@@streaming_auc
@@streaming_curve_points
@@streaming_recall_at_k
@@streaming_mean_absolute_error
@@streaming_mean_iou
@ -76,6 +77,7 @@ from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_accuracy
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_auc
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_concat
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_covariance
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_curve_points
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_false_negatives
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_false_negatives_at_thresholds
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_false_positives
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@ -733,6 +733,102 @@ def streaming_true_negatives_at_thresholds(
  return values['tn'], update_ops['tn']
 def streaming_curve_points(labels=None,
                           predictions=None,
                           weights=None,
                           num_thresholds=200,
                           metrics_collections=None,
                           updates_collections=None,
                           curve='ROC',
                           name=None):
  """Computes curve (ROC or PR) values for a prespecified number of points.
  The `streaming_curve_points` function creates four local variables,
  `true_positives`, `true_negatives`, `false_positives` and `false_negatives`
  that are used to compute the curve values. To discretize the curve, a linearly
  spaced set of thresholds is used to compute pairs of recall and precision
  values.
  For best results, `predictions` should be distributed approximately uniformly
  in the range [0, 1] and not peaked around 0 or 1.
  For estimation of the metric over a stream of data, the function creates an
  `update_op` operation that updates these variables.
  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
  Args:
    labels: A `Tensor` whose shape matches `predictions`. Will be cast to
      `bool`.
    predictions: A floating point `Tensor` of arbitrary shape and whose values
      are in the range `[0, 1]`.
    weights: Optional `Tensor` whose rank is either 0, or the same rank as
      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `labels` dimension).
    num_thresholds: The number of thresholds to use when discretizing the roc
      curve.
    metrics_collections: An optional list of collections that `auc` should be
      added to.
    updates_collections: An optional list of collections that `update_op` should
      be added to.
    curve: Specifies the name of the curve to be computed, 'ROC' [default] or
      'PR' for the Precision-Recall-curve.
    name: An optional variable_scope name.
  Returns:
    points: A `Tensor` with shape [num_thresholds, 2] that contains points of
      the curve.
    update_op: An operation that increments the `true_positives`,
      `true_negatives`, `false_positives` and `false_negatives` variables.
  Raises:
    ValueError: If `predictions` and `labels` have mismatched shapes, or if
      `weights` is not `None` and its shape doesn't match `predictions`, or if
      either `metrics_collections` or `updates_collections` are not a list or
      tuple.
  """
  with variable_scope.variable_scope(name, 'curve_points', (labels, predictions,
                                                            weights)):
    if curve != 'ROC' and curve != 'PR':
      raise ValueError('curve must be either ROC or PR, %s unknown' % (curve))
    kepsilon = 1e-7  # to account for floating point imprecisions
    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
                  for i in range(num_thresholds - 2)]
    thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
    values, update_ops = _streaming_confusion_matrix_at_thresholds(
        labels=labels,
        predictions=predictions,
        thresholds=thresholds,
        weights=weights)
    # Add epsilons to avoid dividing by 0.
    epsilon = 1.0e-6
    def compute_points(tp, fn, tn, fp):
      """Computes the roc-auc or pr-auc based on confusion counts."""
      rec = math_ops.div(tp + epsilon, tp + fn + epsilon)
      if curve == 'ROC':
        fp_rate = math_ops.div(fp, fp + tn + epsilon)
        return fp_rate, rec
      else:  # curve == 'PR'.
        prec = math_ops.div(tp + epsilon, tp + fp + epsilon)
        return rec, prec
    xs, ys = compute_points(values['tp'], values['fn'], values['tn'],
                            values['fp'])
    points = array_ops.stack([xs, ys], axis=1)
    update_op = control_flow_ops.group(*update_ops.values())
    if metrics_collections:
      ops.add_to_collections(metrics_collections, points)
    if updates_collections:
      ops.add_to_collections(updates_collections, update_op)
    return points, update_op
 def streaming_auc(predictions, labels, weights=None, num_thresholds=200,
                  metrics_collections=None, updates_collections=None,
                  curve='ROC', name=None):
@ -2372,6 +2468,7 @@ __all__ = [
    'sparse_recall_at_top_k',
    'streaming_accuracy',
    'streaming_auc',
    'streaming_curve_points',
    'streaming_false_negatives',
    'streaming_false_negatives_at_thresholds',
    'streaming_false_positives',
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@ -1327,6 +1327,99 @@ class StreamingRecallTest(test.TestCase):
      self.assertEqual(0, recall.eval())
 class StreamingCurvePointsTest(test.TestCase):
  def setUp(self):
    np.random.seed(1)
    ops.reset_default_graph()
  def testVars(self):
    metric_ops.streaming_curve_points(
        predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
    _assert_local_variables(
        self,
        ('curve_points/true_positives:0', 'curve_points/false_negatives:0',
         'curve_points/false_positives:0', 'curve_points/true_negatives:0'))
  def testMetricsCollection(self):
    my_collection_name = '__metrics__'
    points, _ = metric_ops.streaming_curve_points(
        labels=array_ops.ones((10, 1)),
        predictions=array_ops.ones((10, 1)),
        metrics_collections=[my_collection_name])
    self.assertListEqual(ops.get_collection(my_collection_name), [points])
  def testUpdatesCollection(self):
    my_collection_name = '__updates__'
    _, update_op = metric_ops.streaming_curve_points(
        labels=array_ops.ones((10, 1)),
        predictions=array_ops.ones((10, 1)),
        updates_collections=[my_collection_name])
    self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
  def _testValueTensorIsIdempotent(self, curve):
    predictions = constant_op.constant(
        np.random.uniform(size=(10, 3)), dtype=dtypes_lib.float32)
    labels = constant_op.constant(
        np.random.uniform(high=2, size=(10, 3)), dtype=dtypes_lib.float32)
    points, update_op = metric_ops.streaming_curve_points(
        labels, predictions=predictions, curve=curve)
    with self.test_session() as sess:
      sess.run(variables.local_variables_initializer())
      sess.run(update_op)
      initial_points = points.eval()
      sess.run(update_op)
      self.assertAllClose(initial_points, points.eval())
  def testValueTensorIsIdempotentROC(self):
    self._testValueTensorIsIdempotent(curve='ROC')
  def testValueTensorIsIdempotentPR(self):
    self._testValueTensorIsIdempotent(curve='PR')
  def _testCase(self, labels, predictions, curve, expected_points):
    with self.test_session() as sess:
      predictions_tensor = constant_op.constant(
          predictions, dtype=dtypes_lib.float32)
      labels_tensor = constant_op.constant(labels, dtype=dtypes_lib.float32)
      points, update_op = metric_ops.streaming_curve_points(
          labels=labels_tensor,
          predictions=predictions_tensor,
          num_thresholds=3,
          curve=curve)
      sess.run(variables.local_variables_initializer())
      sess.run(update_op)
      self.assertAllClose(expected_points, points.eval())
  def testEdgeCasesROC(self):
    self._testCase([[1]], [[1]], 'ROC', [[0, 1], [0, 1], [0, 0]])
    self._testCase([[0]], [[0]], 'ROC', [[1, 1], [0, 1], [0, 1]])
    self._testCase([[0]], [[1]], 'ROC', [[1, 1], [1, 1], [0, 1]])
    self._testCase([[1]], [[0]], 'ROC', [[0, 1], [0, 0], [0, 0]])
  def testManyValuesROC(self):
    self._testCase([[1.0, 0.0, 0.0, 1.0, 1.0, 1.0]],
                   [[0.2, 0.3, 0.4, 0.6, 0.7, 0.8]], 'ROC',
                   [[1.0, 1.0], [0.0, 0.75], [0.0, 0.0]])
  def testEdgeCasesPR(self):
    self._testCase([[1]], [[1]], 'PR', [[1, 1], [1, 1], [0, 1]])
    self._testCase([[0]], [[0]], 'PR', [[1, 0], [1, 1], [1, 1]])
    self._testCase([[0]], [[1]], 'PR', [[1, 0], [1, 0], [1, 1]])
    self._testCase([[1]], [[0]], 'PR', [[1, 1], [0, 1], [0, 1]])
  def testManyValuesPR(self):
    self._testCase([[1.0, 0.0, 0.0, 1.0, 1.0, 1.0]],
                   [[0.2, 0.3, 0.4, 0.6, 0.7, 0.8]], 'PR',
                   [[1.0, 4.0 / 6.0], [0.75, 1.0], [0.0, 1.0]])
 class StreamingAUCTest(test.TestCase):
  def setUp(self):
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@ -226,8 +226,8 @@ class TestBeamStep(test.TestCase):
 class BeamSearchDecoderTest(test.TestCase):
  def _testDynamicDecodeRNN(self, time_major, has_attention):
-    encoder_sequence_length = [3, 2, 3, 1, 1]
+    encoder_sequence_length = np.array([3, 2, 3, 1, 1])
-    decoder_sequence_length = [2, 0, 1, 2, 3]
+    decoder_sequence_length = np.array([2, 0, 1, 2, 3])
    batch_size = 5
    decoder_max_time = 4
    input_depth = 7
@ -245,6 +245,7 @@ class BeamSearchDecoderTest(test.TestCase):
      batch_size_tensor = constant_op.constant(batch_size)
      embedding = np.random.randn(vocab_size, embedding_dim).astype(np.float32)
      cell = rnn_cell.LSTMCell(cell_depth)
      initial_state = cell.zero_state(batch_size, dtypes.float32)
      if has_attention:
        inputs = array_ops.placeholder_with_default(
            np.random.randn(batch_size, decoder_max_time,
@ -258,6 +259,8 @@ class BeamSearchDecoderTest(test.TestCase):
            num_units=attention_depth,
            memory=tiled_inputs,
            memory_sequence_length=tiled_sequence_length)
        initial_state = beam_search_decoder.tile_batch(
            initial_state, multiplier=beam_width)
        cell = attention_wrapper.AttentionWrapper(
            cell=cell,
            attention_mechanism=attention_mechanism,
@ -265,6 +268,9 @@ class BeamSearchDecoderTest(test.TestCase):
            alignment_history=False)
      cell_state = cell.zero_state(
          dtype=dtypes.float32, batch_size=batch_size_tensor * beam_width)
      if has_attention:
        cell_state = cell_state.clone(
            cell_state=initial_state)
      bsd = beam_search_decoder.BeamSearchDecoder(
          cell=cell,
          embedding=embedding,
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@ -72,27 +72,8 @@ class FinalBeamSearchDecoderOutput(
  pass
-def tile_batch(t, multiplier, name=None):
+def _tile_batch(t, multiplier):
-  """Tile the batch dimension of tensor t.
+  """Core single-tensor implementation of tile_batch."""
  This function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed of
  minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a shape
  `[batch_size * multiplier, s0, s1, ...]` composed of minibatch entries
  `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is repeated
  `multiplier` times.
  Args:
    t: `Tensor` shaped `[batch_size, ...]`.
    multiplier: Python int.
    name: Name scope for any created operations.
  Returns:
    A `Tensor` shaped `[batch_size * multiplier, ...]`.
  Raises:
    ValueError: if `t` does not have a statically known rank or it's < 1.
  """
  with ops.name_scope(name, "tile_batch", [t, multiplier]):
  t = ops.convert_to_tensor(t, name="t")
  shape_t = array_ops.shape(t)
  if t.shape.ndims is None or t.shape.ndims < 1:
@ -110,6 +91,34 @@ def tile_batch(t, multiplier, name=None):
  return tiled
 def tile_batch(t, multiplier, name=None):
  """Tile the batch dimension of a (possibly nested structure of) tensor(s) t.
  For each tensor t in a (possibly nested structure) of tensors,
  this function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed of
  minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a shape
  `[batch_size * multiplier, s0, s1, ...]` composed of minibatch entries
  `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is repeated
  `multiplier` times.
  Args:
    t: `Tensor` shaped `[batch_size, ...]`.
    multiplier: Python int.
    name: Name scope for any created operations.
  Returns:
    A (possibly nested structure of) `Tensor` shaped
    `[batch_size * multiplier, ...]`.
  Raises:
    ValueError: if tensor(s) `t` do not have a statically known rank or
    the rank is < 1.
  """
  flat_t = nest.flatten(t)
  with ops.name_scope(name, "tile_batch", flat_t + [multiplier]):
    return nest.map_structure(lambda t_: _tile_batch(t_, multiplier), t)
 def _check_maybe(t):
  if isinstance(t, tensor_array_ops.TensorArray):
    raise TypeError(
--- a/tensorflow/contrib/session_bundle/session_bundle_test.cc
+++ b/tensorflow/contrib/session_bundle/session_bundle_test.cc
@ -270,7 +270,7 @@ class SessionBundleTest : public ::testing::Test {
  // MetaGraphDef.
  // Returns the path of the export.
  // ** Should only be called once per test **
-  string SetupExport(MetaGraphDefTwiddler twiddler) {
+  string SetupExport(const MetaGraphDefTwiddler& twiddler) {
    return SetupExport(twiddler, kVariablesFilename, kMetaGraphDefFilename);
  }
  // SetupExport that allows for the variables and meta_graph_def filenames
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -62,6 +62,7 @@ licenses(["notice"])  # Apache 2.0
 load(
    "//tensorflow:tensorflow.bzl",
    "full_path",
    "if_android",
    "if_ios",
    "if_x86",
--- a/tensorflow/core/common_runtime/device.cc
+++ b/tensorflow/core/common_runtime/device.cc
@ -30,7 +30,11 @@ Device::Device(Env* env, const DeviceAttributes& device_attributes)
  rmgr_ = new ResourceMgr(parsed_name_.job);
 }
-Device::~Device() { delete rmgr_; }
+Device::~Device() {
  if (rmgr_ != nullptr) {
    DeleteResourceMgr();
  }
 }
 // static
 DeviceAttributes Device::BuildDeviceAttributes(
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@ -60,7 +60,9 @@ class Device : public DeviceBase {
  const string& name() const { return device_attributes_.name(); }
  // Parsed name of this device
-  const DeviceNameUtils::ParsedName parsed_name() const { return parsed_name_; }
+  const DeviceNameUtils::ParsedName& parsed_name() const {
    return parsed_name_;
  }
  // Describes what kind of device this is.  This is intended to be
  // human-readable and not computer-parsed, except that two devices
@ -149,6 +151,12 @@ class Device : public DeviceBase {
    return BuildDeviceAttributes(name, device, memory_limit, locality, "");
  }
 protected:
  void DeleteResourceMgr() {
    delete rmgr_;
    rmgr_ = nullptr;
  }
 private:
  const DeviceAttributes device_attributes_;
  DeviceNameUtils::ParsedName parsed_name_;
--- a/tensorflow/core/common_runtime/device_set.cc
+++ b/tensorflow/core/common_runtime/device_set.cc
@ -53,7 +53,7 @@ Device* DeviceSet::FindDeviceByName(const string& name) const {
 // static
 int DeviceSet::DeviceTypeOrder(const DeviceType& d) {
-  return DeviceFactory::DevicePriority(d.type());
+  return DeviceFactory::DevicePriority(d.type_string());
 }
 static bool DeviceTypeComparator(const DeviceType& a, const DeviceType& b) {
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@ -1231,7 +1231,7 @@ Status FunctionDefToBodyHelper(
  GraphConstructorOptions opts;
  opts.allow_internal_ops = true;
  opts.expect_device_spec = false;
-  Status s = ConvertGraphDefToGraph(opts, result.gdef, graph);
+  Status s = ConvertNodeDefsToGraph(opts, result.nodes, graph);
  if (!s.ok()) {
    delete graph;
  } else {
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@ -93,7 +93,7 @@ class FunctionTest : public ::testing::Test {
    GraphConstructorOptions opts;
    opts.allow_internal_ops = true;
    opts.expect_device_spec = false;
-    TF_CHECK_OK(ConvertGraphDefToGraph(opts, result.gdef, g));
+    TF_CHECK_OK(ConvertNodeDefsToGraph(opts, result.nodes, g));
    const int version = g->versions().producer();
    LocalExecutorParams params;
@ -949,7 +949,7 @@ GraphDef Optimize(const std::function<bool(Graph* g)>& pass,
  GraphConstructorOptions opts;
  opts.allow_internal_ops = true;
  opts.expect_device_spec = false;
-  TF_CHECK_OK(ConvertGraphDefToGraph(opts, result.gdef, g.get()));
+  TF_CHECK_OK(ConvertNodeDefsToGraph(opts, result.nodes, g.get()));
  pass(g.get());
  std::unique_ptr<Graph> g1(new Graph(OpRegistry::Global()));
  CopyGraph(*g, g1.get());
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@ -324,6 +324,7 @@ static void BM_AllocationDelayed(int iters, int delay) {
  int size_index = 0;
  std::vector<void*> ptrs;
  ptrs.reserve(delay);
  for (int i = 0; i < delay; i++) {
    ptrs.push_back(nullptr);
  }
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@ -123,10 +123,12 @@ void Benchmark::RunWithArgs(
  }
  // Gets inputs' and outputs' rendezvous keys.
  std::vector<std::pair<string, Tensor>> in;
  in.reserve(inputs.size());
  for (const auto& p : inputs) {
    in.push_back({GetRendezvousKey(p.first), p.second});
  }
  std::vector<string> out;
  out.reserve(outputs.size());
  for (const auto& n : outputs) {
    out.push_back(GetRendezvousKey(n));
  }
--- a/tensorflow/core/common_runtime/session_factory.cc
+++ b/tensorflow/core/common_runtime/session_factory.cc
@ -94,6 +94,7 @@ Status SessionFactory::GetFactory(const SessionOptions& options,
    // TODO(mrry): Consider providing a system-default fallback option
    // in this case.
    std::vector<string> factory_types;
    factory_types.reserve(candidate_factories.size());
    for (const auto& candidate_factory : candidate_factories) {
      factory_types.push_back(candidate_factory.first);
    }
--- a/Show More
+++ b/Show More