diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 42fa139282a..8810b8731ae 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -388,6 +388,16 @@ tf_gen_op_wrappers_cc(
     visibility = ["//tensorflow:internal"],
 )
 
+tf_gen_op_wrappers_cc(
+    name = "functional_ops",
+    include_internal_ops = 1,
+    op_lib_names = [
+        "functional_ops",
+    ],
+    pkg = "//tensorflow/core",
+    visibility = ["//tensorflow:internal"],
+)
+
 tf_gen_op_wrappers_cc(
     name = "resource_variable_ops",
     include_internal_ops = 1,
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index 03276121294..f58f0bdc9f5 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -422,7 +422,7 @@ class ReferenceUtil {
   static std::unique_ptr<Array2D<T1>> ApplyElementwise2D(
       F&& f, const Array2D<T1>& array1, const Array2D<Ts>&... arrays) {
     AssertSameSize2D(array1, arrays...);
-    auto result = MakeUnique<Array2D<T1>>(array1.n1(), array1.n1());
+    auto result = MakeUnique<Array2D<T1>>(array1.n1(), array1.n2());
     for (int64 i = 0; i < array1.n1(); ++i) {
       for (int64 j = 0; j < array1.n2(); ++j) {
         (*result)(i, j) = f(array1(i, j), arrays(i, j)...);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 240da35ef19..dc002846e9e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -24,6 +24,11 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
+  // Output fusion is not currently supported on CPUs.
+  if (producer->opcode() == HloOpcode::kFusion) {
+    return false;
+  }
+
   // Condition for consumer: must be elementwise or a fusion op
   // (which necessarily only contains elementwise operations)
   if (!(consumer->opcode() == HloOpcode::kFusion ||
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 34a44ad4054..a36dcbbd2fa 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -46,6 +46,11 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
+  // Output fusion is not currently supported on GPUs.
+  if (producer->opcode() == HloOpcode::kFusion) {
+    return false;
+  }
+
   // RNG operations are not currently parallel-friendly on GPU.
   if (producer->opcode() == HloOpcode::kRng) {
     return false;
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 179e1832654..66fb0599752 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -1570,7 +1570,9 @@ string HloInstruction::ToCategory() const {
           return "non-elementwise fusion";
         }
       case FusionKind::kInput:
-        return "reduce fusion";
+        return "input fusion";
+      case FusionKind::kOutput:
+        return "output fusion";
       case FusionKind::kTransposeDot:
         return "dot fusion";
       case FusionKind::kConvBackwardFilter:
@@ -1618,7 +1620,6 @@ bool HloInstruction::IsFusable() const {
 
   // Some kinds of instructions don't make sense to fuse.
   switch (opcode_) {
-    case HloOpcode::kFusion:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kParameter:
@@ -2186,6 +2187,8 @@ string ToString(HloInstruction::FusionKind kind) {
       return "kLoop";
     case HloInstruction::FusionKind::kInput:
       return "kInput";
+    case HloInstruction::FusionKind::kOutput:
+      return "kOutput";
     case HloInstruction::FusionKind::kTransposeDot:
       return "kTransposeDot";
     case HloInstruction::FusionKind::kConvBackwardFilter:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 5ec17c80048..43935690dff 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -54,7 +54,8 @@ class HloInstruction {
  public:
   enum class FusionKind {
     kLoop,                // Fused into a loop.
-    kInput,               // Fused into a reduction kernel.
+    kInput,               // Op's input is fused into the op itself.
+    kOutput,              // Op's output is fused into the op itself.
     kTransposeDot,        // Fused into a dot with transposed operands.
     kConvBackwardFilter,  // Fused into a backward filter convolution.
     kConvBackwardInput,   // Fused into a backward input convolution.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 15e457f932c..25f2922bf8e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -379,7 +379,12 @@ def multi_label_head(n_classes,
                      loss_fn=None):
   """Creates a Head for multi label classification.
 
-  The Head uses sigmoid cross entropy loss.
+  Multi-label classification handles the case where each example may have zero
+  or more associated labels, from a discrete set.  This is distinct from
+  `multi_class_head` which has exactly one label from a discrete set.
+
+  This head by default uses sigmoid cross entropy loss, which expects as input
+  a multi-hot tensor of shape `(batch_size, num_classes)`.
 
   Args:
     n_classes: Integer, number of classes, must be >= 2
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 1617addba05..119bc0f8997 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2417,6 +2417,9 @@ tf_cc_test(
         ":test_main",
         ":testlib",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:function_ops",
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 13e20568fff..3644279b920 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -1001,25 +1001,19 @@ string NewName(const Node* n, bool pretty) {
 void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
   // We visit nodes in forward topological sort order, which is a
   // possible execution order of the graph.
-  std::vector<size_t> pending(g->num_node_ids());
-  std::deque<const Node*> ready;
-  for (const Node* n : g->nodes()) {
-    pending[n->id()] = n->in_edges().size();
-    if (pending[n->id()] == 0) ready.push_back(n);
-  }
   gtl::InlinedVector<const Edge*, 4> inputs;
   gdef->Clear();
   gdef->mutable_versions()->CopyFrom(g->versions());
-  while (!ready.empty()) {
-    const Node* n = ready.front();
-    ready.pop_front();
-    for (const Edge* e : n->out_edges()) {
-      const Node* next = e->dst();
-      if (--pending[next->id()] == 0) {
-        ready.push_back(next);
-      }
+
+  std::vector<Node*> start_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->out_edges().empty()) {
+      start_nodes.push_back(n);
     }
-    if (!n->IsOp()) continue;
+  }
+
+  ReverseDFSFrom(*g, start_nodes, nullptr, [gdef, pretty, &inputs](Node* n) {
+    if (!n->IsOp()) return;
     NodeDef* ndef = gdef->add_node();
     ndef->set_name(NewName(n, pretty));
     ndef->set_op(n->type_string());
@@ -1054,7 +1048,7 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
         ndef->add_input(strings::StrCat(srcname, ":", e->src_output()));
       }
     }
-  }
+  });
 }
 
 string DebugString(const Graph* g) {
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 8f70ab8783c..af1ff6aec03 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -17,6 +17,10 @@ limitations under the License.
 
 #include <atomic>
 
+#include "tensorflow/cc/ops/array_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
@@ -28,10 +32,12 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
 
@@ -58,13 +64,8 @@ class FunctionTest : public ::testing::Test {
       : device_(DeviceFactory::NewDevice("CPU", {},
                                          "/job:localhost/replica:0/task:0")) {}
 
-  ~FunctionTest() override {
-    delete exec_;
-    delete device_;
-  }
-
   void Create(const FunctionDef& fdef, InstantiateAttrValueSlice attrs) {
-    delete exec_;
+    exec_ = nullptr;
     InstantiationResult result;
     TF_CHECK_OK(InstantiateFunction(fdef, attrs, GetOpSig, &result));
 
@@ -79,15 +80,18 @@ class FunctionTest : public ::testing::Test {
 
     const int version = g->versions().producer();
     LocalExecutorParams params;
-    params.device = device_;
+    params.device = device_.get();
     params.create_kernel = [this, version](const NodeDef& ndef,
                                            OpKernel** kernel) {
-      return CreateNonCachedKernel(device_, nullptr, ndef, version, kernel);
+      return CreateNonCachedKernel(device_.get(), nullptr, ndef, version,
+                                   kernel);
     };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
     };
-    TF_CHECK_OK(NewLocalExecutor(params, g, &exec_));
+    Executor* exec;
+    TF_CHECK_OK(NewLocalExecutor(params, g, &exec));
+    exec_.reset(exec);
   }
 
   void Run(const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
@@ -105,8 +109,8 @@ class FunctionTest : public ::testing::Test {
     }
   }
 
-  Device* device_ = nullptr;
-  Executor* exec_ = nullptr;
+  std::unique_ptr<Device> device_;
+  std::unique_ptr<Executor> exec_;
   DataTypeVector arg_types_;
   DataTypeVector ret_types_;
 };
@@ -136,21 +140,15 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       : device_(DeviceFactory::NewDevice("CPU", {},
                                          "/job:localhost/replica:0/task:0")) {}
 
-  ~FunctionLibraryRuntimeTest() override {
-    delete lib_;
-    delete lib_def_;
-    delete device_;
-  }
-
   void Init(const std::vector<FunctionDef>& flib) {
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
-    delete lib_def_;
-    lib_def_ = new FunctionLibraryDefinition(OpRegistry::Global(), proto);
-    delete lib_;
+    lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
-    lib_ = NewFunctionLibraryRuntime(nullptr, Env::Default(), device_,
-                                     TF_GRAPH_DEF_VERSION, lib_def_, opts);
+    lib_.reset(NewFunctionLibraryRuntime(nullptr, Env::Default(), device_.get(),
+                                         TF_GRAPH_DEF_VERSION, lib_def_.get(),
+                                         opts));
+    fdef_lib_ = lib_def_->ToProto();
   }
 
   Status Run(const string& name, InstantiateAttrValueSlice attrs,
@@ -190,7 +188,8 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     return Status::OK();
   }
 
-  Graph* GetFuncBody(const string& name, InstantiateAttrValueSlice attrs) {
+  std::unique_ptr<Graph> GetFuncBody(const string& name,
+                                     InstantiateAttrValueSlice attrs) {
     FunctionLibraryRuntime::Handle handle;
     Status status = lib_->Instantiate(name, attrs, &handle);
     if (!status.ok()) {
@@ -199,12 +198,13 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     }
     const FunctionBody* fbody = lib_->GetFunctionBody(handle);
     CHECK_NOTNULL(fbody);
-    Graph* ret = new Graph(lib_def_);
-    CopyGraph(*fbody->graph, ret);
+    std::unique_ptr<Graph> ret(new Graph(lib_def_.get()));
+    CopyGraph(*fbody->graph, ret.get());
     return ret;
   }
 
-  Graph* GetGradBody(const string& func, InstantiateAttrValueSlice attrs) {
+  std::unique_ptr<Graph> GetGradBody(const string& func,
+                                     InstantiateAttrValueSlice attrs) {
     FunctionLibraryRuntime::Handle handle;
     Status status = lib_->Instantiate(func, attrs, &handle);
     if (!status.ok()) {
@@ -213,17 +213,17 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     }
     const FunctionBody* fbody = lib_->GetFunctionBody(handle);
     CHECK_NOTNULL(fbody);
-    FunctionBody* gbody = SymbolicGradient(*fbody);
+    std::unique_ptr<FunctionBody> gbody(SymbolicGradient(*fbody));
     CHECK_NOTNULL(gbody);
-    Graph* ret = new Graph(lib_def_);
-    CopyGraph(*gbody->graph, ret);
-    delete gbody;
+    std::unique_ptr<Graph> ret(new Graph(lib_def_.get()));
+    CopyGraph(*gbody->graph, ret.get());
     return ret;
   }
 
-  Device* device_ = nullptr;
-  FunctionLibraryDefinition* lib_def_ = nullptr;
-  FunctionLibraryRuntime* lib_ = nullptr;
+  std::unique_ptr<Device> device_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<FunctionLibraryRuntime> lib_;
+  FunctionDefLibrary fdef_lib_;
 };
 
 TEST_F(FunctionLibraryRuntimeTest, IsStateful) {
@@ -254,113 +254,174 @@ TEST_F(FunctionLibraryRuntimeTest, XTimesN) {
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({16, 32, 48, 64}));
 }
 
+// Adds a function call to 'scope.
+// TODO(phawkins): replace with C++ API for calling functions, when that exists.
+Output Call(Scope* scope, const string& op_name, const string& fn_name,
+            gtl::ArraySlice<Input> inputs) {
+  NodeDef def;
+  NodeDefBuilder builder(op_name, fn_name, scope->graph()->op_registry());
+  for (const Input& input : inputs) {
+    builder.Input(input.node()->name(), input.index(),
+                  input.node()->output_type(input.index()));
+  }
+  TF_CHECK_OK(builder.Finalize(&def));
+  Status status;
+  Node* n = scope->graph()->AddNode(def, &status);
+  TF_CHECK_OK(status);
+  for (int i = 0; i < inputs.size(); ++i) {
+    scope->graph()->AddEdge(inputs[i].node(), inputs[i].index(), n, i);
+  }
+  return Output(n);
+}
+
 TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  Graph* g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
+  std::unique_ptr<Graph> g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
   ASSERT_TRUE(g != nullptr);
-  const char* e0 = R"P(
-(n2:float) -> (n4:float) {
-  n3 = XTimesFour[T=float](n2)
-  n4 = XTimesFour[T=float](n3)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
 
-  ExpandInlineFunctions(lib_, g);
-  const char* e1 = R"P(
-(n2:float) -> (n17:float) {
-  n10 = Identity[T=float](n2)
-  n7 = XTimesTwo[T=float](n10)
-  n8 = XTimesTwo[T=float](n7)
-  n11 = Identity[T=float](n8)
-  n16 = Identity[T=float](n11)
-  n13 = XTimesTwo[T=float](n16)
-  n14 = XTimesTwo[T=float](n13)
-  n17 = Identity[T=float](n14)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g));
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto arg = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto a = Call(&s, "x4", "XTimesFour", {arg});
+    auto b = Call(&s, "y", "XTimesFour", {a});
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), b, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
 
-  ExpandInlineFunctions(lib_, g);
-  const char* e2 = R"P(
-(n2:float) -> (n17:float) {
-  n18 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n25 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n32 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n39 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n19 = Cast[DstT=float, SrcT=int64](n18)
-  n26 = Cast[DstT=float, SrcT=int64](n25)
-  n33 = Cast[DstT=float, SrcT=int64](n32)
-  n40 = Cast[DstT=float, SrcT=int64](n39)
-  n10 = Identity[T=float](n2)
-  n23 = Identity[T=float](n10)
-  n21 = Mul[T=float](n23, n19)
-  n24 = Identity[T=float](n21)
-  n30 = Identity[T=float](n24)
-  n28 = Mul[T=float](n30, n26)
-  n31 = Identity[T=float](n28)
-  n11 = Identity[T=float](n31)
-  n16 = Identity[T=float](n11)
-  n37 = Identity[T=float](n16)
-  n35 = Mul[T=float](n37, n33)
-  n38 = Identity[T=float](n35)
-  n44 = Identity[T=float](n38)
-  n42 = Mul[T=float](n44, n40)
-  n45 = Identity[T=float](n42)
-  n17 = Identity[T=float](n45)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g));
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto x4_x2 = Call(&s, "x4/x2", "XTimesTwo", {func0});
+    auto x4_y = Call(&s, "x4/y", "XTimesTwo", {x4_x2});
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), x4_y);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func1);
+    auto y_x2 = Call(&s, "y/x2", "XTimesTwo", {func2});
+    auto y_y = Call(&s, "y/y", "XTimesTwo", {y_x2});
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), y_y);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), func3, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  GraphDef e2;
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_two = ops::Const<int64>(s.WithOpName("x4/x2/two"), 2LL);
+    auto x4_y_two = ops::Const<int64>(s.WithOpName("x4/y/two"), 2LL);
+    auto y_x2_two = ops::Const<int64>(s.WithOpName("y/x2/two"), 2LL);
+    auto y_y_two = ops::Const<int64>(s.WithOpName("y/y/two"), 2LL);
+    auto x4_x2_scale =
+        ops::Cast(s.WithOpName("x4/x2/scale"), x4_x2_two, DT_FLOAT);
+    auto x4_y_scale = ops::Cast(s.WithOpName("x4/y/scale"), x4_y_two, DT_FLOAT);
+    auto y_x2_scale = ops::Cast(s.WithOpName("y/x2/scale"), y_x2_two, DT_FLOAT);
+    auto y_y_scale = ops::Cast(s.WithOpName("y/y/scale"), y_y_two, DT_FLOAT);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto func4 = ops::Identity(s.WithOpName("Func/_4"), func0);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), func4, x4_x2_scale);
+    auto func5 = ops::Identity(s.WithOpName("Func/_5"), x4_x2_y);
+    auto func6 = ops::Identity(s.WithOpName("Func/_6"), func5);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), func6, x4_y_scale);
+    auto func7 = ops::Identity(s.WithOpName("Func/_7"), x4_y_y);
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), func7);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func1);
+    auto func8 = ops::Identity(s.WithOpName("Func/_8"), func2);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), func8, y_x2_scale);
+    auto func9 = ops::Identity(s.WithOpName("Func/_9"), y_x2_y);
+    auto func10 = ops::Identity(s.WithOpName("Func/_10"), func9);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), func10, y_y_scale);
+    auto func11 = ops::Identity(s.WithOpName("Func/_11"), y_y_y);
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), func11);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), func3, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&e2));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(e2, actual);
+  }
 
   // No further inlining.
-  ExpandInlineFunctions(lib_, g);
-  EXPECT_EQ(e2, DebugString(g));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(e2, actual);
+  }
 
   // Get rid of redundant Identity nodes.
-  RemoveIdentityNodes(g);
-  const char* e3 = R"P(
-(n2:float) -> (n42:float) {
-  n18 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n25 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n32 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n39 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n19 = Cast[DstT=float, SrcT=int64](n18)
-  n26 = Cast[DstT=float, SrcT=int64](n25)
-  n33 = Cast[DstT=float, SrcT=int64](n32)
-  n40 = Cast[DstT=float, SrcT=int64](n39)
-  n21 = Mul[T=float](n2, n19)
-  n28 = Mul[T=float](n21, n26)
-  n35 = Mul[T=float](n28, n33)
-  n42 = Mul[T=float](n35, n40)
-}
-)P";
-  EXPECT_EQ(e3, DebugString(g));
-  delete g;
+  RemoveIdentityNodes(g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_two = ops::Const<int64>(s.WithOpName("x4/x2/two"), 2LL);
+    auto x4_y_two = ops::Const<int64>(s.WithOpName("x4/y/two"), 2LL);
+    auto y_x2_two = ops::Const<int64>(s.WithOpName("y/x2/two"), 2LL);
+    auto y_y_two = ops::Const<int64>(s.WithOpName("y/y/two"), 2LL);
+    auto x4_x2_scale =
+        ops::Cast(s.WithOpName("x4/x2/scale"), x4_x2_two, DT_FLOAT);
+    auto x4_y_scale = ops::Cast(s.WithOpName("x4/y/scale"), x4_y_two, DT_FLOAT);
+    auto y_x2_scale = ops::Cast(s.WithOpName("y/x2/scale"), y_x2_two, DT_FLOAT);
+    auto y_y_scale = ops::Cast(s.WithOpName("y/y/scale"), y_y_two, DT_FLOAT);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), x4_x2_y, x4_y_scale);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), x4_y_y, y_x2_scale);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), y_x2_y, y_y_scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y_y_y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  std::unique_ptr<Graph> g(GetFuncBody("XTimes16", {{"T", DT_FLOAT}}));
+  std::unique_ptr<Graph> g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
   ASSERT_TRUE(g != nullptr);
-  ExpandInlineFunctions(lib_, g.get());
-  OptimizeGraph(lib_, &g);
-  const char* e0 = R"P(
-(n2:float) -> (n7:float) {
-  n8 = Const[dtype=float, value=Tensor<type: float shape: [] values: 2>]()
-  n4 = Mul[T=float](n2, n8)
-  n5 = Mul[T=float](n4, n8)
-  n6 = Mul[T=float](n5, n8)
-  n7 = Mul[T=float](n6, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  OptimizeGraph(lib_.get(), &g);
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_scale = ops::Const<float>(
+        s.WithOpName("x4/x2/scale/_12__cf__2")
+            .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+        2.0f);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), x4_x2_y, x4_x2_scale);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), x4_y_y, x4_x2_scale);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), y_x2_y, x4_x2_scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y_y_y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, ManySwapsNodeDef) {
   auto func = FDH::Create(  // Creates a FunctionDef using NodeDefs
-      // Name
+                            // Name
       "ManySwapsNodeDef",
       // Input
       {"x: float", "y: float"},
@@ -379,9 +440,9 @@ TEST_F(FunctionLibraryRuntimeTest, ManySwapsNodeDef) {
       // Return
       {{"o", "g:output"}});
   Init({test::function::Swap(), func});
-  std::unique_ptr<Graph> g(GetFuncBody("ManySwapsNodeDef", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("ManySwapsNodeDef", {});
   ASSERT_TRUE(g != nullptr);
-  OptimizeGraph(lib_, &g);
+  OptimizeGraph(lib_.get(), &g);
   const char* e0 = R"P(
 (n3:float, n2:float) -> (n3:float) {
 }
@@ -412,24 +473,35 @@ TEST_F(FunctionLibraryRuntimeTest, ControlDeps) {
        {{"o"}, "Add", {"x2:z:0", "y2:z:0"}, {{"T", DT_FLOAT}}}},
       {{"o", "o:z:0"}});
   Init({test::function::Swap(), func});
-  std::unique_ptr<Graph> g(GetFuncBody("ManySwapsFirst", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("ManySwapsFirst", {});
   ASSERT_TRUE(g != nullptr);
-  OptimizeGraph(lib_, &g);
+  OptimizeGraph(lib_.get(), &g);
 
-  // NOTE: We can remove n8, n9, n10, n11 with a control edge n8->n5.
+  // NOTE: We can remove func0, func1, func2, func9 with a control edge n8->n5.
   // But we don't have a pass doing that.
-  const char* e0 = R"P(
-(n3:float, n2:float) -> (n6:float) {
-  n4 = Mul[T=float](n3, n3)
-  n8 = NoOp() @ n4
-  n9 = Identity[T=float](n3) @ n8
-  n10 = Identity[T=float](n2) @ n8
-  n11 = NoOp() @ n9, n10
-  n5 = Mul[T=float](n2, n2) @ n11
-  n6 = Add[T=float](n4, n5)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto x2 = ops::Mul(s.WithOpName("x2"), x, x);
+    auto func0 = ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies(x2));
+    auto func1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func0}), x);
+    auto func2 = ops::Identity(
+        s.WithOpName("Func/_2").WithControlDependencies({func0}), y);
+    auto func9 = ops::NoOp(s.WithOpName("Func/_9").WithControlDependencies(
+        {func1.output.op(), func2.output.op()}));
+    auto y2 =
+        ops::Mul(s.WithOpName("y2").WithControlDependencies({func9}), y, y);
+    auto o = ops::Add(s.WithOpName("o"), x2, y2);
+    auto ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Error_NotFound) {
@@ -476,84 +548,136 @@ TEST_F(FunctionLibraryRuntimeTest, Error_InstantiaionError) {
 TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  auto f = GetFuncBody("XTimesTwo", {{"T", DT_FLOAT}});
-  const char* e0 = R"P(
-(n4:float) -> (n5:float) {
-  n2 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n3 = Cast[DstT=float, SrcT=int64](n2)
-  n5 = Mul[T=float](n4, n3)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(f));
-  delete f;
-  std::unique_ptr<Graph> g(GetGradBody("XTimesTwo", {{"T", DT_FLOAT}}));
-  const char* e1 = R"P(
-(n4:float, n6:float) -> (n7:float) {
-  n2 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n3 = Cast[DstT=float, SrcT=int64](n2)
-  n5 = Mul[T=float](n4, n3)
-  n7 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Mul[T=float]](n4, n3, n6)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g.get()));
+  std::unique_ptr<Graph> f = GetFuncBody("XTimesTwo", {{"T", DT_FLOAT}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto two = ops::Const(s.WithOpName("two"), 2LL);
+    auto scale = ops::Cast(s.WithOpName("scale"), two, DT_FLOAT);
+    auto y = ops::Mul(s.WithOpName("y"), x, scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
 
-  OptimizeGraph(lib_, &g);
-  const char* e2 = R"P(
-(n2:float, n3:float) -> (n9:float) {
-  n10 = Const[dtype=float, value=Tensor<type: float shape: [] values: 2>]()
-  n11 = Const[dtype=int32, value=Tensor<type: int32 shape: [0] values: >]()
-  n6 = Shape[T=float, out_type=int32](n2)
-  n5 = Mul[T=float](n3, n10)
-  n7 = BroadcastGradientArgs[T=int32](n6, n11)
-  n8 = Sum[T=float, Tidx=int32, keep_dims=false](n5, n7)
-  n9 = Reshape[T=float, Tshape=int32](n8, n6)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g.get()));
+    GraphDef actual;
+    f->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  std::unique_ptr<Graph> g = GetGradBody("XTimesTwo", {{"T", DT_FLOAT}});
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
+    auto two = ops::Const(s.WithOpName("two"), 2LL);
+    auto scale = ops::Cast(s.WithOpName("scale"), two, DT_FLOAT);
+    auto y = ops::Mul(s.WithOpName("y"), x, scale);
+    NameAttrList fn;
+    fn.set_name("Mul");
+    (*fn.mutable_attr())["T"].set_type(DT_FLOAT);
+    auto func1 = ops::SymbolicGradient(
+        s.WithOpName("Func/_1"), std::initializer_list<Input>{x, scale, func0},
+        {DT_FLOAT, DT_FLOAT}, fn);
+    auto func2 = ops::_Retval(s.WithOpName("Func/_2"), func1[0], 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  OptimizeGraph(lib_.get(), &g);
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
+    auto scale =
+        ops::Const(s.WithOpName("scale/_5__cf__6")
+                       .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+                   2.0f);
+    auto func1_gx = ops::Mul(s.WithOpName("Func/_1/gx"), func0, scale);
+    auto func1_sx = ops::Shape(s.WithOpName("Func/_1/sx"), x);
+    auto const0 =
+        ops::Const(s.WithOpName("Func/_1/sy/_6__cf__7")
+                       .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+                   0, {0});
+    auto func1_rx = ops::internal::BroadcastGradientArgs(
+        s.WithOpName("Func/_1/rx"), func1_sx, const0);
+    auto func1_sum_gx =
+        ops::Sum(s.WithOpName("Func/_1/sum_gx"), func1_gx, func1_rx.r0);
+    auto func1_dx =
+        ops::Reshape(s.WithOpName("Func/_1/dx"), func1_sum_gx, func1_sx);
+    auto func2 = ops::_Retval(s.WithOpName("Func/_2"), func1_dx, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_Add) {
   Init({});
   auto T = DT_FLOAT;
-  auto g = GetFuncBody("SymbolicGradient",
-                       {{"f", FDH::FunctionRef("Add", {{"T", T}})}});
-  const char* e0 = R"P(
-(n7:float, n5:float, n2:float) -> (n14:float, n11:float) {
-  n3 = Identity[T=float](n2)
-  n4 = Identity[T=float](n2)
-  n6 = Shape[T=float, out_type=int32](n5)
-  n8 = Shape[T=float, out_type=int32](n7)
-  n9 = BroadcastGradientArgs[T=int32](n8, n6)
-  n10 = Sum[T=float, Tidx=int32, keep_dims=false](n3, n9:1)
-  n13 = Sum[T=float, Tidx=int32, keep_dims=false](n4, n9)
-  n11 = Reshape[T=float, Tshape=int32](n10, n6)
-  n14 = Reshape[T=float, Tshape=int32](n13, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
-  delete g;
+  std::unique_ptr<Graph> g = GetFuncBody(
+      "SymbolicGradient", {{"f", FDH::FunctionRef("Add", {{"T", T}})}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::_Arg(s.WithOpName("dz"), DT_FLOAT, 2);
+    auto gx = ops::Identity(s.WithOpName("gx"), dz);
+    auto gy = ops::Identity(s.WithOpName("gy"), dz);
+    auto sx = ops::Shape(s.WithOpName("sx"), x);
+    auto sy = ops::Shape(s.WithOpName("sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(s.WithOpName("rx"), sx, sy);
+    auto sum_gx = ops::Sum(s.WithOpName("sum_gx"), gx, rx.r0);
+    auto sum_gy = ops::Sum(s.WithOpName("sum_gy"), gy, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("dy"), sum_gy, sy);
+    auto dx_ret = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_ret = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_Mul) {
   Init({});
   auto T = DT_FLOAT;
-  auto g = GetFuncBody("SymbolicGradient",
-                       {{"f", FDH::FunctionRef("Mul", {{"T", T}})}});
-  const char* e0 = R"P(
-(n6:float, n3:float, n2:float) -> (n14:float, n11:float) {
-  n4 = Mul[T=float](n2, n3)
-  n5 = Shape[T=float, out_type=int32](n3)
-  n7 = Mul[T=float](n6, n2)
-  n8 = Shape[T=float, out_type=int32](n6)
-  n9 = BroadcastGradientArgs[T=int32](n8, n5)
-  n10 = Sum[T=float, Tidx=int32, keep_dims=false](n7, n9:1)
-  n13 = Sum[T=float, Tidx=int32, keep_dims=false](n4, n9)
-  n11 = Reshape[T=float, Tshape=int32](n10, n5)
-  n14 = Reshape[T=float, Tshape=int32](n13, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
-  delete g;
+  std::unique_ptr<Graph> g = GetFuncBody(
+      "SymbolicGradient", {{"f", FDH::FunctionRef("Mul", {{"T", T}})}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::_Arg(s.WithOpName("dz"), DT_FLOAT, 2);
+    auto gx = ops::Mul(s.WithOpName("gx"), dz, y);
+    auto sx = ops::Shape(s.WithOpName("sx"), x);
+    auto gy = ops::Mul(s.WithOpName("gy"), x, dz);
+    auto sy = ops::Shape(s.WithOpName("sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(s.WithOpName("rx"), sx, sy);
+    auto sum_gx = ops::Sum(s.WithOpName("sum_gx"), gx, rx.r0);
+    auto sum_gy = ops::Sum(s.WithOpName("sum_gy"), gy, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("dy"), sum_gy, sy);
+    auto dx_ret = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_ret = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
@@ -570,108 +694,170 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
                           });
 
   // TestGrad = Test'(x, y)
-  auto grad =
-      FDH::Define("TestGrad", {"x:float", "y:float"}, {"dx:float", "dy:float"},
-                  {}, {FDH::Const<float>("dz", 1),
-                       {{"grad0", "grad1"},
-                        "SymbolicGradient",
-                        {"x", "y", "dz"},
-                        {
-                            {"f", FDH::FunctionRef("Test")},
-                            {"Tin", DataTypeSlice{T, T, T}},
-                            {"Tout", DataTypeSlice{T, T}},
-                        }},
-                       {{"dx"}, "Identity", {"grad0"}, {{"T", DT_FLOAT}}},
-                       {{"dy"}, "Identity", {"grad1"}, {{"T", DT_FLOAT}}}});
+  auto grad = FDH::Define("TestGrad", {"x:float", "y:float"},
+                          {"dx:float", "dy:float"}, {},
+                          {FDH::Const<float>("dz", 1),
+                           {{"grad0", "grad1"},
+                            "SymbolicGradient",
+                            {"x", "y", "dz"},
+                            {
+                                {"f", FDH::FunctionRef("Test")},
+                                {"Tin", DataTypeSlice{T, T, T}},
+                                {"Tout", DataTypeSlice{T, T}},
+                            }},
+                           {{"dx"}, "Identity", {"grad0"}, {{"T", DT_FLOAT}}},
+                           {{"dy"}, "Identity", {"grad1"}, {{"T", DT_FLOAT}}}});
 
   Init({test, grad});
 
-  std::unique_ptr<Graph> g(GetFuncBody("TestGrad", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("TestGrad", {});
   ASSERT_TRUE(g != nullptr);
-  const char* e0 = R"P(
-(n4:float, n3:float) -> (n8:float, n6:float) {
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n5 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Test](n4, n3, n2)
-  n6 = Identity[T=float](n5:1)
-  n8 = Identity[T=float](n5)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    NameAttrList fn;
+    fn.set_name("Test");
+    auto grad0 = ops::SymbolicGradient(s.WithOpName("grad0"),
+                                       std::initializer_list<Input>{x, y, dz},
+                                       {DT_FLOAT, DT_FLOAT}, fn);
+    auto dx = ops::Identity(s.WithOpName("dx"), grad0[0]);
+    auto dy = ops::Identity(s.WithOpName("dy"), grad0[1]);
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
 
-  ExpandInlineFunctions(lib_, g.get());
-  const char* e1 = R"P(
-(n4:float, n3:float) -> (n8:float, n6:float) {
-  n10 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n11 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n26 = Identity[T=float](n2)
-  n25 = Identity[T=float](n3)
-  n24 = Identity[T=float](n4)
-  n14 = Add[T=float](n24, n25)
-  n15 = Rank[T=float](n14)
-  n16 = Range[Tidx=int32](n11, n15, n10)
-  n20 = ZerosLike[T=int32](n15)
-  n17 = Sum[T=float, Tidx=int32, keep_dims=false](n14, n16)
-  n19 = SymbolicGradient[Tin={float, int32, float}, Tout={float, int32}, f=Sum[T=float, Tidx=int32, keep_dims=false]](n14, n16, n26)
-  n21 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Add[T=float]](n24, n25, n19)
-  n27 = Identity[T=float](n21)
-  n28 = Identity[T=float](n21:1)
-  n8 = Identity[T=float](n27)
-  n6 = Identity[T=float](n28)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g.get()));
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 
-  OptimizeGraph(lib_, &g);
-  const char* e2 = R"P(
-(n4:float, n3:float) -> (n25:float, n23:float) {
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n7 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n8 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n19 = Shape[T=float, out_type=int32](n3)
-  n9 = Add[T=float](n4, n3)
-  n20 = Shape[T=float, out_type=int32](n4)
-  n10 = Rank[T=float](n9)
-  n14 = Shape[T=float, out_type=int32](n9)
-  n21 = BroadcastGradientArgs[T=int32](n20, n19)
-  n11 = Range[Tidx=int32](n8, n10, n7)
-  n12 = Shape[T=int32, out_type=int32](n11)
-  n13 = Fill[T=int32](n12, n7)
-  n15 = DynamicStitch[N=2, T=int32](n11, n11, n14, n13)
-  n16 = Reshape[T=float, Tshape=int32](n2, n15)
-  n17 = Div[T=int32](n14, n15)
-  n18 = Tile[T=float, Tmultiples=int32](n16, n17)
-  n22 = Sum[T=float, Tidx=int32, keep_dims=false](n18, n21:1)
-  n24 = Sum[T=float, Tidx=int32, keep_dims=false](n18, n21)
-  n23 = Reshape[T=float, Tshape=int32](n22, n19)
-  n25 = Reshape[T=float, Tshape=int32](n24, n20)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g.get()));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    auto grad0_zero = ops::Const(s.WithOpName("grad0/zero"), 0);
+    auto grad0_one = ops::Const(s.WithOpName("grad0/one"), 1);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), y);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), dz);
+    auto grad0_z = ops::Add(s.WithOpName("grad0/z"), func0, func1);
+    auto grad0_r = ops::Rank(s.WithOpName("grad0/r"), grad0_z);
+    auto grad0_indices = ops::Range(s.WithOpName("grad0/indices"), grad0_zero,
+                                    grad0_r, grad0_one);
+    auto grad0_l = ops::Sum(s.WithOpName("grad0/l"), grad0_z, grad0_indices);
+
+    NameAttrList sum;
+    sum.set_name("Sum");
+    (*sum.mutable_attr())["T"].set_type(DT_FLOAT);
+    (*sum.mutable_attr())["Tidx"].set_type(DT_INT32);
+    (*sum.mutable_attr())["keep_dims"].set_b(false);
+    auto grad0_func1 = ops::SymbolicGradient(
+        s.WithOpName("grad0/Func/_1"),
+        std::initializer_list<Input>{grad0_z, grad0_indices, func2},
+        {DT_FLOAT, DT_INT32}, sum);
+
+    auto grad0_func2 = ops::ZerosLike(s.WithOpName("grad0/Func/_2"), grad0_r);
+
+    NameAttrList add;
+    add.set_name("Add");
+    (*add.mutable_attr())["T"].set_type(DT_FLOAT);
+    auto grad0_func3 = ops::SymbolicGradient(
+        s.WithOpName("grad0/Func/_3"),
+        std::initializer_list<Input>{func0, func1, grad0_func1[0]},
+        {DT_FLOAT, DT_FLOAT}, add);
+
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), grad0_func3[0]);
+    auto func4 = ops::Identity(s.WithOpName("Func/_4"), grad0_func3[1]);
+    auto dx = ops::Identity(s.WithOpName("dx"), func3);
+    auto dy = ops::Identity(s.WithOpName("dy"), func4);
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  OptimizeGraph(lib_.get(), &g);
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    auto grad0_zero = ops::Const(s.WithOpName("grad0/zero"), 0);
+    auto grad0_one = ops::Const(s.WithOpName("grad0/one"), 1);
+    auto grad0_z = ops::Add(s.WithOpName("grad0/z"), x, y);
+    auto grad0_r = ops::Rank(s.WithOpName("grad0/r"), grad0_z);
+    auto grad0_indices = ops::Range(s.WithOpName("grad0/indices"), grad0_zero,
+                                    grad0_r, grad0_one);
+    auto i_shape =
+        ops::Shape(s.WithOpName("grad0/Func/_1/i_shape"), grad0_indices);
+    auto stitch_val = ops::Fill(s.WithOpName("grad0/Func/_1/stitch_val1"),
+                                i_shape, grad0_one);
+    auto x_shape = ops::Shape(s.WithOpName("grad0/Func/_1/x_shape"), grad0_z);
+    auto y_shape = ops::DynamicStitch(
+        s.WithOpName("grad0/Func/_1/y_shape"),
+        std::initializer_list<Input>{grad0_indices, grad0_indices},
+        std::initializer_list<Input>{x_shape, stitch_val});
+    auto dy_reshaped =
+        ops::Reshape(s.WithOpName("grad0/Func/_1/dy_reshaped"), dz, y_shape);
+    auto tile_scaling =
+        ops::Div(s.WithOpName("grad0/Func/_1/tile_scaling"), x_shape, y_shape);
+    auto func1_dx =
+        ops::Tile(s.WithOpName("grad0/Func/_1/dx"), dy_reshaped, tile_scaling);
+
+    auto sx = ops::Shape(s.WithOpName("grad0/Func/_3/sx"), x);
+    auto sy = ops::Shape(s.WithOpName("grad0/Func/_3/sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(
+        s.WithOpName("grad0/Func/_3/rx"), sx, sy);
+    auto sum_gx =
+        ops::Sum(s.WithOpName("grad0/Func/_3/sum_gx"), func1_dx, rx.r0);
+    auto sum_gy =
+        ops::Sum(s.WithOpName("grad0/Func/_3/sum_gy"), func1_dx, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("grad0/Func/_3/dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("grad0/Func/_3/dy"), sum_gy, sy);
+
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 namespace {
 
 bool DoNothing(Graph* g) { return false; }
 
-string Optimize(const std::function<bool(Graph* g)>& pass,
-                const FunctionDef& fdef) {
+GraphDef Optimize(const std::function<bool(Graph* g)>& pass,
+                  const FunctionDef& fdef) {
   InstantiationResult result;
   InstantiateAttrValueMap empty;
   TF_CHECK_OK(InstantiateFunction(fdef, empty, GetOpSig, &result));
-  Graph* g = new Graph(OpRegistry::Global());
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
   GraphConstructorOptions opts;
   opts.allow_internal_ops = true;
   opts.expect_device_spec = false;
-  TF_CHECK_OK(ConvertGraphDefToGraph(opts, result.gdef, g));
-  pass(g);
-  Graph* g1 = new Graph(OpRegistry::Global());
-  CopyGraph(*g, g1);
-  delete g;
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, result.gdef, g.get()));
+  pass(g.get());
+  std::unique_ptr<Graph> g1(new Graph(OpRegistry::Global()));
+  CopyGraph(*g, g1.get());
+  g = nullptr;
   GraphDef gdef;
   g1->ToGraphDef(&gdef);
-  delete g1;
-  return DebugString(gdef);
+  return gdef;
 }
 
 }  // end namespace
@@ -700,21 +886,25 @@ TEST(OptimizationTest, RemoveDeadNodes) {
        {{"keep_me"}, "RandomUniform", {"o"}, {{"T", T}, {"dtype", DT_FLOAT}}},
        // y = Add<T>(a, o)
        {{"y"}, "Add", {"a", "o"}, {{"T", T}}}});
-  const char* e0 = R"S(
-(x:int32) -> (y:int32) {
-  o = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  keep_me = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](o)
-  x1 = Add[T=int32](o, o)
-  a = Square[T=int32](x)
-  y = Add[T=int32](a, o)
-  x2 = Mul[T=int32](a, x1)
-  x3 = Mul[T=int32](x1, x2)
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
+
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto keep_me = ops::RandomUniform(s.WithOpName("keep_me"), {o}, DT_FLOAT);
+    auto x1 = ops::Add(s.WithOpName("x1"), o, o);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto x2 = ops::Mul(s.WithOpName("x2"), a, x1);
+    auto x3 = ops::Mul(s.WithOpName("x3"), x1, x2);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
 
   // TODO(zhifengc): Comes up another test case.
-  EXPECT_EQ(Optimize(::tensorflow::RemoveDeadNodes, func), e0);
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(::tensorflow::RemoveDeadNodes, func));
 }
 
 TEST(OptimizationTest, RemoveIdentityNodes_Ref) {
@@ -735,23 +925,19 @@ TEST(OptimizationTest, RemoveIdentityNodes_Ref) {
        {{"v_read"}, "Identity", {"v"}, {{"T", T}}},
        // returns v + v
        {{"ret"}, "Add", {"v_read", "v_read"}, {{"T", T}}}});
-  const char* e0 = R"S(
-() -> (ret:float) {
-  v = VariableV2[container="", dtype=float, shape=[], shared_name=""]()
-  v_read = Identity[T=float](v)
-  ret = Add[T=float](v_read, v_read)
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
 
-  const char* e1 = R"S(
-() -> (ret:float) {
-  v = VariableV2[container="", dtype=float, shape=[], shared_name=""]()
-  v_read = Identity[T=float](v)
-  ret = Add[T=float](v_read, v_read)
-}
-)S";
-  EXPECT_EQ(Optimize(::tensorflow::RemoveIdentityNodes, func), e1);
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto v = ops::Variable(s.WithOpName("v"), PartialTensorShape({}), DT_FLOAT);
+    auto v_read = ops::Identity(s.WithOpName("v_read"), v);
+    auto ret = ops::Add(s.WithOpName("ret"), v_read, v_read);
+    auto ret_retval = ops::_Retval(s.WithOpName("ret_RetVal"), ret, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  TF_EXPECT_GRAPH_EQ(expected,
+                     Optimize(::tensorflow::RemoveIdentityNodes, func));
 }
 
 TEST(OptimizationTest, RemoveIdentityNodes) {
@@ -782,28 +968,38 @@ TEST(OptimizationTest, RemoveIdentityNodes) {
         {"x3"}},
        // y = Add<T>(a, o)
        {{"y"}, "Add", {"a", "o"}, {{"T", T}}}});
-  const char* e0 = R"S(
-(x:int32) -> (y:int32) {
-  o = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  a = Square[T=int32](x)
-  y = Add[T=int32](a, o)
-  x1 = Identity[T=int32](a)
-  x2 = Identity[T=int32](x1)
-  x3 = Identity[T=int32](x2)
-  keep_me = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](o) @ x3
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
 
-  const char* e1 = R"S(
-(x:int32) -> (y:int32) {
-  o = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  a = Square[T=int32](x)
-  y = Add[T=int32](a, o)
-  keep_me = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](o) @ a
-}
-)S";
-  EXPECT_EQ(Optimize(::tensorflow::RemoveIdentityNodes, func), e1);
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto x1 = ops::Identity(s.WithOpName("x1"), a);
+    auto x2 = ops::Identity(s.WithOpName("x2"), x1);
+    auto x3 = ops::Identity(s.WithOpName("x3"), x2);
+    auto keep_me = ops::RandomUniform(
+        s.WithOpName("keep_me").WithControlDependencies(x3), {o}, DT_FLOAT);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto keep_me = ops::RandomUniform(
+        s.WithOpName("keep_me").WithControlDependencies(a), {o}, DT_FLOAT);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected,
+                       Optimize(::tensorflow::RemoveIdentityNodes, func));
+  }
 }
 
 TEST(OptimizationTest, RemoveListArrayConverter) {
@@ -840,49 +1036,63 @@ TEST(OptimizationTest, RemoveListArrayConverter) {
       // Return values
       {{"o", "o:sum"}});
 
-  const char* e0 = R"P(
-(i:float) -> (o:float) {
-  zero = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  s = Split[T=float, num_split=4](zero, i)
-  a = _ArrayToList[N=4, T=float, out_types={float, float, float, float}](s, s:1, s:2, s:3)
-  r = Mul[T=float](a:2, a:3)
-  l = Mul[T=float](a, a:1)
-  x = _ListToArray[N=2, T=float, Tin={float, float}](l, r)
-  o = AddN[N=2, T=float](x, x:1)
-}
-)P";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto a = ops::_ArrayToList(scope.WithOpName("a"), s.output,
+                               {DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT});
+    auto r = ops::Mul(scope.WithOpName("r"), a[2], a[3]);
+    auto l = ops::Mul(scope.WithOpName("l"), a[0], a[1]);
+    auto x = ops::_ListToArray(scope.WithOpName("x"),
+                               std::initializer_list<Input>{l, r}, DT_FLOAT, 2);
+    auto o = ops::AddN(scope.WithOpName("o"), x.output);
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
 
-  const char* e1 = R"P(
-(i:float) -> (o:float) {
-  zero = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  s = Split[T=float, num_split=4](zero, i)
-  r = Mul[T=float](Func/_2, Func/_3)
-  l = Mul[T=float](Func/_0, Func/_1)
-  o = AddN[N=2, T=float](Func/_4, Func/_5)
-  Func/_0 = Identity[T=float](s)
-  Func/_1 = Identity[T=float](s:1)
-  Func/_2 = Identity[T=float](s:2)
-  Func/_3 = Identity[T=float](s:3)
-  Func/_4 = Identity[T=float](l)
-  Func/_5 = Identity[T=float](r)
-}
-)P";
-  EXPECT_EQ(Optimize(RemoveListArrayConverter, func), e1);
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto func_0 = ops::Identity(scope.WithOpName("Func/_0"), s[0]);
+    auto func_1 = ops::Identity(scope.WithOpName("Func/_1"), s[1]);
+    auto func_2 = ops::Identity(scope.WithOpName("Func/_2"), s[2]);
+    auto func_3 = ops::Identity(scope.WithOpName("Func/_3"), s[3]);
+    auto r = ops::Mul(scope.WithOpName("r"), func_2, func_3);
+    auto l = ops::Mul(scope.WithOpName("l"), func_0, func_1);
+    auto func_4 = ops::Identity(scope.WithOpName("Func/_4"), l);
+    auto func_5 = ops::Identity(scope.WithOpName("Func/_5"), r);
+    auto o = ops::AddN(scope.WithOpName("o"),
+                       std::initializer_list<Input>{func_4, func_5});
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(RemoveListArrayConverter, func));
+  }
 
-  const char* e2 = R"P(
-(i:float) -> (o:float) {
-  zero = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  s = Split[T=float, num_split=4](zero, i)
-  r = Mul[T=float](s:2, s:3)
-  l = Mul[T=float](s, s:1)
-  o = AddN[N=2, T=float](l, r)
-}
-)P";
-  auto remove_listarray_and_identity = [](Graph* g) {
-    return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
-  };
-  EXPECT_EQ(Optimize(remove_listarray_and_identity, func), e2);
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto r = ops::Mul(scope.WithOpName("r"), s[2], s[3]);
+    auto l = ops::Mul(scope.WithOpName("l"), s[0], s[1]);
+    auto o =
+        ops::AddN(scope.WithOpName("o"), std::initializer_list<Input>{l, r});
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+
+    auto remove_listarray_and_identity = [](Graph* g) {
+      return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
+    };
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(remove_listarray_and_identity, func));
+  }
 }
 
 TEST(OptimizationTest, RemoveListArrayConverter_WithContolDeps) {
@@ -911,33 +1121,47 @@ TEST(OptimizationTest, RemoveListArrayConverter_WithContolDeps) {
         {"x"}}},
       {{"o", "o:sum"}});
 
-  const char* e0 = R"P(
-(i:float) -> (o:float) {
-  dummy = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  x = _ListToArray[N=2, T=float, Tin={float, float}](i, i) @ dummy
-  o = AddN[N=2, T=float](x, x:1) @ x
-}
-)P";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
+  {
+    Scope s = Scope::NewRootScope();
+    auto i = ops::_Arg(s.WithOpName("i"), DT_FLOAT, 0);
+    auto dummy = ops::Const(s.WithOpName("dummy"), 0);
+    auto x = ops::_ListToArray(s.WithOpName("x").WithControlDependencies(dummy),
+                               std::initializer_list<Input>{i, i}, DT_FLOAT, 2);
+    auto o =
+        ops::AddN(s.WithOpName("o").WithControlDependencies({x.output[0].op()}),
+                  x.output);
+    auto o_ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
 
-  const char* e1 = R"P(
-(i:float) -> (o:float) {
-  dummy = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  o = AddN[N=2, T=float](Func/_0, Func/_1) @ Func/_3
-  Func/_0 = Identity[T=float](i) @ Func/_2
-  Func/_1 = Identity[T=float](i) @ Func/_2
-  Func/_2 = NoOp() @ dummy
-  Func/_3 = NoOp() @ Func/_0, Func/_1
-}
-)P";
-  EXPECT_EQ(Optimize(RemoveListArrayConverter, func), e1);
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto i = ops::_Arg(s.WithOpName("i"), DT_FLOAT, 0);
+    auto dummy = ops::Const(s.WithOpName("dummy"), 0);
+    auto func_2 =
+        ops::NoOp(s.WithOpName("Func/_2").WithControlDependencies(dummy));
+    auto func_0 = ops::Identity(
+        s.WithOpName("Func/_0").WithControlDependencies({func_2}), i);
+    auto func_1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func_2}), i);
+    auto func_3 = ops::NoOp(s.WithOpName("Func/_3").WithControlDependencies(
+        {func_0.output.op(), func_1.output.op()}));
+    auto o = ops::AddN(s.WithOpName("o").WithControlDependencies({func_3}),
+                       std::initializer_list<Input>{func_0, func_1});
+    auto o_ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(RemoveListArrayConverter, func));
 
   auto remove_listarray_and_identity = [](Graph* g) {
     return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
   };
   // NOTE: We are not removing Identity nodes with any control
   // dependencies yet.
-  EXPECT_EQ(Optimize(remove_listarray_and_identity, func), e1);
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(remove_listarray_and_identity, func));
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index 452cfdda9e6..3573cc6ec21 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -400,16 +400,33 @@ void SetAttrValue(gtl::ArraySlice<NameAttrList> value, AttrValue* out) {
   }
 }
 
+// Wrapper around protocol buffer serialization that requests deterministic
+// serialization, in particular for Map fields, which serialize in a random
+// order by default. Returns true on success.
+template <typename T>
+static bool DeterministicSerialization(const T& t, string* result) {
+  const int size = t.ByteSize();
+  *result = string(size, '\0');
+  ::tensorflow::protobuf::io::ArrayOutputStream array_stream(&(*result)[0],
+                                                             size);
+  ::tensorflow::protobuf::io::CodedOutputStream output_stream(&array_stream);
+  output_stream.SetSerializationDeterministic(true);
+  t.SerializeWithCachedSizes(&output_stream);
+  return !output_stream.HadError() && size == output_stream.ByteCount();
+}
+
 bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b) {
   string a_str, b_str;
-  a.SerializeToString(&a_str);
-  b.SerializeToString(&b_str);
+  DeterministicSerialization(a, &a_str);
+  DeterministicSerialization(b, &b_str);
   // Note: it should be safe to compare proto serializations of the attr
   // values since at most one field should be set in each (indeed, it
   // must be the same field if they are to compare equal).
   // Exception: there are multiple equivalent representations of
   // TensorProtos.  So a return value of true implies a == b, but not the
   // converse.
+  // TODO(phawkins): this is incorrect for NameAttrList attributes that may
+  // contain nested AttrValue maps.
   return a_str == b_str;
 }
 
diff --git a/tensorflow/core/graph/algorithm.cc b/tensorflow/core/graph/algorithm.cc
index 38f011ecaf1..3bfba3fc4ee 100644
--- a/tensorflow/core/graph/algorithm.cc
+++ b/tensorflow/core/graph/algorithm.cc
@@ -23,8 +23,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-void DFS(const Graph& g, std::function<void(Node*)> enter,
-         std::function<void(Node*)> leave) {
+void DFS(const Graph& g, const std::function<void(Node*)>& enter,
+         const std::function<void(Node*)>& leave) {
   // Stack of work to do.
   struct Work {
     Node* node;
@@ -61,15 +61,23 @@ void DFS(const Graph& g, std::function<void(Node*)> enter,
   }
 }
 
-void ReverseDFS(const Graph& g, std::function<void(Node*)> enter,
-                std::function<void(Node*)> leave) {
+void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
+                const std::function<void(Node*)>& leave) {
+  ReverseDFSFrom(g, {g.sink_node()}, enter, leave);
+}
+
+void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                    const std::function<void(Node*)>& enter,
+                    const std::function<void(Node*)>& leave) {
   // Stack of work to do.
   struct Work {
     Node* node;
     bool leave;  // Are we entering or leaving n?
   };
-  std::vector<Work> stack;
-  stack.push_back(Work{g.sink_node(), false});
+  std::vector<Work> stack(start.size());
+  for (int i = 0; i < start.size(); ++i) {
+    stack[i] = Work{start[i], false};
+  }
 
   std::vector<bool> visited(g.num_node_ids(), false);
   while (!stack.empty()) {
diff --git a/tensorflow/core/graph/algorithm.h b/tensorflow/core/graph/algorithm.h
index 74aace80722..01d36e0a124 100644
--- a/tensorflow/core/graph/algorithm.h
+++ b/tensorflow/core/graph/algorithm.h
@@ -21,20 +21,28 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tensorflow {
 
 // Perform a depth-first-search on g starting at the source node.
 // If enter is not empty, calls enter(n) before visiting any children of n.
 // If leave is not empty, calls leave(n) after visiting all children of n.
-extern void DFS(const Graph& g, std::function<void(Node*)> enter,
-                std::function<void(Node*)> leave);
+extern void DFS(const Graph& g, const std::function<void(Node*)>& enter,
+                const std::function<void(Node*)>& leave);
 
 // Perform a reverse depth-first-search on g starting at the sink node.
 // If enter is not empty, calls enter(n) before visiting any parents of n.
 // If leave is not empty, calls leave(n) after visiting all parents of n.
-extern void ReverseDFS(const Graph& g, std::function<void(Node*)> enter,
-                       std::function<void(Node*)> leave);
+extern void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
+                       const std::function<void(Node*)>& leave);
+
+// Perform a reverse depth-first-search on g starting at the 'start' nodes.
+// If enter is not empty, calls enter(n) before visiting any parents of n.
+// If leave is not empty, calls leave(n) after visiting all parents of n.
+extern void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                           const std::function<void(Node*)>& enter,
+                           const std::function<void(Node*)>& leave);
 
 // Stores in *order the post-order numbering of all nodes
 // in graph found via a depth first search starting at the source node.
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index e784c2df443..22f4708d032 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -90,6 +90,23 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "robust_stats",
+    srcs = ["robust_stats.cc"],
+    hdrs = ["robust_stats.h"],
+    visibility = ["//visibility:public"],
+)
+
+cc_test(
+    name = "robust_stats_test",
+    srcs = ["robust_stats_test.cc"],
+    deps = [
+        ":robust_stats",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "utils",
     srcs = ["utils.cc"],
@@ -116,3 +133,37 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
+
+cc_library(
+    name = "virtual_scheduler",
+    srcs = ["virtual_scheduler.cc"],
+    hdrs = ["virtual_scheduler.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+    ],
+)
+
+cc_library(
+    name = "measuring_cost_estimator",
+    srcs = ["measuring_cost_estimator.cc"],
+    hdrs = ["measuring_cost_estimator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":robust_stats",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
new file mode 100644
index 00000000000..6179dc05c1e
--- /dev/null
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -0,0 +1,133 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/measuring_cost_estimator.h"
+
+#include <limits>
+
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace grappler {
+
+MeasuringCostEstimator::MeasuringCostEstimator(Cluster* cluster,
+                                               int measurement_steps,
+                                               int measurement_threads)
+    : measurement_steps_(measurement_steps),
+      measurement_threads_(measurement_threads) {
+  CHECK_GE(measurement_steps, 1);
+  if (measurement_threads > 0) {
+    thread_pool_.reset(new thread::ThreadPool(
+        Env::Default(), SanitizeThreadSuffix("measurements"),
+        measurement_threads));
+  }
+  cluster_ = cluster;
+}
+
+Status MeasuringCostEstimator::Initialize(const GrapplerItem& item) {
+  feed_ = item.feed;
+  fetch_ = item.fetch;
+  return cluster_->Initialize(item);
+}
+
+Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
+                                            CostGraphDef* cost_graph,
+                                            Costs* costs) const {
+  std::vector<double> times(measurement_steps_);
+  BlockingCounter barrier(measurement_steps_);
+
+  mutex status_mu;
+  Status status;
+
+  auto measurement_fn = [&](const int step) {
+    const Costs::MicroSeconds start = Env::Default()->NowMicros();
+
+    RunMetadata metadata;
+    const Status local_status =
+        cluster_->Run(optimized_graph, feed_, fetch_, &metadata);
+    {
+      mutex_lock lock(status_mu);
+      status.Update(local_status);
+    }
+    if (step < 0) {
+      // Discard the first iteration as it triggers the warmup, and therefore
+      // takes much longer than a normal step.
+      return;
+    }
+    if (!local_status.ok()) {
+      // Discard the data if the run wasn't sucessful.
+      barrier.DecrementCount();
+      return;
+    }
+
+    const Costs::MicroSeconds finish = Env::Default()->NowMicros();
+    const double time = (finish - start).count() * 1e3;
+    times[step] = time;
+
+    if (cost_graph && (step + 1 == measurement_steps_)) {
+      metadata.mutable_cost_graph()->Swap(cost_graph);
+    }
+
+    barrier.DecrementCount();
+  };
+
+  // Initialize the computation and warm up TensorFlow.
+  measurement_fn(-1);
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to run start measurements: "
+               << status.error_message();
+    costs->execution_time = Costs::Duration::max();
+    return status;
+  }
+
+  // Run "measurement_steps_" and measure the time.
+  if (measurement_threads_ > 0) {
+    for (int i = 0; i < measurement_steps_; ++i) {
+      thread_pool_->Schedule([i, &measurement_fn]() { measurement_fn(i); });
+    }
+    barrier.Wait();
+  } else {
+    for (int i = 0; i < measurement_steps_ && status.ok(); ++i) {
+      measurement_fn(i);
+    }
+  }
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to measure graph performance: "
+               << status.error_message();
+    costs->execution_time = Costs::Duration::max();
+    costs->max_execution_time = Costs::Duration::max();
+    costs->min_execution_time = 0;
+    return status;
+  }
+
+  // Compute the average time of the measure steps. Use Huber statistics
+  // to filter out outliers.
+  RobustStats stats(times);
+  costs->execution_time = Costs::Duration(stats.mean());
+  costs->max_execution_time = Costs::Duration(stats.hi());
+  costs->min_execution_time = Costs::Duration(stats.lo());
+
+  return Status::OK();
+}
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.h b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
new file mode 100644
index 00000000000..a84853f6c71
--- /dev/null
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+#define TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+class CostGraphDef;
+class GraphDef;
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+struct GrapplerItem;
+
+// Estimate the cost of running a Grappler item by actually running the
+// corresponding TensorFlow graph on the specified cluster and measuring the
+// runtimes.
+class MeasuringCostEstimator : public CostEstimator {
+ public:
+  // Run the model for measurement_steps to measure its average cost.
+  // When measurement_threads is greater than 0, use a threadpool of as many
+  // threads to run the measurements; otherwise, run them serially. Does not
+  // take ownership of cluster.
+  explicit MeasuringCostEstimator(Cluster* cluster, int measurement_steps,
+                                  int measurement_threads);
+  ~MeasuringCostEstimator() override {}
+
+  // Initalizes the estimator for the specified grappler item.
+  // This implementation always returns OK.
+  Status Initialize(const GrapplerItem& item) override;
+
+  // Runs the optimized version of the graph on the cluster, measure
+  // the runtimes of each operation, and annotated the CostGraphDef
+  // with the corresponding measurements.
+  // Returns the average latency for the whole graph.
+  Status PredictCosts(const GraphDef& optimized_graph, CostGraphDef* cost_graph,
+                      Costs* overall_cost) const override;
+
+ private:
+  Cluster* cluster_;  // Not owned.
+  int measurement_steps_;
+  int measurement_threads_;
+  std::vector<std::pair<string, Tensor>> feed_;
+  std::vector<string> fetch_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/robust_stats.cc b/tensorflow/core/grappler/costs/robust_stats.cc
new file mode 100644
index 00000000000..87cda1c0d2e
--- /dev/null
+++ b/tensorflow/core/grappler/costs/robust_stats.cc
@@ -0,0 +1,151 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include <algorithm>
+
+namespace tensorflow {
+namespace grappler {
+
+// Given a sorted vector of values, calculate the median.
+// Returns 0 for an empty vector.  Does not verify sortedness.
+static double SortedMedian(const std::vector<double> &values) {
+  const int n = values.size();
+  if (n == 0) return 0.0;
+  if (n & 1) {
+    return values[n / 2];
+  } else {
+    return (values[n / 2] + values[n / 2 - 1]) / 2.0;
+  }
+}
+
+// Given a vector of values (sorted or not), calculate the median.
+static double Median(std::vector<double> &&values) {
+  const size_t n = values.size();
+  if (n == 0) return 0;
+  const auto middle = values.begin() + (n / 2);
+  // Put the middle value in its place.
+  std::nth_element(values.begin(), middle, values.end());
+  if (n & 1) {
+    return *middle;
+  }
+  // Return the average of the two elements, the max_element lower than
+  // *middle is found between begin and middle as a post-cond of
+  // nth_element.
+  const auto lower_middle = std::max_element(values.begin(), middle);
+  // Preventing overflow. We know that '*lower_middle <= *middle'.
+  // If both are on oposite sides of zero, the sum won't overflow, otherwise
+  // the difference won't overflow.
+  if (*lower_middle <= 0 && *middle >= 0) {
+    return (*lower_middle + *middle) / 2;
+  }
+  return *lower_middle + (*middle - *lower_middle) / 2;
+}
+
+// Given a set of values, calculates the scaled Median Absolute Deviation (a
+// robust approximation to the standard deviation).  This is calculated as the
+// median of the absolute deviations from the median, scaled by 1.4826.  Its
+// advantage over the standard deviation is that it is not (as) affected by
+// outlier values.  Returns a pair<median, mad>.
+static std::pair<double, double> ScaledMedianAbsoluteDeviation(
+    const std::vector<double> &sorted_values) {
+  double median = SortedMedian(sorted_values);
+
+  // Next, we calculate the absolute deviations from the median,
+  // find the median of the resulting data, and scale by 1.4826.
+  std::vector<double> deviations;
+  deviations.reserve(sorted_values.size());
+  for (double d : sorted_values) {
+    deviations.push_back(std::abs(d - median));
+  }
+  double mad = Median(std::move(deviations)) * 1.4826;
+  return std::pair<double, double>(median, mad);
+}
+
+RobustStats::RobustStats(const std::vector<double> &values)
+    : RobustStats(std::vector<double>(values)) {}
+
+RobustStats::RobustStats(std::vector<double> &&values) {
+  std::sort(values.begin(), values.end());
+  lo_ = values[0];
+  hi_ = values.back();
+  HuberMAD(values);
+}
+
+// Computes an updated mean using Huber's weighting function (values beyond
+// the margin are weighted by margin / abs(value - mean).
+double UpdateHuberMean(const std::vector<double> &sorted_values, double mean,
+                       double margin) {
+  int num_within = 0;
+  double sum = 0.0;
+
+  for (double d : sorted_values) {
+    if (d < mean - margin) {
+      sum -= margin;
+    } else if (d > mean + margin) {
+      sum += margin;
+    } else {
+      sum += d;
+      ++num_within;
+    }
+  }
+
+  // It is possible, for a set with an interquartile distance of 0, i.e., with
+  // more than half of the values at the median, to encounter the case where
+  // the Huber mean drifts slightly off the median and there are no values
+  // within the margin.  In that case, just return the old mean, and the caller
+  // will quit.
+  if (num_within > 0) {
+    return sum / num_within;
+  } else {
+    return mean;
+  }
+}
+
+// Given a list of values, this approximates the stddev using the MAD and then
+// uses it to compute a Huber robust mean (sandwich mean).  A margin of
+// c*stddev is defined around the current mean, and values are weighted by
+// margin / abs(value - mean) if outside the margin, or 1 if inside.  This
+// computes the mean iteratively, because each time it changes the margin
+// shifts a bit.  It typically settles very quickly, but it's possible for it
+// to be unstable.  We limit it to 10 iterations.
+//
+void RobustStats::HuberMAD(const std::vector<double> &sorted_values) {
+  const std::pair<double, double> median_mad =
+      ScaledMedianAbsoluteDeviation(sorted_values);
+  mean_ = median_mad.first;
+  stddev_ = median_mad.second;
+
+  // c = 1.345 is the commonly used cutoff with 95% efficiency at the normal.
+  // We're using c = 1.5 to be a little more conservative, and because that's
+  // the default in S-plus.
+  // TODO(dehnert): Specialize Stats for integral types so we don't implement
+  // methods that don't make sense.
+  const double c = 1.5;
+  const double margin = c * stddev_;
+
+  // Iterate 10 times, or until the Huber mean stabilizes.
+  // If the margin is zero, we don't want mean to drift from the median.
+  if (margin > 0.0) {
+    for (int k = 0; k < 10; ++k) {
+      double old_mean = mean_;
+      mean_ = UpdateHuberMean(sorted_values, mean_, margin);
+      if (mean_ == old_mean) break;
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/robust_stats.h b/tensorflow/core/grappler/costs/robust_stats.h
new file mode 100644
index 00000000000..9d8f5bc970a
--- /dev/null
+++ b/tensorflow/core/grappler/costs/robust_stats.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
+#define TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
+
+#include <vector>
+namespace tensorflow {
+namespace grappler {
+class RobustStats {
+ public:
+  RobustStats(const std::vector<double>& values);
+  RobustStats(std::vector<double>&& values);
+
+  double lo() const { return lo_; }
+  double hi() const { return hi_; }
+  double mean() const { return mean_; }
+
+ private:
+  void HuberMAD(const std::vector<double>& values);
+
+  double lo_;
+  double hi_;
+  double mean_;
+  double stddev_;
+};
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
diff --git a/tensorflow/core/grappler/costs/robust_stats_test.cc b/tensorflow/core/grappler/costs/robust_stats_test.cc
new file mode 100644
index 00000000000..924097b126d
--- /dev/null
+++ b/tensorflow/core/grappler/costs/robust_stats_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class RobustStatsTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    for (double d = 1.0; d <= 5.0; d += 1.0) {
+      values1_.push_back(5.0 - d);
+      values1_.push_back(5.0 + d);
+      values2_.push_back(25.0 - 2 * d);
+      values2_.push_back(25.0 + 2 * d);
+      values3_.push_back(-3.0 - d);
+      values3_.push_back(-3.0 + d);
+    }
+    values1_.push_back(5.0);  // Odd # elements, mean is 5.0
+    values3_.push_back(197.0);
+    values3_.push_back(-203.0);  // Even # elements, mean is -3.0
+  }
+
+  std::vector<double> values1_;
+  std::vector<double> values2_;
+  std::vector<double> values3_;
+};
+
+TEST_F(RobustStatsTest, Simple) {
+  RobustStats s1(values1_);
+  EXPECT_EQ(5.0, s1.mean());
+  EXPECT_EQ(0.0, s1.lo());
+  EXPECT_EQ(10.0, s1.hi());
+
+  RobustStats s2(values2_);
+  EXPECT_EQ(25.0, s2.mean());
+  EXPECT_EQ(15.0, s2.lo());
+  EXPECT_EQ(35.0, s2.hi());
+
+  RobustStats s3(values3_);
+  EXPECT_EQ(-3.0, s3.mean());
+  EXPECT_EQ(-203.0, s3.lo());
+  EXPECT_EQ(197.0, s3.hi());
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
new file mode 100644
index 00000000000..8f77d7677ac
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -0,0 +1,215 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+Costs CombineCosts(const Costs& left, const Costs& right) {
+  CHECK_NE(left.max_memory, kMemoryUnknown);
+  CHECK_NE(left.max_per_op_buffers, kMemoryUnknown);
+  CHECK_NE(left.max_per_op_streaming, kMemoryUnknown);
+
+  Costs result = left;
+  result.execution_time += right.execution_time;
+  if (right.max_memory != kMemoryUnknown) {
+    result.max_memory += right.max_memory;
+  }
+  if (right.max_per_op_buffers != kMemoryUnknown) {
+    result.max_per_op_buffers =
+        std::max(left.max_per_op_buffers, right.max_per_op_buffers);
+  }
+  if (right.max_per_op_streaming != kMemoryUnknown) {
+    result.max_per_op_streaming =
+        std::max(left.max_per_op_streaming, right.max_per_op_streaming);
+  }
+  VLOG(2) << "costs execution_time=" << result.execution_time.count()
+          << " max_memory=" << result.max_memory
+          << " max_per_op_buffers=" << result.max_per_op_buffers
+          << " max_per_op_streaming=" << result.max_per_op_streaming;
+  return result;
+}
+}  // namespace
+
+VirtualScheduler::VirtualScheduler(const GraphDef& graph,
+                                   const std::vector<string>& fetch_nodes)
+    : graph_costs_(Costs::ZeroCosts()),
+      // TODO(dyoon): Use a better way than FIFO.
+      ready_nodes_(new FIFOManager()) {
+  // First, get the nodes that would run to output fetch_nodes.
+  std::vector<const NodeDef*> nodes =
+      ComputeTransitiveFanin(graph, fetch_nodes);
+
+  // TODO(dyoon): this is a bit inefficient as name_to_node is already built in
+  // ComputeTransitiveFanin().
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  for (const auto& node : graph.node()) {
+    name_to_node[node.name()] = &node;
+  }
+
+  // Build node_map.
+  for (const auto* node : nodes) {
+    auto& node_state = GetNodeStateOrCreateIt(node);
+    // TODO(dyoon): add SendRecv considering devices and control dependency.
+    for (const string& input : node->input()) {
+      const NodeDef* in = name_to_node[NodeName(input)];
+      CHECK(in);
+      node_state.inputs.push_back(in);
+      auto& input_node_state = GetNodeStateOrCreateIt(in);
+      input_node_state.outputs.push_back(node);
+    }
+    if (node->input().empty()) {
+      node_state.time_ready =
+          Costs::Duration();  // Node without input: ready at time 0.
+      ready_nodes_->AddNode(node);
+    }
+  }
+}
+
+const NodeDef* VirtualScheduler::GetCurrNode() const {
+  return ready_nodes_->GetCurrNode();
+}
+
+NodeState& VirtualScheduler::GetNodeStateOrCreateIt(const NodeDef* node) {
+  auto it = node_map_.find(node);
+  if (it == node_map_.end()) {
+    it = node_map_.emplace(node, NodeState()).first;
+  }
+  return it->second;
+}
+
+bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
+  // Update graph_costs_ and per-op costs.
+  graph_costs_ = CombineCosts(graph_costs_, node_costs);
+  const auto* node = GetCurrNode();
+  const auto& op_name = node->op();
+
+  auto it = op_to_cost_.find(op_name);
+  if (it == op_to_cost_.end()) {
+    it = op_to_cost_.emplace(op_name, Costs::ZeroCosts()).first;
+  }
+  auto& op_cost = it->second;
+  op_cost = CombineCosts(op_cost, node_costs);
+
+  // Update node and device states.
+  auto& node_state = node_map_[node];
+  auto& device = device_[node->device()];
+  device.nodes_executed.push_back(node);
+  // Node is scheduled when the device is available AND all the inputs are
+  // ready; hence, time_scheduled is time_ready if time_ready > device curr
+  // time.
+  node_state.time_scheduled =
+      std::max(device.GetCurrTime(), node_state.time_ready);
+  // Override device curr time with the time_scheduled.
+  device.device_costs.execution_time = node_state.time_scheduled;
+  device.device_costs = CombineCosts(device.device_costs, node_costs);
+  auto curr_time = device.GetCurrTime();
+  node_state.time_finished = curr_time;
+
+  // Update device's per-op cost.
+  {
+    auto it = device.op_to_cost.find(op_name);
+    if (it == device.op_to_cost.end()) {
+      it = device.op_to_cost.emplace(op_name, Costs::ZeroCosts()).first;
+    }
+    auto& op_cost = it->second;
+    op_cost = CombineCosts(op_cost, node_costs);
+
+    VLOG(2) << "Op scheduled -- name: " << node->name()
+            << ", op: " << node->op() << ", device: " << node->device()
+            << ", ready: " << node_state.time_ready.count()
+            << ", scheduled: " << node_state.time_scheduled.count()
+            << ", finished: " << node_state.time_finished.count();
+
+    // Increment num_inputs_ready of the output nodes.
+    for (auto* output : node_state.outputs) {
+      auto& output_state = node_map_[output];
+      output_state.num_inputs_ready++;
+      if (output_state.num_inputs_ready == output_state.inputs.size()) {
+        // This output node is now ready.
+        output_state.time_ready = curr_time;
+        ready_nodes_->AddNode(output);
+      }
+    }
+
+    // Increment num_outputs_executed of the input nodes.
+    for (auto* input : node_state.inputs) {
+      auto& input_state = node_map_[input];
+      input_state.num_outputs_executed++;
+      if (input_state.num_outputs_executed == input_state.outputs.size()) {
+        // All the outputs are executed; no reference to this input nodel
+        input_state.time_no_reference = curr_time;
+        // TODO(dyoon): collect device memory usage; note that this input node
+        // use device memory between time_scheduled and time_no_reference.
+      }
+    }
+  }
+
+  // Remove the current node; assume FIFO.
+  ready_nodes_->RemoveCurrNode();
+  return !ready_nodes_->Empty();  // True if not empty.
+}
+
+Costs VirtualScheduler::Summary() const {
+  // Print out basic execution summary.
+  VLOG(1) << "Expected execution time: " << graph_costs_.execution_time.count();
+  VLOG(1) << "Expected max memory: " << graph_costs_.max_memory;
+  VLOG(1) << "Expected max per-op buffers: " << graph_costs_.max_per_op_buffers;
+  VLOG(1) << "Expected max per-op streaming buffers: "
+          << graph_costs_.max_per_op_streaming;
+
+  VLOG(1) << "Per-op execution time:";
+  for (const auto& op_cost_pair : op_to_cost_) {
+    const auto& op = op_cost_pair.first;
+    const auto& cost = op_cost_pair.second.execution_time.count();
+    if (cost) {  // Skip printing out zero-cost ops.
+      VLOG(1) << " + " << op << " : " << cost;
+    }
+  }
+
+  // Print per device summary
+  VLOG(1) << "Devices:";
+  Costs critical_path_costs = Costs::ZeroCosts();
+
+  for (const auto& device : device_) {
+    const auto& name = device.first;
+    const auto& state = device.second;
+    VLOG(1) << "Device = " << name
+            << ", num_nodes = " << state.nodes_executed.size()
+            << ", execution_time = " << state.GetCurrTime().count();
+    VLOG(1) << "Per-op execution time:";
+    for (const auto& op_cost_pair : state.op_to_cost) {
+      const auto& op = op_cost_pair.first;
+      const auto& cost = op_cost_pair.second.execution_time.count();
+      if (cost) {  // Skip printing out zero-cost ops.
+        VLOG(1) << " + " << op << " : " << cost;
+      }
+    }
+    if (critical_path_costs.execution_time <= state.GetCurrTime()) {
+      critical_path_costs = state.device_costs;
+    }
+  }
+
+  VLOG(1) << "Critical path execution time: "
+          << critical_path_costs.execution_time.count();
+  return critical_path_costs;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
new file mode 100644
index 00000000000..b7785c94e04
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -0,0 +1,116 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+
+#include <list>
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+struct NodeState {
+  std::vector<const NodeDef*> inputs;
+  std::vector<const NodeDef*> outputs;
+  int num_inputs_ready;
+  int num_outputs_executed;
+  Costs::Duration time_ready;
+  Costs::Duration time_scheduled;
+  Costs::Duration time_finished;
+  Costs::Duration time_no_reference;
+
+  // Node will be ready to be executed at time_ready, scheduled at
+  // time_scheduled, and finishes execution at time_finished.
+  // Between time_scheduled and time_no_reference, the node's output tensor
+  // needs to be on the device, using up device memory.
+
+  NodeState() {
+    num_inputs_ready = 0;
+    num_outputs_executed = 0;
+    time_ready = Costs::Duration::max();
+    time_scheduled = Costs::Duration::max();
+    time_finished = Costs::Duration::max();
+    time_no_reference = Costs::Duration::max();
+  }
+};
+
+struct DeviceState {
+  std::vector<const NodeDef*> nodes_executed;
+  Costs device_costs;
+  std::map<string, Costs> op_to_cost;  // Per-op cost.
+
+  DeviceState() { device_costs = Costs::ZeroCosts(); }
+
+  Costs::Duration GetCurrTime() const { return device_costs.execution_time; }
+};
+
+// ReadyNodeManager (abstract class):
+// Keeps ready nodes and picks the best one to be scheduled.
+class ReadyNodeManager {
+ public:
+  ReadyNodeManager() {}
+  virtual ~ReadyNodeManager() {}
+  virtual void AddNode(const NodeDef* node) = 0;
+  virtual const NodeDef* GetCurrNode() const = 0;
+  virtual void RemoveCurrNode() = 0;
+  virtual bool Empty() const = 0;
+};
+
+class FIFOManager : public ReadyNodeManager {
+ public:
+  FIFOManager() : ReadyNodeManager() {}
+  ~FIFOManager() override {}
+  void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
+  const NodeDef* GetCurrNode() const override { return nodes_.front(); }
+  void RemoveCurrNode() override { nodes_.pop_front(); }
+  bool Empty() const override { return nodes_.empty(); }
+
+ private:
+  std::list<const NodeDef*> nodes_;
+};
+}  // namespace
+
+// The virtual scheduler emulates execution of nodes in a graph, considering
+// dependencies, device, etc.
+class VirtualScheduler {
+ public:
+  VirtualScheduler(const GraphDef& graph,
+                   const std::vector<string>& fetch_nodes);
+
+  const NodeDef* GetCurrNode() const;
+  bool MarkCurrNodeExecuted(const Costs& node_costs);
+
+  Costs Summary() const;
+
+ private:
+  NodeState& GetNodeStateOrCreateIt(const NodeDef* node);
+
+  Costs graph_costs_;                   // Graph cost.
+  std::map<string, Costs> op_to_cost_;  // Per-op cost.
+  std::unique_ptr<ReadyNodeManager> ready_nodes_;
+  std::unordered_map<const NodeDef*, NodeState> node_map_;
+  std::unordered_map<string, DeviceState> device_;
+};
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 29b4d63bbf8..0847d1279b8 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2109,7 +2109,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "matrix_triangular_solve_op",
     prefix = "matrix_triangular_solve_op",
-    deps = LINALG_DEPS,
+    deps = LINALG_DEPS + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+    ]),
 )
 
 tf_kernel_library(
@@ -2350,6 +2352,8 @@ tf_kernel_library(
         "//conditions:default": [],
     }) + if_mkl([
         "//third_party/mkl:intel_binary_blob",
+    ]) + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
     ]),
 )
 
@@ -2630,6 +2634,7 @@ tf_kernel_library(
         ],
         "//conditions:default": [],
     }) + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
         "//tensorflow/core/platform/default/build_config:cudnn_plugin",
     ]),
 )
diff --git a/tensorflow/docs_src/performance/performance_models.md b/tensorflow/docs_src/performance/performance_models.md
index 027ecb195ed..d415c29aa13 100644
--- a/tensorflow/docs_src/performance/performance_models.md
+++ b/tensorflow/docs_src/performance/performance_models.md
@@ -328,12 +328,16 @@ The downside is that all the weights read are from the previous training step.
 So it is a different algorithm from SGD. But it is possible to improve its
 convergence by adjusting learning rate and other hyperparameters.
 
-### Executing the script
+## Executing the script
 
 This section lists the core command line arguments and a few basic examples for
 executing the main script
 ([tf_cnn_benchmarks.py](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py)).
 
+> Note: `tf_cnn_benchmarks.py` uses the config `force_gpu_compatible`,
+> which was introduced after TensorFlow 1.1. Until TensorFlow 1.2 is released
+> building from source is advised.
+
 #### Base command line arguments
 
 *   **`model`**: Model to use, e.g. `resnet50`, `inception3`, `vgg16`, and
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index e408506cb06..a96052a3ae5 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -139,6 +139,82 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
+def make_input_layer(features,
+                     feature_columns,
+                     weight_collections=None,
+                     trainable=True):
+  """Returns a dense `Tensor` as input layer based on given `feature_columns`.
+
+  Generally a single example in training data is described with FeatureColumns.
+  At the first layer of the model, this column oriented data should be converted
+  to a single `Tensor`.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  keywords_embedded = embedding_column(
+      categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
+  all_feature_columns = [price, keywords_embedded, ...]
+  dense_tensor = make_input_layer(features, all_feature_columns)
+  for units in [128, 64, 32]:
+    dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu)
+  prediction = tf.layers.dense(dense_tensor, 1)
+  ```
+
+  Args:
+    features: A mapping from key to tensors. `FeatureColumn`s look up via these
+      keys. For example `numeric_column('price') will look at 'price' key in
+      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
+      corresponding `FeatureColumn`.
+    feature_columns: An iterable containing all the `FeatureColumn`s. All items
+      should be instances of classes derived from `_DenseColumn` such as
+      `numeric_column`, `embedding_column`, `bucketized_column`,
+      `indicator_column`. If you have categorical features, you can wrap them
+      with with an `embedding_column` or `indicator_column`.
+    weight_collections: A list of collection names to which the Variable will be
+      added. Note that, variables will also be added to collections
+      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
+    trainable: If `True` also add the variable to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+
+  Returns:
+    A `Tensor` which represents input layer of a model. Its shape
+    is (batch_size, first_layer_dimension) and its dtype is `float32`.
+    first_layer_dimension is determined based on given `feature_columns`.
+
+  Raises:
+    ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
+  """
+  _check_feature_columns(feature_columns)
+  for column in feature_columns:
+    if not isinstance(column, _DenseColumn):
+      raise ValueError(
+          'Items of feature_columns must be a _DenseColumn. '
+          'You can wrap a categorical column with an '
+          'embedding_column or indicator_column. Given: {}'.format(column))
+  weight_collections = list(weight_collections or [])
+  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+  with variable_scope.variable_scope(
+      None, default_name='make_input_layer', values=features.values()):
+    builder = _LazyBuilder(features)
+    output_tensors = []
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      with variable_scope.variable_scope(None, default_name=column.name):
+        tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+            builder,
+            weight_collections=weight_collections,
+            trainable=trainable)
+        num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
+        batch_size = array_ops.shape(tensor)[0]
+        tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
+        output_tensors.append(tensor)
+    return array_ops.concat(output_tensors, 1)
+
+
 def make_linear_model(features,
                       feature_columns,
                       units=1,
@@ -156,10 +232,21 @@ def make_linear_model(features,
   while `make_input_layer` explicitly requires wrapping each of them with an
   `embedding_column` or an `indicator_column`.
 
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.])
+  keywords = categorical_column_with_hash_bucket("keywords", 10K)
+  all_feature_columns = [price_buckets, keywords, ...]
+  prediction = make_linear_model(features, all_feature_columns)
+  ```
+
   Args:
-    features: A mapping from key to tensors. 'string' key means a base feature.
-      It can have `_FeatureColumn` as a key too. That means that FeatureColumn
-      is already transformed by the input pipeline.
+    features: A mapping from key to tensors. `FeatureColumn`s look up via these
+      keys. For example `numeric_column('price')` will look at 'price' key in
+      this dict. Values are `Tensor` or `SparseTensor` depending on
+      corresponding `FeatureColumn`.
     feature_columns: An iterable containing all the FeatureColumns. All items
       should be instances of classes derived from FeatureColumn.
     units: units: An integer, dimensionality of the output space. Default
@@ -191,9 +278,10 @@ def make_linear_model(features,
       raise ValueError('Items of feature_columns must be either a _DenseColumn '
                        'or _CategoricalColumn. Given: {}'.format(column))
   weight_collections = list(weight_collections or [])
-  weight_collections += [
-      ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES
-  ]
+  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
   with variable_scope.variable_scope(
       None, default_name='make_linear_model', values=features.values()):
     weigthed_sums = []
@@ -228,7 +316,8 @@ def numeric_column(key,
                    normalizer_fn=None):
   """Represents real valued or numerical features.
 
-  An example:
+  Example:
+
   ```python
   price = numeric_column('price')
   all_feature_columns = [price, ...]
@@ -298,7 +387,8 @@ def bucketized_column(source_column, boundaries):
   `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
   `[1., 2.)`, and `[2., +inf)`.
 
-  An example:
+  Example:
+
   ```python
   price = numeric_column('price')
   bucketized_price = bucketized_column(price, boundaries=[...])
@@ -349,7 +439,8 @@ def categorical_column_with_hash_bucket(key,
   want to distribute your inputs into a finite number of buckets by hashing.
   output_id = Hash(input_feature_string) % bucket_size
 
-  An example:
+  Example:
+
   ```python
   keywords = categorical_column_with_hash_bucket("keywords", 10K)
   all_feature_columns = [keywords, ...]
@@ -471,7 +562,7 @@ class _DenseColumn(_FeatureColumn):
 
   @abc.abstractproperty
   def _variable_shape(self):
-    """Returns shape of variable which is compatible with _get_dense_tensor."""
+    """Returns a `TensorShape` of variable compatible with _get_dense_tensor."""
     pass
 
   @abc.abstractmethod
@@ -480,6 +571,7 @@ class _DenseColumn(_FeatureColumn):
 
     The output of this function will be used by model-buildier-functions. For
     example the pseudo code of `make_input_layer` will be like that:
+
     ```python
     def make_input_layer(features, feature_columns, ...):
       outputs = [fc._get_dense_tensor(...) for fc in feature_columns]
@@ -503,7 +595,7 @@ def _create_dense_column_weighted_sum(
       builder,
       weight_collections=weight_collections,
       trainable=trainable)
-  num_elements = tensor_shape.TensorShape(column._variable_shape).num_elements()  # pylint: disable=protected-access
+  num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
   batch_size = array_ops.shape(tensor)[0]
   tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
   weight = variable_scope.get_variable(
@@ -615,12 +707,15 @@ class _LazyBuilder(object):
     """Creates a `_LazyBuilder`.
 
     Args:
-      features: A mapping from feature column to tensors. A `string` key
+      features: A mapping from feature column to objects that are `Tensor` or
+        `SparseTensor`, or can be converted to same via
+        `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key
         signifies a base feature (not-transformed). A `FeatureColumn` key
         means that this `Tensor` is the output of an existing `FeatureColumn`
         which can be reused.
     """
-    self._columns_to_tensors = features.copy()
+    self._features = features.copy()
+    self._feature_tensors = {}
 
   def get(self, key):
     """Returns a `Tensor` for the given key.
@@ -640,9 +735,16 @@ class _LazyBuilder(object):
       ValueError: if key is not found or a transformed `Tensor` cannot be
         computed.
     """
-    if key in self._columns_to_tensors:
-      # Feature_column is already transformed or it's a raw feature.
-      return self._columns_to_tensors[key]
+    if key in self._feature_tensors:
+      # FeatureColumn is already transformed or converted.
+      return self._feature_tensors[key]
+
+    if key in self._features:
+      # FeatureColumn is a raw feature.
+      feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
+          self._features[key])
+      self._feature_tensors[key] = feature_tensor
+      return feature_tensor
 
     if not isinstance(key, (str, _FeatureColumn)):
       raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
@@ -653,11 +755,13 @@ class _LazyBuilder(object):
 
     column = key
     logging.debug('Transforming feature_column %s.', column)
-    transformed = column._transform_feature(self)  # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    transformed = column._transform_feature(self)
+    # pylint: enable=protected-access
     if transformed is None:
       raise ValueError('Column {} is not supported.'.format(column.name))
-    self._columns_to_tensors[column] = transformed
-    return self._columns_to_tensors[column]
+    self._feature_tensors[column] = transformed
+    return transformed
 
 
 def _check_feature_columns(feature_columns):
@@ -709,7 +813,7 @@ class _NumericColumn(_DenseColumn,
 
   @property
   def _variable_shape(self):
-    return self.shape
+    return tensor_shape.TensorShape(self.shape)
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     del weight_collections
@@ -738,7 +842,8 @@ class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
 
   @property
   def _variable_shape(self):
-    return tuple(self.source_column.shape) + (len(self.boundaries) + 1,)
+    return tensor_shape.TensorShape(
+        tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     del weight_collections
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 32d6a4e8f0a..bc626533104 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -65,7 +65,7 @@ class LazyColumnTest(test.TestCase):
       def _parse_example_config(self):
         pass
 
-    builder = fc._LazyBuilder(features={'a': constant_op.constant([[2], [3.]])})
+    builder = fc._LazyBuilder(features={'a': [[2], [3.]]})
     column = TransformCounter()
     self.assertEqual(0, column.num_transform)
     builder.get(column)
@@ -88,7 +88,7 @@ class LazyColumnTest(test.TestCase):
       def _parse_example_config(self):
         pass
 
-    builder = fc._LazyBuilder(features={'a': constant_op.constant([[2], [3.]])})
+    builder = fc._LazyBuilder(features={'a': [[2], [3.]]})
     column = Transformer()
     self.assertEqual('Output', builder.get(column))
     self.assertEqual('Output', builder.get(column))
@@ -108,13 +108,13 @@ class LazyColumnTest(test.TestCase):
       def _parse_example_config(self):
         pass
 
-    features = {'a': constant_op.constant([[2], [3.]])}
+    features = {'a': [[2], [3.]]}
     builder = fc._LazyBuilder(features=features)
     builder.get(Transformer())
     self.assertEqual(['a'], list(features.keys()))
 
   def test_error_if_feature_is_not_found(self):
-    builder = fc._LazyBuilder(features={'a': constant_op.constant([[2], [3.]])})
+    builder = fc._LazyBuilder(features={'a': [[2], [3.]]})
     with self.assertRaisesRegexp(ValueError,
                                  'bbb is not in features dictionary'):
       builder.get('bbb')
@@ -135,7 +135,7 @@ class LazyColumnTest(test.TestCase):
       def _parse_example_config(self):
         pass
 
-    builder = fc._LazyBuilder(features={'a': constant_op.constant([[2], [3.]])})
+    builder = fc._LazyBuilder(features={'a': [[2], [3.]]})
     with self.assertRaisesRegexp(ValueError,
                                  'NotAProperColumn is not supported'):
       builder.get(NotAProperColumn())
@@ -145,7 +145,7 @@ class LazyColumnTest(test.TestCase):
     class NotAFeatureColumn(object):
       pass
 
-    builder = fc._LazyBuilder(features={'a': constant_op.constant([[2], [3.]])})
+    builder = fc._LazyBuilder(features={'a': [[2], [3.]]})
     with self.assertRaisesRegexp(
         TypeError, '"key" must be either a "str" or "_FeatureColumn".'):
       builder.get(NotAFeatureColumn())
@@ -273,7 +273,7 @@ class NumericColumnTest(test.TestCase):
 
     price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     builder = fc._LazyBuilder({
-        'price': constant_op.constant([[1., 2.], [5., 6.]])
+        'price': [[1., 2.], [5., 6.]]
     })
     output = builder.get(price)
     with self.test_session():
@@ -286,7 +286,7 @@ class NumericColumnTest(test.TestCase):
 
     price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     builder = fc._LazyBuilder({
-        'price': constant_op.constant([[1., 2.], [5., 6.]])
+        'price': [[1., 2.], [5., 6.]]
     })
     self.assertEqual(builder.get(price), price._get_dense_tensor(builder))
 
@@ -315,7 +315,7 @@ class NumericColumnTest(test.TestCase):
   def test_make_linear_model(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[1.], [5.]])}
+      features = {'price': [[1.], [5.]]}
       predictions = fc.make_linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
@@ -402,7 +402,7 @@ class BucketizedColumnTest(test.TestCase):
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = fc._LazyBuilder({
-          'price': constant_op.constant([[-1., 1.], [5., 6.]])
+          'price': [[-1., 1.], [5., 6.]]
       })
       transformed_tensor = builder.get(bucketized_price)
       with _initialized_session():
@@ -414,7 +414,7 @@ class BucketizedColumnTest(test.TestCase):
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = fc._LazyBuilder({
-          'price': constant_op.constant([[-1.], [1.], [5.], [6.]])
+          'price': [[-1.], [1.], [5.], [6.]]
       })
       with _initialized_session():
         bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
@@ -432,7 +432,7 @@ class BucketizedColumnTest(test.TestCase):
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = fc._LazyBuilder({
-          'price': constant_op.constant([[-1., 1.], [5., 6.]])
+          'price': [[-1., 1.], [5., 6.]]
       })
       with _initialized_session():
         bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
@@ -448,7 +448,7 @@ class BucketizedColumnTest(test.TestCase):
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = fc._LazyBuilder({
-          'price': constant_op.constant([[-1.], [1.], [5.], [6.]])
+          'price': [[-1.], [1.], [5.], [6.]]
       })
       with _initialized_session() as sess:
         id_weight_pair = bucketized_price._get_sparse_tensors(builder)
@@ -465,7 +465,7 @@ class BucketizedColumnTest(test.TestCase):
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = fc._LazyBuilder({
-          'price': constant_op.constant([[-1., 1.], [5., 6.]])
+          'price': [[-1., 1.], [5., 6.]]
       })
       with _initialized_session() as sess:
         id_weight_pair = bucketized_price._get_sparse_tensors(builder)
@@ -502,7 +502,7 @@ class BucketizedColumnTest(test.TestCase):
     price = fc.numeric_column('price', shape=[1])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[-1.], [1.], [5.], [6.]])}
+      features = {'price': [[-1.], [1.], [5.], [6.]]}
       predictions = fc.make_linear_model(features, [bucketized_price])
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
@@ -527,7 +527,7 @@ class BucketizedColumnTest(test.TestCase):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[-1., 1.], [5., 6.]])}
+      features = {'price': [[-1., 1.], [5., 6.]]}
       predictions = fc.make_linear_model(features, [bucketized_price])
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
@@ -621,15 +621,15 @@ class SparseColumnHashedTest(test.TestCase):
     float_fc = fc.categorical_column_with_hash_bucket(
         'a_float', 10, dtype=dtypes.string)
     int_tensor = sparse_tensor.SparseTensor(
-        values=constant_op.constant([101]),
+        values=[101],
         indices=[[0, 0]],
         dense_shape=[1, 1])
     string_tensor = sparse_tensor.SparseTensor(
-        values=constant_op.constant(['101']),
+        values=['101'],
         indices=[[0, 0]],
         dense_shape=[1, 1])
     float_tensor = sparse_tensor.SparseTensor(
-        values=constant_op.constant([101.]),
+        values=[101.],
         indices=[[0, 0]],
         dense_shape=[1, 1])
     builder = fc._LazyBuilder({
@@ -745,7 +745,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_bias(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[1.], [5.]])}
+      features = {'price': [[1.], [5.]]}
       predictions = fc.make_linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
@@ -848,7 +848,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_multi_output(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[1.], [5.]])}
+      features = {'price': [[1.], [5.]]}
       predictions = fc.make_linear_model(features, [price], units=3)
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
@@ -885,7 +885,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[1., 2.], [5., 6.]])}
+      features = {'price': [[1., 2.], [5., 6.]]}
       predictions = fc.make_linear_model(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
@@ -913,7 +913,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_multi_dimension_multi_output(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[1., 2.], [5., 6.]])}
+      features = {'price': [[1., 2.], [5., 6.]]}
       predictions = fc.make_linear_model(features, [price], units=3)
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
@@ -928,7 +928,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[1.], [5.]])}
+      features = {'price': [[1.], [5.]]}
       predictions = fc.make_linear_model(features, [price])
       with _initialized_session():
         with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
@@ -937,7 +937,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[[1., 2.]], [[5., 6.]]])}
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
       predictions = fc.make_linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
@@ -953,8 +953,8 @@ class MakeLinearModelTest(test.TestCase):
     price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
       features = {
-          'price1': constant_op.constant([[1., 2.], [5., 6.]]),
-          'price2': constant_op.constant([[3.], [4.]])
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3.], [4.]]
       }
       predictions = fc.make_linear_model(features, [price1, price2])
       bias = get_linear_model_bias()
@@ -973,7 +973,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_collection(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
-      features = {'price': constant_op.constant([[1.], [5.]])}
+      features = {'price': [[1.], [5.]]}
       fc.make_linear_model(features, [price], weight_collections=['my-vars'])
       my_vars = g.get_collection('my-vars')
       bias = get_linear_model_bias()
@@ -998,7 +998,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_trainable_default(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
-      features = {'price': constant_op.constant([[1.], [5.]])}
+      features = {'price': [[1.], [5.]]}
       fc.make_linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
@@ -1022,7 +1022,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_trainable_false(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
-      features = {'price': constant_op.constant([[1.], [5.]])}
+      features = {'price': [[1.], [5.]]}
       fc.make_linear_model(features, [price], trainable=False)
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       self.assertEqual([], trainable_vars)
@@ -1074,5 +1074,89 @@ class MakeLinearModelTest(test.TestCase):
       self.assertIn('wire_cast', my_vars[2].name)
 
 
+class MakeInputLayerTest(test.TestCase):
+
+  def test_should_be_dense_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _DenseColumn'):
+      fc.make_input_layer(
+          features={'a': [[0]]},
+          feature_columns=[
+              fc.categorical_column_with_hash_bucket('wire_cast', 4)
+          ])
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc.make_input_layer(
+          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      fc.make_input_layer(
+          features={'a': [[0]]},
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
+
+  def test_one_column(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      net = fc.make_input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1.], [5.]], net.eval())
+
+  def test_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      net = fc.make_input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      net = fc.make_input_layer(features, [price])
+      with _initialized_session():
+        with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+          net.eval()
+
+  def test_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      net = fc.make_input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+
+  def test_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3.], [4.]]
+      }
+      net = fc.make_input_layer(features, [price1, price2])
+      with _initialized_session():
+        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    with ops.Graph().as_default():
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+      }
+      net1 = fc.make_input_layer(features, [price_a, price_b])
+      net2 = fc.make_input_layer(features, [price_b, price_a])
+      with _initialized_session():
+        self.assertAllClose([[1., 3.]], net1.eval())
+        self.assertAllClose([[1., 3.]], net2.eval())
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 69d1a6f60e1..245dcc96db7 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -774,6 +774,11 @@ class VariableScopeTest(test.TestCase):
         self.assertEqual([v.name
                           for v in scope.global_variables()], ["foo/b:0"])
 
+  def testGetVariableWithRefDtype(self):
+    v = variable_scope.get_variable("v", shape=[3, 4], dtype=dtypes.float32)
+    # Ensure it is possible to do get_variable with a _ref dtype passed in.
+    _ = variable_scope.get_variable("w", shape=[5, 6], dtype=v.dtype)
+
 
 def axis0_into1_partitioner(shape=None, **unused_kwargs):
   part = [1] * len(shape)
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 76719f35b80..43addbe5a52 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -280,6 +280,17 @@ class _VariableStore(object):
       raise ValueError(
           "Passed a custom_getter which is not callable: %s" % custom_getter)
 
+    # If a *_ref type is passed in an error would be triggered further down the
+    # stack. We prevent this using base_dtype to get a non-ref version of the
+    # type, before doing anything else. When _ref types are removed in favour of
+    # resources, this line can be removed.
+    try:
+      dtype = dtype.base_dtype
+    except AttributeError:
+      # .base_dtype not existing means that we will try and use the raw dtype
+      # which was passed in - this might be a NumPy type which is valid.
+      pass
+
     # This is the main logic of get_variable.  However, custom_getter
     # may override this logic.  So we save it as a callable and pass
     # it to custom_getter.
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index 93e64b4ab0b..277c11386dd 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -994,7 +994,7 @@ class SVSummaryThread(coordinator.LooperThread):
       summary_strs = self._sess.run(self._sv.summary_op)
       global_step = None
     if self._sv.summary_writer:
-      logging.info("Recording summary at step %d.", global_step)
+      logging.info("Recording summary at step %s.", global_step)
       self._sv.summary_writer.add_summary(summary_strs, global_step)
 
 
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index e441321fc86..9b8e23babd6 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -227,7 +227,7 @@ string ToString(CUresult result) {
 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
 // context behind our backs).
 CUcontext CurrentContext() {
-  CUcontext current  = CUDADriver::CurrentContextOrDie();
+  CUcontext current = CUDADriver::CurrentContextOrDie();
   if (current != nullptr && !CreatedContexts::Has(current)) {
     LOG(FATAL) << "current context was not created by the StreamExecutor "
                   "cuda_driver API: "
@@ -480,27 +480,56 @@ bool DeviceOptionsToContextFlags(DeviceOptions device_options, int *flags) {
     CUdevice device, DeviceOptions device_options, CudaContext** context) {
   *context = nullptr;
 
-  CUcontext former_context = CurrentContext();
-  if (former_context != nullptr) {
-    LOG(WARNING) << "creating context when one is currently active; existing: "
-                 << former_context;
-  }
-
   int flags = 0;
   if (!DeviceOptionsToContextFlags(device_options, &flags)) {
     LOG(WARNING) << "could not convert all device options into context flags";
   }
 
   CUresult res;
+  CUcontext former_context;
   CUcontext new_context;
   {
     // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
     // context creation: see http://b/13248943
 
 #if CUDA_VERSION >= 7000
-    res = cuDevicePrimaryCtxSetFlags(device, flags);
+    {
+      unsigned int former_primary_context_flags;
+      int former_primary_context_is_active;
+      CHECK_EQ(CUDA_SUCCESS,
+               cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
+                                          &former_primary_context_is_active));
+      if (former_primary_context_flags != flags) {
+        if (former_primary_context_is_active) {
+          LOG(ERROR)
+              << "The primary context is active and has a different flag set ("
+              << former_primary_context_flags << ") than the desired flag set ("
+              << flags << ").";
+        } else {
+          CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
+        }
+      }
+    }
+
+    former_context = CUDADriver::CurrentContextOrDie();
     res = cuDevicePrimaryCtxRetain(&new_context, device);
+    if (former_context != nullptr) {
+      if (former_context == new_context) {
+        VLOG(2) << "The primary context " << former_context
+                << " exists before initializing the StreamExecutor.";
+      } else {
+        LOG(WARNING) << "A non-primary context " << former_context
+                     << " exists before initializing the StreamExecutor. We "
+                        "haven't verified StreamExecutor works with that.";
+      }
+    }
 #else
+    former_context = CurrentContext();
+    if (former_context != nullptr) {
+      LOG(WARNING)
+          << "creating context when one is currently active; existing: "
+          << former_context;
+    }
     res = cuCtxCreate(&new_context, flags, device);
 #endif
   }