diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 729d84a07b0..7f80bf94132 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -37,7 +37,10 @@ config_setting(
 
 package_group(
     name = "internal",
-    packages = ["//tensorflow/..."],
+    packages = [
+        "//learning/vis/...",
+        "//tensorflow/...",
+    ],
 )
 
 sh_binary(
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 54c33c9dffa..58557e9ba29 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -482,7 +482,6 @@ static void TF_Run_Helper(
     result = session->PRun(handle, input_pairs, output_tensor_names, &outputs);
   }
   if (!result.ok()) {
-    LOG(ERROR) << result.error_message();
     status->status = result;
     return;
   }
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 7c347c6cf67..8e6c3d7e62b 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -73,6 +73,46 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "grad_op_registry",
+    srcs = ["framework/grad_op_registry.cc"],
+    hdrs = ["framework/grad_op_registry.h"],
+    deps = [
+        ":ops",
+        ":scope",
+    ],
+)
+
+cc_library(
+    name = "math_grad",
+    srcs = ["gradients/math_grad.cc"],
+    deps = [
+        ":cc_ops",
+        ":grad_op_registry",
+        ":ops",
+        ":scope",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+    ],
+)
+
+tf_cc_test(
+    name = "gradients/math_grad_test",
+    deps = [
+        ":cc_ops",
+        ":grad_op_registry",
+        ":math_grad",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_gen_op_wrappers_cc(
     name = "cc_ops",
     op_lib_names = [
diff --git a/tensorflow/cc/framework/grad_op_registry.cc b/tensorflow/cc/framework/grad_op_registry.cc
new file mode 100644
index 00000000000..b83e7de61c6
--- /dev/null
+++ b/tensorflow/cc/framework/grad_op_registry.cc
@@ -0,0 +1,42 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/framework/grad_op_registry.h"
+
+namespace tensorflow {
+namespace ops {
+
+// static
+GradOpRegistry* GradOpRegistry::Global() {
+  static GradOpRegistry* grad_op_registry = new GradOpRegistry;
+  return grad_op_registry;
+}
+
+bool GradOpRegistry::Register(const string& op, GradFunc func) {
+  CHECK(registry_.insert({op, func}).second) << "Existing gradient for " << op;
+  return true;
+}
+
+Status GradOpRegistry::Lookup(const string& op, GradFunc* func) {
+  auto iter = registry_.find(op);
+  if (iter == registry_.end()) {
+    return errors::NotFound("No gradient defined for op: ", op);
+  }
+  *func = iter->second;
+  return Status::OK();
+}
+
+}  // end namespace ops
+}  // namespace tensorflow
diff --git a/tensorflow/cc/framework/grad_op_registry.h b/tensorflow/cc/framework/grad_op_registry.h
new file mode 100644
index 00000000000..b8a15219e52
--- /dev/null
+++ b/tensorflow/cc/framework/grad_op_registry.h
@@ -0,0 +1,75 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
+#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
+
+#include <unordered_map>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+
+namespace tensorflow {
+namespace ops {
+
+// GradFunc is the signature for all gradient functions in GradOpRegistry.
+// Implementations should add operations to compute the gradient outputs of 'op'
+// (returned in 'grad_outputs') using 'scope' and 'grad_inputs'.
+typedef Status (*GradFunc)(const Scope& scope, const Operation& op,
+                           const std::vector<Output>& grad_inputs,
+                           std::vector<Output>* grad_outputs);
+
+// GradOpRegistry maintains a static registry of gradient functions.
+// Gradient functions are indexed in the registry by the forward op name (i.e.
+// "MatMul" -> MatMulGrad func).
+class GradOpRegistry {
+ public:
+  // Registers 'func' as the the gradient function for 'op'.
+  // Returns true if registration was succesful, check fails otherwise.
+  bool Register(const string& op, GradFunc func);
+
+  // Sets 'func' to the gradient function for 'op' and returns Status OK if
+  // the gradient function for 'op' exists in the registry.
+  // Note that 'func' can be null for ops that have registered no-gradient with
+  // the registry.
+  // Returns error status otherwise.
+  Status Lookup(const string& op, GradFunc* func);
+
+  // Returns a pointer to the global gradient function registry.
+  static GradOpRegistry* Global();
+
+ private:
+  std::unordered_map<string, GradFunc> registry_;
+};
+
+}  // namespace ops
+
+// Macros used to define gradient functions for ops.
+#define REGISTER_GRADIENT_OP(name, fn) \
+  REGISTER_GRADIENT_OP_UNIQ_HELPER(__COUNTER__, name, fn)
+
+#define REGISTER_NO_GRADIENT_OP(name) \
+  REGISTER_GRADIENT_OP_UNIQ_HELPER(__COUNTER__, name, nullptr)
+
+#define REGISTER_GRADIENT_OP_UNIQ_HELPER(ctr, name, fn) \
+  REGISTER_GRADIENT_OP_UNIQ(ctr, name, fn)
+
+#define REGISTER_GRADIENT_OP_UNIQ(ctr, name, fn) \
+  static bool unused_ret_val_##ctr =             \
+      ::tensorflow::ops::GradOpRegistry::Global()->Register(name, fn)
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
diff --git a/tensorflow/cc/framework/ops.cc b/tensorflow/cc/framework/ops.cc
index dd107a57fd0..52ef39f5070 100644
--- a/tensorflow/cc/framework/ops.cc
+++ b/tensorflow/cc/framework/ops.cc
@@ -18,6 +18,44 @@ limitations under the License.
 namespace tensorflow {
 namespace ops {
 
+Operation::Operation(Node* n) : inputs_(GetInputs(n)), node_(n) {}
+
+Output Operation::input(int i) const {
+  CHECK_NOTNULL(node_);
+  CHECK_GE(i, 0);
+  CHECK_LT(i, node_->num_inputs());
+  // Handle the case where the input was unknown at the time this
+  // Operation was constructed.
+  if (inputs_[i].first == nullptr && inputs_[i].second == -1) {
+    for (const Edge* e : node_->in_edges()) {
+      if (e->IsControlEdge()) continue;
+      if (e->dst_input() == i) {
+        return Output(e->src(), e->src_output());
+      }
+    }
+  }
+  return Output(inputs_[i].first, inputs_[i].second);
+}
+
+Output Operation::output(int i) const {
+  CHECK_NOTNULL(node_);
+  CHECK_GE(i, 0);
+  CHECK_LT(i, node_->num_outputs());
+  return Output(node_, i);
+}
+
+Operation::Inputs Operation::GetInputs(Node* node) {
+  Operation::Inputs inputs;
+  if (node != nullptr) {
+    inputs.resize(node->num_inputs(), {nullptr, -1});
+    for (const Edge* e : node->in_edges()) {
+      if (e->IsControlEdge()) continue;
+      inputs[e->dst_input()] = std::make_pair(e->src(), e->src_output());
+    }
+  }
+  return inputs;
+}
+
 Input::Initializer::Initializer(
     const std::initializer_list<Input::Initializer>& v) {
   if (v.size() < 1) {
diff --git a/tensorflow/cc/framework/ops.h b/tensorflow/cc/framework/ops.h
index 1737f043cb0..517598d9e86 100644
--- a/tensorflow/cc/framework/ops.h
+++ b/tensorflow/cc/framework/ops.h
@@ -27,17 +27,29 @@ limitations under the License.
 namespace tensorflow {
 namespace ops {
 
+class Output;
+
 // Represents a node in the computation graph.
 class Operation {
  public:
   Operation() : node_(nullptr) {}
-  explicit Operation(Node* n) : node_(n) {}
+  explicit Operation(Node* n);
+
+  int num_inputs() const { return node_->num_inputs(); }
+  DataType input_type(int o) const { return node_->input_type(o); }
+  Output input(int i) const;
 
   int num_outputs() const { return node_->num_outputs(); }
   DataType output_type(int o) const { return node_->output_type(o); }
+  Output output(int i) const;
+
   Node* node() const { return node_; }
 
  private:
+  typedef std::vector<std::pair<Node*, int64>> Inputs;
+  static Inputs GetInputs(Node* node);
+
+  Inputs inputs_;
   Node* node_;
 };
 
@@ -81,7 +93,7 @@ class Input {
       tensor = t;
     }
 
-    explicit Initializer(const Tensor& t) : tensor(t) {}
+    Initializer(const Tensor& t) : tensor(t) {}  // NOLINT(runtime/explicit)
 
     // Construct from a scalar value and an explicit shape
     template <typename T, typename = typename std::enable_if<
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
new file mode 100644
index 00000000000..85093015b7c
--- /dev/null
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -0,0 +1,91 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/standard_ops.h"
+
+#include "tensorflow/cc/framework/grad_op_registry.h"
+
+namespace tensorflow {
+namespace ops {
+namespace {
+
+// TODO(andydavis) Move this to a more appropriate file.
+REGISTER_NO_GRADIENT_OP("Const");
+
+// MatMulGrad helper function used to compute two MatMul operations
+// based on input matrix transposition combinations.
+Status MatMulGradHelper(const Scope& scope, const Output& x0, const bool adj_x0,
+                        const Output& x1, const bool adj_x1, const Output& y0,
+                        const bool adj_y0, const Output& y1, const bool adj_y1,
+                        std::vector<Output>* grad_outputs) {
+  auto dx =
+      MatMul(scope, x0, x1, MatMul::TransposeA(adj_x0).TransposeB(adj_x1));
+  grad_outputs->push_back(dx);
+  auto dy =
+      MatMul(scope, y0, y1, MatMul::TransposeA(adj_y0).TransposeB(adj_y1));
+  grad_outputs->push_back(dy);
+  return Status::OK();
+}
+
+// MatMulGrad common used to read and check node attr state, and determine
+// proper MatMul products for gradients based on input matrix transposition
+// combinations.
+// TODO(andydavis) Re-use this function for BatchMatMulGrad.
+Status MatMulGradCommon(const Scope& scope, const Operation& op,
+                        const std::vector<Output>& grad_inputs,
+                        const string& attr_adj_x, const string& attr_adj_y,
+                        std::vector<Output>* grad_outputs) {
+  DataType dtype;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), "T", &dtype));
+  if (dtype == DT_COMPLEX64 || dtype == DT_COMPLEX128) {
+    return errors::Unimplemented(
+        "MatMul gradient for complex data type is not supported yet.");
+  }
+
+  bool ta;
+  bool tb;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), attr_adj_x, &ta));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), attr_adj_y, &tb));
+
+  if (!ta && !tb) {
+    return MatMulGradHelper(scope, grad_inputs[0], false, op.input(1), true,
+                            op.input(0), true, grad_inputs[0], false,
+                            grad_outputs);
+  } else if (!ta && tb) {
+    return MatMulGradHelper(scope, grad_inputs[0], false, op.input(1), false,
+                            grad_inputs[0], true, op.input(0), false,
+                            grad_outputs);
+  } else if (ta && !tb) {
+    return MatMulGradHelper(scope, op.input(1), false, grad_inputs[0], true,
+                            op.input(0), false, grad_inputs[0], false,
+                            grad_outputs);
+  }
+  return MatMulGradHelper(scope, op.input(1), true, grad_inputs[0], true,
+                          grad_inputs[0], true, op.input(0), true,
+                          grad_outputs);
+}
+
+Status MatMulGrad(const Scope& scope, const Operation& op,
+                  const std::vector<Output>& grad_inputs,
+                  std::vector<Output>* grad_outputs) {
+  return MatMulGradCommon(scope, op, grad_inputs, "transpose_a", "transpose_b",
+                          grad_outputs);
+}
+
+REGISTER_GRADIENT_OP("MatMul", MatMulGrad);
+
+}  // anonymous namespace
+}  // namespace ops
+}  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
new file mode 100644
index 00000000000..993316d7628
--- /dev/null
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/framework/grad_op_registry.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+using namespace ops;  // NOLINT(build/namespaces)
+
+namespace {
+
+// TODO(andydavis) Test gradient function against numeric gradients output.
+// TODO(andydavis) As more gradients are added move common test functions
+// to a testutil library.
+class MathGradTest : public ::testing::Test {
+ protected:
+  MathGradTest() : root_(Scope::NewRootScope()) {}
+
+  void ComputeMatMulGrad(const Output& x, const bool t_x, const Output& y,
+                         const bool t_y, const Output& dz,
+                         std::vector<Tensor>* out) {
+    // Compute forward MatMul: z = MatMul(x, y).
+    auto z = MatMul(root_, x, y, MatMul::TransposeA(t_x).TransposeB(t_y));
+    TF_EXPECT_OK(root_.status());
+    CHECK_NOTNULL(z.node());
+    std::vector<Output> grad_outputs;
+    // Call MatMulGrad which populates 'grad_outputs'.
+    CallGradFunction(Operation(z.node()), {dz}, &grad_outputs);
+    EXPECT_EQ(2, grad_outputs.size());
+    // Run graph and return MatMul gradient tensors for 'dx' and 'dy' in 'out'.
+    GetTensors(root_, {grad_outputs[0], grad_outputs[1]}, out);
+  }
+
+  void CallGradFunction(const Operation& op,
+                        const std::vector<Output>& grad_inputs,
+                        std::vector<Output>* grad_outputs) {
+    GradFunc grad_fn;
+    TF_EXPECT_OK(GradOpRegistry::Global()->Lookup(op.node()->name(), &grad_fn));
+    TF_EXPECT_OK(grad_fn(root_, op, grad_inputs, grad_outputs));
+    TF_EXPECT_OK(root_.status());
+  }
+
+  Tensor ComputeMatMul(const Output& x, const bool t_x, const Output& y,
+                       const bool t_y) {
+    auto z = MatMul(root_, x, y, MatMul::TransposeA(t_x).TransposeB(t_y));
+    TF_EXPECT_OK(root_.status());
+    Tensor out;
+    GetTensor(root_, z, &out);
+    return out;
+  }
+
+  void RandMatMulGradData(const bool tx, const bool ty,
+                          std::vector<Tensor>* data) {
+    // z = MatMul(x, y)
+    const int m = Rand();
+    const int k = Rand();
+    const int n = Rand();
+    // x.shape = [m, k]
+    const TensorShape x_shape = tx ? TensorShape({k, m}) : TensorShape({m, k});
+    data->emplace_back(DT_FLOAT, x_shape);
+    RandTensor(&data->back());
+    // y.shape = [k, n]
+    const TensorShape y_shape = ty ? TensorShape({n, k}) : TensorShape({k, n});
+    data->emplace_back(DT_FLOAT, y_shape);
+    RandTensor(&data->back());
+    // z.shape = [m, n]
+    data->emplace_back(DT_FLOAT, TensorShape({m, n}));
+    RandTensor(&data->back());
+  }
+
+  void RandTensor(Tensor* t) {
+    test::FillFn<float>(
+        t, [this](const int i) { return static_cast<float>(Rand()); });
+  }
+
+  int Rand() { return 1 + (random::New64() % 10); }
+
+  // TODO(andydavis) Move 'GetTensors/GetTensor' to some testutil class.
+  // Note: they should be moved to a general/non-grad specific testutil class.
+  void GetTensors(const Scope& scope, OutputList tensors,
+                  std::vector<Tensor>* out) {
+    SessionOptions options;
+    std::unique_ptr<Session> session(NewSession(options));
+    GraphDef def;
+    scope.graph()->ToGraphDef(&def);
+
+    graph::SetDefaultDevice("/cpu:0", &def);
+
+    TF_CHECK_OK(session->Create(def));
+    std::vector<string> names;
+    for (const auto& t : tensors) {
+      names.push_back(strings::StrCat(t.node()->name(), ":", t.index()));
+    }
+    TF_CHECK_OK(session->Run({}, names, {}, out));
+    TF_CHECK_OK(session->Close());
+  }
+
+  void GetTensor(const Scope& scope, Output tensor, Tensor* out) {
+    std::vector<Tensor> outputs;
+    GetTensors(scope, {tensor}, &outputs);
+    *out = outputs[0];
+  }
+
+  Scope root_;
+};
+
+TEST_F(MathGradTest, MatMulGrad_NoTranspose) {
+  std::vector<Tensor> data;
+  RandMatMulGradData(false, false, &data);
+  auto x = Const(root_, data[0]);
+  auto y = Const(root_, data[1]);
+  auto dz = Const(root_, data[2]);
+
+  std::vector<Tensor> grad_outputs;
+  ComputeMatMulGrad(x, false, y, false, dz, &grad_outputs);
+
+  test::ExpectClose(grad_outputs[0], ComputeMatMul(dz, false, y, true));
+  test::ExpectClose(grad_outputs[1], ComputeMatMul(x, true, dz, false));
+}
+
+TEST_F(MathGradTest, MatMulGrad_TransposeX) {
+  std::vector<Tensor> data;
+  RandMatMulGradData(true, false, &data);
+  auto x = Const(root_, data[0]);
+  auto y = Const(root_, data[1]);
+  auto dz = Const(root_, data[2]);
+
+  std::vector<Tensor> grad_outputs;
+  ComputeMatMulGrad(x, true, y, false, dz, &grad_outputs);
+
+  test::ExpectClose(grad_outputs[0], ComputeMatMul(y, false, dz, true));
+  test::ExpectClose(grad_outputs[1], ComputeMatMul(x, false, dz, false));
+}
+
+TEST_F(MathGradTest, MatMulGrad_TransposeY) {
+  std::vector<Tensor> data;
+  RandMatMulGradData(false, true, &data);
+  auto x = Const(root_, data[0]);
+  auto y = Const(root_, data[1]);
+  auto dz = Const(root_, data[2]);
+
+  std::vector<Tensor> grad_outputs;
+  ComputeMatMulGrad(x, false, y, true, dz, &grad_outputs);
+
+  test::ExpectClose(grad_outputs[0], ComputeMatMul(dz, false, y, false));
+  test::ExpectClose(grad_outputs[1], ComputeMatMul(dz, true, x, false));
+}
+
+TEST_F(MathGradTest, MatMulGrad_TransposeX_TransposeY) {
+  std::vector<Tensor> data;
+  RandMatMulGradData(true, true, &data);
+  auto x = Const(root_, data[0]);
+  auto y = Const(root_, data[1]);
+  auto dz = Const(root_, data[2]);
+
+  std::vector<Tensor> grad_outputs;
+  ComputeMatMulGrad(x, true, y, true, dz, &grad_outputs);
+
+  test::ExpectClose(grad_outputs[0], ComputeMatMul(y, true, dz, true));
+  test::ExpectClose(grad_outputs[1], ComputeMatMul(dz, true, x, true));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 3fd428e1220..2d5a708bac6 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -99,7 +99,16 @@ cuda_py_tests(
     srcs = ["python/kernel_tests/beta_test.py"],
     additional_deps = [
         ":distributions_py",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_tests(
+    name = "binomial_test",
+    size = "small",
+    srcs = ["python/kernel_tests/binomial_test.py"],
+    additional_deps = [
+        ":distributions_py",
         "//tensorflow/python:platform_test",
     ],
     tags = ["notsan"],
@@ -179,9 +188,8 @@ cuda_py_tests(
 )
 
 cuda_py_tests(
-    name = "kullback_leibler_test",
-    size = "small",
-    srcs = ["python/kernel_tests/kullback_leibler_test.py"],
+    name = "laplace_test",
+    srcs = ["python/kernel_tests/laplace_test.py"],
     additional_deps = [
         ":distributions_py",
         "//tensorflow/python:framework_test_lib",
@@ -190,13 +198,14 @@ cuda_py_tests(
 )
 
 cuda_py_tests(
-    name = "laplace_test",
-    srcs = ["python/kernel_tests/laplace_test.py"],
+    name = "multinomial_test",
+    srcs = ["python/kernel_tests/multinomial_test.py"],
     additional_deps = [
         ":distributions_py",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
+    tags = ["notsan"],
 )
 
 cuda_py_tests(
@@ -239,6 +248,15 @@ cuda_py_tests(
     srcs = ["python/kernel_tests/uniform_test.py"],
     additional_deps = [
         ":distributions_py",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+cuda_py_tests(
+    name = "kullback_leibler_test",
+    size = "small",
+    srcs = ["python/kernel_tests/kullback_leibler_test.py"],
+    additional_deps = [
         "//tensorflow/python:platform_test",
     ],
 )
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 2b32556f3eb..83719157761 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -25,6 +25,7 @@ initialized with parameters that define the distributions.
 
 ### Univariate (scalar) distributions
 
+@@Binomial
 @@Bernoulli
 @@Beta
 @@Categorical
@@ -50,6 +51,7 @@ initialized with parameters that define the distributions.
 
 @@Dirichlet
 @@DirichletMultinomial
+@@Multinomial
 
 ### Transformed distributions
 
@@ -79,6 +81,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops.bernoulli import *
 from tensorflow.contrib.distributions.python.ops.beta import *
+from tensorflow.contrib.distributions.python.ops.binomial import *
 from tensorflow.contrib.distributions.python.ops.categorical import *
 from tensorflow.contrib.distributions.python.ops.chi2 import *
 from tensorflow.contrib.distributions.python.ops.dirichlet import *
@@ -89,6 +92,7 @@ from tensorflow.contrib.distributions.python.ops.gamma import *
 from tensorflow.contrib.distributions.python.ops.inverse_gamma import *
 from tensorflow.contrib.distributions.python.ops.kullback_leibler import *
 from tensorflow.contrib.distributions.python.ops.laplace import *
+from tensorflow.contrib.distributions.python.ops.multinomial import *
 from tensorflow.contrib.distributions.python.ops.mvn import *
 from tensorflow.contrib.distributions.python.ops.normal import *
 from tensorflow.contrib.distributions.python.ops.normal_conjugate_posteriors import *
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
index c636a4d060c..82f77fbfd1e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
@@ -57,10 +57,17 @@ class BernoulliTest(tf.test.TestCase):
       self.assertAllClose(scipy.special.logit(p), dist.logits.eval())
 
   def testInvalidP(self):
-    invalid_ps = [1.01, -0.01, 2., -3.]
+    invalid_ps = [1.01, 2.]
     for p in invalid_ps:
       with self.test_session():
-        with self.assertRaisesOpError("x <= y"):
+        with self.assertRaisesOpError("p has components greater than 1"):
+          dist = tf.contrib.distributions.Bernoulli(p=p)
+          dist.p.eval()
+
+    invalid_ps = [-0.01, -3.]
+    for p in invalid_ps:
+      with self.test_session():
+        with self.assertRaisesOpError("Condition x >= 0"):
           dist = tf.contrib.distributions.Bernoulli(p=p)
           dist.p.eval()
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py b/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py
new file mode 100644
index 00000000000..8b2520f8368
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py
@@ -0,0 +1,173 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy import stats
+import tensorflow as tf
+
+
+class BinomialTest(tf.test.TestCase):
+
+  def testSimpleShapes(self):
+    with self.test_session():
+      p = np.float32(np.random.beta(1, 1))
+      binom = tf.contrib.distributions.Binomial(n=1., p=p)
+      self.assertAllEqual([], binom.event_shape().eval())
+      self.assertAllEqual([], binom.batch_shape().eval())
+      self.assertEqual(tf.TensorShape([]), binom.get_event_shape())
+      self.assertEqual(tf.TensorShape([]), binom.get_batch_shape())
+
+  def testComplexShapes(self):
+    with self.test_session():
+      p = np.random.beta(1, 1, size=(3, 2)).astype(np.float32)
+      n = [[3., 2], [4, 5], [6, 7]]
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      self.assertAllEqual([], binom.event_shape().eval())
+      self.assertAllEqual([3, 2], binom.batch_shape().eval())
+      self.assertEqual(tf.TensorShape([]), binom.get_event_shape())
+      self.assertEqual(tf.TensorShape([3, 2]), binom.get_batch_shape())
+
+  def testNProperty(self):
+    p = [[0.1, 0.2, 0.7], [0.2, 0.3, 0.5]]
+    n = [[3.], [4]]
+    with self.test_session():
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      self.assertEqual((2, 1), binom.n.get_shape())
+      self.assertAllClose(n, binom.n.eval())
+
+  def testPProperty(self):
+    p = [[0.1, 0.2, 0.7]]
+    with self.test_session():
+      binom = tf.contrib.distributions.Binomial(n=3., p=p)
+      self.assertEqual((1, 3), binom.p.get_shape())
+      self.assertEqual((1, 3), binom.logits.get_shape())
+      self.assertAllClose(p, binom.p.eval())
+
+  def testLogitsProperty(self):
+    logits = [[0., 9., -0.5]]
+    with self.test_session():
+      binom = tf.contrib.distributions.Binomial(n=3., logits=logits)
+      self.assertEqual((1, 3), binom.p.get_shape())
+      self.assertEqual((1, 3), binom.logits.get_shape())
+      self.assertAllClose(logits, binom.logits.eval())
+
+  def testPmfNandCountsAgree(self):
+    p = [[0.1, 0.2, 0.7]]
+    n = [[5.]]
+    with self.test_session():
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      binom.pmf([2., 3, 2]).eval()
+      binom.pmf([3., 1, 2]).eval()
+      with self.assertRaisesOpError('Condition x >= 0.*'):
+        binom.pmf([-1., 4, 2]).eval()
+      with self.assertRaisesOpError('Condition x <= y.*'):
+        binom.pmf([7., 3, 0]).eval()
+
+  def testPmf_non_integer_counts(self):
+    p = [[0.1, 0.2, 0.7]]
+    n = [[5.]]
+    with self.test_session():
+      # No errors with integer n.
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      binom.pmf([2., 3, 2]).eval()
+      binom.pmf([3., 1, 2]).eval()
+      # Both equality and integer checking fail.
+      with self.assertRaisesOpError('Condition x == y.*'):
+        binom.pmf([1.0, 2.5, 1.5]).eval()
+
+      binom = tf.contrib.distributions.Binomial(n=n, p=p, validate_args=False)
+      binom.pmf([1., 2., 3.]).eval()
+      # Non-integer arguments work.
+      binom.pmf([1.0, 2.5, 1.5]).eval()
+
+  def testPmfBothZeroBatches(self):
+    with self.test_session():
+      # Both zero-batches.  No broadcast
+      p = 0.5
+      counts = 1.
+      pmf = tf.contrib.distributions.Binomial(n=1., p=p).pmf(counts)
+      self.assertAllClose(0.5, pmf.eval())
+      self.assertEqual((), pmf.get_shape())
+
+  def testPmfBothZeroBatchesNontrivialN(self):
+    with self.test_session():
+      # Both zero-batches.  No broadcast
+      p = 0.1
+      counts = 3.
+      binom = tf.contrib.distributions.Binomial(n=5., p=p)
+      pmf = binom.pmf(counts)
+      self.assertAllClose(stats.binom.pmf(counts, n=5., p=p), pmf.eval())
+      self.assertEqual((), pmf.get_shape())
+
+  def testPmfPStretchedInBroadcastWhenSameRank(self):
+    with self.test_session():
+      p = [[0.1, 0.9]]
+      counts = [[1., 2.]]
+      pmf = tf.contrib.distributions.Binomial(n=3., p=p).pmf(counts)
+      self.assertAllClose(stats.binom.pmf(counts, n=3., p=p), pmf.eval())
+      self.assertEqual((1, 2), pmf.get_shape())
+
+  def testPmfPStretchedInBroadcastWhenLowerRank(self):
+    with self.test_session():
+      p = [0.1, 0.4]
+      counts = [[1.], [0.]]
+      pmf = tf.contrib.distributions.Binomial(n=1., p=p).pmf(counts)
+      self.assertAllClose([[0.1, 0.4], [0.9, 0.6]], pmf.eval())
+      self.assertEqual((2, 2), pmf.get_shape())
+
+  def testBinomialMean(self):
+    with self.test_session():
+      n = 5.
+      p = [0.1, 0.2, 0.7]
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      expected_means = stats.binom.mean(n, p)
+      self.assertEqual((3,), binom.mean().get_shape())
+      self.assertAllClose(expected_means, binom.mean().eval())
+
+  def testBinomialVariance(self):
+    with self.test_session():
+      n = 5.
+      p = [0.1, 0.2, 0.7]
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      expected_variances = stats.binom.var(n, p)
+      self.assertEqual((3,), binom.variance().get_shape())
+      self.assertAllClose(expected_variances, binom.variance().eval())
+
+  def testBinomialMode(self):
+    with self.test_session():
+      n = 5.
+      p = [0.1, 0.2, 0.7]
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      expected_modes = [0., 1, 4]
+      self.assertEqual((3,), binom.mode().get_shape())
+      self.assertAllClose(expected_modes, binom.mode().eval())
+
+  def testBinomialMultipleMode(self):
+    with self.test_session():
+      n = 9.
+      p = [0.1, 0.2, 0.7]
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      # For the case where (n + 1) * p is an integer, the modes are:
+      # (n + 1) * p and (n + 1) * p - 1. In this case, we get back
+      # the larger of the two modes.
+      expected_modes = [1., 2, 7]
+      self.assertEqual((3,), binom.mode().get_shape())
+      self.assertAllClose(expected_modes, binom.mode().eval())
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py b/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
index 1a3f5eaf66c..23833a246b9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
@@ -65,7 +65,7 @@ class DirichletMultinomialTest(tf.test.TestCase):
       dist.pmf([3., 0, 2]).eval()
       with self.assertRaisesOpError('Condition x >= 0.*'):
         dist.pmf([-1., 4, 2]).eval()
-      with self.assertRaisesOpError('Condition x == y.*'):
+      with self.assertRaisesOpError('counts do not sum to n'):
         dist.pmf([3., 3, 0]).eval()
 
   def testPmf_non_integer_counts(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py b/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
new file mode 100644
index 00000000000..55c7825bf3e
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
@@ -0,0 +1,226 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+
+class MultinomialTest(tf.test.TestCase):
+
+  def testSimpleShapes(self):
+    with self.test_session():
+      p = [.1, .3, .6]
+      dist = tf.contrib.distributions.Multinomial(n=1., p=p)
+      self.assertEqual(3, dist.event_shape().eval())
+      self.assertAllEqual([], dist.batch_shape().eval())
+      self.assertEqual(tf.TensorShape([3]), dist.get_event_shape())
+      self.assertEqual(tf.TensorShape([]), dist.get_batch_shape())
+
+  def testComplexShapes(self):
+    with self.test_session():
+      p = 0.5 * np.ones([3, 2, 2], dtype=np.float32)
+      n = [[3., 2], [4, 5], [6, 7]]
+      dist = tf.contrib.distributions.Multinomial(n=n, p=p)
+      self.assertEqual(2, dist.event_shape().eval())
+      self.assertAllEqual([3, 2], dist.batch_shape().eval())
+      self.assertEqual(tf.TensorShape([2]), dist.get_event_shape())
+      self.assertEqual(tf.TensorShape([3, 2]), dist.get_batch_shape())
+
+  def testNProperty(self):
+    p = [[0.1, 0.2, 0.7], [0.2, 0.3, 0.5]]
+    n = [[3.], [4]]
+    with self.test_session():
+      dist = tf.contrib.distributions.Multinomial(n=n, p=p)
+      self.assertEqual((2, 1), dist.n.get_shape())
+      self.assertAllClose(n, dist.n.eval())
+
+  def testPProperty(self):
+    p = [[0.1, 0.2, 0.7]]
+    with self.test_session():
+      dist = tf.contrib.distributions.Multinomial(n=3., p=p)
+      self.assertEqual((1, 3), dist.p.get_shape())
+      self.assertEqual((1, 3), dist.logits.get_shape())
+      self.assertAllClose(p, dist.p.eval())
+
+  def testLogitsProperty(self):
+    logits = [[0., 9., -0.5]]
+    with self.test_session():
+      multinom = tf.contrib.distributions.Multinomial(n=3., logits=logits)
+      self.assertEqual((1, 3), multinom.p.get_shape())
+      self.assertEqual((1, 3), multinom.logits.get_shape())
+      self.assertAllClose(logits, multinom.logits.eval())
+
+  def testPmfNandCountsAgree(self):
+    p = [[0.1, 0.2, 0.7]]
+    n = [[5.]]
+    with self.test_session():
+      dist = tf.contrib.distributions.Multinomial(n=n, p=p)
+      dist.pmf([2., 3, 0]).eval()
+      dist.pmf([3., 0, 2]).eval()
+      with self.assertRaisesOpError('Condition x >= 0.*'):
+        dist.pmf([-1., 4, 2]).eval()
+      with self.assertRaisesOpError('counts do not sum to n'):
+        dist.pmf([3., 3, 0]).eval()
+
+  def testPmf_non_integer_counts(self):
+    p = [[0.1, 0.2, 0.7]]
+    n = [[5.]]
+    with self.test_session():
+      # No errors with integer n.
+      multinom = tf.contrib.distributions.Multinomial(n=n, p=p)
+      multinom.pmf([2., 1, 2]).eval()
+      multinom.pmf([3., 0, 2]).eval()
+      # Counts don't sum to n.
+      with self.assertRaisesOpError('counts do not sum to n'):
+        multinom.pmf([2., 3, 2]).eval()
+      # Counts are non-integers.
+      with self.assertRaisesOpError('Condition x == y.*'):
+        multinom.pmf([1.0, 2.5, 1.5]).eval()
+
+      multinom = tf.contrib.distributions.Multinomial(
+          n=n, p=p, validate_args=False)
+      multinom.pmf([1., 2., 2.]).eval()
+      # Non-integer arguments work.
+      multinom.pmf([1.0, 2.5, 1.5]).eval()
+
+  def testPmfBothZeroBatches(self):
+    with self.test_session():
+      # Both zero-batches.  No broadcast
+      p = [0.5, 0.5]
+      counts = [1., 0]
+      pmf = tf.contrib.distributions.Multinomial(n=1., p=p).pmf(counts)
+      self.assertAllClose(0.5, pmf.eval())
+      self.assertEqual((), pmf.get_shape())
+
+  def testPmfBothZeroBatchesNontrivialN(self):
+    with self.test_session():
+      # Both zero-batches.  No broadcast
+      p = [0.1, 0.9]
+      counts = [3., 2]
+      dist = tf.contrib.distributions.Multinomial(n=5., p=p)
+      pmf = dist.pmf(counts)
+      # 5 choose 3 = 5 choose 2 = 10. 10 * (.9)^2 * (.1)^3 = 81/10000.
+      self.assertAllClose(81./10000, pmf.eval())
+      self.assertEqual((), pmf.get_shape())
+
+  def testPmfPStretchedInBroadcastWhenSameRank(self):
+    with self.test_session():
+      p = [[0.1, 0.9]]
+      counts = [[1., 0], [0, 1]]
+      pmf = tf.contrib.distributions.Multinomial(n=1., p=p).pmf(counts)
+      self.assertAllClose([0.1, 0.9], pmf.eval())
+      self.assertEqual((2), pmf.get_shape())
+
+  def testPmfPStretchedInBroadcastWhenLowerRank(self):
+    with self.test_session():
+      p = [0.1, 0.9]
+      counts = [[1., 0], [0, 1]]
+      pmf = tf.contrib.distributions.Multinomial(n=1., p=p).pmf(counts)
+      self.assertAllClose([0.1, 0.9], pmf.eval())
+      self.assertEqual((2), pmf.get_shape())
+
+  def testPmfCountsStretchedInBroadcastWhenSameRank(self):
+    with self.test_session():
+      p = [[0.1, 0.9], [0.7, 0.3]]
+      counts = [[1., 0]]
+      pmf = tf.contrib.distributions.Multinomial(n=1., p=p).pmf(counts)
+      self.assertAllClose(pmf.eval(), [0.1, 0.7])
+      self.assertEqual((2), pmf.get_shape())
+
+  def testPmfCountsStretchedInBroadcastWhenLowerRank(self):
+    with self.test_session():
+      p = [[0.1, 0.9], [0.7, 0.3]]
+      counts = [1., 0]
+      pmf = tf.contrib.distributions.Multinomial(n=1., p=p).pmf(counts)
+      self.assertAllClose(pmf.eval(), [0.1, 0.7])
+      self.assertEqual(pmf.get_shape(), (2))
+
+  def testPmfShapeCountsStretched_N(self):
+    with self.test_session():
+      # [2, 2, 2]
+      p = [[[0.1, 0.9], [0.1, 0.9]], [[0.7, 0.3], [0.7, 0.3]]]
+      # [2, 2]
+      n = [[3., 3], [3, 3]]
+      # [2]
+      counts = [2., 1]
+      pmf = tf.contrib.distributions.Multinomial(n=n, p=p).pmf(counts)
+      pmf.eval()
+      self.assertEqual(pmf.get_shape(), (2, 2))
+
+  def testPmfShapeCountsPStretched_N(self):
+    with self.test_session():
+      p = [0.1, 0.9]
+      counts = [3., 2]
+      n = np.full([4, 3], 5., dtype=np.float32)
+      pmf = tf.contrib.distributions.Multinomial(n=n, p=p).pmf(counts)
+      pmf.eval()
+      self.assertEqual((4, 3), pmf.get_shape())
+
+  def testMultinomialMean(self):
+    with self.test_session():
+      n = 5.
+      p = [0.1, 0.2, 0.7]
+      dist = tf.contrib.distributions.Multinomial(n=n, p=p)
+      expected_means = 5 * np.array(p, dtype=np.float32)
+      self.assertEqual((3,), dist.mean().get_shape())
+      self.assertAllClose(expected_means, dist.mean().eval())
+
+  def testMultinomialVariance(self):
+    with self.test_session():
+      n = 5.
+      p = [0.1, 0.2, 0.7]
+      dist = tf.contrib.distributions.Multinomial(n=n, p=p)
+      expected_variances = [
+          [9./20, -1/10, -7/20], [-1/10, 4/5, -7/10], [-7/20, -7/10, 21/20]]
+      self.assertEqual((3, 3), dist.variance().get_shape())
+      self.assertAllClose(expected_variances, dist.variance().eval())
+
+  def testMultinomialVariance_batch(self):
+    with self.test_session():
+      # Shape [2]
+      n = [5.] * 2
+      # Shape [4, 1, 2]
+      p = [[[0.1, 0.9]], [[0.1, 0.9]]] * 2
+      dist = tf.contrib.distributions.Multinomial(n=n, p=p)
+      # Shape [2, 2]
+      inner_var = [[9./20, -9/20], [-9/20, 9/20]]
+      # Shape [4, 2, 2, 2]
+      expected_variances = [[inner_var, inner_var]] * 4
+      self.assertEqual((4, 2, 2, 2), dist.variance().get_shape())
+      self.assertAllClose(expected_variances, dist.variance().eval())
+
+  def testVariance_multidimensional(self):
+    # Shape [3, 5, 4]
+    p = np.random.dirichlet([.25, .25, .25, .25], [3, 5]).astype(np.float32)
+    # Shape [6, 3, 3]
+    p2 = np.random.dirichlet([.3, .3, .4], [6, 3]).astype(np.float32)
+
+    ns = np.random.randint(low=1, high=11, size=[3, 5]).astype(np.float32)
+    ns2 = np.random.randint(low=1, high=11, size=[6, 1]).astype(np.float32)
+
+    with self.test_session():
+      dist = tf.contrib.distributions.Multinomial(ns, p)
+      dist2 = tf.contrib.distributions.Multinomial(ns2, p2)
+
+      variance = dist.variance()
+      variance2 = dist2.variance()
+      self.assertEqual((3, 5, 4, 4), variance.get_shape())
+      self.assertEqual((6, 3, 3, 3), variance2.get_shape())
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py
index a985477242f..748439070c5 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py
@@ -369,5 +369,87 @@ class MultivariateNormalCholeskyTest(tf.test.TestCase):
       self.assertEqual((3, 5), tuple(mvn.batch_shape().eval()))
 
 
+class MultivariateNormalFullTest(tf.test.TestCase):
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def _random_mu_and_sigma(self, batch_shape, event_shape):
+    # This ensures sigma is positive def.
+    mat_shape = batch_shape + event_shape + event_shape
+    mat = self._rng.randn(*mat_shape)
+    sigma = tf.batch_matmul(mat, mat, adj_y=True).eval()
+
+    mu_shape = batch_shape + event_shape
+    mu = self._rng.randn(*mu_shape)
+
+    return mu, sigma
+
+  def testKLNonBatch(self):
+    batch_shape = ()
+    event_shape = (2,)
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
+      mvn_a = distributions.MultivariateNormalFull(mu_a, sigma_a)
+      mvn_b = distributions.MultivariateNormalFull(mu_b, sigma_b)
+
+      kl = distributions.kl(mvn_a, mvn_b)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      expected_kl = _compute_non_batch_kl(mu_a, sigma_a, mu_b, sigma_b)
+      self.assertAllClose(expected_kl, kl_v)
+
+  def testKLBatch(self):
+    batch_shape = (2,)
+    event_shape = (3,)
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
+      mvn_a = distributions.MultivariateNormalFull(mu_a, sigma_a)
+      mvn_b = distributions.MultivariateNormalFull(mu_b, sigma_b)
+
+      kl = distributions.kl(mvn_a, mvn_b)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      expected_kl_0 = _compute_non_batch_kl(
+          mu_a[0, :], sigma_a[0, :, :], mu_b[0, :], sigma_b[0, :])
+      expected_kl_1 = _compute_non_batch_kl(
+          mu_a[1, :], sigma_a[1, :, :], mu_b[1, :], sigma_b[1, :])
+      self.assertAllClose(expected_kl_0, kl_v[0])
+      self.assertAllClose(expected_kl_1, kl_v[1])
+
+  def testKLTwoIdenticalDistributionsIsZero(self):
+    batch_shape = (2,)
+    event_shape = (3,)
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      mvn_a = distributions.MultivariateNormalFull(mu_a, sigma_a)
+
+      # Should be zero since KL(p || p) = =.
+      kl = distributions.kl(mvn_a, mvn_a)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      self.assertAllClose(np.zeros(*batch_shape), kl_v)
+
+
+def _compute_non_batch_kl(mu_a, sigma_a, mu_b, sigma_b):
+  """Non-batch KL for N(mu_a, sigma_a), N(mu_b, sigma_b)."""
+  # Check using numpy operations
+  # This mostly repeats the tensorflow code _kl_mvn_mvn(), but in numpy.
+  # So it is important to also check that KL(mvn, mvn) = 0.
+  sigma_b_inv = np.linalg.inv(sigma_b)
+
+  t = np.trace(sigma_b_inv.dot(sigma_a))
+  q = (mu_b - mu_a).dot(sigma_b_inv).dot(mu_b - mu_a)
+  k = mu_a.shape[0]
+  l = np.log(np.linalg.det(sigma_b) / np.linalg.det(sigma_a))
+
+  return 0.5 * (t + q - k + l)
+
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bernoulli.py b/tensorflow/contrib/distributions/python/ops/bernoulli.py
index fe5826e491f..1db599b3fea 100644
--- a/tensorflow/contrib/distributions/python/ops/bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/bernoulli.py
@@ -19,15 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import distribution
+from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import kullback_leibler  # pylint: disable=line-too-long
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
@@ -38,10 +36,6 @@ class Bernoulli(distribution.Distribution):
 
   The Bernoulli distribution is parameterized by p, the probability of a
   positive event.
-
-  Note, the following methods of the base class aren't implemented:
-    * cdf
-    * log_cdf
   """
 
   def __init__(self,
@@ -64,10 +58,10 @@ class Bernoulli(distribution.Distribution):
       dtype: dtype for samples.
       validate_args: Whether to assert that `0 <= p <= 1`. If not validate_args,
        `log_pmf` may return nans.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: A name for this distribution.
 
     Raises:
@@ -77,27 +71,8 @@ class Bernoulli(distribution.Distribution):
     self._name = name
     self._dtype = dtype
     self._validate_args = validate_args
-    check_op = check_ops.assert_less_equal
-    if p is None and logits is None:
-      raise ValueError("Must pass p or logits.")
-    elif p is not None and logits is not None:
-      raise ValueError("Must pass either p or logits, not both.")
-    elif p is None:
-      with ops.op_scope([logits], name):
-        self._logits = array_ops.identity(logits, name="logits")
-      with ops.name_scope(name):
-        with ops.name_scope("p"):
-          self._p = math_ops.sigmoid(self._logits)
-    elif logits is None:
-      with ops.name_scope(name):
-        with ops.name_scope("p"):
-          p = array_ops.identity(p)
-          one = constant_op.constant(1., p.dtype)
-          zero = constant_op.constant(0., p.dtype)
-          self._p = control_flow_ops.with_dependencies(
-              [check_op(p, one), check_op(zero, p)] if validate_args else [], p)
-        with ops.name_scope("logits"):
-          self._logits = math_ops.log(self._p) - math_ops.log(1. - self._p)
+    self._logits, self._p = distribution_util.get_logits_and_prob(
+        name=name, logits=logits, p=p, validate_args=validate_args)
     with ops.name_scope(name):
       with ops.name_scope("q"):
         self._q = 1. - self._p
@@ -184,8 +159,12 @@ class Bernoulli(distribution.Distribution):
         event = ops.convert_to_tensor(event, name="event")
         event = math_ops.cast(event, self.logits.dtype)
         logits = self.logits
-        if ((event.get_shape().ndims is not None) or
-            (logits.get_shape().ndims is not None) or
+        # sigmoid_cross_entropy_with_logits doesn't broadcast shape,
+        # so we do this here.
+        # TODO(b/30637701): Check dynamic shape, and don't broadcast if the
+        # dynamic shapes are the same.
+        if (not event.get_shape().is_fully_defined() or
+            not logits.get_shape().is_fully_defined() or
             event.get_shape() != logits.get_shape()):
           logits = array_ops.ones_like(event) * logits
           event = array_ops.ones_like(logits) * event
@@ -206,8 +185,7 @@ class Bernoulli(distribution.Distribution):
     with ops.name_scope(self.name):
       with ops.op_scope([self.p, n], name):
         n = ops.convert_to_tensor(n, name="n")
-        new_shape = array_ops.concat(
-            0, [array_ops.expand_dims(n, 0), self.batch_shape()])
+        new_shape = array_ops.concat(0, ([n], self.batch_shape()))
         uniform = random_ops.random_uniform(
             new_shape, seed=seed, dtype=dtypes.float32)
         sample = math_ops.less(uniform, self.p)
diff --git a/tensorflow/contrib/distributions/python/ops/beta.py b/tensorflow/contrib/distributions/python/ops/beta.py
index 2bd64180682..fcf4a9056c3 100644
--- a/tensorflow/contrib/distributions/python/ops/beta.py
+++ b/tensorflow/contrib/distributions/python/ops/beta.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """The Beta distribution class."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -95,6 +96,7 @@ class Beta(distribution.Distribution):
   x = [.2, .3, .9]
   dist.pdf(x)  # Shape [2]
   ```
+
   """
 
   def __init__(self, a, b, validate_args=True, allow_nan_stats=False,
@@ -102,20 +104,20 @@ class Beta(distribution.Distribution):
     """Initialize a batch of Beta distributions.
 
     Args:
-      a:  Positive `float` or `double` tensor with shape broadcastable to
+      a:  Positive floating point tensor with shape broadcastable to
         `[N1,..., Nm]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
          different Beta distributions. This also defines the
          dtype of the distribution.
-      b:  Positive `float` or `double` tensor with shape broadcastable to
+      b:  Positive floating point tensor with shape broadcastable to
         `[N1,..., Nm]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
          different Beta distributions.
       validate_args: Whether to assert valid values for parameters `a` and `b`,
-        and `x` in `prob` and `log_prob`.  If False, correct behavior is not
+        and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
         guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prefix Ops created by this distribution class.
 
     Examples:
@@ -127,6 +129,7 @@ class Beta(distribution.Distribution):
     # Define a 2-batch.
     dist = Beta([1.0, 2.0], [4.0, 5.0])
     ```
+
     """
     with ops.op_scope([a, b], name):
       with ops.control_dependencies([
@@ -276,8 +279,14 @@ class Beta(distribution.Distribution):
                array_ops.ones_like(a_b_sum, dtype=self.dtype)))
         else:
           return control_flow_ops.with_dependencies([
-              check_ops.assert_less(one, a),
-              check_ops.assert_less(one, b)], mode)
+              check_ops.assert_less(
+                  one, a,
+                  message="mode not defined for components of a <= 1"
+              ),
+              check_ops.assert_less(
+                  one, b,
+                  message="mode not defined for components of b <= 1"
+              )], mode)
 
   def entropy(self, name="entropy"):
     """Entropy of the distribution in nats."""
@@ -306,7 +315,7 @@ class Beta(distribution.Distribution):
     """`Log(P[counts])`, computed for every batch member.
 
     Args:
-      x:  Non-negative `float` or `double`, tensor whose shape can
+      x:  Non-negative floating point tensor whose shape can
         be broadcast with `self.a` and `self.b`.  For fixed leading
         dimensions, the last dimension represents counts for the corresponding
         Beta distribution in `self.a` and `self.b`. `x` is only legal if
@@ -334,7 +343,7 @@ class Beta(distribution.Distribution):
     """`P[x]`, computed for every batch member.
 
     Args:
-      x:  Non-negative `float`, `double` tensor whose shape can
+      x:  Non-negative floating point tensor whose shape can
         be broadcast with `self.a` and `self.b`.  For fixed leading
         dimensions, the last dimension represents x for the corresponding Beta
         distribution in `self.a` and `self.b`. `x` is only legal if is
diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py
new file mode 100644
index 00000000000..9978d0ad613
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/binomial.py
@@ -0,0 +1,340 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Binomial distribution class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=line-too-long
+
+from tensorflow.contrib.distributions.python.ops import distribution
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+
+# pylint: enable=line-too-long
+
+
+class Binomial(distribution.Distribution):
+  """Binomial distribution.
+
+  This distribution is parameterized by a vector `p` of probabilities and `n`,
+  the total counts.
+
+  #### Mathematical details
+
+  The Binomial is a distribution over the number of successes in `n` independent
+  trials, with each trial having the same probability of success `p`.
+  The probability mass function (pmf):
+
+  ```pmf(k) = n! / (k! * (n - k)!) * (p)^k * (1 - p)^(n - k)```
+
+  #### Examples
+
+  Create a single distribution, corresponding to 5 coin flips.
+
+  ```python
+  dist = Binomial(n=5., p=.5)
+  ```
+
+  Create a single distribution (using logits), corresponding to 5 coin flips.
+
+  ```python
+  dist = Binomial(n=5., logits=0.)
+  ```
+
+  Creates 3 distributions with the third distribution most likely to have
+  successes.
+
+  ```python
+  p = [.2, .3, .8]
+  # n will be broadcast to [4., 4., 4.], to match p.
+  dist = Binomial(n=4., p=p)
+  ```
+
+  The distribution functions can be evaluated on counts.
+
+  ```python
+  # counts same shape as p.
+  counts = [1., 2, 3]
+  dist.prob(counts)  # Shape [3]
+
+  # p will be broadcast to [[.2, .3, .8], [.2, .3, .8]] to match counts.
+  counts = [[1., 2, 1], [2, 2, 4]]
+  dist.prob(counts)  # Shape [2, 3]
+
+  # p will be broadcast to shape [5, 7, 3] to match counts.
+  counts = [[...]]  # Shape [5, 7, 3]
+  dist.prob(counts)  # Shape [5, 7, 3]
+  ```
+  """
+
+  def __init__(self,
+               n,
+               logits=None,
+               p=None,
+               validate_args=True,
+               allow_nan_stats=False,
+               name="Binomial"):
+    """Initialize a batch of Binomial distributions.
+
+    Args:
+      n:  Non-negative floating point tensor with shape broadcastable to
+        `[N1,..., Nm]` with `m >= 0` and the same dtype as `p` or `logits`.
+        Defines this as a batch of `N1 x ... x Nm` different Binomial
+        distributions. Its components should be equal to integer values.
+      logits: Floating point tensor representing the log-odds of a
+        positive event with shape broadcastable to `[N1,..., Nm]` `m >= 0`, and
+        the same dtype as `n`. Each entry represents logits for the probability
+        of success for independent Binomial distributions.
+      p:  Positive floating point tensor with shape broadcastable to
+        `[N1,..., Nm]` `m >= 0`, `p in [0, 1]`. Each entry represents the
+        probability of success for independent Binomial distributions.
+      validate_args: Whether to assert valid values for parameters `n` and `p`,
+        and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
+        guaranteed.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
+      name: The name to prefix Ops created by this distribution class.
+
+    Examples:
+
+    ```python
+    # Define 1-batch of a binomial distribution.
+    dist = Binomial(n=2., p=.9)
+
+    # Define a 2-batch.
+    dist = Binomial(n=[4., 5], p=[.1, .3])
+    ```
+
+    """
+
+    self._logits, self._p = distribution_util.get_logits_and_prob(
+        name=name, logits=logits, p=p, validate_args=validate_args)
+
+    with ops.op_scope([n], name):
+      with ops.control_dependencies([
+          check_ops.assert_non_negative(
+              n, message="n has negative components."),
+          distribution_util.assert_integer_form(
+              n, message="n has non-integer components."
+          )] if validate_args else []):
+        self._n = array_ops.identity(n, name="convert_n")
+
+        self._name = name
+        self._validate_args = validate_args
+        self._allow_nan_stats = allow_nan_stats
+
+        self._mean = self._n * self._p
+        self._get_batch_shape = self._mean.get_shape()
+        self._get_event_shape = tensor_shape.TensorShape([])
+
+  @property
+  def name(self):
+    """Name to prepend to all ops."""
+    return self._name
+
+  @property
+  def dtype(self):
+    """dtype of samples from this distribution."""
+    return self._p.dtype
+
+  @property
+  def validate_args(self):
+    """Boolean describing behavior on invalid input."""
+    return self._validate_args
+
+  @property
+  def allow_nan_stats(self):
+    """Boolean describing behavior when a stat is undefined for batch member."""
+    return self._allow_nan_stats
+
+  def batch_shape(self, name="batch_shape"):
+    """Batch dimensions of this instance as a 1-D int32 `Tensor`.
+
+    The product of the dimensions of the `batch_shape` is the number of
+    independent distributions of this kind the instance represents.
+
+    Args:
+      name: name to give to the op
+
+    Returns:
+      `Tensor` `batch_shape`
+    """
+    return array_ops.shape(self._mean)
+
+  def get_batch_shape(self):
+    """`TensorShape` available at graph construction time.
+
+    Same meaning as `batch_shape`. May be only partially defined.
+
+    Returns:
+      batch shape
+    """
+    return self._get_batch_shape
+
+  def event_shape(self, name="event_shape"):
+    """Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
+
+    Args:
+      name: name to give to the op
+
+    Returns:
+      `Tensor` `event_shape`
+    """
+    with ops.name_scope(self.name):
+      with ops.op_scope([], name):
+        return constant_op.constant([], name=name, dtype=dtypes.int32)
+
+  def get_event_shape(self):
+    """`TensorShape` available at graph construction time.
+
+    Same meaning as `event_shape`. May be only partially defined.
+
+    Returns:
+      event shape
+    """
+    return self._get_event_shape
+
+  @property
+  def n(self):
+    """Number of trials."""
+    return self._n
+
+  @property
+  def logits(self):
+    """Log-odds."""
+    return self._logits
+
+  @property
+  def p(self):
+    """Probability of success."""
+    return self._p
+
+  def mean(self, name="mean"):
+    """Mean of the distribution."""
+    with ops.name_scope(self.name):
+      return array_ops.identity(self._mean, name=name)
+
+  def variance(self, name="variance"):
+    """Variance of the distribution."""
+    with ops.name_scope(self.name):
+      with ops.op_scope([self._n, self._p], name):
+        return self._n * self._p * (1 - self._p)
+
+  def std(self, name="std"):
+    """Standard deviation of the distribution."""
+    with ops.name_scope(self.name):
+      with ops.op_scope([self._n, self._p], name):
+        return math_ops.sqrt(self.variance())
+
+  def mode(self, name="mode"):
+    """Mode of the distribution.
+
+    Note that when `(n + 1) * p` is an integer, there are actually two modes.
+    Namely, `(n + 1) * p` and `(n + 1) * p - 1` are both modes. Here we return
+    only the larger of the two modes.
+
+    Args:
+      name: The name for this op.
+
+    Returns:
+      The mode of the Binomial distribution.
+    """
+    with ops.name_scope(self.name):
+      with ops.op_scope([self._n, self._p], name):
+        return math_ops.floor((self._n + 1) * self._p)
+
+  def log_prob(self, counts, name="log_prob"):
+    """`Log(P[counts])`, computed for every batch member.
+
+    For each batch member of counts `k`, `P[counts]` is the probability that
+    after sampling `n` draws from this Binomial distribution, the number of
+    successes is `k`.  Note that different sequences of draws can result in the
+    same counts, thus the probability includes a combinatorial coefficient.
+
+    Args:
+      counts:  Non-negative tensor with dtype `dtype` and whose shape can be
+        broadcast with `self.p` and `self.n`. `counts` is only legal if it is
+        less than or equal to `n` and its components are equal to integer
+        values.
+      name:  Name to give this Op, defaults to "log_prob".
+
+    Returns:
+      Log probabilities for each record, shape `[N1,...,Nm]`.
+    """
+    n = self._n
+    p = self._p
+    with ops.name_scope(self.name):
+      with ops.op_scope([self._n, self._p, counts], name):
+        counts = self._check_counts(counts)
+
+        prob_prob = counts * math_ops.log(p) + (
+            n - counts) * math_ops.log(1 - p)
+
+        combinations = math_ops.lgamma(n + 1) - math_ops.lgamma(
+            counts + 1) - math_ops.lgamma(n - counts + 1)
+        log_prob = prob_prob + combinations
+        return log_prob
+
+  def prob(self, counts, name="prob"):
+    """`P[counts]`, computed for every batch member.
+
+
+    For each batch member of counts `k`, `P[counts]` is the probability that
+    after sampling `n` draws from this Binomial distribution, the number of
+    successes is `k`.  Note that different sequences of draws can result in the
+    same counts, thus the probability includes a combinatorial coefficient.
+
+    Args:
+      counts:  Non-negative tensor with dtype `dtype` and whose shape can be
+        broadcast with `self.p` and `self.n`. `counts` is only legal if it is
+        less than or equal to `n` and its components are equal to integer
+        values.
+      name:  Name to give this Op, defaults to "prob".
+
+    Returns:
+      Probabilities for each record, shape `[N1,...,Nm]`.
+    """
+    return super(Binomial, self).prob(counts, name=name)
+
+  @property
+  def is_continuous(self):
+    return False
+
+  @property
+  def is_reparameterized(self):
+    return False
+
+  def _check_counts(self, counts):
+    """Check counts for proper shape, values, then return tensor version."""
+    counts = ops.convert_to_tensor(counts, name="counts_before_deps")
+    if not self.validate_args:
+      return counts
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_non_negative(
+            counts, message="counts has negative components."),
+        check_ops.assert_less_equal(
+            counts, self._n, message="counts are not less than or equal to n."),
+        distribution_util.assert_integer_form(
+            counts, message="counts have non-integer components.")], counts)
diff --git a/tensorflow/contrib/distributions/python/ops/categorical.py b/tensorflow/contrib/distributions/python/ops/categorical.py
index 64572ed7885..e79a732a0c9 100644
--- a/tensorflow/contrib/distributions/python/ops/categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/categorical.py
@@ -34,11 +34,6 @@ class Categorical(distribution.Distribution):
 
   The categorical distribution is parameterized by the log-probabilities
   of a set of classes.
-
-  Note, the following methods of the base class aren't implemented:
-    * mean
-    * cdf
-    * log_cdf
   """
 
   def __init__(
@@ -57,10 +52,10 @@ class Categorical(distribution.Distribution):
           indexes into the classes.
       dtype: The type of the event samples (default: int32).
       validate_args: Unused in this distribution.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: A name for this distribution (optional).
     """
     self._allow_nan_stats = allow_nan_stats
@@ -177,8 +172,7 @@ class Categorical(distribution.Distribution):
         samples = math_ops.cast(samples, self._dtype)
         ret = array_ops.reshape(
             array_ops.transpose(samples),
-            array_ops.concat(
-                0, [array_ops.expand_dims(n, 0), self.batch_shape()]))
+            array_ops.concat(0, ([n], self.batch_shape())))
         ret.set_shape(tensor_shape.vector(tensor_util.constant_value(n))
                       .concatenate(self.get_batch_shape()))
         return ret
diff --git a/tensorflow/contrib/distributions/python/ops/chi2.py b/tensorflow/contrib/distributions/python/ops/chi2.py
index 65840373f12..e09ef6324b8 100644
--- a/tensorflow/contrib/distributions/python/ops/chi2.py
+++ b/tensorflow/contrib/distributions/python/ops/chi2.py
@@ -42,15 +42,15 @@ class Chi2(gamma.Gamma):
     """Construct Chi2 distributions with parameter `df`.
 
     Args:
-      df: `float` or `double` tensor, the degrees of freedom of the
+      df: Floating point tensor, the degrees of freedom of the
         distribution(s).  `df` must contain only positive values.
       validate_args: Whether to assert that `df > 0`, and that `x > 0` in the
-        methods `prob(x)` and `log_prob(x)`. If `validate_args` is False
+        methods `prob(x)` and `log_prob(x)`. If `validate_args` is `False`
         and the inputs are invalid, correct behavior is not guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prepend to all ops created by this distribution.
     """
     # Even though all stats of chi2 are defined for valid parameters, this is
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet.py b/tensorflow/contrib/distributions/python/ops/dirichlet.py
index b4f59d5bd8c..25aee5cf03e 100644
--- a/tensorflow/contrib/distributions/python/ops/dirichlet.py
+++ b/tensorflow/contrib/distributions/python/ops/dirichlet.py
@@ -19,9 +19,8 @@ from __future__ import print_function
 
 # pylint: disable=line-too-long
 
-import numpy as np
-
 from tensorflow.contrib.distributions.python.ops import distribution
+from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -29,7 +28,6 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
@@ -37,24 +35,6 @@ from tensorflow.python.ops import special_math_ops
 # pylint: enable=line-too-long
 
 
-def _assert_close(x, y, data=None, summarize=None, name=None):
-  if x.dtype.is_integer:
-    return check_ops.assert_equal(
-        x, y, data=data, summarize=summarize, name=name)
-
-  with ops.op_scope([x, y, data], name, "assert_close"):
-    x = ops.convert_to_tensor(x, name="x")
-    y = ops.convert_to_tensor(y, name="y")
-    tol = np.finfo(x.dtype.as_numpy_dtype).resolution
-    if data is None:
-      data = [
-          "Condition x ~= y did not hold element-wise: x = ", x.name, x, "y = ",
-          y.name, y
-      ]
-    condition = math_ops.reduce_all(math_ops.less_equal(math_ops.abs(x-y), tol))
-    return logging_ops.Assert(condition, data, summarize=summarize)
-
-
 class Dirichlet(distribution.Distribution):
   """Dirichlet distribution.
 
@@ -117,6 +97,7 @@ class Dirichlet(distribution.Distribution):
   x = [.2, .3, .5]
   dist.prob(x)  # Shape [2]
   ```
+
   """
 
   def __init__(self,
@@ -127,16 +108,16 @@ class Dirichlet(distribution.Distribution):
     """Initialize a batch of Dirichlet distributions.
 
     Args:
-      alpha:  Positive `float` or `double` tensor with shape broadcastable to
+      alpha:  Positive floating point tensor with shape broadcastable to
         `[N1,..., Nm, k]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
          different `k` class Dirichlet distributions.
       validate_args: Whether to assert valid values for parameters `alpha` and
-        `x` in `prob` and `log_prob`.  If False, correct behavior is not
+        `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
         guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prefix Ops created by this distribution class.
 
     Examples:
@@ -149,6 +130,7 @@ class Dirichlet(distribution.Distribution):
     # Define a 2-batch of 3-class distributions.
     dist = Dirichlet([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
     ```
+
     """
     with ops.op_scope([alpha], name):
       alpha = ops.convert_to_tensor(alpha, name="alpha_before_deps")
@@ -302,7 +284,9 @@ class Dirichlet(distribution.Distribution):
                array_ops.ones_like(self._alpha, dtype=self.dtype)))
         else:
           return control_flow_ops.with_dependencies([
-              check_ops.assert_less(one, self._alpha)
+              check_ops.assert_less(
+                  one, self._alpha,
+                  message="mode not defined for components of alpha <= 1")
           ], mode)
 
   def entropy(self, name="entropy"):
@@ -334,7 +318,7 @@ class Dirichlet(distribution.Distribution):
     """`Log(P[counts])`, computed for every batch member.
 
     Args:
-      x:  Non-negative `float` or `double`, tensor whose shape can
+      x:  Non-negative tensor with dtype `dtype` and whose shape can
         be broadcast with `self.alpha`.  For fixed leading dimensions, the last
         dimension represents counts for the corresponding Dirichlet distribution
         in `self.alpha`. `x` is only legal if it sums up to one.
@@ -359,7 +343,7 @@ class Dirichlet(distribution.Distribution):
     """`P[x]`, computed for every batch member.
 
     Args:
-      x:  Non-negative `float`, `double` tensor whose shape can
+      x:  Non-negative tensor with dtype `dtype` and whose shape can
         be broadcast with `self.alpha`.  For fixed leading dimensions, the last
         dimension represents x for the corresponding Dirichlet distribution in
         `self.alpha` and `self.beta`. `x` is only legal if it sums up to one.
@@ -407,7 +391,8 @@ class Dirichlet(distribution.Distribution):
     x = ops.convert_to_tensor(x, name="x_before_deps")
     candidate_one = math_ops.reduce_sum(x, reduction_indices=[-1])
     one = constant_op.constant(1., self.dtype)
-    dependencies = [check_ops.assert_positive(x), check_ops.assert_less(x, one),
-                    _assert_close(one, candidate_one)
+    dependencies = [check_ops.assert_positive(x), check_ops.assert_less(
+        x, one, message="x has components greater than or equal to 1"),
+                    distribution_util.assert_close(one, candidate_one)
                    ] if self.validate_args else []
     return control_flow_ops.with_dependencies(dependencies, x)
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py b/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
index 7c779fff065..67cdd566c67 100644
--- a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
+++ b/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
@@ -13,13 +13,15 @@
 # limitations under the License.
 # ==============================================================================
 """The Dirichlet Multinomial distribution class."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=line-too-long
 
-from tensorflow.contrib.distributions.python.ops import distribution  # pylint: disable=line-too-long
+from tensorflow.contrib.distributions.python.ops import distribution
+from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -30,34 +32,6 @@ from tensorflow.python.ops import special_math_ops
 # pylint: enable=line-too-long
 
 
-def _assert_integer_form(x):
-  """Check x for integer components (or floats that are equal to integers)."""
-  x = ops.convert_to_tensor(x, name='x')
-  casted_x = math_ops.to_int64(x)
-  return check_ops.assert_equal(x, math_ops.cast(
-      math_ops.round(casted_x), x.dtype))
-
-
-def _log_combinations(n, counts, name='log_combinations'):
-  """Log number of ways counts could have come in."""
-  # First a bit about the number of ways counts could have come in:
-  # E.g. if counts = [1, 2], then this is 3 choose 2.
-  # In general, this is (sum counts)! / sum(counts!)
-  # The sum should be along the last dimension of counts.  This is the
-  # "distribution" dimension. Here n a priori represents the sum of counts.
-  with ops.op_scope([counts], name):
-    # To compute factorials, use the fact that Gamma(n + 1) = n!
-    # Compute two terms, each a sum over counts.  Compute each for each
-    # batch member.
-    # Log Gamma((sum counts) + 1) = Log((sum counts)!)
-    total_permutations = math_ops.lgamma(n + 1)
-    # sum(Log Gamma(counts + 1)) = Log sum(counts!)
-    counts_factorial = math_ops.lgamma(counts + 1)
-    redundant_permutations = math_ops.reduce_sum(counts_factorial,
-                                                 reduction_indices=[-1])
-    return total_permutations - redundant_permutations
-
-
 class DirichletMultinomial(distribution.Distribution):
   """DirichletMultinomial mixture distribution.
 
@@ -126,6 +100,7 @@ class DirichletMultinomial(distribution.Distribution):
   counts = [2, 1, 0]
   dist.pmf(counts)  # Shape [2]
   ```
+
   """
 
   # TODO(b/27419586) Change docstring for dtype of alpha once int allowed.
@@ -134,26 +109,26 @@ class DirichletMultinomial(distribution.Distribution):
                alpha,
                validate_args=True,
                allow_nan_stats=False,
-               name='DirichletMultinomial'):
+               name="DirichletMultinomial"):
     """Initialize a batch of DirichletMultinomial distributions.
 
     Args:
-      n:  Non-negative `float` or `double` tensor, whose dtype is the same as
+      n:  Non-negative floating point tensor, whose dtype is the same as
         `alpha`. The shape is broadcastable to `[N1,..., Nm]` with `m >= 0`.
         Defines this as a batch of `N1 x ... x Nm` different Dirichlet
-        multinomial distributions. Its components should be equal to integral
+        multinomial distributions. Its components should be equal to integer
         values.
-      alpha:  Positive `float` or `double` tensor, whose dtype is the same as
+      alpha: Positive floating point tensor, whose dtype is the same as
         `n` with shape broadcastable to `[N1,..., Nm, k]` `m >= 0`.  Defines
         this as a batch of `N1 x ... x Nm` different `k` class Dirichlet
         multinomial distributions.
       validate_args: Whether to assert valid values for parameters `alpha` and
-        `n`, and `x` in `prob` and `log_prob`.  If False, correct behavior is
+        `n`, and `x` in `prob` and `log_prob`.  If `False`, correct behavior is
         not guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prefix Ops created by this distribution class.
 
     Examples:
@@ -166,6 +141,7 @@ class DirichletMultinomial(distribution.Distribution):
     # Define a 2-batch of 3-class distributions.
     dist = DirichletMultinomial([3., 4], [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
     ```
+
     """
     self._allow_nan_stats = allow_nan_stats
     self._validate_args = validate_args
@@ -221,7 +197,7 @@ class DirichletMultinomial(distribution.Distribution):
     """dtype of samples from this distribution."""
     return self._alpha.dtype
 
-  def mean(self, name='mean'):
+  def mean(self, name="mean"):
     """Class means for every batch member."""
     alpha = self._alpha
     alpha_sum = self._alpha_sum
@@ -231,7 +207,7 @@ class DirichletMultinomial(distribution.Distribution):
         mean_no_n = alpha / array_ops.expand_dims(alpha_sum, -1)
         return array_ops.expand_dims(n, -1) * mean_no_n
 
-  def variance(self, name='mean'):
+  def variance(self, name="mean"):
     """Class variances for every batch member.
 
     The variance for each batch member is defined as the following:
@@ -273,7 +249,7 @@ class DirichletMultinomial(distribution.Distribution):
         variance *= array_ops.expand_dims(shared_factor, -1)
         return variance
 
-  def batch_shape(self, name='batch_shape'):
+  def batch_shape(self, name="batch_shape"):
     """Batch dimensions of this instance as a 1-D int32 `Tensor`.
 
     The product of the dimensions of the `batch_shape` is the number of
@@ -299,7 +275,7 @@ class DirichletMultinomial(distribution.Distribution):
     """
     return self._get_batch_shape
 
-  def event_shape(self, name='event_shape'):
+  def event_shape(self, name="event_shape"):
     """Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
 
     Args:
@@ -322,15 +298,15 @@ class DirichletMultinomial(distribution.Distribution):
     """
     return self._get_event_shape
 
-  def cdf(self, x, name='cdf'):
+  def cdf(self, x, name="cdf"):
     raise NotImplementedError(
-        'DirichletMultinomial does not have a well-defined cdf.')
+        "DirichletMultinomial does not have a well-defined cdf.")
 
-  def log_cdf(self, x, name='log_cdf'):
+  def log_cdf(self, x, name="log_cdf"):
     raise NotImplementedError(
-        'DirichletMultinomial does not have a well-defined cdf.')
+        "DirichletMultinomial does not have a well-defined cdf.")
 
-  def log_prob(self, counts, name='log_prob'):
+  def log_prob(self, counts, name="log_prob"):
     """`Log(P[counts])`, computed for every batch member.
 
     For each batch of counts `[n_1,...,n_k]`, `P[counts]` is the probability
@@ -340,12 +316,11 @@ class DirichletMultinomial(distribution.Distribution):
     probability includes a combinatorial coefficient.
 
     Args:
-      counts:  Non-negative `float` or `double` tensor whose dtype is the same
-        `self` and whose shape can be broadcast with `self.alpha`.  For fixed
-        leading dimensions, the last dimension represents counts for the
-        corresponding Dirichlet Multinomial distribution in `self.alpha`.
-        `counts` is only legal if it sums up to `n` and its components are
-        equal to integral values.
+      counts:  Non-negative tensor with dtype `dtype` and whose shape can be
+        broadcast with `self.alpha`.  For fixed leading dimensions, the last
+        dimension represents counts for the corresponding Dirichlet Multinomial
+        distribution in `self.alpha`. `counts` is only legal if it sums up to
+        `n` and its components are equal to integer values.
       name:  Name to give this Op, defaults to "log_prob".
 
     Returns:
@@ -359,20 +334,11 @@ class DirichletMultinomial(distribution.Distribution):
 
         ordered_prob = (special_math_ops.lbeta(alpha + counts) -
                         special_math_ops.lbeta(alpha))
-        log_prob = ordered_prob + _log_combinations(n, counts)
-        # If alpha = counts = [[]], ordered_prob carries the right shape, which
-        # is [].  However, since reduce_sum([[]]) = [0], log_combinations = [0],
-        # which is not correct.  Luckily, [] + [0] = [], so the sum is fine, but
-        # shape must be inferred from ordered_prob. We must also make this
-        # broadcastable with n, so this is multiplied by n to ensure the shape
-        # is correctly inferred.
-        # Note also that tf.constant([]).get_shape() =
-        # TensorShape([Dimension(0)])
-        broadcasted_tensor = ordered_prob * n
-        log_prob.set_shape(broadcasted_tensor.get_shape())
+        log_prob = ordered_prob + distribution_util.log_combinations(
+            n, counts)
         return log_prob
 
-  def prob(self, counts, name='prob'):
+  def prob(self, counts, name="prob"):
     """`P[counts]`, computed for every batch member.
 
     For each batch of counts `[c_1,...,c_k]`, `P[counts]` is the probability
@@ -382,12 +348,11 @@ class DirichletMultinomial(distribution.Distribution):
     probability includes a combinatorial coefficient.
 
     Args:
-      counts:  Non-negative `float` or `double` tensor whose dtype is the same
-        `self` and whose shape can be broadcast with `self.alpha`.  For fixed
-        leading dimensions, the last dimension represents counts for the
-        corresponding Dirichlet Multinomial distribution in `self.alpha`.
-        `counts` is only legal if it sums up to `n` and its components are
-        equal to integral values.
+      counts:  Non-negative tensor with dtype `dtype` and whose shape can be
+        broadcast with `self.alpha`.  For fixed leading dimensions, the last
+        dimension represents counts for the corresponding Dirichlet Multinomial
+        distribution in `self.alpha`. `counts` is only legal if it sums up to
+        `n` and its components are equal to integer values.
       name:  Name to give this Op, defaults to "prob".
 
     Returns:
@@ -397,18 +362,21 @@ class DirichletMultinomial(distribution.Distribution):
 
   def _check_counts(self, counts):
     """Check counts for proper shape, values, then return tensor version."""
-    counts = ops.convert_to_tensor(counts, name='counts')
+    counts = ops.convert_to_tensor(counts, name="counts")
     if not self.validate_args:
       return counts
     candidate_n = math_ops.reduce_sum(counts, reduction_indices=[-1])
 
     return control_flow_ops.with_dependencies([
         check_ops.assert_non_negative(counts),
-        check_ops.assert_equal(self._n, candidate_n),
-        _assert_integer_form(counts)], counts)
+        check_ops.assert_equal(
+            self._n, candidate_n,
+            message="counts do not sum to n"
+        ),
+        distribution_util.assert_integer_form(counts)], counts)
 
   def _check_alpha(self, alpha):
-    alpha = ops.convert_to_tensor(alpha, name='alpha')
+    alpha = ops.convert_to_tensor(alpha, name="alpha")
     if not self.validate_args:
       return alpha
     return control_flow_ops.with_dependencies(
@@ -416,11 +384,12 @@ class DirichletMultinomial(distribution.Distribution):
          check_ops.assert_positive(alpha)], alpha)
 
   def _check_n(self, n):
-    n = ops.convert_to_tensor(n, name='n')
+    n = ops.convert_to_tensor(n, name="n")
     if not self.validate_args:
       return n
     return control_flow_ops.with_dependencies(
-        [check_ops.assert_non_negative(n), _assert_integer_form(n)], n)
+        [check_ops.assert_non_negative(n),
+         distribution_util.assert_integer_form(n)], n)
 
   @property
   def is_continuous(self):
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
new file mode 100644
index 00000000000..9c751270032
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -0,0 +1,177 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for probability distributions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+
+
+def assert_close(
+    x, y, data=None, summarize=None, message=None, name="assert_close"):
+  """Assert that that x and y are within machine epsilon of each other.
+
+  Args:
+    x: Numeric `Tensor`
+    y: Numeric `Tensor`
+    data: The tensors to print out if the condition is `False`. Defaults to
+      error message and first few entries of `x` and `y`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).
+
+  Returns:
+    Op raising `InvalidArgumentError` if |x - y| > machine epsilon.
+  """
+  message = message or ""
+  x = ops.convert_to_tensor(x, name="x")
+  y = ops.convert_to_tensor(y, name="y")
+
+  if x.dtype.is_integer:
+    return check_ops.assert_equal(
+        x, y, data=data, summarize=summarize, message=message, name=name)
+
+  with ops.op_scope([x, y, data], name, "assert_close"):
+    tol = np.finfo(x.dtype.as_numpy_dtype).resolution
+    if data is None:
+      data = [
+          message,
+          "Condition x ~= y did not hold element-wise: x = ", x.name, x, "y = ",
+          y.name, y
+      ]
+    condition = math_ops.reduce_all(math_ops.less_equal(math_ops.abs(x-y), tol))
+    return logging_ops.Assert(
+        condition, data, summarize=summarize)
+
+
+def assert_integer_form(
+    x, data=None, summarize=None, message=None, name="assert_integer_form"):
+  """Assert that x has integer components (or floats equal to integers).
+
+  Args:
+    x: Numeric `Tensor`
+    data: The tensors to print out if the condition is `False`. Defaults to
+      error message and first few entries of `x` and `y`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).
+
+  Returns:
+    Op raising `InvalidArgumentError` if round(x) != x.
+  """
+
+  message = message or "x has non-integer components"
+  x = ops.convert_to_tensor(x, name="x")
+  casted_x = math_ops.to_int64(x)
+  return check_ops.assert_equal(
+      x, math_ops.cast(math_ops.round(casted_x), x.dtype),
+      data=data, summarize=summarize, message=message, name=name)
+
+
+def get_logits_and_prob(
+    logits=None, p=None, multidimensional=False, validate_args=True, name=None):
+  """Converts logits to probabilities and vice-versa, and returns both.
+
+  Args:
+    logits: Numeric `Tensor` representing log-odds.
+    p: Numeric `Tensor` representing probabilities.
+    multidimensional: Given `p` a [N1, N2, ... k] dimensional tensor,
+      whether the last dimension represents the probability between k classes.
+      This will additionally assert that the values in the last dimension
+      sum to one. If `False`, will instead assert that each value is in
+      `[0, 1]`.
+    validate_args: Whether to assert `0 <= p <= 1` if multidimensional is
+      `False`, otherwise that the last dimension of `p` sums to one.
+    name: A name for this operation (optional).
+
+  Returns:
+    Tuple with `logits` and `p`. If `p` has an entry that is `0` or `1`, then
+    the corresponding entry in the returned logits will be `-Inf` and `Inf`
+    respectively.
+
+  Raises:
+    ValueError: if neither `p` nor `logits` were passed in, or both were.
+  """
+  if p is None and logits is None:
+    raise ValueError("Must pass p or logits.")
+  elif p is not None and logits is not None:
+    raise ValueError("Must pass either p or logits, not both.")
+  elif p is None:
+    with ops.op_scope([logits], name):
+      logits = array_ops.identity(logits, name="logits")
+    with ops.name_scope(name):
+      with ops.name_scope("p"):
+        p = math_ops.sigmoid(logits)
+  elif logits is None:
+    with ops.name_scope(name):
+      with ops.name_scope("p"):
+        p = array_ops.identity(p)
+        if validate_args:
+          one = constant_op.constant(1., p.dtype)
+          dependencies = [check_ops.assert_non_negative(p)]
+          if multidimensional:
+            dependencies += [assert_close(
+                math_ops.reduce_sum(p, reduction_indices=[-1]),
+                one, message="p does not sum to 1.")]
+          else:
+            dependencies += [check_ops.assert_less_equal(
+                p, one, message="p has components greater than 1.")]
+          p = control_flow_ops.with_dependencies(dependencies, p)
+      with ops.name_scope("logits"):
+        logits = math_ops.log(p) - math_ops.log(1. - p)
+  return (logits, p)
+
+
+def log_combinations(n, counts, name="log_combinations"):
+  """Multinomial coefficient.
+
+  Given `n` and `counts`, where `counts` has last dimension `k`, we compute
+  the multinomial coefficient as:
+
+  ```n! / sum_i n_i!```
+
+  where `i` runs over all `k` classes.
+
+  Args:
+    n: Numeric `Tensor` broadcastable with `counts`. This represents `n`
+      outcomes.
+    counts: Numeric `Tensor` broadcastable with `n`. This represents counts
+      in `k` classes, where `k` is the last dimension of the tensor.
+    name: A name for this operation (optional).
+
+  Returns:
+    `Tensor` representing the multinomial coefficient between `n` and `counts`.
+  """
+  # First a bit about the number of ways counts could have come in:
+  # E.g. if counts = [1, 2], then this is 3 choose 2.
+  # In general, this is (sum counts)! / sum(counts!)
+  # The sum should be along the last dimension of counts.  This is the
+  # "distribution" dimension. Here n a priori represents the sum of counts.
+  with ops.op_scope([n, counts], name):
+    total_permutations = math_ops.lgamma(n + 1)
+    counts_factorial = math_ops.lgamma(counts + 1)
+    redundant_permutations = math_ops.reduce_sum(counts_factorial,
+                                                 reduction_indices=[-1])
+    return total_permutations - redundant_permutations
diff --git a/tensorflow/contrib/distributions/python/ops/exponential.py b/tensorflow/contrib/distributions/python/ops/exponential.py
index c49b3eeba8d..c1a7eb025ef 100644
--- a/tensorflow/contrib/distributions/python/ops/exponential.py
+++ b/tensorflow/contrib/distributions/python/ops/exponential.py
@@ -46,15 +46,15 @@ class Exponential(gamma.Gamma):
     """Construct Exponential distribution with parameter `lam`.
 
     Args:
-      lam: `float` or `double` tensor, the rate of the distribution(s).
+      lam: Floating point tensor, the rate of the distribution(s).
         `lam` must contain only positive values.
       validate_args: Whether to assert that `lam > 0`, and that `x > 0` in the
-        methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+        methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
         and the inputs are invalid, correct behavior is not guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member. If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prepend to all ops created by this distribution.
     """
     # Even though all statistics of are defined for valid inputs, this is not
@@ -95,8 +95,7 @@ class Exponential(gamma.Gamma):
     broadcast_shape = self._lam.get_shape()
     with ops.op_scope([self.lam, n], name, "ExponentialSample"):
       n = ops.convert_to_tensor(n, name="n")
-      shape = array_ops.concat(
-          0, [array_ops.pack([n]), array_ops.shape(self._lam)])
+      shape = array_ops.concat(0, ([n], array_ops.shape(self._lam)))
       # Sample uniformly-at-random from the open-interval (0, 1).
       sampled = random_ops.random_uniform(
           shape, minval=np.nextafter(
diff --git a/tensorflow/contrib/distributions/python/ops/gamma.py b/tensorflow/contrib/distributions/python/ops/gamma.py
index 1f733ceda16..6bd93877613 100644
--- a/tensorflow/contrib/distributions/python/ops/gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/gamma.py
@@ -69,19 +69,19 @@ class Gamma(distribution.Distribution):
     broadcasting (e.g. `alpha + beta` is a valid operation).
 
     Args:
-      alpha: `float` or `double` tensor, the shape params of the
+      alpha: Floating point tensor, the shape params of the
         distribution(s).
         alpha must contain only positive values.
-      beta: `float` or `double` tensor, the inverse scale params of the
+      beta: Floating point tensor, the inverse scale params of the
         distribution(s).
         beta must contain only positive values.
       validate_args: Whether to assert that `a > 0, b > 0`, and that `x > 0` in
-        the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+        the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
         and the inputs are invalid, correct behavior is not guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prepend to all ops created by this distribution.
 
     Raises:
@@ -213,9 +213,12 @@ class Gamma(distribution.Distribution):
           nan = np.nan * self._ones()
           return math_ops.select(alpha_ge_1, mode_if_defined, nan)
         else:
-          one = ops.convert_to_tensor(1.0, dtype=self.dtype)
+          one = constant_op.constant(1.0, dtype=self.dtype)
           return control_flow_ops.with_dependencies(
-              [check_ops.assert_less(one, alpha)], mode_if_defined)
+              [check_ops.assert_less(
+                  one, alpha,
+                  message="mode not defined for components of alpha <= 1"
+              )], mode_if_defined)
 
   def variance(self, name="variance"):
     """Variance of each batch member."""
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index a23f6df5717..d78e82a7524 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -69,18 +69,18 @@ class InverseGamma(distribution.Distribution):
     broadcasting (e.g. `alpha + beta` is a valid operation).
 
     Args:
-      alpha: `float` or `double` tensor, the shape params of the
+      alpha: Floating point tensor, the shape params of the
         distribution(s).
         alpha must contain only positive values.
-      beta: `float` or `double` tensor, the scale params of the distribution(s).
+      beta: Floating point tensor, the scale params of the distribution(s).
         beta must contain only positive values.
       validate_args: Whether to assert that `a > 0, b > 0`, and that `x > 0` in
-        the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+        the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
         and the inputs are invalid, correct behavior is not guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prepend to all ops created by this distribution.
 
     Raises:
@@ -206,9 +206,12 @@ class InverseGamma(distribution.Distribution):
           nan = np.nan * self._ones()
           return math_ops.select(alpha_gt_1, mean_if_defined, nan)
         else:
-          one = ops.convert_to_tensor(1.0, dtype=self.dtype)
+          one = constant_op.constant(1.0, dtype=self.dtype)
           return control_flow_ops.with_dependencies(
-              [check_ops.assert_less(one, alpha)], mean_if_defined)
+              [check_ops.assert_less(
+                  one, alpha,
+                  message="mean not defined for components of alpha <= 1")],
+              mean_if_defined)
 
   def mode(self, name="mode"):
     """Mode of each batch member.
@@ -250,9 +253,12 @@ class InverseGamma(distribution.Distribution):
           nan = np.nan * self._ones()
           return math_ops.select(alpha_gt_2, var_if_defined, nan)
         else:
-          two = ops.convert_to_tensor(2.0, dtype=self.dtype)
+          two = constant_op.constant(2.0, dtype=self.dtype)
           return control_flow_ops.with_dependencies(
-              [check_ops.assert_less(two, alpha)], var_if_defined)
+              [check_ops.assert_less(
+                  two, alpha,
+                  message="variance not defined for components of alpha <= 2")],
+              var_if_defined)
 
   def log_prob(self, x, name="log_prob"):
     """Log prob of observations in `x` under these InverseGamma distribution(s).
diff --git a/tensorflow/contrib/distributions/python/ops/kullback_leibler.py b/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
index c134ca2cbfd..c1e0b2d2398 100644
--- a/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
+++ b/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
@@ -34,9 +34,9 @@ def kl(dist_a, dist_b, allow_nan=False, name=None):
   Args:
     dist_a: instance of distributions.Distribution.
     dist_b: instance of distributions.Distribution.
-    allow_nan: If False (default), a runtime error is raised
+    allow_nan: If `False` (default), a runtime error is raised
       if the KL returns NaN values for any batch entry of the given
-      distributions.  If True, the KL may return a NaN for the given entry.
+      distributions.  If `True`, the KL may return a NaN for the given entry.
     name: (optional) Name scope to use for created operations.
 
   Returns:
diff --git a/tensorflow/contrib/distributions/python/ops/laplace.py b/tensorflow/contrib/distributions/python/ops/laplace.py
index ee6aa81c0f4..a03a80d4ece 100644
--- a/tensorflow/contrib/distributions/python/ops/laplace.py
+++ b/tensorflow/contrib/distributions/python/ops/laplace.py
@@ -60,17 +60,17 @@ class Laplace(distribution.Distribution):
     broadcasting (e.g., `loc / scale` is a valid operation).
 
     Args:
-      loc: `float` or `double` tensor which characterizes the location (center)
+      loc: Floating point tensor which characterizes the location (center)
         of the distribution.
-      scale: `float` or `double`, positive-valued tensor which characterzes the
-        spread of the distribution.
+      scale: Positive floating point tensor which characterizes the spread of
+        the distribution.
       validate_args: Whether to validate input with asserts.  If `validate_args`
         is `False`, and the inputs are invalid, correct behavior is not
         guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to give Ops created by the initializer.
 
     Raises:
@@ -294,8 +294,7 @@ class Laplace(distribution.Distribution):
       with ops.op_scope([self._loc, self._scale, n], name):
         n = ops.convert_to_tensor(n)
         n_val = tensor_util.constant_value(n)
-        shape = array_ops.concat(
-            0, [array_ops.pack([n]), self.batch_shape()])
+        shape = array_ops.concat(0, ([n], self.batch_shape()))
         # Sample uniformly-at-random from the open-interval (-1, 1).
         uniform_samples = random_ops.random_uniform(
             shape=shape,
diff --git a/tensorflow/contrib/distributions/python/ops/multinomial.py b/tensorflow/contrib/distributions/python/ops/multinomial.py
new file mode 100644
index 00000000000..477dd06673e
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/multinomial.py
@@ -0,0 +1,343 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Multinomial distribution class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=line-too-long
+
+from tensorflow.contrib.distributions.python.ops import distribution
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+
+# pylint: enable=line-too-long
+
+
+class Multinomial(distribution.Distribution):
+  """Multinomial distribution.
+
+  This distribution is parameterized by a vector `p` of probability
+  parameters for `k` classes and `n`, the counts per each class..
+
+  #### Mathematical details
+
+  The Multinomial is a distribution over k-class count data, meaning
+  for each k-tuple of non-negative integer `counts = [n_1,...,n_k]`, we have a
+  probability of these draws being made from the distribution.  The distribution
+  has hyperparameters `p = (p_1,...,p_k)`, and probability mass
+  function (pmf):
+
+  ```pmf(counts) = n! / (n_1!...n_k!) * (p_1)^n_1*(p_2)^n_2*...(p_k)^n_k```
+
+  where above `n = sum_j n_j`, `n!` is `n` factorial.
+
+  #### Examples
+
+  Create a 3-class distribution, with the 3rd class is most likely to be drawn,
+  using logits..
+
+  ```python
+  logits = [-50., -43, 0]
+  dist = Multinomial(n=4., logits=logits)
+  ```
+
+  Create a 3-class distribution, with the 3rd class is most likely to be drawn.
+
+  ```python
+  p = [.2, .3, .5]
+  dist = Multinomial(n=4., p=p)
+  ```
+
+  The distribution functions can be evaluated on counts.
+
+  ```python
+  # counts same shape as p.
+  counts = [1., 0, 3]
+  dist.prob(counts)  # Shape []
+
+  # p will be broadcast to [[.2, .3, .5], [.2, .3, .5]] to match counts.
+  counts = [[1., 2, 1], [2, 2, 0]]
+  dist.prob(counts)  # Shape [2]
+
+  # p will be broadcast to shape [5, 7, 3] to match counts.
+  counts = [[...]]  # Shape [5, 7, 3]
+  dist.prob(counts)  # Shape [5, 7]
+  ```
+
+  Create a 2-batch of 3-class distributions.
+
+  ```python
+  p = [[.1, .2, .7], [.3, .3, .4]]  # Shape [2, 3]
+  dist = Multinomial(n=[4., 5], p=p)
+
+  counts = [[2., 1, 1], [3, 1, 1]]
+  dist.prob(counts)  # Shape [2]
+  ```
+  """
+
+  def __init__(self,
+               n,
+               logits=None,
+               p=None,
+               validate_args=True,
+               allow_nan_stats=False,
+               name="Multinomial"):
+    """Initialize a batch of Multinomial distributions.
+
+    Args:
+      n:  Non-negative floating point tensor with shape broadcastable to
+        `[N1,..., Nm]` with `m >= 0`. Defines this as a batch of
+        `N1 x ... x Nm` different Multinomial distributions.  Its components
+        should be equal to integer values.
+      logits: Floating point tensor representing the log-odds of a
+        positive event with shape broadcastable to `[N1,..., Nm, k], m >= 0`,
+        and the same dtype as `n`. Defines this as a batch of `N1 x ... x Nm`
+        different `k` class Multinomial distributions.
+      p:  Positive floating point tensor with shape broadcastable to
+        `[N1,..., Nm, k]` `m >= 0` and same dtype as `n`.  Defines this as
+        a batch of `N1 x ... x Nm` different `k` class Multinomial
+        distributions. `p`'s components in the last portion of its shape should
+        sum up to 1.
+      validate_args: Whether to assert valid values for parameters `n` and `p`,
+        and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
+        guaranteed.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
+      name: The name to prefix Ops created by this distribution class.
+
+    Examples:
+
+    ```python
+    # Define 1-batch of 2-class multinomial distribution,
+    # also known as a Binomial distribution.
+    dist = Multinomial(n=2., p=[.1, .9])
+
+    # Define a 2-batch of 3-class distributions.
+    dist = Multinomial(n=[4., 5], p=[[.1, .3, .6], [.4, .05, .55]])
+    ```
+
+    """
+
+    self._logits, self._p = distribution_util.get_logits_and_prob(
+        name=name, logits=logits, p=p, validate_args=validate_args,
+        multidimensional=True)
+    with ops.op_scope([n, self._p], name):
+      with ops.control_dependencies([
+          check_ops.assert_non_negative(
+              n, message="n has negative components."),
+          distribution_util.assert_integer_form(
+              n, message="n has non-integer components."
+          )] if validate_args else []):
+        self._n = array_ops.identity(n, name="convert_n")
+        self._name = name
+
+        self._validate_args = validate_args
+        self._allow_nan_stats = allow_nan_stats
+
+        self._mean = array_ops.expand_dims(n, -1) * self._p
+        # Only used for inferring shape.
+        self._broadcast_shape = math_ops.reduce_sum(self._mean,
+                                                    reduction_indices=[-1],
+                                                    keep_dims=False)
+
+        self._get_batch_shape = self._broadcast_shape.get_shape()
+        self._get_event_shape = (
+            self._mean.get_shape().with_rank_at_least(1)[-1:])
+
+  @property
+  def n(self):
+    """Number of trials."""
+    return self._n
+
+  @property
+  def p(self):
+    """Event probabilities."""
+    return self._p
+
+  @property
+  def logits(self):
+    """Log-odds."""
+    return self._logits
+
+  @property
+  def name(self):
+    """Name to prepend to all ops."""
+    return self._name
+
+  @property
+  def dtype(self):
+    """dtype of samples from this distribution."""
+    return self._p.dtype
+
+  @property
+  def validate_args(self):
+    """Boolean describing behavior on invalid input."""
+    return self._validate_args
+
+  @property
+  def allow_nan_stats(self):
+    """Boolean describing behavior when a stat is undefined for batch member."""
+    return self._allow_nan_stats
+
+  def batch_shape(self, name="batch_shape"):
+    """Batch dimensions of this instance as a 1-D int32 `Tensor`.
+
+    The product of the dimensions of the `batch_shape` is the number of
+    independent distributions of this kind the instance represents.
+
+    Args:
+      name: name to give to the op
+
+    Returns:
+      `Tensor` `batch_shape`
+    """
+    with ops.name_scope(self.name):
+      with ops.op_scope([self._broadcast_shape], name):
+        return array_ops.shape(self._broadcast_shape)
+
+  def get_batch_shape(self):
+    """`TensorShape` available at graph construction time.
+
+    Same meaning as `batch_shape`. May be only partially defined.
+
+    Returns:
+      batch shape
+    """
+    return self._get_batch_shape
+
+  def event_shape(self, name="event_shape"):
+    """Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
+
+    Args:
+      name: name to give to the op
+
+    Returns:
+      `Tensor` `event_shape`
+    """
+    with ops.name_scope(self.name):
+      with ops.op_scope([self._mean], name):
+        return array_ops.gather(array_ops.shape(self._mean),
+                                [array_ops.rank(self._mean) - 1])
+
+  def get_event_shape(self):
+    """`TensorShape` available at graph construction time.
+
+    Same meaning as `event_shape`. May be only partially defined.
+
+    Returns:
+      event shape
+    """
+    return self._get_event_shape
+
+  def mean(self, name="mean"):
+    """Mean of the distribution."""
+    with ops.name_scope(self.name):
+      return array_ops.identity(self._mean, name=name)
+
+  def variance(self, name="variance"):
+    """Variance of the distribution."""
+    with ops.name_scope(self.name):
+      with ops.op_scope([self._n, self._p, self._mean], name):
+        p = array_ops.expand_dims(
+            self._p * array_ops.expand_dims(
+                array_ops.ones_like(self._n), -1), -1)
+        variance = -math_ops.batch_matmul(
+            array_ops.expand_dims(self._mean, -1), p, adj_y=True)
+        variance += array_ops.batch_matrix_diag(self._mean)
+        return variance
+
+  def log_prob(self, counts, name="log_prob"):
+    """`Log(P[counts])`, computed for every batch member.
+
+    For each batch of counts `[n_1,...,n_k]`, `P[counts]` is the probability
+    that after sampling `n` draws from this Multinomial distribution, the
+    number of draws falling in class `j` is `n_j`.  Note that different
+    sequences of draws can result in the same counts, thus the probability
+    includes a combinatorial coefficient.
+
+    Args:
+      counts:  Non-negative tensor with dtype `dtype` and whose shape can
+        be broadcast with `self.p` and `self.n`.  For fixed leading dimensions,
+        the last dimension represents counts for the corresponding Multinomial
+        distribution in `self.p`. `counts` is only legal if it sums up to `n`
+        and its components are equal to integer values.
+      name:  Name to give this Op, defaults to "log_prob".
+
+    Returns:
+      Log probabilities for each record, shape `[N1,...,Nm]`.
+    """
+    n = self._n
+    p = self._p
+    with ops.name_scope(self.name):
+      with ops.op_scope([n, p, counts], name):
+        counts = self._check_counts(counts)
+
+        prob_prob = math_ops.reduce_sum(counts * math_ops.log(self._p),
+                                        reduction_indices=[-1])
+        log_prob = prob_prob + distribution_util.log_combinations(
+            n, counts)
+        return log_prob
+
+  def prob(self, counts, name="prob"):
+    """`P[counts]`, computed for every batch member.
+
+    For each batch of counts `[n_1,...,n_k]`, `P[counts]` is the probability
+    that after sampling `n` draws from this Multinomial distribution, the
+    number of draws falling in class `j` is `n_j`.  Note that different
+    sequences of draws can result in the same counts, thus the probability
+    includes a combinatorial coefficient.
+
+    Args:
+      counts:  Non-negative tensor with dtype `dtype` and whose shape can
+        be broadcast with `self.p` and `self.n`.  For fixed leading dimensions,
+        the last dimension represents counts for the corresponding Multinomial
+        distribution in `self.p`. `counts` is only legal if it sums up to `n`
+        and its components are equal to integer values.
+      name:  Name to give this Op, defaults to "prob".
+
+    Returns:
+      Probabilities for each record, shape `[N1,...,Nm]`.
+    """
+    return super(Multinomial, self).prob(counts, name=name)
+
+  @property
+  def is_continuous(self):
+    return False
+
+  @property
+  def is_reparameterized(self):
+    return False
+
+  def _check_counts(self, counts):
+    """Check counts for proper shape, values, then return tensor version."""
+    counts = ops.convert_to_tensor(counts, name="counts_before_deps")
+    candidate_n = math_ops.reduce_sum(counts, reduction_indices=[-1])
+    if not self.validate_args:
+      return counts
+
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_non_negative(
+            counts, message="counts has negative components."),
+        check_ops.assert_equal(
+            self._n, candidate_n, message="counts do not sum to n."),
+        distribution_util.assert_integer_form(
+            counts, message="counts have non-integer components.")], counts)
diff --git a/tensorflow/contrib/distributions/python/ops/mvn.py b/tensorflow/contrib/distributions/python/ops/mvn.py
index a3b1baeba52..8936594dfac 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import math
 
 from tensorflow.contrib.distributions.python.ops import distribution
+from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
 from tensorflow.contrib.distributions.python.ops import operator_pd_diag
 from tensorflow.contrib.distributions.python.ops import operator_pd_full
@@ -104,9 +105,9 @@ class MultivariateNormalOperatorPD(distribution.Distribution):
     which determines the covariance.
 
     Args:
-      mu: `float` or `double` tensor with shape `[N1,...,Nb, k]`, `b >= 0`.
-      cov: `float` or `double` instance of `OperatorPDBase` with same `dtype`
-        as `mu` and shape `[N1,...,Nb, k, k]`.
+      mu: Floating point tensor with shape `[N1,...,Nb, k]`, `b >= 0`.
+      cov: Instance of `OperatorPDBase` with same `dtype` as `mu` and shape
+        `[N1,...,Nb, k, k]`.
       validate_args: Whether to validate input with asserts.  If `validate_args`
         is `False`, and the inputs are invalid, correct behavior is not
         guaranteed.
@@ -149,7 +150,7 @@ class MultivariateNormalOperatorPD(distribution.Distribution):
       else:
         return mu
 
-    # Static checks could not be run, so possibly do dyamic checks.
+    # Static checks could not be run, so possibly do dynamic checks.
     if not self.validate_args:
       return mu
     else:
@@ -465,7 +466,7 @@ class MultivariateNormalDiag(MultivariateNormalOperatorPD):
     The mean of `X_i` is `mu[i]`, and the standard deviation is `diag_stdev[i]`.
 
     Args:
-      mu:  Rank `N + 1` `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+      mu:  Rank `N + 1` floating point tensor with shape `[N1,...,Nb, k]`,
         `b >= 0`.
       diag_stdev: Rank `N + 1` `Tensor` with same `dtype` and shape as `mu`,
         representing the standard deviations.  Must be positive.
@@ -580,13 +581,13 @@ class MultivariateNormalDiagPlusVDVT(MultivariateNormalOperatorPD):
     ```
 
     Args:
-      mu:  Rank `n + 1` `float` or `double` tensor with shape `[N1,...,Nn, k]`,
+      mu:  Rank `n + 1` floating point tensor with shape `[N1,...,Nn, k]`,
         `n >= 0`.  The means.
-      diag_large:  Optional rank `n + 1` `float` or `double` tensor, shape
+      diag_large:  Optional rank `n + 1` floating point tensor, shape
         `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `M`.
-      v:  Rank `n + 1` `float` or `double` tensor, shape `[N1,...,Nn, k, r]`
+      v:  Rank `n + 1` floating point tensor, shape `[N1,...,Nn, k, r]`
         `n >= 0`.  Defines the matrix `V`.
-      diag_small:  Rank `n + 1` `float` or `double` tensor, shape
+      diag_small:  Rank `n + 1` floating point tensor, shape
         `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `D`.  Default
         is `None`, which means `D` will be the identity matrix.
       validate_args: Whether to validate input with asserts.  If `validate_args`
@@ -669,7 +670,7 @@ class MultivariateNormalCholesky(MultivariateNormalOperatorPD):
     factors, such that the covariance of each batch member is `chol chol^T`.
 
     Args:
-      mu: `(N+1)-D`  `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+      mu: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
         `b >= 0`.
       chol: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
         `[N1,...,Nb, k, k]`.  The upper triangular part is ignored (treated as
@@ -749,7 +750,7 @@ class MultivariateNormalFull(MultivariateNormalOperatorPD):
     User must provide means `mu` and `sigma`, the mean and covariance.
 
     Args:
-      mu: `(N+1)-D`  `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+      mu: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
         `b >= 0`.
       sigma: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
         `[N1,...,Nb, k, k]`.  Each batch member must be positive definite.
@@ -772,3 +773,72 @@ class MultivariateNormalFull(MultivariateNormalOperatorPD):
         allow_nan_stats=allow_nan_stats,
         validate_args=validate_args,
         name=name)
+
+
+def _kl_mvn_mvn_brute_force(mvn_a, mvn_b, name=None):
+  """Batched KL divergence `KL(mvn_a || mvn_b)` for multivariate normals.
+
+  With `X`, `Y` both multivariate normals in `R^k` with means `mu_x`, `mu_y` and
+  covariance `C_x`, `C_y` respectively,
+
+  ```
+  KL(X || Y) = 0.5 * ( T + Q + - k + L ),
+  T := trace(C_b^{-1} C_a),
+  Q := (mu_b - mu_a)^T C_b^{-1} (mu_b - mu_a),
+  L := Log[Det(C_b)] - Log[Det(C_a)]
+  ```
+
+  This `Op` computes the trace by solving `C_b^{-1} C_a`.  Although efficient
+  methods for solving systems with `C_b` may be available, a dense version of
+  (the square root of) `C_a` is used, so performance is `O(B s k^2)` where `B`
+  is the batch size, and `s` is the cost of solving `C_b x = y` for vectors `x`
+  and `y`.
+
+  Args:
+    mvn_a:  Instance of subclass of `MultivariateNormalOperatorPD`.
+    mvn_b:  Instance of subclass of `MultivariateNormalOperatorPD`.
+    name:  (optional) name to use for created ops.  Default "kl_mvn_mvn".
+
+  Returns:
+    Batchwise `KL(mvn_a || mvn_b)`.
+  """
+  # Access the "private" OperatorPD that each mvn is built from.
+  cov_a = mvn_a._cov  # pylint: disable=protected-access
+  cov_b = mvn_b._cov  # pylint: disable=protected-access
+  mu_a = mvn_a.mu
+  mu_b = mvn_b.mu
+  inputs = [mu_a, mu_b] + cov_a.inputs + cov_b.inputs
+
+  with ops.op_scope(inputs, name, "kl_mvn_mvn"):
+    # If Ca = AA', Cb = BB', then
+    # tr[inv(Cb) Ca] = tr[inv(B)' inv(B) A A']
+    #                = tr[inv(B) A A' inv(B)']
+    #                = tr[(inv(B) A) (inv(B) A)']
+    #                = sum_{ik} (inv(B) A)_{ik}^2
+    # The second equality follows from the cyclic permutation property.
+    b_inv_a = cov_b.sqrt_solve(cov_a.sqrt_to_dense())
+    t = math_ops.reduce_sum(
+        math_ops.square(b_inv_a),
+        reduction_indices=[-1, -2])
+    q = cov_b.inv_quadratic_form_on_vectors(mu_b - mu_a)
+    k = math_ops.cast(cov_a.vector_space_dimension(), mvn_a.dtype)
+    one_half_l = cov_b.sqrt_log_det() - cov_a.sqrt_log_det()
+    return 0.5 * (t + q - k) + one_half_l
+
+
+# Register KL divergences.
+kl_classes = [
+    MultivariateNormalFull,
+    MultivariateNormalCholesky,
+    MultivariateNormalDiag,
+    MultivariateNormalDiagPlusVDVT,
+]
+
+
+for mvn_aa in kl_classes:
+  # Register when they are the same here, and do not register when they are the
+  # same below because that would result in a repeated registration.
+  kullback_leibler.RegisterKL(mvn_aa, mvn_aa)(_kl_mvn_mvn_brute_force)
+  for mvn_bb in kl_classes:
+    if mvn_bb != mvn_aa:
+      kullback_leibler.RegisterKL(mvn_aa, mvn_bb)(_kl_mvn_mvn_brute_force)
diff --git a/tensorflow/contrib/distributions/python/ops/normal.py b/tensorflow/contrib/distributions/python/ops/normal.py
index dff8c7fdbbe..182afa31f7f 100644
--- a/tensorflow/contrib/distributions/python/ops/normal.py
+++ b/tensorflow/contrib/distributions/python/ops/normal.py
@@ -92,15 +92,15 @@ class Normal(distribution.Distribution):
     broadcasting (e.g. `mu + sigma` is a valid operation).
 
     Args:
-      mu: `float` or `double` tensor, the means of the distribution(s).
-      sigma: `float` or `double` tensor, the stddevs of the distribution(s).
+      mu: Floating point tensor, the means of the distribution(s).
+      sigma: Floating point tensor, the stddevs of the distribution(s).
         sigma must contain only positive values.
       validate_args: Whether to assert that `sigma > 0`. If `validate_args` is
-        False, correct output is not guaranteed when input is invalid.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+        `False`, correct output is not guaranteed when input is invalid.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to give Ops created by the initializer.
 
     Raises:
@@ -321,8 +321,7 @@ class Normal(distribution.Distribution):
       with ops.op_scope([self._mu, self._sigma, n], name):
         broadcast_shape = (self._mu + self._sigma).get_shape()
         n = ops.convert_to_tensor(n)
-        shape = array_ops.concat(
-            0, [array_ops.pack([n]), array_ops.shape(self.mean())])
+        shape = array_ops.concat(0, ([n], array_ops.shape(self.mean())))
         sampled = random_ops.random_normal(
             shape=shape, mean=0, stddev=1, dtype=self._mu.dtype, seed=seed)
 
diff --git a/tensorflow/contrib/distributions/python/ops/student_t.py b/tensorflow/contrib/distributions/python/ops/student_t.py
index e5fa624ddc4..8e43c95b6db 100644
--- a/tensorflow/contrib/distributions/python/ops/student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/student_t.py
@@ -82,6 +82,7 @@ class StudentT(distribution.Distribution):
   # returning a length 2 tensor.
   dist.pdf(3.0)
   ```
+
   """
 
   def __init__(self,
@@ -99,19 +100,19 @@ class StudentT(distribution.Distribution):
     broadcasting (e.g. `df + mu + sigma` is a valid operation).
 
     Args:
-      df: `float` or `double` tensor, the degrees of freedom of the
+      df: Floating point tensor, the degrees of freedom of the
         distribution(s). `df` must contain only positive values.
-      mu: `float` or `double` tensor, the means of the distribution(s).
-      sigma: `float` or `double` tensor, the scaling factor for the
+      mu: Floating point tensor, the means of the distribution(s).
+      sigma: Floating point tensor, the scaling factor for the
         distribution(s). `sigma` must contain only positive values.
         Note that `sigma` is not the standard deviation of this distribution.
       validate_args: Whether to assert that `df > 0, sigma > 0`. If
-        `validate_args` is False and inputs are invalid, correct behavior is not
-        guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+        `validate_args` is `False` and inputs are invalid, correct behavior is
+        not guaranteed.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to give Ops created by the initializer.
 
     Raises:
@@ -185,9 +186,12 @@ class StudentT(distribution.Distribution):
           nan = np.nan + self._zeros()
           return math_ops.select(df_gt_1, result_if_defined, nan)
         else:
-          one = ops.convert_to_tensor(1.0, dtype=self.dtype)
+          one = constant_op.constant(1.0, dtype=self.dtype)
           return control_flow_ops.with_dependencies(
-              [check_ops.assert_less(one, self._df)], result_if_defined)
+              [check_ops.assert_less(
+                  one, self._df,
+                  message="mean not defined for components of df <= 1"
+              )], result_if_defined)
 
   def mode(self, name="mode"):
     with ops.name_scope(self.name):
@@ -232,9 +236,12 @@ class StudentT(distribution.Distribution):
               result_where_defined,
               self._zeros() + np.nan)
         else:
-          one = ops.convert_to_tensor(1.0, self.dtype)
+          one = constant_op.constant(1.0, dtype=self.dtype)
           return control_flow_ops.with_dependencies(
-              [check_ops.assert_less(one, self._df)], result_where_defined)
+              [check_ops.assert_less(
+                  one, self._df,
+                  message="variance not defined for components of df <= 1"
+              )], result_where_defined)
 
   def std(self, name="std"):
     with ops.name_scope(self.name):
@@ -348,8 +355,7 @@ class StudentT(distribution.Distribution):
         # Let X = R*cos(theta), and let Y = R*sin(theta).
         # Then X ~ t_df and Y ~ t_df.
         # The variates X and Y are not independent.
-        shape = array_ops.concat(0, [array_ops.pack([2, n]),
-                                     self.batch_shape()])
+        shape = array_ops.concat(0, ([2, n], self.batch_shape()))
         uniform = random_ops.random_uniform(shape=shape,
                                             dtype=self.dtype,
                                             seed=seed)
diff --git a/tensorflow/contrib/distributions/python/ops/transformed_distribution.py b/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
index 185741b2176..82971301560 100644
--- a/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
@@ -57,6 +57,7 @@ class TransformedDistribution(distribution.Distribution):
     name="LogitNormalTransformedDistribution"
   )
   ```
+
   """
 
   def __init__(self,
diff --git a/tensorflow/contrib/distributions/python/ops/uniform.py b/tensorflow/contrib/distributions/python/ops/uniform.py
index eb196a3ea91..09437d36d16 100644
--- a/tensorflow/contrib/distributions/python/ops/uniform.py
+++ b/tensorflow/contrib/distributions/python/ops/uniform.py
@@ -67,14 +67,14 @@ class Uniform(distribution.Distribution):
     ```
 
     Args:
-      a: `float` or `double` tensor, the minimum endpoint.
-      b: `float` or `double` tensor, the maximum endpoint. Must be > `a`.
-      validate_args: Whether to assert that `a > b`. If `validate_args` is False
-        and inputs are invalid, correct behavior is not guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      a: Floating point tensor, the minimum endpoint.
+      b: Floating point tensor, the maximum endpoint. Must be > `a`.
+      validate_args: Whether to assert that `a > b`. If `validate_args` is
+        `False` and inputs are invalid, correct behavior is not guaranteed.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prefix Ops created by this distribution class.
 
     Raises:
@@ -83,8 +83,9 @@ class Uniform(distribution.Distribution):
     self._allow_nan_stats = allow_nan_stats
     self._validate_args = validate_args
     with ops.op_scope([a, b], name):
-      with ops.control_dependencies([check_ops.assert_less(a, b)] if
-                                    validate_args else []):
+      with ops.control_dependencies([check_ops.assert_less(
+          a, b, message="uniform not defined when a > b.")] if validate_args
+                                    else []):
         a = array_ops.identity(a, name="a")
         b = array_ops.identity(b, name="b")
 
@@ -228,7 +229,7 @@ class Uniform(distribution.Distribution):
         n = ops.convert_to_tensor(n, name="n")
         n_val = tensor_util.constant_value(n)
 
-        shape = array_ops.concat(0, [array_ops.pack([n]), self.batch_shape()])
+        shape = array_ops.concat(0, ([n], self.batch_shape()))
         samples = random_ops.random_uniform(shape=shape,
                                             dtype=self.dtype,
                                             seed=seed)
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index 2e7b547b308..14c7258c4a4 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -94,6 +94,30 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "gmm_test",
+    srcs = [
+        "python/ops/gmm_test.py",
+    ],
+    additional_deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+tf_py_test(
+    name = "gmm_ops_test",
+    srcs = [
+        "python/ops/gmm_ops_test.py",
+    ],
+    additional_deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_py_test(
     name = "factorization_ops_test",
     srcs = ["python/ops/factorization_ops_test.py"],
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
index 5a6bbec4b0d..655fb57a3ec 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
@@ -304,7 +304,7 @@ class WalsModelTest(tf.test.TestCase):
       col_factors2 = [x.eval() for x in wals_model.col_factors]
 
       for c1, c2 in zip(col_factors1, col_factors2):
-        self.assertAllClose(c1, c2, atol=1e-3)
+        self.assertAllClose(c1, c2, rtol=5e-3, atol=1e-2)
 
   def test_als_transposed(self):
     with self.test_session():
@@ -383,7 +383,7 @@ class WalsModelTest(tf.test.TestCase):
                                           regularization=1e-5,
                                           row_weights=None,
                                           col_weights=None)
-      self.simple_train(model, inp, 15)
+      self.simple_train(model, inp, 25)
       row_factor = model.row_factors[0].eval()
       col_factor = model.col_factors[0].eval()
       self.assertAllClose(data,
@@ -407,7 +407,7 @@ class WalsModelTest(tf.test.TestCase):
                                           regularization=1e-5,
                                           row_weights=[0] * rows,
                                           col_weights=[0] * cols)
-      self.simple_train(model, inp, 15)
+      self.simple_train(model, inp, 25)
       row_factor = model.row_factors[0].eval()
       col_factor = model.col_factors[0].eval()
       self.assertAllClose(data,
@@ -438,7 +438,7 @@ class WalsModelTest(tf.test.TestCase):
                                           regularization=0.001,
                                           row_weights=row_wts,
                                           col_weights=col_wts)
-      self.simple_train(model, inp, 10)
+      self.simple_train(model, inp, 25)
       row_factor = model.row_factors[0].eval()
       col_factor = model.col_factors[0].eval()
       out = np.dot(row_factor, np.transpose(col_factor))
@@ -446,7 +446,7 @@ class WalsModelTest(tf.test.TestCase):
         for j in xrange(cols):
           if keep_index([i, j]):
             self.assertNear(data[i][j], out[i][j],
-                            err=0.2, msg="%d, %d" % (i, j))
+                            err=0.4, msg="%d, %d" % (i, j))
           else:
             self.assertNear(0, out[i][j], err=0.5, msg="%d, %d" % (i, j))
 
diff --git a/tensorflow/contrib/factorization/python/ops/gmm.py b/tensorflow/contrib/factorization/python/ops/gmm.py
new file mode 100644
index 00000000000..c23a8cb30ed
--- /dev/null
+++ b/tensorflow/contrib/factorization/python/ops/gmm.py
@@ -0,0 +1,211 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Implementation of Gaussian mixture model (GMM) clustering.
+
+This goes on top of skflow API.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+import tensorflow as tf
+
+from tensorflow.contrib.factorization.python.ops import gmm_ops
+from tensorflow.contrib.learn.python.learn.estimators import estimator
+from tensorflow.contrib.learn.python.learn.estimators._sklearn import TransformerMixin
+from tensorflow.contrib.learn.python.learn.learn_io import data_feeder
+from tensorflow.contrib.learn.python.learn.utils import checkpoints
+from tensorflow.python.ops.control_flow_ops import with_dependencies
+
+
+class GMM(estimator.Estimator, TransformerMixin):
+  """GMM clustering."""
+  SCORES = 'scores'
+  ASSIGNMENTS = 'assignments'
+  ALL_SCORES = 'all_scores'
+
+  def __init__(self,
+               num_clusters,
+               model_dir=None,
+               random_seed=0,
+               params='wmc',
+               initial_clusters='random',
+               covariance_type='full',
+               batch_size=128,
+               steps=10,
+               continue_training=False,
+               config=None,
+               verbose=1):
+    """Creates a model for running GMM training and inference.
+
+    Args:
+      num_clusters: number of clusters to train.
+      model_dir: the directory to save the model results and log files.
+      random_seed: Python integer. Seed for PRNG used to initialize centers.
+      params: Controls which parameters are updated in the training process.
+        Can contain any combination of "w" for weights, "m" for means,
+        and "c" for covars.
+      initial_clusters: specifies how to initialize the clusters for training.
+        See gmm_ops.gmm for the possible values.
+      covariance_type: one of "full", "diag".
+      batch_size: See TensorFlowEstimator
+      steps: See TensorFlowEstimator
+      continue_training: See TensorFlowEstimator
+      config: See TensorFlowEstimator
+      verbose: See TensorFlowEstimator
+    """
+    super(GMM, self).__init__(
+        model_dir=model_dir,
+        config=config)
+    self.batch_size = batch_size
+    self.steps = steps
+    self.continue_training = continue_training
+    self.verbose = verbose
+    self._num_clusters = num_clusters
+    self._params = params
+    self._training_initial_clusters = initial_clusters
+    self._covariance_type = covariance_type
+    self._training_graph = None
+    self._random_seed = random_seed
+
+  def fit(self, x, y=None, monitors=None, logdir=None, steps=None):
+    """Trains a GMM clustering on x.
+
+    Note: See TensorFlowEstimator for logic for continuous training and graph
+      construction across multiple calls to fit.
+
+    Args:
+      x: training input matrix of shape [n_samples, n_features].
+      y: labels. Should be None.
+      monitors: List of `Monitor` objects to print training progress and
+        invoke early stopping.
+      logdir: the directory to save the log file that can be used for optional
+        visualization.
+      steps: number of training steps. If not None, overrides the value passed
+        in constructor.
+
+    Returns:
+      Returns self.
+    """
+    if logdir is not None:
+      self._model_dir = logdir
+    self._data_feeder = data_feeder.setup_train_data_feeder(
+        x, None, self._num_clusters, self.batch_size)
+    self._train_model(input_fn=self._data_feeder.input_builder,
+                      feed_fn=self._data_feeder.get_feed_dict_fn(),
+                      steps=steps or self.steps,
+                      monitors=monitors,
+                      init_feed_fn=self._data_feeder.get_feed_dict_fn())
+    return self
+
+  def predict(self, x, batch_size=None):
+    """Predict cluster id for each element in x.
+
+    Args:
+      x: 2-D matrix or iterator.
+      batch_size: size to use for batching up x for querying the model.
+
+    Returns:
+      Array with same number of rows as x, containing cluster ids.
+    """
+    return super(GMM, self).predict(x=x, batch_size=batch_size)[GMM.ASSIGNMENTS]
+
+  def score(self, x, batch_size=None):
+    """Predict total sum of distances to nearest clusters.
+
+    Args:
+      x: 2-D matrix or iterator.
+      batch_size: size to use for batching up x for querying the model.
+
+    Returns:
+      Total score.
+    """
+    return np.sum(self.evaluate(x=x, batch_size=batch_size)[GMM.SCORES])
+
+  def transform(self, x, batch_size=None):
+    """Transforms each element in x to distances to cluster centers.
+
+    Args:
+      x: 2-D matrix or iterator.
+      batch_size: size to use for batching up x for querying the model.
+
+    Returns:
+      Array with same number of rows as x, and num_clusters columns, containing
+      distances to the cluster centers.
+    """
+    return super(GMM, self).predict(x=x, batch_size=batch_size)[GMM.ALL_SCORES]
+
+  def clusters(self):
+    """Returns cluster centers."""
+    clusters = checkpoints.load_variable(self.model_dir,
+                                         gmm_ops.GmmAlgorithm.CLUSTERS_VARIABLE)
+    return np.squeeze(clusters, 1)
+
+  def covariances(self):
+    """Returns the covariances."""
+    return checkpoints.load_variable(
+        self.model_dir,
+        gmm_ops.GmmAlgorithm.CLUSTERS_COVS_VARIABLE)
+
+  def _get_train_ops(self, features, _):
+    (_,
+     _,
+     losses,
+     training_op) = gmm_ops.gmm(
+         features,
+         self._training_initial_clusters,
+         self._num_clusters,
+         self._random_seed,
+         self._covariance_type,
+         self._params)
+    incr_step = tf.assign_add(tf.contrib.framework.get_global_step(), 1)
+    loss = tf.reduce_sum(losses)
+    training_op = with_dependencies([training_op, incr_step], loss)
+    return training_op, loss
+
+  def _get_predict_ops(self, features):
+    (all_scores,
+     model_predictions,
+     _,
+     _) = gmm_ops.gmm(
+         features,
+         self._training_initial_clusters,
+         self._num_clusters,
+         self._random_seed,
+         self._covariance_type,
+         self._params)
+    return {
+        GMM.ALL_SCORES: all_scores[0],
+        GMM.ASSIGNMENTS: model_predictions[0]
+    }
+
+  def _get_eval_ops(self, features, _, unused_metrics):
+    (_,
+     _,
+     losses,
+     _) = gmm_ops.gmm(
+         features,
+         self._training_initial_clusters,
+         self._num_clusters,
+         self._random_seed,
+         self._covariance_type,
+         self._params)
+    return {
+        GMM.SCORES: tf.reduce_sum(losses),
+    }
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
new file mode 100644
index 00000000000..e9a64efe2a5
--- /dev/null
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -0,0 +1,461 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Gaussian mixture models Operations."""
+# TODO(xavigonzalvo): Factor out covariance matrix operations to make
+# code reusable for different types (e.g. diag).
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+from tensorflow.python.ops.embedding_ops import embedding_lookup
+
+# Machine epsilon.
+MEPS = np.finfo(float).eps
+FULL_COVARIANCE = 'full'
+DIAG_COVARIANCE = 'diag'
+
+
+def _covariance(x, diag):
+  """Defines the covariance operation of a matrix.
+
+  Args:
+    x: a matrix Tensor. Dimension 0 should contain the number of examples.
+    diag: if True, it computes the diagonal covariance.
+
+  Returns:
+    A Tensor representing the covariance of x. In the case of
+  diagonal matrix just the diagonal is returned.
+  """
+  num_points = tf.to_float(tf.shape(x)[0])
+  x -= tf.reduce_mean(x, 0, keep_dims=True)
+  if diag:
+    cov = tf.reduce_sum(
+        tf.square(x), 0, keep_dims=True) / (num_points - 1)
+  else:
+    cov = tf.matmul(x, x, transpose_a=True)  / (num_points - 1)
+  return cov
+
+
+def _init_clusters_random(data, num_clusters, random_seed):
+  """Does random initialization of clusters.
+
+  Args:
+    data: a list of Tensors with a matrix of data, each row is an example.
+    num_clusters: an integer with the number of clusters.
+    random_seed: Seed for PRNG used to initialize seeds.
+
+  Returns:
+    A Tensor with num_clusters random rows of data.
+  """
+  assert isinstance(data, list)
+  num_data = tf.add_n([tf.shape(inp)[0] for inp in data])
+  with tf.control_dependencies([tf.assert_less_equal(num_clusters, num_data)]):
+    indices = tf.random_uniform([num_clusters],
+                                minval=0,
+                                maxval=tf.cast(num_data, tf.int64),
+                                seed=random_seed,
+                                dtype=tf.int64)
+  indices = tf.cast(indices, tf.int32) % num_data
+  clusters_init = embedding_lookup(data, indices, partition_strategy='div')
+  return clusters_init
+
+
+class GmmAlgorithm(object):
+  """Tensorflow Gaussian mixture model clustering class."""
+  CLUSTERS_VARIABLE = 'clusters'
+  CLUSTERS_COVS_VARIABLE = 'clusters_covs'
+
+  def __init__(self, data, num_classes, initial_means=None, params='wmc',
+               covariance_type=FULL_COVARIANCE, random_seed=0):
+    """Constructor.
+
+    Args:
+      data: a list of Tensors with data, each row is a new example.
+      num_classes: number of clusters.
+      initial_means: a Tensor with a matrix of means. If None, means are
+        computed by sampling randomly.
+      params: Controls which parameters are updated in the training
+        process. Can contain any combination of "w" for weights, "m" for
+        means, and "c" for covariances.
+      covariance_type: one of "full", "diag".
+      random_seed: Seed for PRNG used to initialize seeds.
+
+    Raises:
+      Exception if covariance type is unknown.
+    """
+    self._params = params
+    self._random_seed = random_seed
+    self._covariance_type = covariance_type
+    if self._covariance_type not in [DIAG_COVARIANCE, FULL_COVARIANCE]:
+      raise Exception(  # pylint: disable=g-doc-exception
+          'programmer error: Invalid covariance type: %s' %
+          self._covariance_type)
+    # Create sharded variables for multiple shards. The following
+    # lists are indexed by shard.
+    # Probability per example in a class.
+    num_shards = len(data)
+    self._probs = [None] * num_shards
+    # Prior probability.
+    self._prior_probs = [None] * num_shards
+    # Membership weights w_{ik} where "i" is the i-th example and "k"
+    # is the k-th mixture.
+    self._w = [None] * num_shards
+    # Number of examples in a class.
+    self._points_in_k = [None] * num_shards
+    first_shard = data[0]
+    self._dimensions = tf.shape(first_shard)[1]
+    self._num_classes = num_classes
+    # Small value to guarantee that covariances are invertible.
+    self._min_var = tf.diag(tf.ones(tf.pack([self._dimensions]))) * 1e-3
+    self._create_variables(data, initial_means)
+    # Operations of partial statistics for the computation of the means.
+    self._w_mul_x = []
+    # Operations of partial statistics for the computation of the covariances.
+    self._w_mul_x2 = []
+    self._define_graph(data)
+
+  def _create_variables(self, data, initial_means=None):
+    """Initializes GMM algorithm.
+
+    Args:
+      data: a list of Tensors with data, each row is a new example.
+      initial_means: a Tensor with a matrix of means.
+    """
+    first_shard = data[0]
+    # Initialize means: num_classes X 1 X dimensions.
+    if initial_means is not None:
+      self._means = tf.Variable(tf.expand_dims(initial_means, 1),
+                                name=self.CLUSTERS_VARIABLE,
+                                validate_shape=False, dtype=tf.float32)
+    else:
+      # Sample data randomly
+      self._means = tf.Variable(tf.expand_dims(
+          _init_clusters_random(data, self._num_classes, self._random_seed), 1),
+                                name=self.CLUSTERS_VARIABLE,
+                                validate_shape=False)
+
+    # Initialize covariances.
+    if self._covariance_type == FULL_COVARIANCE:
+      cov = _covariance(first_shard, False) + self._min_var
+      # A matrix per class, num_classes X dimensions X dimensions
+      covs = tf.tile(
+          tf.expand_dims(cov, 0), [self._num_classes, 1, 1])
+    elif self._covariance_type == DIAG_COVARIANCE:
+      cov = _covariance(first_shard, True) + self._min_var
+      # A diagonal per row, num_classes X dimensions.
+      covs = tf.tile(tf.expand_dims(tf.diag_part(cov), 0),
+                     [self._num_classes, 1])
+    self._covs = tf.Variable(covs, name='clusters_covs', validate_shape=False)
+    # Mixture weights, representing the probability that a randomly
+    # selected unobservable data (in EM terms) was generated by component k.
+    self._alpha = tf.Variable(tf.tile([1.0 / self._num_classes],
+                                      [self._num_classes]))
+
+  def training_ops(self):
+    """Returns the training operation."""
+    return self._train_ops
+
+  def alphas(self):
+    return self._alpha
+
+  def clusters(self):
+    """Returns the clusters with dimensions num_classes X 1 X num_dimensions."""
+    return self._means
+
+  def covariances(self):
+    """Returns the covariances matrices."""
+    return self._covs
+
+  def assignments(self):
+    """Returns a list of Tensors with the matrix of assignments per shard."""
+    ret = []
+    for w in self._w:
+      ret.append(tf.argmax(w, 1))
+    return ret
+
+  def scores(self):
+    """Returns the distances to each class.
+
+    Returns:
+      A tuple with two Tensors. The first contains the distance to
+    each class. The second contains the distance to the assigned
+    class.
+    """
+    return (self._all_scores, self._scores)
+
+  def _define_graph(self, data):
+    """Define graph for a single iteration.
+
+    Args:
+      data: a list of Tensors defining the training data.
+    """
+    for shard_id, shard in enumerate(data):
+      self._num_examples = tf.shape(shard)[0]
+      shard = tf.expand_dims(shard, 0)
+      self._define_log_prob_operation(shard_id, shard)
+      self._define_prior_log_prob_operation(shard_id)
+      self._define_expectation_operation(shard_id)
+      self._define_partial_maximization_operation(shard_id, shard)
+    self._define_maximization_operation(len(data))
+    self._define_distance_to_clusters(data)
+
+  def _define_full_covariance_probs(self, shard_id, shard):
+    """Defines the full covariance probabilties per example in a class.
+
+    Updates a matrix with dimension num_examples X num_classes.
+
+    Args:
+      shard_id: id of the current shard.
+      shard: current data shard, 1 X num_examples X dimensions.
+    """
+    diff = shard - self._means
+    cholesky = tf.batch_cholesky(self._covs + self._min_var)
+    log_det_covs = 2.0 * tf.reduce_sum(tf.log(
+        tf.batch_matrix_diag_part(cholesky)), 1)
+    x_mu_cov = tf.square(tf.batch_matrix_triangular_solve(
+        cholesky, tf.transpose(diff, perm=[0, 2, 1]),
+        lower=True))
+    diag_m = tf.transpose(tf.reduce_sum(x_mu_cov, 1))
+    self._probs[shard_id] = -0.5 * (
+        diag_m + tf.to_float(self._dimensions) * tf.log(2 * np.pi) +
+        log_det_covs)
+
+  def _define_diag_covariance_probs(self, shard_id, shard):
+    """Defines the diagonal covariance probabilities per example in a class.
+
+    Args:
+      shard_id: id of the current shard.
+      shard: current data shard, 1 X num_examples X dimensions.
+
+    Returns a matrix num_examples * num_classes.
+    """
+    # num_classes X 1
+    # TODO(xavigonzalvo): look into alternatives to log for
+    # reparametrization of variance parameters.
+    det_expanded = tf.reduce_sum(tf.log(self._covs + 1e-3),
+                                 1, keep_dims=True)
+    diff = shard - self._means
+    x2 = tf.square(diff)
+    cov_expanded = tf.expand_dims(1.0 / (self._covs + 1e-3), 2)
+    # num_classes X num_examples
+    x2_cov = tf.batch_matmul(x2, cov_expanded)
+    x2_cov = tf.transpose(tf.squeeze(x2_cov, [2]))
+    self._probs[shard_id] = -0.5 * (
+        tf.to_float(self._dimensions) * tf.log(2.0 * np.pi) +
+        tf.transpose(det_expanded) + x2_cov)
+
+  def _define_log_prob_operation(self, shard_id, shard):
+    """Probability per example in a class.
+
+    Updates a matrix with dimension num_examples X num_classes.
+
+    Args:
+      shard_id: id of the current shard.
+      shard: current data shard, 1 X num_examples X dimensions.
+    """
+    # TODO(xavigonzalvo): Use the pdf defined in
+    # third_party/tensorflow/contrib/distributions/python/ops/gaussian.py
+    if self._covariance_type == FULL_COVARIANCE:
+      self._define_full_covariance_probs(shard_id, shard)
+    elif self._covariance_type == DIAG_COVARIANCE:
+      self._define_diag_covariance_probs(shard_id, shard)
+    self._probs[shard_id] += tf.log(self._alpha)
+
+  def _define_prior_log_prob_operation(self, shard_id):
+    """Computes the prior probability of all samples.
+
+    Updates a vector where each item is the prior probabibility of an
+    input example.
+
+    Args:
+      shard_id: id of current shard_id.
+    """
+    self._prior_probs[shard_id] = tf.log(
+        tf.reduce_sum(tf.exp(self._probs[shard_id]), 1, keep_dims=True))
+
+  def _define_expectation_operation(self, shard_id):
+    # Shape broadcasting.
+    probs = tf.expand_dims(self._probs[shard_id], 0)
+    # Membership weights are computed as:
+    # w_{ik} = \frac{\alpha_k f(\mathbf{y_i}|\mathbf{\theta}_k)}
+    #               {\sum_{m=1}^{K}\alpha_mf(\mathbf{y_i}|\mathbf{\theta}_m)}
+    # where "i" is the i-th example, "k" is the k-th mixture, theta are
+    # the model parameters and y_i the observations.
+    # These are defined for each shard.
+    self._w[shard_id] = tf.reshape(
+        tf.exp(probs - self._prior_probs[shard_id]),
+        tf.pack([self._num_examples, self._num_classes]))
+
+  def _define_partial_maximization_operation(self, shard_id, shard):
+    """Computes the partial statistics of the means and covariances.
+
+    Args:
+      shard_id: current shard id.
+      shard: current data shard, 1 X num_examples X dimensions.
+    """
+    # Soft assignment of each data point to each of the two clusters.
+    self._points_in_k[shard_id] = tf.reduce_sum(self._w[shard_id], 0,
+                                                keep_dims=True)
+    # Partial means.
+    w_mul_x = tf.expand_dims(
+        tf.matmul(self._w[shard_id],
+                  tf.squeeze(shard, [0]), transpose_a=True), 1)
+    self._w_mul_x.append(w_mul_x)
+    # Partial covariances.
+    x = tf.concat(0, [shard for _ in range(self._num_classes)])
+    x_trans = tf.transpose(x, perm=[0, 2, 1])
+    x_mul_w = tf.concat(0, [
+        tf.expand_dims(x_trans[k, :, :] * self._w[shard_id][:, k], 0)
+        for k in range(self._num_classes)])
+    self._w_mul_x2.append(tf.batch_matmul(x_mul_w, x))
+
+  def _define_maximization_operation(self, num_batches):
+    """Maximization operations."""
+    # TODO(xavigonzalvo): some of these operations could be moved to C++.
+    # Compute the effective number of data points assigned to component k.
+    with tf.control_dependencies(self._w):
+      points_in_k = tf.squeeze(tf.add_n(self._points_in_k), squeeze_dims=[0])
+      # Update alpha.
+      if 'w' in self._params:
+        final_points_in_k = points_in_k / num_batches
+        num_examples = tf.to_float(tf.reduce_sum(final_points_in_k))
+        self._alpha_op = self._alpha.assign(
+            final_points_in_k / (num_examples + MEPS))
+      else:
+        self._alpha_op = tf.no_op()
+      self._train_ops = [self._alpha_op]
+
+      # Update means.
+      points_in_k_expanded = tf.reshape(points_in_k,
+                                        [self._num_classes, 1, 1])
+      if 'm' in self._params:
+        self._means_op = self._means.assign(
+            tf.div(tf.add_n(self._w_mul_x), points_in_k_expanded + MEPS))
+      else:
+        self._means_op = tf.no_op()
+      # means are (num_classes x 1 x dims)
+
+      # Update covariances.
+      with tf.control_dependencies([self._means_op]):
+        b = tf.add_n(self._w_mul_x2) / (points_in_k_expanded + MEPS)
+        new_covs = []
+        for k in range(self._num_classes):
+          mean = self._means.ref()[k, :, :]
+          square_mean = tf.matmul(mean, mean, transpose_a=True)
+          new_cov = b[k, :, :] - square_mean + self._min_var
+          if self._covariance_type == FULL_COVARIANCE:
+            new_covs.append(tf.expand_dims(new_cov, 0))
+          elif self._covariance_type == DIAG_COVARIANCE:
+            new_covs.append(tf.expand_dims(tf.diag_part(new_cov), 0))
+        new_covs = tf.concat(0, new_covs)
+        if 'c' in self._params:
+          # Train operations don't need to take care of the means
+          # because covariances already depend on it.
+          with tf.control_dependencies([self._means_op, new_covs]):
+            self._train_ops.append(
+                tf.assign(self._covs, new_covs, validate_shape=False))
+
+  def _define_distance_to_clusters(self, data):
+    """Defines the Mahalanobis distance to the assigned Gaussian."""
+    # TODO(xavigonzalvo): reuse (input - mean) * cov^-1 * (input -
+    # mean) from log probability function.
+    self._all_scores = []
+    for shard in data:
+      all_scores = []
+      shard = tf.expand_dims(shard, 0)
+      for c in xrange(self._num_classes):
+        if self._covariance_type == FULL_COVARIANCE:
+          cov = self._covs[c, :, :]
+        elif self._covariance_type == DIAG_COVARIANCE:
+          cov = tf.diag(self._covs[c, :])
+        inverse = tf.matrix_inverse(cov + self._min_var)
+        inv_cov = tf.tile(
+            tf.expand_dims(inverse, 0),
+            tf.pack([self._num_examples, 1, 1]))
+        diff = tf.transpose(shard - self._means[c, :, :], perm=[1, 0, 2])
+        m_left = tf.batch_matmul(diff, inv_cov)
+        all_scores.append(tf.sqrt(tf.batch_matmul(
+            m_left, tf.transpose(diff, perm=[0, 2, 1])
+        )))
+      self._all_scores.append(tf.reshape(
+          tf.concat(1, all_scores),
+          tf.pack([self._num_examples, self._num_classes])))
+
+    # Distance to the associated class.
+    self._all_scores = tf.concat(0, self._all_scores)
+    assignments = tf.concat(0, self.assignments())
+    rows = tf.to_int64(tf.range(0, self._num_examples))
+    indices = tf.concat(1, [tf.expand_dims(rows, 1),
+                            tf.expand_dims(assignments, 1)])
+    self._scores = tf.gather_nd(self._all_scores, indices)
+
+  def _define_loglikelihood_operation(self):
+    """Defines the total log-likelihood of current iteration."""
+    self._ll_op = []
+    for prior_probs in self._prior_probs:
+      self._ll_op.append(tf.reduce_sum(tf.log(prior_probs)))
+    tf.scalar_summary('ll', tf.reduce_sum(self._ll_op))
+
+
+def gmm(inp, initial_clusters, num_clusters, random_seed,
+        covariance_type=FULL_COVARIANCE, params='wmc'):
+  """Creates the graph for Gaussian mixture model (GMM) clustering.
+
+  Args:
+    inp: An input tensor or list of input tensors
+    initial_clusters: Specifies the clusters used during
+      initialization. Can be a tensor or numpy array, or a function
+      that generates the clusters. Can also be "random" to specify
+      that clusters should be chosen randomly from input data. Note: type
+      is diverse to be consistent with skflow.
+    num_clusters: number of clusters.
+    random_seed: Python integer. Seed for PRNG used to initialize centers.
+    covariance_type: one of "diag", "full".
+    params: Controls which parameters are updated in the training
+      process. Can contain any combination of "w" for weights, "m" for
+      means, and "c" for covars.
+
+  Returns:
+    Note: tuple of lists returned to be consistent with skflow
+    A tuple consisting of:
+    all_scores: A matrix (or list of matrices) of dimensions (num_input,
+      num_clusters) where the value is the distance of an input vector and a
+      cluster center.
+    assignments: A vector (or list of vectors). Each element in the vector
+      corresponds to an input row in 'inp' and specifies the cluster id
+      corresponding to the input.
+    scores: Similar to assignments but specifies the distance to the
+      assigned cluster instead.
+    training_op: an op that runs an iteration of training.
+  """
+  initial_means = None
+  if initial_clusters != 'random' and not isinstance(
+      initial_clusters, tf.Tensor):
+    initial_means = tf.constant(initial_clusters, dtype=tf.float32)
+
+  # Implementation of GMM.
+  inp = inp if isinstance(inp, list) else [inp]
+  gmm_tool = GmmAlgorithm(inp, num_clusters, initial_means, params,
+                          covariance_type, random_seed)
+  training_ops = gmm_tool.training_ops()
+  assignments = gmm_tool.assignments()
+  all_scores, scores = gmm_tool.scores()
+  return [all_scores], [assignments], [scores], tf.group(*training_ops)
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py b/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
new file mode 100644
index 00000000000..a1bc0dca7ba
--- /dev/null
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
@@ -0,0 +1,198 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for gmm_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+from tensorflow.contrib.factorization.python.ops import gmm_ops
+from tensorflow.python.platform import tf_logging as logging
+
+
+class GmmOpsTest(tf.test.TestCase):
+
+  def setUp(self):
+    self.num_examples = 1000
+    self.iterations = 40
+    self.seed = 4
+    tf.set_random_seed(self.seed)
+    np.random.seed(self.seed * 2)
+    self.data, self.true_assignments = self.make_data(self.num_examples)
+    # Generate more complicated data.
+    self.centers = [[1, 1], [-1, 0.5], [2, 1]]
+    self.more_data, self.more_true_assignments = self.make_data_from_centers(
+        self.num_examples, self.centers)
+
+  @staticmethod
+  def make_data(num_vectors):
+    """Generates 2-dimensional data centered on (2,2), (-1,-1).
+
+    Args:
+      num_vectors: number of training examples.
+
+    Returns:
+      A tuple containing the data as a numpy array and the cluster ids.
+    """
+    vectors = []
+    classes = []
+    for _ in xrange(num_vectors):
+      if np.random.random() > 0.5:
+        vectors.append([np.random.normal(2.0, 0.6),
+                        np.random.normal(2.0, 0.9)])
+        classes.append(0)
+      else:
+        vectors.append([np.random.normal(-1.0, 0.4),
+                        np.random.normal(-1.0, 0.5)])
+        classes.append(1)
+    return np.asarray(vectors), classes
+
+  @staticmethod
+  def make_data_from_centers(num_vectors, centers):
+    """Generates 2-dimensional data with random centers.
+
+    Args:
+      num_vectors: number of training examples.
+      centers: a list of random 2-dimensional centers.
+
+    Returns:
+      A tuple containing the data as a numpy array and the cluster ids.
+    """
+    vectors = []
+    classes = []
+    for _ in xrange(num_vectors):
+      current_class = np.random.random_integers(0, len(centers) - 1)
+      vectors.append([np.random.normal(centers[current_class][0],
+                                       np.random.random_sample()),
+                      np.random.normal(centers[current_class][1],
+                                       np.random.random_sample())])
+      classes.append(current_class)
+    return np.asarray(vectors), len(centers)
+
+  def test_covariance(self):
+    start_time = time.time()
+    data = self.data.T
+    np_cov = np.cov(data)
+    logging.info('Numpy took %f', time.time() - start_time)
+
+    start_time = time.time()
+    with self.test_session() as sess:
+      op = gmm_ops._covariance(
+          tf.constant(data.T, dtype=tf.float32),
+          False)
+      op_diag = gmm_ops._covariance(
+          tf.constant(data.T, dtype=tf.float32),
+          True)
+      tf.initialize_all_variables().run()
+      tf_cov = sess.run(op)
+      np.testing.assert_array_almost_equal(np_cov, tf_cov)
+      logging.info('Tensorflow took %f', time.time() - start_time)
+      tf_cov = sess.run(op_diag)
+      np.testing.assert_array_almost_equal(
+          np.diag(np_cov), np.ravel(tf_cov), decimal=5)
+
+  def test_simple_cluster(self):
+    """Tests that the clusters are correct."""
+    num_classes = 2
+    graph = tf.Graph()
+    with graph.as_default() as g:
+      g.seed = 5
+      with self.test_session() as sess:
+        data = tf.constant(self.data, dtype=tf.float32)
+        _, assignments, _, training_op = gmm_ops.gmm(data, 'random',
+                                                     num_classes,
+                                                     random_seed=self.seed)
+
+        tf.initialize_all_variables().run()
+        for _ in xrange(self.iterations):
+          sess.run(training_op)
+        assignments = sess.run(assignments)
+        accuracy = np.mean(
+            np.asarray(self.true_assignments) == np.squeeze(assignments))
+        logging.info('Accuracy: %f', accuracy)
+        self.assertGreater(accuracy, 0.98)
+
+  def testParams(self):
+    """Tests that the params work as intended."""
+    num_classes = 2
+    with self.test_session() as sess:
+      # Experiment 1. Update weights only.
+      data = tf.constant(self.data, dtype=tf.float32)
+      gmm_tool = gmm_ops.GmmAlgorithm([data], num_classes,
+                                      [[3.0, 3.0], [0.0, 0.0]], 'w')
+      training_ops = gmm_tool.training_ops()
+      tf.initialize_all_variables().run()
+      for _ in xrange(self.iterations):
+        sess.run(training_ops)
+
+      # Only the probability to each class is updated.
+      alphas = sess.run(gmm_tool.alphas())
+      self.assertGreater(alphas[1], 0.6)
+      means = sess.run(gmm_tool.clusters())
+      np.testing.assert_almost_equal(
+          np.expand_dims([[3.0, 3.0], [0.0, 0.0]], 1), means)
+      covs = sess.run(gmm_tool.covariances())
+      np.testing.assert_almost_equal(covs[0], covs[1])
+
+      # Experiment 2. Update means and covariances.
+      gmm_tool = gmm_ops.GmmAlgorithm([data], num_classes,
+                                      [[3.0, 3.0], [0.0, 0.0]], 'mc')
+      training_ops = gmm_tool.training_ops()
+      tf.initialize_all_variables().run()
+      for _ in xrange(self.iterations):
+        sess.run(training_ops)
+      alphas = sess.run(gmm_tool.alphas())
+      self.assertAlmostEqual(alphas[0], alphas[1])
+      means = sess.run(gmm_tool.clusters())
+      np.testing.assert_almost_equal(
+          np.expand_dims([[2.0, 2.0], [-1.0, -1.0]], 1), means, decimal=1)
+      covs = sess.run(gmm_tool.covariances())
+      np.testing.assert_almost_equal(
+          [[0.371111, -0.0050774], [-0.0050774, 0.8651744]],
+          covs[0], decimal=4)
+      np.testing.assert_almost_equal(
+          [[0.146976, 0.0259463], [0.0259463, 0.2543971]],
+          covs[1], decimal=4)
+
+      # Experiment 3. Update covariances only.
+      gmm_tool = gmm_ops.GmmAlgorithm([data], num_classes,
+                                      [[-1.0, -1.0], [1.0, 1.0]], 'c')
+      training_ops = gmm_tool.training_ops()
+      tf.initialize_all_variables().run()
+      for _ in xrange(self.iterations):
+        sess.run(training_ops)
+      alphas = sess.run(gmm_tool.alphas())
+      self.assertAlmostEqual(alphas[0], alphas[1])
+      means = sess.run(gmm_tool.clusters())
+      np.testing.assert_almost_equal(
+          np.expand_dims([[-1.0, -1.0], [1.0, 1.0]], 1), means)
+      covs = sess.run(gmm_tool.covariances())
+      np.testing.assert_almost_equal(
+          [[0.1299582, 0.0435872], [0.0435872, 0.2558578]],
+          covs[0], decimal=5)
+      np.testing.assert_almost_equal(
+          [[3.195385, 2.6989155], [2.6989155, 3.3881593]],
+          covs[1], decimal=5)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_test.py b/tensorflow/contrib/factorization/python/ops/gmm_test.py
new file mode 100644
index 00000000000..323133e0dff
--- /dev/null
+++ b/tensorflow/contrib/factorization/python/ops/gmm_test.py
@@ -0,0 +1,172 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for ops.gmm."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+from tensorflow.contrib.factorization.python.ops.gmm import GMM
+from tensorflow.contrib.factorization.python.ops.kmeans import KMeansClustering as KMeans
+from tensorflow.contrib.learn.python.learn.estimators import run_config
+
+FLAGS = tf.app.flags.FLAGS
+
+
+class GMMTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(3)
+    tf.set_random_seed(2)
+    self.num_centers = 2
+    self.num_dims = 2
+    self.num_points = 4000
+    self.batch_size = 100
+    self.true_centers = self.make_random_centers(self.num_centers,
+                                                 self.num_dims)
+    self.points, self.assignments, self.scores = self.make_random_points(
+        self.true_centers,
+        self.num_points)
+    self.true_score = np.add.reduce(self.scores)
+
+    # Use initial means from kmeans (just like scikit-learn does).
+    clusterer = KMeans(num_clusters=self.num_centers)
+    clusterer.fit(self.points, steps=30)
+    self.initial_means = clusterer.clusters()
+
+  @staticmethod
+  def make_random_centers(num_centers, num_dims):
+    return np.round(np.random.rand(num_centers,
+                                   num_dims).astype(np.float32) * 500)
+
+  @staticmethod
+  def make_random_points(centers, num_points):
+    num_centers, num_dims = centers.shape
+    assignments = np.random.choice(num_centers, num_points)
+    offsets = np.round(np.random.randn(num_points,
+                                       num_dims).astype(np.float32) * 20)
+    points = centers[assignments] + offsets
+    means = [np.mean(points[assignments == center], axis=0)
+             for center in xrange(num_centers)]
+    covs = [np.cov(points[assignments == center].T)
+            for center in xrange(num_centers)]
+    scores = []
+    for r in xrange(num_points):
+      scores.append(np.sqrt(np.dot(
+          np.dot(points[r, :] - means[assignments[r]],
+                 np.linalg.inv(covs[assignments[r]])),
+          points[r, :] - means[assignments[r]])))
+    return (points, assignments, scores)
+
+  def test_clusters(self):
+    """Tests the shape of the clusters."""
+    gmm = GMM(self.num_centers,
+              initial_clusters=self.initial_means,
+              batch_size=self.batch_size,
+              steps=40,
+              continue_training=True,
+              random_seed=4,
+              config=run_config.RunConfig(tf_random_seed=2))
+    gmm.fit(x=self.points, steps=0)
+    clusters = gmm.clusters()
+    self.assertAllEqual(list(clusters.shape),
+                        [self.num_centers, self.num_dims])
+
+  def test_fit(self):
+    gmm = GMM(self.num_centers,
+              initial_clusters='random',
+              batch_size=self.batch_size,
+              random_seed=4,
+              config=run_config.RunConfig(tf_random_seed=2))
+    gmm.fit(x=self.points, steps=1)
+    score1 = gmm.score(x=self.points)
+    gmm = GMM(self.num_centers,
+              initial_clusters='random',
+              batch_size=self.batch_size,
+              random_seed=4,
+              config=run_config.RunConfig(tf_random_seed=2))
+    gmm.fit(x=self.points, steps=10)
+    score2 = gmm.score(x=self.points)
+    self.assertGreater(score1, score2)
+    self.assertNear(self.true_score, score2, self.true_score * 0.15)
+
+  def test_infer(self):
+    gmm = GMM(self.num_centers,
+              initial_clusters=self.initial_means,
+              batch_size=self.batch_size,
+              steps=40,
+              continue_training=True,
+              random_seed=4,
+              config=run_config.RunConfig(tf_random_seed=2))
+    gmm.fit(x=self.points, steps=60)
+    clusters = gmm.clusters()
+
+    # Make a small test set
+    points, true_assignments, true_offsets = (
+        self.make_random_points(clusters, 40))
+
+    assignments = np.ravel(gmm.predict(points))
+    self.assertAllEqual(true_assignments, assignments)
+
+    # Test score
+    score = gmm.score(points)
+    self.assertNear(score, np.sum(true_offsets), 4.05)
+
+  def _compare_with_sklearn(self, cov_type):
+    # sklearn version.
+    iterations = 40
+    np.random.seed(5)
+    sklearn_assignments = np.asarray([0, 0, 1, 0, 0, 0, 1, 0, 0, 1])
+    sklearn_means = np.asarray([[144.83417719, 254.20130341],
+                                [274.38754816, 353.16074346]])
+    sklearn_covs = np.asarray([[[395.0081194, -4.50389512],
+                                [-4.50389512, 408.27543989]],
+                               [[385.17484203, -31.27834935],
+                                [-31.27834935, 391.74249925]]])
+
+    # skflow version.
+    gmm = GMM(self.num_centers,
+              initial_clusters=self.initial_means,
+              covariance_type=cov_type,
+              batch_size=self.num_points,
+              steps=iterations,
+              continue_training=True,
+              config=run_config.RunConfig(tf_random_seed=2))
+    gmm.fit(self.points)
+    skflow_assignments = gmm.predict(self.points[:10, :]).astype(int)
+    self.assertAllClose(sklearn_assignments,
+                        np.ravel(skflow_assignments))
+    self.assertAllClose(sklearn_means, gmm.clusters())
+    if cov_type == 'full':
+      self.assertAllClose(sklearn_covs, gmm.covariances(), rtol=0.01)
+    else:
+      for d in [0, 1]:
+        self.assertAllClose(np.diag(sklearn_covs[d]),
+                            gmm.covariances()[d, :], rtol=0.01)
+
+  def test_compare_full(self):
+    self._compare_with_sklearn('full')
+
+  def test_compare_diag(self):
+    self._compare_with_sklearn('diag')
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans_test.py b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
index bc706453c13..4fc2bea515a 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans_test.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
@@ -153,9 +153,11 @@ class KMeansTest(tf.test.TestCase):
   def test_fit_with_cosine_distance(self):
     # Create points on y=x and y=1.5x lines to check the cosine similarity.
     # Note that euclidean distance will give different results in this case.
-    points = np.array([[9, 9], [0.5, 0.5], [10, 15], [0.4, 0.6]])
+    points = np.array(
+        [[9, 9], [0.5, 0.5], [10, 15], [0.4, 0.6]], dtype=np.float32)
     # true centers are the unit vectors on lines y=x and y=1.5x
-    true_centers = np.array([[0.70710678, 0.70710678], [0.5547002, 0.83205029]])
+    true_centers = np.array(
+        [[0.70710678, 0.70710678], [0.5547002, 0.83205029]], dtype=np.float32)
     kmeans = KMeans(2,
                     initial_clusters=kmeans_ops.RANDOM_INIT,
                     distance_metric=kmeans_ops.COSINE_DISTANCE,
@@ -168,8 +170,9 @@ class KMeansTest(tf.test.TestCase):
                         np.sort(true_centers, axis=0))
 
   def test_transform_with_cosine_distance(self):
-    points = np.array([[2.5, 3.5], [2, 8], [3, 1], [3, 18],
-                       [-2.5, -3.5], [-2, -8], [-3, -1], [-3, -18]])
+    points = np.array(
+        [[2.5, 0.1], [2, 0.2], [3, 0.1], [4, 0.2],
+         [0.1, 2.5], [0.2, 2], [0.1, 3], [0.2, 4]], dtype=np.float32)
 
     true_centers = [normalize(np.mean(normalize(points)[4:, :], axis=0,
                                       keepdims=True))[0],
@@ -180,8 +183,8 @@ class KMeansTest(tf.test.TestCase):
                     initial_clusters=kmeans_ops.RANDOM_INIT,
                     distance_metric=kmeans_ops.COSINE_DISTANCE,
                     use_mini_batch=self.use_mini_batch,
-                    config=self.config(3))
-    kmeans.fit(x=points, steps=30, batch_size=8)
+                    config=self.config(5))
+    kmeans.fit(x=points, steps=50, batch_size=8)
 
     centers = normalize(kmeans.clusters())
     self.assertAllClose(np.sort(centers, axis=0),
@@ -193,16 +196,16 @@ class KMeansTest(tf.test.TestCase):
     self.assertAllClose(transform, true_transform, atol=1e-3)
 
   def test_predict_with_cosine_distance(self):
-    points = np.array([[2.5, 3.5], [2, 8], [3, 1], [3, 18],
-                       [-2.5, -3.5], [-2, -8], [-3, -1], [-3, -18]]).astype(
-                           np.float32)
+    points = np.array(
+        [[2.5, 0.1], [2, 0.2], [3, 0.1], [4, 0.2],
+         [0.1, 2.5], [0.2, 2], [0.1, 3], [0.2, 4]], dtype=np.float32)
     true_centers = np.array(
         [normalize(np.mean(normalize(points)[0:4, :],
                            axis=0,
                            keepdims=True))[0],
          normalize(np.mean(normalize(points)[4:, :],
                            axis=0,
-                           keepdims=True))[0]])
+                           keepdims=True))[0]], dtype=np.float32)
     true_assignments = [0] * 4 + [1] * 4
     true_score = len(points) - np.tensordot(normalize(points),
                                             true_centers[true_assignments])
@@ -230,14 +233,14 @@ class KMeansTest(tf.test.TestCase):
     # the less populated centers.
     points = np.array([[2.5, 3.5], [2.5, 3.5], [-2, 3], [-2, 3], [-3, -3],
                        [-3.1, -3.2], [-2.8, -3.], [-2.9, -3.1], [-3., -3.1],
-                       [-3., -3.1], [-3.2, -3.], [-3., -3.]]).astype(np.float32)
+                       [-3., -3.1], [-3.2, -3.], [-3., -3.]], dtype=np.float32)
     true_centers = np.array(
         [normalize(np.mean(normalize(points)[0:2, :], axis=0,
                            keepdims=True))[0],
          normalize(np.mean(normalize(points)[2:4, :], axis=0,
                            keepdims=True))[0],
          normalize(np.mean(normalize(points)[4:, :], axis=0,
-                           keepdims=True))[0]])
+                           keepdims=True))[0]], dtype=np.float32)
     true_assignments = [0] * 2 + [1] * 2 + [2] * 8
     true_score = len(points) - np.tensordot(normalize(points),
                                             true_centers[true_assignments])
@@ -262,7 +265,7 @@ class KMeansTest(tf.test.TestCase):
     self.assertAllClose(score, true_score, atol=1e-2)
 
   def test_fit_raise_if_num_clusters_larger_than_num_points_random_init(self):
-    points = np.array([[2.0, 3.0], [1.6, 8.2]])
+    points = np.array([[2.0, 3.0], [1.6, 8.2]], dtype=np.float32)
 
     with self.assertRaisesOpError('less'):
       kmeans = KMeans(num_clusters=3, initial_clusters=kmeans_ops.RANDOM_INIT)
@@ -270,7 +273,7 @@ class KMeansTest(tf.test.TestCase):
 
   def test_fit_raise_if_num_clusters_larger_than_num_points_kmeans_plus_plus(
       self):
-    points = np.array([[2.0, 3.0], [1.6, 8.2]])
+    points = np.array([[2.0, 3.0], [1.6, 8.2]], dtype=np.float32)
 
     with self.assertRaisesOpError(AssertionError):
       kmeans = KMeans(num_clusters=3,
diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op.cc b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
index fd958e24a2c..10e35e165b2 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
@@ -21,6 +21,7 @@
 #include "tensorflow/contrib/ffmpeg/ffmpeg_lib.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -63,13 +64,11 @@ class FileDeleter {
 
 class DecodeAudioOp : public OpKernel {
  public:
-  explicit DecodeAudioOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+  explicit DecodeAudioOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("file_format", &file_format_));
     file_format_ = str_util::Lowercase(file_format_);
     const std::set<string> valid_file_formats(
-        kValidFileFormats,
-        kValidFileFormats + TF_ARRAYSIZE(kValidFileFormats));
+        kValidFileFormats, kValidFileFormats + TF_ARRAYSIZE(kValidFileFormats));
     OP_REQUIRES(context, valid_file_formats.count(file_format_) == 1,
                 errors::InvalidArgument(
                     "file_format arg must be in {",
@@ -80,8 +79,7 @@ class DecodeAudioOp : public OpKernel {
     OP_REQUIRES(context, samples_per_second_ > 0,
                 errors::InvalidArgument("samples_per_second must be > 0."));
 
-    OP_REQUIRES_OK(
-        context, context->GetAttr("channel_count", &channel_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("channel_count", &channel_count_));
     OP_REQUIRES(context, channel_count_ > 0,
                 errors::InvalidArgument("channel_count must be > 0."));
   }
@@ -117,15 +115,14 @@ class DecodeAudioOp : public OpKernel {
       LOG(ERROR) << "Ffmpeg failed with error '" << result.error_message()
                  << "'. Returning empty tensor.";
       Tensor* output = nullptr;
-      OP_REQUIRES_OK(
-          context, context->allocate_output(0, TensorShape({0, 0}), &output));
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, TensorShape({0, 0}), &output));
       return;
     } else {
       OP_REQUIRES_OK(context, result);
     }
-    OP_REQUIRES(
-        context, !output_samples.empty(),
-        errors::Unknown("No output created by FFmpeg."));
+    OP_REQUIRES(context, !output_samples.empty(),
+                errors::Unknown("No output created by FFmpeg."));
     OP_REQUIRES(
         context, output_samples.size() % channel_count_ == 0,
         errors::Unknown("FFmpeg created non-integer number of audio frames."));
@@ -133,9 +130,9 @@ class DecodeAudioOp : public OpKernel {
     // Copy the output data to the output Tensor.
     Tensor* output = nullptr;
     const int64 frame_count = output_samples.size() / channel_count_;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(
-            0, TensorShape({frame_count, channel_count_}), &output));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0, TensorShape({frame_count, channel_count_}), &output));
     auto matrix = output->tensor<float, 2>();
     for (int32 frame = 0; frame < frame_count; ++frame) {
       for (int32 channel = 0; channel < channel_count_; ++channel) {
@@ -159,6 +156,15 @@ REGISTER_OP("DecodeAudio")
     .Attr("file_format: string")
     .Attr("samples_per_second: int")
     .Attr("channel_count: int")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      int64 channels;
+      if (c->GetAttr("channel_count", &channels).ok()) {
+        c->set_output(0, c->Matrix(c->UnknownDim(), channels));
+      } else {
+        c->set_output(0, c->Matrix(c->UnknownDim(), c->UnknownDim()));
+      }
+      return Status::OK();
+    })
     .Doc(R"doc(
 Processes the contents of an audio file into a tensor using FFmpeg to decode
 the file.
diff --git a/tensorflow/contrib/ffmpeg/encode_audio_op.cc b/tensorflow/contrib/ffmpeg/encode_audio_op.cc
index 818285be5c1..bd3d6ae6998 100644
--- a/tensorflow/contrib/ffmpeg/encode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/encode_audio_op.cc
@@ -16,6 +16,7 @@
 #include <limits>
 
 #include "tensorflow/contrib/ffmpeg/ffmpeg_lib.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -24,8 +25,7 @@ namespace ffmpeg {
 
 class EncodeAudioOp : public OpKernel {
  public:
-  explicit EncodeAudioOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+  explicit EncodeAudioOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("file_format", &file_format_));
     file_format_ = str_util::Lowercase(file_format_);
     OP_REQUIRES(context, file_format_ == "wav",
@@ -35,15 +35,15 @@ class EncodeAudioOp : public OpKernel {
         context, context->GetAttr("samples_per_second", &samples_per_second_));
     OP_REQUIRES(context, samples_per_second_ > 0,
                 errors::InvalidArgument("samples_per_second must be > 0."));
-    OP_REQUIRES_OK(
-        context, context->GetAttr("bits_per_second", &bits_per_second_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("bits_per_second", &bits_per_second_));
   }
 
   void Compute(OpKernelContext* context) override {
     // Get and verify the input data.
-    OP_REQUIRES(context, context->num_inputs() == 1,
-                errors::InvalidArgument(
-                    "EncodeAudio requires exactly one input."));
+    OP_REQUIRES(
+        context, context->num_inputs() == 1,
+        errors::InvalidArgument("EncodeAudio requires exactly one input."));
     const Tensor& contents = context->input(0);
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(contents.shape()),
                 errors::InvalidArgument(
@@ -88,6 +88,7 @@ REGISTER_OP("EncodeAudio")
     .Attr("file_format: string")
     .Attr("samples_per_second: int")
     .Attr("bits_per_second: int = 192000")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Processes a `Tensor` containing sampled audio with the number of channels
 and length of the audio specified by the dimensions of the `Tensor`. The
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index 9dd59319848..b77fe259f84 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -91,6 +91,15 @@ py_test(
     deps = ["//tensorflow:tensorflow_py"],
 )
 
+py_test(
+    name = "sampling_ops_threading_test",
+    size = "small",
+    srcs = ["python/ops/sampling_ops_threading_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = ["//tensorflow:tensorflow_py"],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index c8cca813bbd..1510683b365 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -30,6 +30,7 @@
 
 ## Deprecation
 @@deprecated
+@@deprecated_arg_values
 
 ## Arg_Scope
 @@arg_scope
diff --git a/tensorflow/contrib/framework/python/framework/__init__.py b/tensorflow/contrib/framework/python/framework/__init__.py
index 407a03761dd..033faa6757f 100644
--- a/tensorflow/contrib/framework/python/framework/__init__.py
+++ b/tensorflow/contrib/framework/python/framework/__init__.py
@@ -21,4 +21,5 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.contrib.framework.python.framework.checkpoint_utils import *
 from tensorflow.contrib.framework.python.framework.deprecation import deprecated
+from tensorflow.contrib.framework.python.framework.deprecation import deprecated_arg_values
 from tensorflow.contrib.framework.python.framework.tensor_util import *
diff --git a/tensorflow/contrib/framework/python/framework/deprecation.py b/tensorflow/contrib/framework/python/framework/deprecation.py
index 7e83b6cdfa5..10d8f26c837 100644
--- a/tensorflow/contrib/framework/python/framework/deprecation.py
+++ b/tensorflow/contrib/framework/python/framework/deprecation.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import inspect
 import re
 
 from tensorflow.python.platform import tf_logging as logging
@@ -34,45 +36,77 @@ def _get_qualified_name(function):
   return function.__name__
 
 
-def _add_deprecation_to_docstring(doc, date, instructions):
+def _add_deprecation_to_docstring(
+    doc, instructions, no_doc_str, suffix_str, notice):
   """Adds a deprecation notice to a docstring."""
   if not doc:
-    lines = ['DEPRECATED FUNCTION']
+    lines = [no_doc_str]
   else:
     lines = doc.splitlines()
-    lines[0] += ' (deprecated)'
+    lines[0] += ' ' + suffix_str
 
-  notice = [
-      '',
-      'THIS FUNCTION IS DEPRECATED. It will be removed after %s.' % date,
-      'Instructions for updating:',
-      '%s' % instructions,
-  ]
+  notice = [''] + notice + [instructions]
 
   if len(lines) > 1:
     # Make sure that we keep our distance from the main body
     if lines[1].strip():
-      notice += ['']
+      notice.append('')
 
-    lines = [lines[0]] + notice + lines[1:]
+    lines[1:1] = notice
   else:
     lines += notice
 
   return '\n'.join(lines)
 
 
+def _add_deprecated_function_notice_to_docstring(doc, date, instructions):
+  """Adds a deprecation notice to a docstring for deprecated functions."""
+  return _add_deprecation_to_docstring(
+      doc, instructions,
+      'DEPRECATED FUNCTION',
+      '(deprecated)', [
+          'THIS FUNCTION IS DEPRECATED. It will be removed after %s.' % date,
+          'Instructions for updating:'])
+
+
+def _add_deprecated_arg_notice_to_docstring(doc, date, instructions):
+  """Adds a deprecation notice to a docstring for deprecated arguments."""
+  return _add_deprecation_to_docstring(
+      doc, instructions,
+      'DEPRECATED FUNCTION ARGUMENTS',
+      '(deprecated arguments)', [
+          'SOME ARGUMENTS ARE DEPRECATED. '
+          'They will be removed after %s.' % date,
+          'Instructions for updating:'])
+
+
+def _validate_deprecation_args(date, instructions):
+  if not date:
+    raise ValueError('Tell us what date this will be deprecated!')
+  if not re.match(r'20\d\d-[01]\d-[0123]\d', date):
+    raise ValueError('Date must be YYYY-MM-DD.')
+  if not instructions:
+    raise ValueError('Don\'t deprecate things without conversion instructions!')
+
+
+def _validate_callable(func, decorator_name):
+  if not hasattr(func, '__call__'):
+    raise ValueError(
+        '%s is not a function. If this is a property, '
+        'apply @%s after @property.' % (func, decorator_name))
+
+
 def deprecated(date, instructions):
   """Decorator for marking functions or methods deprecated.
 
-  This decorator adds a deprecation warning to a function's docstring. It has
-  the following format:
+  This decorator logs a deprecation warning whenever the decorated function is
+  called. It has the following format:
 
     <function> (from <module>) is deprecated and will be removed after <date>.
     Instructions for updating:
     <instructions>
 
-  whenever the decorated function is called. <function> will include the class
-  name if it is a method.
+  <function> will include the class name if it is a method.
 
   It also edits the docstring of the function: ' (deprecated)' is appended
   to the first line of the docstring and a deprecation notice is prepended
@@ -90,28 +124,73 @@ def deprecated(date, instructions):
   Raises:
     ValueError: If date is not in ISO 8601 format, or instructions are empty.
   """
-  if not date:
-    raise ValueError('Tell us what date this will be deprecated!')
-  if not re.match(r'20\d\d-[01]\d-[0123]\d', date):
-    raise ValueError('Date must be YYYY-MM-DD.')
-  if not instructions:
-    raise ValueError('Don\'t deprecate things without conversion instructions!')
+  _validate_deprecation_args(date, instructions)
 
   def deprecated_wrapper(func):
     """Deprecation wrapper."""
-    if not hasattr(func, '__call__'):
-      raise ValueError(
-          '%s is not a function.'
-          'If this is a property, apply @deprecated after @property.' % func)
+    _validate_callable(func, 'deprecated')
+    @functools.wraps(func)
     def new_func(*args, **kwargs):
-      logging.warning('%s (from %s) is deprecated and will be removed after %s.'
-                      '\nInstructions for updating:\n%s',
-                      _get_qualified_name(func), func.__module__,
-                      date, instructions)
+      logging.warning(
+          '%s (from %s) is deprecated and will be removed after %s.\n'
+          'Instructions for updating:\n%s',
+          _get_qualified_name(func), func.__module__, date, instructions)
       return func(*args, **kwargs)
-    new_func.__name__ = func.__name__
-    new_func.__doc__ = _add_deprecation_to_docstring(func.__doc__, date,
-                                                     instructions)
-    new_func.__dict__.update(func.__dict__)
+    new_func.__doc__ = _add_deprecated_function_notice_to_docstring(
+        func.__doc__, date, instructions)
+    return new_func
+  return deprecated_wrapper
+
+
+def deprecated_arg_values(date, instructions, **deprecated_kwargs):
+  """Decorator for marking specific function argument values as deprecated.
+
+  This decorator logs a deprecation warning whenever the decorated function is
+  called with the deprecated argument values. It has the following format:
+
+    Calling <function> (from <module>) with <arg>=<value> is deprecated and
+    will be removed after <date>. Instructions for updating:
+      <instructions>
+
+  <function> will include the class name if it is a method.
+
+  It also edits the docstring of the function: ' (deprecated arguments)' is
+  appended to the first line of the docstring and a deprecation notice is
+  prepended to the rest of the docstring.
+
+  Args:
+    date: String. The date the function is scheduled to be removed. Must be
+      ISO 8601 (YYYY-MM-DD).
+    instructions: String. Instructions on how to update code using the
+      deprecated function.
+    **deprecated_kwargs: The deprecated argument values.
+
+  Returns:
+    Decorated function or method.
+
+  Raises:
+    ValueError: If date is not in ISO 8601 format, or instructions are empty.
+  """
+  _validate_deprecation_args(date, instructions)
+  if not deprecated_kwargs:
+    raise ValueError('Specify which argument values are deprecated.')
+
+  def deprecated_wrapper(func):
+    """Deprecation decorator."""
+    _validate_callable(func, 'deprecated_arg_values')
+    @functools.wraps(func)
+    def new_func(*args, **kwargs):
+      """Deprecation wrapper."""
+      named_args = inspect.getcallargs(func, *args, **kwargs)
+      for arg_name, arg_value in deprecated_kwargs.items():
+        if arg_name in named_args and named_args[arg_name] == arg_value:
+          logging.warning(
+              'Calling %s (from %s) with %s=%s is deprecated and will be '
+              'removed after %s.\nInstructions for updating:\n%s',
+              _get_qualified_name(func), func.__module__,
+              arg_name, arg_value, date, instructions)
+      return func(*args, **kwargs)
+    new_func.__doc__ = _add_deprecated_arg_notice_to_docstring(
+        func.__doc__, date, instructions)
     return new_func
   return deprecated_wrapper
diff --git a/tensorflow/contrib/framework/python/framework/deprecation_test.py b/tensorflow/contrib/framework/python/framework/deprecation_test.py
index 914ab04053e..b9572d626a6 100644
--- a/tensorflow/contrib/framework/python/framework/deprecation_test.py
+++ b/tensorflow/contrib/framework/python/framework/deprecation_test.py
@@ -56,7 +56,7 @@ class DeprecationTest(tf.test.TestCase):
 
       Args:
         arg0: Arg 0.
-        arg1: Arg 0.
+        arg1: Arg 1.
 
       Returns:
         Sum of args.
@@ -73,13 +73,38 @@ class DeprecationTest(tf.test.TestCase):
         "\n"
         "\n      Args:"
         "\n        arg0: Arg 0."
-        "\n        arg1: Arg 0."
+        "\n        arg1: Arg 1."
         "\n"
         "\n      Returns:"
         "\n        Sum of args."
         "\n      " % (date, instructions),
         _fn.__doc__)
-    self.assertEqual({}, _fn.__dict__)
+
+    # Assert calling new fn issues log warning.
+    self.assertEqual(3, _fn(1, 2))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_static_fn_with_one_line_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated(date, instructions)
+    def _fn(arg0, arg1):
+      """fn doc."""
+      return arg0 + arg1
+
+    # Assert function docs are properly updated.
+    self.assertEqual("_fn", _fn.__name__)
+    self.assertEqual(
+        "fn doc. (deprecated)"
+        "\n"
+        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nInstructions for updating:\n%s" % (date, instructions),
+        _fn.__doc__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual(3, _fn(1, 2))
@@ -106,7 +131,6 @@ class DeprecationTest(tf.test.TestCase):
         "\nInstructions for updating:"
         "\n%s" % (date, instructions),
         _fn.__doc__)
-    self.assertEqual({}, _fn.__dict__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual(3, _fn(1, 2))
@@ -131,7 +155,7 @@ class DeprecationTest(tf.test.TestCase):
 
         Args:
           arg0: Arg 0.
-          arg1: Arg 0.
+          arg1: Arg 1.
 
         Returns:
           Sum of args.
@@ -147,7 +171,7 @@ class DeprecationTest(tf.test.TestCase):
         "\n"
         "\n        Args:"
         "\n          arg0: Arg 0."
-        "\n          arg1: Arg 0."
+        "\n          arg1: Arg 1."
         "\n"
         "\n        Returns:"
         "\n          Sum of args."
@@ -161,6 +185,36 @@ class DeprecationTest(tf.test.TestCase):
     self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
     self._assert_subset(set([date, instructions]), set(args[1:]))
 
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_instance_fn_with_one_line_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    class _Object(object):
+
+      def __init(self):
+        pass
+
+      @deprecation.deprecated(date, instructions)
+      def _fn(self, arg0, arg1):
+        """fn doc."""
+        return arg0 + arg1
+
+    # Assert function docs are properly updated.
+    self.assertEqual(
+        "fn doc. (deprecated)"
+        "\n"
+        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nInstructions for updating:\n%s" % (date, instructions),
+        getattr(_Object, "_fn").__doc__)
+
+    # Assert calling new fn issues log warning.
+    self.assertEqual(3, _Object()._fn(1, 2))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
   @tf.test.mock.patch.object(logging, "warning", autospec=True)
   def test_instance_fn_no_doc(self, mock_warning):
     date = "2016-07-04"
@@ -280,5 +334,155 @@ class DeprecationTest(tf.test.TestCase):
     self._assert_subset(set([date, instructions]), set(args[1:]))
 
 
+class DeprecatedArgsTest(tf.test.TestCase):
+
+  def _assert_subset(self, expected_subset, actual_set):
+    self.assertTrue(
+        actual_set.issuperset(expected_subset),
+        msg="%s is not a superset of %s." % (actual_set, expected_subset))
+
+  def test_deprecated_illegal_args(self):
+    instructions = "This is how you update..."
+    with self.assertRaisesRegexp(ValueError, "date"):
+      deprecation.deprecated_arg_values(
+          None, instructions, deprecated=True)
+    with self.assertRaisesRegexp(ValueError, "date"):
+      deprecation.deprecated_arg_values(
+          "", instructions, deprecated=True)
+    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
+      deprecation.deprecated_arg_values(
+          "07-04-2016", instructions, deprecated=True)
+    date = "2016-07-04"
+    with self.assertRaisesRegexp(ValueError, "instructions"):
+      deprecation.deprecated_arg_values(
+          date, None, deprecated=True)
+    with self.assertRaisesRegexp(ValueError, "instructions"):
+      deprecation.deprecated_arg_values(
+          date, "", deprecated=True)
+    with self.assertRaisesRegexp(ValueError, "argument", deprecated=True):
+      deprecation.deprecated_arg_values(
+          date, instructions)
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_static_fn_with_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated_arg_values(date, instructions, deprecated=True)
+    def _fn(arg0, arg1, deprecated=True):
+      """fn doc.
+
+      Args:
+        arg0: Arg 0.
+        arg1: Arg 1.
+        deprecated: Deprecated!
+
+      Returns:
+        Sum of args.
+      """
+      return arg0 + arg1 if deprecated else arg1 + arg0
+
+    # Assert function docs are properly updated.
+    self.assertEqual("_fn", _fn.__name__)
+    self.assertEqual(
+        "fn doc. (deprecated arguments)"
+        "\n"
+        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nInstructions for updating:\n%s"
+        "\n"
+        "\n      Args:"
+        "\n        arg0: Arg 0."
+        "\n        arg1: Arg 1."
+        "\n        deprecated: Deprecated!"
+        "\n"
+        "\n      Returns:"
+        "\n        Sum of args."
+        "\n      " % (date, instructions),
+        _fn.__doc__)
+
+    # Assert calling new fn with non-deprecated value logs nothing.
+    self.assertEqual(3, _fn(1, 2, deprecated=False))
+    self.assertEqual(0, mock_warning.call_count)
+
+    # Assert calling new fn with deprecated value issues log warning.
+    self.assertEqual(3, _fn(1, 2, deprecated=True))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+    # Assert calling new fn with default deprecated value issues log warning.
+    self.assertEqual(3, _fn(1, 2))
+    self.assertEqual(2, mock_warning.call_count)
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_static_fn_with_one_line_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated_arg_values(date, instructions, deprecated=True)
+    def _fn(arg0, arg1, deprecated=True):
+      """fn doc."""
+      return arg0 + arg1 if deprecated else arg1 + arg0
+
+    # Assert function docs are properly updated.
+    self.assertEqual("_fn", _fn.__name__)
+    self.assertEqual(
+        "fn doc. (deprecated arguments)"
+        "\n"
+        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nInstructions for updating:\n%s" % (date, instructions),
+        _fn.__doc__)
+
+    # Assert calling new fn with non-deprecated value logs nothing.
+    self.assertEqual(3, _fn(1, 2, deprecated=False))
+    self.assertEqual(0, mock_warning.call_count)
+
+    # Assert calling new fn with deprecated value issues log warning.
+    self.assertEqual(3, _fn(1, 2, deprecated=True))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+    # Assert calling new fn with default deprecated value issues log warning.
+    self.assertEqual(3, _fn(1, 2))
+    self.assertEqual(2, mock_warning.call_count)
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_static_fn_no_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated_arg_values(date, instructions, deprecated=True)
+    def _fn(arg0, arg1, deprecated=True):
+      return arg0 + arg1 if deprecated else arg1 + arg0
+
+    # Assert function docs are properly updated.
+    self.assertEqual("_fn", _fn.__name__)
+    self.assertEqual(
+        "DEPRECATED FUNCTION ARGUMENTS"
+        "\n"
+        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nInstructions for updating:"
+        "\n%s" % (date, instructions),
+        _fn.__doc__)
+
+    # Assert calling new fn with non-deprecated value logs nothing.
+    self.assertEqual(3, _fn(1, 2, deprecated=False))
+    self.assertEqual(0, mock_warning.call_count)
+
+    # Assert calling new fn issues log warning.
+    self.assertEqual(3, _fn(1, 2, deprecated=True))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+    # Assert calling new fn with default deprecated value issues log warning.
+    self.assertEqual(3, _fn(1, 2))
+    self.assertEqual(2, mock_warning.call_count)
+
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/contrib/framework/python/ops/sampling_ops.py b/tensorflow/contrib/framework/python/ops/sampling_ops.py
index d44fe3b3f6c..1d4fed9bd41 100644
--- a/tensorflow/contrib/framework/python/ops/sampling_ops.py
+++ b/tensorflow/contrib/framework/python/ops/sampling_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.training import input as input_ops
 from tensorflow.python.training import queue_runner
 
@@ -34,10 +35,8 @@ __all__ = ['stratified_sample',
            'stratified_sample_unknown_dist',]
 
 
-# TODO(joelshor): Use an exponential-moving-average to estimate the initial
-# class distribution and remove the requirement that it be provided.
-def stratified_sample(tensors, labels, init_probs, target_probs, batch_size,
-                      enqueue_many=False, queue_capacity=16,
+def stratified_sample(tensors, labels, target_probs, batch_size,
+                      init_probs=None, enqueue_many=False, queue_capacity=16,
                       threads_per_queue=1, name=None):
   """Stochastically creates batches based on per-class probabilities.
 
@@ -52,11 +51,12 @@ def stratified_sample(tensors, labels, init_probs, target_probs, batch_size,
         batch, according to enqueue_many.
     labels: Tensor for label of data. Label is a single integer or a batch,
         depending on enqueue_many. It is not a one-hot vector.
-    init_probs: Class proportions in the data. An object whose type has a
-        registered Tensor conversion function.
     target_probs: Target class proportions in batch. An object whose type has a
         registered Tensor conversion function.
     batch_size: Size of batch to be returned.
+    init_probs: Class proportions in the data. An object whose type has a
+        registered Tensor conversion function, or `None` for estimating the
+        initial distribution.
     enqueue_many: Bool. If true, interpret input tensors as having a batch
         dimension.
     queue_capacity: Capacity of the large queue that holds input examples.
@@ -81,10 +81,9 @@ def stratified_sample(tensors, labels, init_probs, target_probs, batch_size,
     data, label = data_provider.Get(['data', 'label'])
 
     # Get stratified batch according to per-class probabilities.
-    init_probs = [1.0/NUM_CLASSES for _ in range(NUM_CLASSES)]
     target_probs = [...distribution you want...]
     [data_batch], labels = tf.contrib.framework.sampling_ops.stratified_sample(
-        [data], label, init_probs, target_probs)
+        [data], label, target_probs)
 
     # Run batch through network.
     ...
@@ -92,22 +91,34 @@ def stratified_sample(tensors, labels, init_probs, target_probs, batch_size,
   with ops.op_scope(tensors + [labels], name, 'stratified_sample'):
     tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors)
     labels = ops.convert_to_tensor(labels)
-    init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32)
     target_probs = ops.convert_to_tensor(target_probs, dtype=dtypes.float32)
     # Reduce the case of a single example to that of a batch of size 1.
     if not enqueue_many:
       tensor_list = [array_ops.expand_dims(tensor, 0) for tensor in tensor_list]
       labels = array_ops.expand_dims(labels, 0)
 
+    # If `init_probs` is `None`, set up online estimation of data distribution.
+    if init_probs is None:
+      # We use `target_probs` to get the number of classes, so its shape must be
+      # fully defined at graph construction time.
+      target_probs.get_shape().assert_is_fully_defined()
+      init_probs = _estimate_data_distribution(
+          labels, target_probs.get_shape().num_elements())
+    else:
+      init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32)
+
     # Validate that input is consistent.
     tensor_list, labels, [init_probs, target_probs] = _verify_input(
         tensor_list, labels, [init_probs, target_probs])
 
     # Check that all zero initial probabilities also have zero target
     # probabilities.
-    assert_op = logging_ops.Assert(math_ops.reduce_all(math_ops.logical_or(
-        math_ops.not_equal(init_probs, 0),
-        math_ops.equal(target_probs, 0))), [init_probs, target_probs])
+    assert_op = logging_ops.Assert(
+        math_ops.reduce_all(math_ops.logical_or(
+            math_ops.not_equal(init_probs, 0),
+            math_ops.equal(target_probs, 0))),
+        ['All classes with zero initial probability must also have zero target '
+         'probability: ', init_probs, target_probs])
     init_probs = control_flow_ops.with_dependencies([assert_op], init_probs)
 
     # Calculate acceptance sampling probabilities.
@@ -212,6 +223,40 @@ def stratified_sample_unknown_dist(tensors, labels, probs, batch_size,
         per_class_queues, probs, batch_size)
 
 
+def _estimate_data_distribution(labels, num_classes):
+  """Estimate data distribution as labels are seen."""
+  # Variable to track running count of classes. Add 1 to avoid division-by-zero,
+  # and to guarantee that calculation of acceptance probabilities is (mostly)
+  # correct.
+  num_examples_per_class_seen = variables.Variable(
+      initial_value=[1] * num_classes, trainable=False, name='class_count',
+      dtype=dtypes.int64)
+
+  # Update the class-count based on what labels are seen in batch.
+  num_examples_per_class_seen = num_examples_per_class_seen.assign_add(
+      math_ops.reduce_sum(array_ops.one_hot(labels, num_classes,
+                                            dtype=dtypes.int64), 0))
+
+  # Normalize count into a probability.
+  # NOTE: Without the `+= 0` line below, the test
+  # `testMultiThreadedEstimateDataDistribution` fails. The reason is that
+  # before this line, `num_examples_per_class_seen` is a Tensor that shares a
+  # buffer with an underlying `ref` object. When the `ref` is changed by another
+  # thread, `num_examples_per_class_seen` changes as well. Since this can happen
+  # in the middle of the normalization computation, we get probabilities that
+  # are very far from summing to one. Adding `+= 0` copies the contents of the
+  # tensor to a new buffer, which will be consistent from the start to the end
+  # of the normalization computation.
+  num_examples_per_class_seen += 0
+  init_prob_estimate = math_ops.truediv(
+      num_examples_per_class_seen,
+      math_ops.reduce_sum(num_examples_per_class_seen))
+
+  # Must return float32 (not float64) to agree with downstream `_verify_input`
+  # checks.
+  return math_ops.cast(init_prob_estimate, dtypes.float32)
+
+
 def _verify_input(tensor_list, labels, probs_list):
   """Verify that batched inputs are well-formed."""
   checked_probs_list = []
diff --git a/tensorflow/contrib/framework/python/ops/sampling_ops_test.py b/tensorflow/contrib/framework/python/ops/sampling_ops_test.py
index 4ec7d86ec82..35b56bdfa1a 100644
--- a/tensorflow/contrib/framework/python/ops/sampling_ops_test.py
+++ b/tensorflow/contrib/framework/python/ops/sampling_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 import tensorflow as tf
+from tensorflow.python.platform import tf_logging as logging
 
 
 class SamplingOpsTest(tf.test.TestCase):
@@ -33,15 +34,22 @@ class SamplingOpsTest(tf.test.TestCase):
 
     # Curry the rejection sampler so we can easily run the same tests on both
     # stratified_sample and stratified_sample_unknown_dist.
-    def curried_sampler(val, lbls, probs, batch, enqueue_many=True):
+    def curried_sampler(tensors, labels, probs, batch_size, enqueue_many=True):
       return tf.contrib.framework.sampling_ops.stratified_sample(
-          val, lbls, initial_p, probs, batch, enqueue_many=enqueue_many)
+          tensors=tensors,
+          labels=labels,
+          target_probs=probs,
+          batch_size=batch_size,
+          init_probs=initial_p,
+          enqueue_many=enqueue_many)
+
     samplers = [
         tf.contrib.framework.sampling_ops.stratified_sample_unknown_dist,
         curried_sampler,
     ]
 
     for sampler in samplers:
+      logging.info('Now testing `%s`', sampler.__class__.__name__)
       # Label must have only batch dimension if enqueue_many is True.
       with self.assertRaises(ValueError):
         sampler(val, tf.zeros([]), probs, batch_size, enqueue_many=True)
@@ -70,20 +78,21 @@ class SamplingOpsTest(tf.test.TestCase):
 
       # Probabilities shape must be fully defined.
       with self.assertRaises(ValueError):
-        sampler(val, label, tf.placeholder(tf.float32, shape=[None]),
-                batch_size)
+        sampler(
+            val, label, tf.placeholder(
+                tf.float32, shape=[None]), batch_size)
 
     # In the rejection sampling case, make sure that probability lengths are
     # the same.
     with self.assertRaises(ValueError):
       tf.contrib.framework.sampling_ops.stratified_sample(
-          val, label, [.2] * 5, [.1] * 10, batch_size)
+          val, label, [.1] * 10, batch_size, init_probs=[.2] * 5)
 
     # In the rejection sampling case, make sure that zero initial probability
     # classes also have zero target probability.
     with self.assertRaises(ValueError):
       tf.contrib.framework.sampling_ops.stratified_sample(
-          val, label, [0, .5, .5], [.2, .4, .4], batch_size)
+          val, label, [.2, .4, .4], batch_size, init_probs=[0, .5, .5])
 
     # Probabilities must be 1D.
     with self.assertRaises(ValueError):
@@ -116,15 +125,17 @@ class SamplingOpsTest(tf.test.TestCase):
       # Run session that should fail.
       with self.test_session() as sess:
         with self.assertRaises(tf.errors.InvalidArgumentError):
-          sess.run([val_tf, lbl_tf], feed_dict={label_ph: illegal_label,
-                                                probs_ph: valid_probs})
+          sess.run([val_tf, lbl_tf],
+                   feed_dict={label_ph: illegal_label,
+                              probs_ph: valid_probs})
 
     for illegal_prob in illegal_probs:
       # Run session that should fail.
       with self.test_session() as sess:
         with self.assertRaises(tf.errors.InvalidArgumentError):
-          sess.run([prob_tf], feed_dict={label_ph: valid_labels,
-                                         probs_ph: illegal_prob})
+          sess.run([prob_tf],
+                   feed_dict={label_ph: valid_labels,
+                              probs_ph: illegal_prob})
 
   def batchingBehaviorHelper(self, sampler):
     batch_size = 20
@@ -152,15 +163,14 @@ class SamplingOpsTest(tf.test.TestCase):
     lbl_input_batch = tf.ones([], dtype=tf.int32)
     probs = np.array([0, 1, 0, 0, 0])
     batches = tf.contrib.framework.sampling_ops.stratified_sample(
-        val_input_batch, lbl_input_batch, probs, probs, batch_size)
+        val_input_batch, lbl_input_batch, probs, batch_size, init_probs=probs)
     batches += tf.contrib.framework.sampling_ops.stratified_sample(
-        val_input_batch, lbl_input_batch, probs, probs, batch_size)
+        val_input_batch, lbl_input_batch, probs, batch_size, init_probs=probs)
     batches += tf.contrib.framework.sampling_ops.stratified_sample_unknown_dist(
         val_input_batch, lbl_input_batch, probs, batch_size)
     batches += tf.contrib.framework.sampling_ops.stratified_sample_unknown_dist(
         val_input_batch, lbl_input_batch, probs, batch_size)
-    summary_op = tf.merge_summary(tf.get_collection(
-        tf.GraphKeys.SUMMARIES))
+    summary_op = tf.merge_summary(tf.get_collection(tf.GraphKeys.SUMMARIES))
 
     with self.test_session() as sess:
       coord = tf.train.Coordinator()
@@ -177,9 +187,15 @@ class SamplingOpsTest(tf.test.TestCase):
 
   def testRejectionBatchingBehavior(self):
     initial_p = [0, .3, 0, .7, 0]
+
     def curried_sampler(val, lbls, probs, batch, enqueue_many=True):
       return tf.contrib.framework.sampling_ops.stratified_sample(
-          val, lbls, initial_p, probs, batch, enqueue_many=enqueue_many)
+          val,
+          lbls,
+          probs,
+          batch,
+          init_probs=initial_p,
+          enqueue_many=enqueue_many)
 
     self.batchingBehaviorHelper(curried_sampler)
 
@@ -190,8 +206,7 @@ class SamplingOpsTest(tf.test.TestCase):
     lbl2 = 3
     # This cond allows the necessary class queues to be populated.
     label = tf.cond(
-        tf.greater(.5, tf.random_uniform([])),
-        lambda: tf.constant(lbl1),
+        tf.greater(.5, tf.random_uniform([])), lambda: tf.constant(lbl1),
         lambda: tf.constant(lbl2))
     val = [np.array([1, 4]) * label]
     probs = tf.placeholder(tf.float32, shape=[5])
@@ -225,7 +240,7 @@ class SamplingOpsTest(tf.test.TestCase):
   def testBatchDimensionNotRequired(self):
     classes = 5
     # Probs must be a tensor, since we pass it directly to _verify_input.
-    probs = tf.constant([1.0/classes] * classes)
+    probs = tf.constant([1.0 / classes] * classes)
 
     # Make sure that these vals/labels pairs don't throw any runtime exceptions.
     legal_input_pairs = [
@@ -243,16 +258,17 @@ class SamplingOpsTest(tf.test.TestCase):
     # Run graph to make sure there are no shape-related runtime errors.
     for vals, labels in legal_input_pairs:
       with self.test_session() as sess:
-        sess.run([val_tf, labels_tf], feed_dict={vals_ph: vals,
-                                                 labels_ph: labels})
+        sess.run([val_tf, labels_tf],
+                 feed_dict={vals_ph: vals,
+                            labels_ph: labels})
 
   def dataListHelper(self, sampler):
     batch_size = 20
     val_input_batch = [tf.zeros([2, 3, 4]), tf.ones([2, 4]), tf.ones(2) * 3]
     lbl_input_batch = tf.ones([], dtype=tf.int32)
     probs = np.array([0, 1, 0, 0, 0])
-    val_list, lbls = sampler(
-        val_input_batch, lbl_input_batch, probs, batch_size)
+    val_list, lbls = sampler(val_input_batch, lbl_input_batch, probs,
+                             batch_size)
 
     # Check output shapes.
     self.assertTrue(isinstance(val_list, list))
@@ -277,9 +293,16 @@ class SamplingOpsTest(tf.test.TestCase):
 
   def testRejectionDataListInput(self):
     initial_p = [0, 1, 0, 0, 0]
+
     def curried_sampler(val, lbls, probs, batch, enqueue_many=False):
       return tf.contrib.framework.sampling_ops.stratified_sample(
-          val, lbls, initial_p, probs, batch, enqueue_many=enqueue_many)
+          val,
+          lbls,
+          probs,
+          batch,
+          init_probs=initial_p,
+          enqueue_many=enqueue_many)
+
     self.dataListHelper(curried_sampler)
 
   def normalBehaviorHelper(self, sampler):
@@ -289,8 +312,7 @@ class SamplingOpsTest(tf.test.TestCase):
     lbl2 = 3
     # This cond allows the necessary class queues to be populated.
     label = tf.cond(
-        tf.greater(.5, tf.random_uniform([])),
-        lambda: tf.constant(lbl1),
+        tf.greater(.5, tf.random_uniform([])), lambda: tf.constant(lbl1),
         lambda: tf.constant(lbl2))
     val = [np.array([1, 4]) * label]
     probs = np.array([.8, 0, 0, .2, 0])
@@ -302,6 +324,9 @@ class SamplingOpsTest(tf.test.TestCase):
     data_l = []
     label_l = []
     with self.test_session() as sess:
+      # Need to initialize variables that keep running total of classes seen.
+      tf.initialize_all_variables().run()
+
       coord = tf.train.Coordinator()
       threads = tf.train.start_queue_runners(coord=coord)
 
@@ -329,7 +354,7 @@ class SamplingOpsTest(tf.test.TestCase):
     # is fixed, for a given implementation, this test will pass or fail 100% of
     # the time. This use of assertNear is to cover cases where someone changes
     # an implementation detail, which would cause the random behavior to differ.
-    self.assertNear(actual_lbl, expected_label, 3*lbl_std_dev_of_mean)
+    self.assertNear(actual_lbl, expected_label, 3 * lbl_std_dev_of_mean)
 
   def testNormalBehavior(self):
     self.normalBehaviorHelper(
@@ -337,10 +362,26 @@ class SamplingOpsTest(tf.test.TestCase):
 
   def testRejectionNormalBehavior(self):
     initial_p = [.7, 0, 0, .3, 0]
+
     def curried_sampler(val, lbls, probs, batch, enqueue_many=False):
       return tf.contrib.framework.sampling_ops.stratified_sample(
-          val, lbls, initial_p, probs, batch, enqueue_many=enqueue_many)
+          val,
+          lbls,
+          probs,
+          batch,
+          init_probs=initial_p,
+          enqueue_many=enqueue_many)
+
     self.normalBehaviorHelper(curried_sampler)
 
+  def testRejectionNormalBehaviorWithOnlineInitPEstimate(self):
+
+    def curried_sampler(val, lbls, probs, batch, enqueue_many=False):
+      return tf.contrib.framework.sampling_ops.stratified_sample(
+          val, lbls, probs, batch, init_probs=None, enqueue_many=enqueue_many)
+
+    self.normalBehaviorHelper(curried_sampler)
+
+
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/framework/python/ops/sampling_ops_threading_test.py b/tensorflow/contrib/framework/python/ops/sampling_ops_threading_test.py
new file mode 100644
index 00000000000..3812c3348c4
--- /dev/null
+++ b/tensorflow/contrib/framework/python/ops/sampling_ops_threading_test.py
@@ -0,0 +1,65 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# pylint: disable=unused-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class SamplingOpsThreadingTest(tf.test.TestCase):
+
+  def testMultiThreadedEstimateDataDistribution(self):
+    num_classes = 10
+
+    # Set up graph.
+    tf.set_random_seed(1234)
+    label = tf.cast(tf.round(tf.random_uniform([1]) * num_classes), tf.int32)
+
+    prob_estimate = tf.contrib.framework.sampling_ops._estimate_data_distribution(  # pylint: disable=line-too-long
+        label, num_classes)
+    # Check that prob_estimate is well-behaved in a multithreaded context.
+    _, _, [prob_estimate] = tf.contrib.framework.sampling_ops._verify_input(
+        [], label, [prob_estimate])
+
+    # Use queues to run multiple threads over the graph, each of which
+    # fetches `prob_estimate`.
+    queue = tf.FIFOQueue(
+        capacity=25,
+        dtypes=[prob_estimate.dtype],
+        shapes=[prob_estimate.get_shape()])
+    enqueue_op = queue.enqueue([prob_estimate])
+    tf.train.add_queue_runner(tf.train.QueueRunner(queue, [enqueue_op] * 25))
+    out_tensor = queue.dequeue()
+
+    # Run the multi-threaded session.
+    with self.test_session() as sess:
+      # Need to initialize variables that keep running total of classes seen.
+      tf.initialize_all_variables().run()
+
+      coord = tf.train.Coordinator()
+      threads = tf.train.start_queue_runners(coord=coord)
+
+      for _ in range(25):
+        sess.run([out_tensor])
+
+      coord.request_stop()
+      coord.join(threads)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc b/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc
index b3addf5746c..e854292f9da 100644
--- a/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc
+++ b/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
 namespace tensorflow {
 REGISTER_OP("SparseFeatureCross")
@@ -31,6 +32,12 @@ REGISTER_OP("SparseFeatureCross")
     .Attr("dense_types: list({int64, string}) >= 0")
     .Attr("out_type: {int64, string}")
     .Attr("internal_type: {int64, string}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Generates sparse cross form a list of sparse tensors.
 
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 3e31ac02f9b..95c55a03dd3 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -193,35 +193,36 @@ class _SparseColumn(_FeatureColumn,
               combiner="sum",
               dtype=dtypes.string):
     if is_integerized and bucket_size is None:
-      raise ValueError("bucket_size should be set if is_integerized=True. "
+      raise ValueError("bucket_size must be set if is_integerized is True. "
                        "column_name: {}".format(column_name))
 
     if is_integerized and not dtype.is_integer:
-      raise ValueError("dtype should be an integer if is_integerized is True. "
-                       "Column {}.".format(column_name))
+      raise ValueError("dtype must be an integer if is_integerized is True. "
+                       "dtype: {}, column_name: {}.".format(dtype, column_name))
 
     if bucket_size is None and lookup_config is None:
-      raise ValueError("one of bucket_size or lookup_config should be "
-                       "set. column_name: {}".format(column_name))
+      raise ValueError("one of bucket_size or lookup_config must be set. "
+                       "column_name: {}".format(column_name))
 
     if bucket_size is not None and lookup_config:
       raise ValueError("one and only one of bucket_size or lookup_config "
-                       "should be set. column_name: {}".format(column_name))
+                       "must be set. column_name: {}".format(column_name))
 
     if bucket_size is not None and bucket_size < 2:
-      raise ValueError("bucket_size should be at least 2. "
-                       "column_name: {}".format(column_name))
+      raise ValueError("bucket_size must be at least 2. "
+                       "bucket_size: {}, column_name: {}".format(bucket_size,
+                                                                 column_name))
 
     if ((lookup_config) and
         (not isinstance(lookup_config, _SparseIdLookupConfig))):
       raise TypeError(
-          "lookup_config should be an instance of _SparseIdLookupConfig. "
+          "lookup_config must be an instance of _SparseIdLookupConfig. "
           "Given one is in type {} for column_name {}".format(
               type(lookup_config), column_name))
 
     if (lookup_config and lookup_config.vocabulary_file and
         lookup_config.vocab_size is None):
-      raise ValueError("vocab_size should be defined. "
+      raise ValueError("vocab_size must be defined. "
                        "column_name: {}".format(column_name))
 
     return super(_SparseColumn, cls).__new__(cls, column_name, is_integerized,
@@ -262,8 +263,8 @@ class _SparseColumn(_FeatureColumn,
                          input_tensor,
                          weight_collections=None,
                          trainable=True):
-    raise ValueError("Column {} is not supported in DNN. "
-                     "Please use embedding_column.".format(self))
+    raise ValueError("SparseColumn is not supported in DNN. "
+                     "Please use embedding_column. column: {}".format(self))
 
   def to_weighted_sum(self,
                       input_tensor,
@@ -279,7 +280,7 @@ class _SparseColumn(_FeatureColumn,
         initializer=init_ops.zeros_initializer,
         combiner=self.combiner,
         trainable=trainable,
-        name=self.name + "_weights")
+        name=self.name)
 
 
 class _SparseColumnIntegerized(_SparseColumn):
@@ -291,8 +292,8 @@ class _SparseColumnIntegerized(_SparseColumn):
               combiner="sum",
               dtype=dtypes.int64):
     if not dtype.is_integer:
-      raise ValueError("dtype should be an integer. Given {}".format(
-          column_name))
+      raise ValueError("dtype must be an integer. "
+                       "dtype: {}, column_name: {}".format(dtype, column_name))
 
     return super(_SparseColumnIntegerized, cls).__new__(cls,
                                                         column_name,
@@ -507,8 +508,8 @@ class _WeightedSparseColumn(_FeatureColumn, collections.namedtuple(
                          input_tensor,
                          weight_collections=None,
                          trainable=True):
-    raise ValueError("Column {} is not supported in DNN. "
-                     "Please use embedding_column.".format(self))
+    raise ValueError("WeightedSparseColumn is not supported in DNN. "
+                     "Please use embedding_column. column: {}".format(self))
 
   def to_weighted_sum(self,
                       input_tensor,
@@ -524,7 +525,7 @@ class _WeightedSparseColumn(_FeatureColumn, collections.namedtuple(
         initializer=init_ops.zeros_initializer,
         combiner=self.sparse_id_column.combiner,
         trainable=trainable,
-        name=self.name + "_weights")
+        name=self.name)
 
 
 def weighted_sparse_column(sparse_id_column,
@@ -609,7 +610,9 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
               ckpt_to_load_from=None,
               tensor_name_in_ckpt=None):
     if initializer is not None and not callable(initializer):
-      raise ValueError("initializer must be callable if specified.")
+      raise ValueError("initializer must be callable if specified. "
+                       "Embedding of column_name: {}".format(
+                           sparse_id_column.name))
 
     if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
       raise ValueError("Must specify both `ckpt_to_load_from` and "
@@ -674,7 +677,7 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
         initializer=self.initializer,
         combiner=self.combiner,
         trainable=trainable,
-        name=self.name + "_weights")
+        name=self.name)
     if self.ckpt_to_load_from is not None:
       weights_to_restore = embedding_weights
       if len(embedding_weights) == 1:
@@ -690,8 +693,8 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
                       num_outputs=1,
                       weight_collections=None,
                       trainable=True):
-    raise ValueError("Column {} is not supported in linear models. "
-                     "Please use sparse_column.".format(self))
+    raise ValueError("EmbeddingColumn is not supported in linear models. "
+                     "Please use sparse_column. column: {}".format(self))
 
 
 def embedding_column(sparse_id_column,
@@ -744,7 +747,8 @@ class _HashedEmbeddingColumn(collections.namedtuple(
               combiner="mean",
               initializer=None):
     if initializer is not None and not callable(initializer):
-      raise ValueError("initializer must be callable if specified.")
+      raise ValueError("initializer must be callable if specified. "
+                       "column_name: {}".format(column_name))
     if initializer is None:
       stddev = 0.1
       # TODO(b/25671353): Better initial value?
@@ -770,7 +774,7 @@ class _HashedEmbeddingColumn(collections.namedtuple(
                          weight_collections=None,
                          trainable=True):
     embeddings = _create_embeddings(
-        name=self.name + "_weights",
+        name=self.name,
         shape=[self.size],
         initializer=self.initializer,
         dtype=dtypes.float32,
@@ -815,10 +819,14 @@ def hashed_embedding_column(column_name,
 
   """
   if (dimension < 1) or (size < 1):
-    raise ValueError("Dimension and size must be greater than 0.")
+    raise ValueError("Dimension and size must be greater than 0. "
+                     "dimension: {}, size: {}, column_name: {}".format(
+                         dimension, size, column_name))
 
   if combiner not in ("mean", "sqrtn", "sum"):
-    raise ValueError("Combiner must be one of 'mean', 'sqrtn' or 'sum'.")
+    raise ValueError("Combiner must be one of 'mean', 'sqrtn' or 'sum'. "
+                     "combiner: {}, column_name: {}".format(
+                         combiner, column_name))
 
   return _HashedEmbeddingColumn(column_name, size, dimension, combiner,
                                 initializer)
@@ -929,14 +937,18 @@ def real_valued_column(column_name,
   """
 
   if not isinstance(dimension, int):
-    raise TypeError("dimension must be an integer")
+    raise TypeError("dimension must be an integer. "
+                    "dimension: {}, column_name: {}".format(dimension,
+                                                            column_name))
 
   if dimension < 1:
-    raise ValueError("dimension must be greater than 0")
+    raise ValueError("dimension must be greater than 0. "
+                     "dimension: {}, column_name: {}".format(dimension,
+                                                             column_name))
 
   if not (dtype.is_integer or dtype.is_floating):
-    raise ValueError("dtype is not convertible to tf.float32. Given {}".format(
-        dtype))
+    raise ValueError("dtype must be convertible to float. "
+                     "dtype: {}, column_name: {}".format(dtype, column_name))
 
   if default_value is None:
     return _RealValuedColumn(column_name, dimension, default_value, dtype)
@@ -957,9 +969,10 @@ def real_valued_column(column_name,
 
   if isinstance(default_value, list):
     if len(default_value) != dimension:
-      raise ValueError("The length of default_value is not equal to the "
-                       "value of dimension. default_value is {}.".format(
-                           default_value))
+      raise ValueError(
+          "The length of default_value must be equal to dimension. "
+          "default_value: {}, dimension: {}, column_name: {}".format(
+              default_value, dimension, column_name))
     # Check if the values in the list are all integers or are convertible to
     # floats.
     is_list_all_int = True
@@ -980,8 +993,9 @@ def real_valued_column(column_name,
         default_value = [float(v) for v in default_value]
         return _RealValuedColumn(column_name, dimension, default_value, dtype)
 
-  raise TypeError("default_value is not compatible with dtype. "
-                  "default_value is {}.".format(default_value))
+  raise TypeError("default_value must be compatible with dtype. "
+                  "default_value: {}, dtype: {}, column_name: {}".format(
+                      default_value, dtype, column_name))
 
 
 class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
@@ -1008,10 +1022,12 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
   def __new__(cls, source_column, boundaries):
     if not isinstance(source_column, _RealValuedColumn):
       raise TypeError(
-          "source_column should be an instance of _RealValuedColumn.")
+          "source_column must be an instance of _RealValuedColumn. "
+          "source_column: {}".format(source_column))
 
     if not isinstance(boundaries, list) or not boundaries:
-      raise ValueError("boundaries must be a list and it should not be empty.")
+      raise ValueError("boundaries must be a non-empty list. "
+                       "boundaries: {}".format(boundaries))
 
     # We allow bucket boundaries to be monotonically increasing
     # (ie a[i+1] >= a[i]). When two bucket boundaries are the same, we
@@ -1023,7 +1039,8 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
       elif boundaries[i] < boundaries[i + 1]:
         sanitized_boundaries.append(boundaries[i])
       else:
-        raise ValueError("boundaries must be a sorted list")
+        raise ValueError("boundaries must be a sorted list. "
+                         "boundaries: {}".format(boundaries))
     sanitized_boundaries.append(boundaries[len(boundaries) - 1])
 
     return super(_BucketizedColumn, cls).__new__(cls, source_column,
@@ -1104,7 +1121,7 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
         initializer=init_ops.zeros_initializer,
         combiner="sum",
         trainable=trainable,
-        name=self.name + "_weights")
+        name=self.name)
 
 
 def bucketized_column(source_column, boundaries):
@@ -1186,18 +1203,21 @@ class _CrossedColumn(_FeatureColumn, collections.namedtuple(
               ckpt_to_load_from=None, tensor_name_in_ckpt=None):
     for column in columns:
       if not _CrossedColumn._is_crossable(column):
-        raise TypeError("columns should be a set of "
-                        "_SparseColumn, _CrossedColumn, or _BucketizedColumn. "
-                        "Column is {}".format(column))
+        raise TypeError("columns must be a set of _SparseColumn, "
+                        "_CrossedColumn, or _BucketizedColumn instances. "
+                        "column: {}".format(column))
 
     if len(columns) < 2:
-      raise ValueError("columns should contain at least 2 elements.")
+      raise ValueError("columns must contain at least 2 elements. "
+                       "columns: {}".format(columns))
 
     if not isinstance(hash_bucket_size, int):
-      raise TypeError("hash_bucket_size should be an int.")
+      raise TypeError("hash_bucket_size must be an int. "
+                      "hash_bucket_size: {}".format(hash_bucket_size))
 
     if hash_bucket_size < 2:
-      raise ValueError("hash_bucket_size should be at least 2.")
+      raise ValueError("hash_bucket_size must be at least 2. "
+                       "hash_bucket_size: {}".format(hash_bucket_size))
 
     if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
       raise ValueError("Must specify both `ckpt_to_load_from` and "
@@ -1275,8 +1295,8 @@ class _CrossedColumn(_FeatureColumn, collections.namedtuple(
                          input_tensor,
                          weight_collections=None,
                          trainable=True):
-    raise ValueError("Column {} is not supported in DNN. "
-                     "Please use embedding_column.".format(self))
+    raise ValueError("CrossedColumn is not supported in DNN. "
+                     "Please use embedding_column. column: {}".format(self))
 
   def to_weighted_sum(self,
                       input_tensor,
@@ -1292,7 +1312,7 @@ class _CrossedColumn(_FeatureColumn, collections.namedtuple(
         initializer=init_ops.zeros_initializer,
         combiner=self.combiner,
         trainable=trainable,
-        name=self.name + "_weights")
+        name=self.name)
     if self.ckpt_to_load_from is not None:
       weights_to_restore = embedding_weights
       if len(embedding_weights) == 1:
@@ -1337,7 +1357,7 @@ def crossed_column(columns, hash_bucket_size, combiner="sum",
 
 class DataFrameColumn(_FeatureColumn,
                       collections.namedtuple("DataFrameColumn",
-                                             ["name", "series"])):
+                                             ["column_name", "series"])):
   """Represents a feature column produced from a `DataFrame`.
 
   Instances of this class are immutable.  A `DataFrame` column may be dense or
@@ -1345,13 +1365,17 @@ class DataFrameColumn(_FeatureColumn,
   batch_size.
 
   Args:
-    name: a name for this column
+    column_name: a name for this column
     series: a `Series` to be wrapped, which has already had its base features
       substituted with `PredefinedSeries`.
   """
 
-  def __new__(cls, name, series):
-    return super(DataFrameColumn, cls).__new__(cls, name, series)
+  def __new__(cls, column_name, series):
+    return super(DataFrameColumn, cls).__new__(cls, column_name, series)
+
+  @property
+  def name(self):
+    return self.column_name
 
   @property
   def config(self):
@@ -1379,7 +1403,17 @@ class DataFrameColumn(_FeatureColumn,
                          input_tensor,
                          weight_collections=None,
                          trainable=True):
-    return input_tensor
+    # DataFrame typically provides Tensors of shape [batch_size],
+    # but Estimator requires shape [batch_size, 1]
+    dims = input_tensor.get_shape().ndims
+    if dims == 0:
+      raise ValueError(
+          "Can't build input layer from tensor of shape (): {}".format(
+              self.column_name))
+    elif dims == 1:
+      return array_ops.expand_dims(input_tensor, 1)
+    else:
+      return input_tensor
 
   # TODO(soergel): This mirrors RealValuedColumn for now, but should become
   # better abstracted with less code duplication when we add other kinds.
@@ -1547,7 +1581,7 @@ def _create_embeddings(name, shape, dtype, initializer, trainable,
   with just one variable.
 
   Args:
-    name: A string specifying the name of the embedding variable.
+    name: A string. The name of the embedding variable will be name + _weights.
     shape: shape of the embeddding. Note this is not the shape of partitioned
       variables.
     dtype: type of the embedding. Also the shape of each partitioned variable.
@@ -1609,7 +1643,7 @@ def _create_embedding_lookup(input_tensor, weight_tensor, vocab_size, dimension,
     A Tensor with shape [batch_size, dimension] and embedding Variable.
   """
 
-  embeddings = _create_embeddings(name=name,
+  embeddings = _create_embeddings(name=name + "_weights",
                                   shape=[vocab_size, dimension],
                                   dtype=dtypes.float32,
                                   initializer=initializer,
@@ -1621,4 +1655,4 @@ def _create_embedding_lookup(input_tensor, weight_tensor, vocab_size, dimension,
       sparse_weights=weight_tensor,
       default_id=0,
       combiner=combiner,
-      name=name), embeddings
+      name=name + "_weights"), embeddings
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index 86d522dedf2..6f1393da4d4 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -60,14 +60,17 @@ class FeatureColumnTest(tf.test.TestCase):
     self.assertEqual(b.dimension, 10)
     self.assertTrue(b.default_value is None)
 
-    # dimension is an integer
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(TypeError, "dimension must be an integer"):
       tf.contrib.layers.real_valued_column("d3", dimension=1.0)
 
-    # dimension is a positive integer
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(ValueError,
+                                 "dimension must be greater than 0"):
       tf.contrib.layers.real_valued_column("d3", dimension=0)
 
+    with self.assertRaisesRegexp(ValueError,
+                                 "dtype must be convertible to float"):
+      tf.contrib.layers.real_valued_column("d3", dtype=tf.string)
+
     # default_value is an integer.
     c1 = tf.contrib.layers.real_valued_column("c1", default_value=2)
     self.assertListEqual(list(c1.default_value), [2.])
@@ -92,15 +95,18 @@ class FeatureColumnTest(tf.test.TestCase):
                                               dimension=4,
                                               default_value=2.)
     self.assertListEqual(list(d2.default_value), [2., 2., 2., 2.])
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(TypeError,
+                                 "default_value must be compatible with dtype"):
       tf.contrib.layers.real_valued_column("d3",
                                            default_value=2.,
                                            dtype=tf.int32)
 
-    # default_value is neither interger nor float.
-    with self.assertRaises(TypeError):
+    # default_value is neither integer nor float.
+    with self.assertRaisesRegexp(
+        TypeError, "default_value must be compatible with dtype"):
       tf.contrib.layers.real_valued_column("e1", default_value="string")
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(
+        TypeError, "default_value must be compatible with dtype"):
       tf.contrib.layers.real_valued_column("e1",
                                            dimension=3,
                                            default_value=[1, 3., "string"])
@@ -125,11 +131,13 @@ class FeatureColumnTest(tf.test.TestCase):
                                               dimension=3,
                                               default_value=[2., 2, 2])
     self.assertListEqual(list(g2.default_value), [2., 2., 2.])
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(
+        TypeError, "default_value must be compatible with dtype"):
       tf.contrib.layers.real_valued_column("g3",
                                            default_value=[2.],
                                            dtype=tf.int32)
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(
+        ValueError, "The length of default_value must be equal to dimension"):
       tf.contrib.layers.real_valued_column("g4",
                                            dimension=3,
                                            default_value=[2.])
@@ -140,11 +148,19 @@ class FeatureColumnTest(tf.test.TestCase):
     self.assertEqual(a.name, "aaa_BUCKETIZED")
 
   def testBucketizedColumnRequiresRealValuedColumn(self):
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(
+        TypeError, "source_column must be an instance of _RealValuedColumn"):
       tf.contrib.layers.bucketized_column("bbb", [0])
+    with self.assertRaisesRegexp(
+        TypeError, "source_column must be an instance of _RealValuedColumn"):
+      tf.contrib.layers.bucketized_column(
+          tf.contrib.layers.sparse_column_with_integerized_feature(
+              column_name="bbb", bucket_size=10),
+          [0])
 
   def testBucketizedColumnRequiresSortedBuckets(self):
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(
+        ValueError, "boundaries must be a sorted list"):
       tf.contrib.layers.bucketized_column(
           tf.contrib.layers.real_valued_column("ccc"), [5, 0, 4])
 
@@ -173,7 +189,10 @@ class FeatureColumnTest(tf.test.TestCase):
   def testCrossedColumnNotSupportRealValuedColumn(self):
     b = tf.contrib.layers.sparse_column_with_hash_bucket("bbb",
                                                          hash_bucket_size=100)
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(
+        TypeError,
+        "columns must be a set of _SparseColumn, _CrossedColumn, "
+        "or _BucketizedColumn instances"):
       tf.contrib.layers.crossed_column(
           set([b, tf.contrib.layers.real_valued_column("real")]),
           hash_bucket_size=10000)
@@ -194,7 +213,8 @@ class FeatureColumnTest(tf.test.TestCase):
          "weights": tf.VarLenFeature(tf.int32)},
         weighted_ids.config)
 
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(ValueError,
+                                 "dtype is not convertible to float"):
       weighted_ids = tf.contrib.layers.weighted_sparse_column(ids, "weights",
                                                               dtype=tf.string)
 
@@ -211,7 +231,8 @@ class FeatureColumnTest(tf.test.TestCase):
             [1], dtype=tf.int32)},
         rvc.config)
 
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(ValueError,
+                                 "dtype must be convertible to float"):
       tf.contrib.layers.real_valued_column("rvc", dtype=tf.string)
 
   def testSparseColumnDtypes(self):
@@ -222,7 +243,8 @@ class FeatureColumnTest(tf.test.TestCase):
         "sc", 10, dtype=tf.int32)
     self.assertDictEqual({"sc": tf.VarLenFeature(dtype=tf.int32)}, sc.config)
 
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(ValueError,
+                                 "dtype must be an integer"):
       tf.contrib.layers.sparse_column_with_integerized_feature("sc",
                                                                10,
                                                                dtype=tf.float32)
diff --git a/tensorflow/contrib/layers/python/layers/target_column.py b/tensorflow/contrib/layers/python/layers/target_column.py
index 08280446723..78178816f35 100644
--- a/tensorflow/contrib/layers/python/layers/target_column.py
+++ b/tensorflow/contrib/layers/python/layers/target_column.py
@@ -70,7 +70,7 @@ def multi_class_target(n_classes, label_name=None, weight_column_name=None):
       will be multiplied by the loss of the example.
 
   Returns:
-    An instance of _TargetColumn
+    An instance of _MultiClassTargetColumn.
 
   Raises:
     ValueError: if n_classes is < 2
diff --git a/tensorflow/contrib/learn/python/learn/__init__.py b/tensorflow/contrib/learn/python/learn/__init__.py
index 9b7a31ede42..50089e18a03 100644
--- a/tensorflow/contrib/learn/python/learn/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/__init__.py
@@ -33,6 +33,7 @@ from tensorflow.contrib.learn.python.learn import preprocessing
 from tensorflow.contrib.learn.python.learn import utils
 from tensorflow.contrib.learn.python.learn.dataframe import *
 from tensorflow.contrib.learn.python.learn.estimators import *
+from tensorflow.contrib.learn.python.learn.evaluable import Evaluable
 from tensorflow.contrib.learn.python.learn.experiment import Experiment
 from tensorflow.contrib.learn.python.learn.graph_actions import evaluate
 from tensorflow.contrib.learn.python.learn.graph_actions import infer
@@ -41,4 +42,5 @@ from tensorflow.contrib.learn.python.learn.graph_actions import run_feeds
 from tensorflow.contrib.learn.python.learn.graph_actions import run_n
 from tensorflow.contrib.learn.python.learn.graph_actions import train
 from tensorflow.contrib.learn.python.learn.learn_io import *
+from tensorflow.contrib.learn.python.learn.trainable import Trainable
 # pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/__init__.py b/tensorflow/contrib/learn/python/learn/dataframe/__init__.py
index 10f35fa129d..8fba9b65136 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/__init__.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.learn.python.learn.dataframe.transform import Transform
 # Transforms
 from tensorflow.contrib.learn.python.learn.dataframe.transforms.boolean_mask import BooleanMask
 from tensorflow.contrib.learn.python.learn.dataframe.transforms.difference import Difference
+from tensorflow.contrib.learn.python.learn.dataframe.transforms.hashes import HashFast
 from tensorflow.contrib.learn.python.learn.dataframe.transforms.in_memory_source import NumpySource
 from tensorflow.contrib.learn.python.learn.dataframe.transforms.in_memory_source import PandasSource
 from tensorflow.contrib.learn.python.learn.dataframe.transforms.reader_source import ReaderSource
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py b/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py
index 31093b9937a..6e03f086425 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py
@@ -117,10 +117,11 @@ class DataFrame(object):
       value = [value]
     self.assign(**dict(zip(key, value)))
 
-  def build(self):
+  def build(self, **kwargs):
     # We do not allow passing a cache here, because that would encourage
     # working around the rule that DataFrames cannot be expected to be
     # synced with each other (e.g., they shuffle independently).
     cache = {}
-    tensors = {name: c.build(cache) for name, c in self._columns.items()}
+    tensors = {name: c.build(cache, **kwargs)
+               for name, c in self._columns.items()}
     return tensors
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py b/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py
index bff0c4e4af0..313ae41cfe8 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py
@@ -91,7 +91,8 @@ def _build_alternate_universe(
 def to_feature_columns_and_input_fn(dataframe,
                                     base_input_keys_with_defaults,
                                     feature_keys,
-                                    target_keys=None):
+                                    target_keys=None,
+                                    **kwargs):
   """Build a list of FeatureColumns and an input_fn for use with Estimator.
 
   Args:
@@ -103,6 +104,7 @@ def to_feature_columns_and_input_fn(dataframe,
       These may include base features and/or derived features.
     target_keys: the names of columns to be used as targets.  None is
       acceptable for unsupervised learning.
+    **kwargs: Additional keyword arguments, unused here.
 
   Returns:
     A tuple of two elements:
@@ -155,10 +157,11 @@ def to_feature_columns_and_input_fn(dataframe,
 
   # Build an input_fn suitable for use with Estimator.
   def input_fn():
+    """An input_fn() for feeding the given set of DataFrameColumns."""
     # It's important to build all the tensors together in one DataFrame.
     # If we did df.select() for both key sets and then build those, the two
     # resulting DataFrames would be shuffled independently.
-    tensors = limited_dataframe.build()
+    tensors = limited_dataframe.build(**kwargs)
 
     base_input_features = {key: tensors[key] for key in base_input_keys}
     targets = {key: tensors[key] for key in target_keys}
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/series.py b/tensorflow/contrib/learn/python/learn/dataframe/series.py
index 12daa7d7cb8..5893db3aad2 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/series.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/series.py
@@ -98,7 +98,7 @@ class Series(object):
       return transform_cls
     return register
 
-  def build(self, cache):
+  def build(self, cache, **kwargs):
     """Returns a Tensor."""
     raise NotImplementedError()
 
@@ -122,7 +122,7 @@ class PredefinedSeries(Series):
   def required_base_features(self):
     return {self.name: self.feature_spec}
 
-  def build(self, cache):
+  def build(self, cache, **kwargs):
     try:
       return cache[self.name]
     except KeyError:
@@ -171,10 +171,11 @@ class TransformedSeries(Series):
       result.update(s.required_base_features)
     return result
 
-  def build(self, cache=None):
+  def build(self, cache=None, **kwargs):
     if cache is None:
       cache = {}
-    all_outputs = self._transform.build_transitive(self._input_series, cache)
+    all_outputs = self._transform.build_transitive(
+        self._input_series, cache, **kwargs)
     return getattr(all_outputs, self._output_name)
 
   def __repr__(self):
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py b/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
index 45df3ac16d5..ddd2b8bfb6e 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
@@ -28,6 +28,7 @@ from tensorflow.contrib.learn.python.learn.dataframe import dataframe as df
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import batch
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import csv_parser
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import example_parser
+from tensorflow.contrib.learn.python.learn.dataframe.transforms import hashes
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import in_memory_source
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import reader_source
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import sparsify
@@ -83,7 +84,8 @@ class TensorFlowDataFrame(df.DataFrame):
           graph=None,
           session=None,
           start_queues=True,
-          initialize_variables=True):
+          initialize_variables=True,
+          **kwargs):
     """Builds and runs the columns of the `DataFrame` and yields batches.
 
     This is a generator that yields a dictionary mapping column names to
@@ -97,6 +99,7 @@ class TensorFlowDataFrame(df.DataFrame):
       start_queues: if true, queues will be started before running and halted
         after producting `n` batches.
       initialize_variables: if true, variables will be initialized.
+      **kwargs: Additional keyword arguments e.g. `num_epochs`.
 
     Yields:
       A dictionary, mapping column names to the values resulting from running
@@ -107,7 +110,7 @@ class TensorFlowDataFrame(df.DataFrame):
     with graph.as_default():
       if session is None:
         session = sess.Session()
-      self_built = self.build()
+      self_built = self.build(**kwargs)
       keys = list(self_built.keys())
       cols = list(self_built.values())
       if initialize_variables:
@@ -157,6 +160,52 @@ class TensorFlowDataFrame(df.DataFrame):
             "Original error: {}").format(type(col), e))
     return result
 
+  def split(self, index_series, proportion, batch_size=None):
+    """Deterministically split a `DataFrame` into two `DataFrame`s.
+
+    Note this split is only as deterministic as the underlying hash function;
+    see `tf.string_to_hash_bucket_fast`.  The hash function is deterministic
+    for a given binary, but may change occasionally.  The only way to achieve
+    an absolute guarantee that the split `DataFrame`s do not change across runs
+    is to materialize them.
+
+    Note too that the allocation of a row to one partition or the
+    other is evaluated independently for each row, so the exact number of rows
+    in each partition is binomially distributed.
+
+    Args:
+      index_series: a `Series` of unique strings, whose hash will determine the
+        partitioning; or the name in this `DataFrame` of such a `Series`.
+        (This `Series` must contain strings because TensorFlow provides hash
+        ops only for strings, and there are no number-to-string converter ops.)
+      proportion: The proportion of the rows to select for the 'left'
+        partition; the remaining (1 - proportion) rows form the 'right'
+        partition.
+      batch_size: the batch size to use when rebatching the left and right
+        `DataFrame`s.  If None (default), the `DataFrame`s are not rebatched;
+        thus their batches will have variable sizes, according to which rows
+        are selected from each batch of the original `DataFrame`.
+
+    Returns:
+      Two `DataFrame`s containing the partitioned rows.
+    """
+    # TODO(soergel): allow seed?
+    if isinstance(index_series, str):
+      index_series = self[index_series]
+    num_buckets = 1000000  # close enough for simple splits
+    hashed_input, = hashes.HashFast(num_buckets)(index_series)
+    threshold = int(num_buckets * proportion)
+    left = hashed_input < threshold
+    right = ~left
+    left_rows = self.select_rows(left)
+    right_rows = self.select_rows(right)
+
+    if batch_size:
+      left_rows = left_rows.batch(batch_size=batch_size, shuffle=False)
+      right_rows = right_rows.batch(batch_size=batch_size, shuffle=False)
+
+    return left_rows, right_rows
+
   def run_once(self):
     """Creates a new 'Graph` and `Session` and runs a single batch.
 
@@ -208,7 +257,7 @@ class TensorFlowDataFrame(df.DataFrame):
 
   @classmethod
   def _from_csv_base(cls, filepatterns, get_default_values, has_header,
-                     column_names, num_epochs, num_threads, enqueue_size,
+                     column_names, num_threads, enqueue_size,
                      batch_size, queue_capacity, min_after_dequeue, shuffle,
                      seed):
     """Create a `DataFrame` from CSV files.
@@ -223,9 +272,6 @@ class TensorFlowDataFrame(df.DataFrame):
         each column, given the column names.
       has_header: whether or not the CSV files have headers.
       column_names: a list of names for the columns in the CSV files.
-      num_epochs: the number of times that the reader should loop through all
-        the file names. If set to `None`, then the reader will continue
-        indefinitely.
       num_threads: the number of readers that will work in parallel.
       enqueue_size: block size for each read operation.
       batch_size: desired batch size.
@@ -265,7 +311,6 @@ class TensorFlowDataFrame(df.DataFrame):
         reader_kwargs=reader_kwargs,
         enqueue_size=enqueue_size,
         batch_size=batch_size,
-        num_epochs=num_epochs,
         queue_capacity=queue_capacity,
         shuffle=shuffle,
         min_after_dequeue=min_after_dequeue,
@@ -287,7 +332,6 @@ class TensorFlowDataFrame(df.DataFrame):
                default_values,
                has_header=True,
                column_names=None,
-               num_epochs=None,
                num_threads=1,
                enqueue_size=None,
                batch_size=32,
@@ -306,9 +350,6 @@ class TensorFlowDataFrame(df.DataFrame):
       default_values: a list of default values for each column.
       has_header: whether or not the CSV files have headers.
       column_names: a list of names for the columns in the CSV files.
-      num_epochs: the number of times that the reader should loop through all
-        the file names. If set to `None`, then the reader will continue
-        indefinitely.
       num_threads: the number of readers that will work in parallel.
       enqueue_size: block size for each read operation.
       batch_size: desired batch size.
@@ -332,7 +373,7 @@ class TensorFlowDataFrame(df.DataFrame):
       return default_values
 
     return cls._from_csv_base(filepatterns, get_default_values, has_header,
-                              column_names, num_epochs, num_threads,
+                              column_names, num_threads,
                               enqueue_size, batch_size, queue_capacity,
                               min_after_dequeue, shuffle, seed)
 
@@ -342,7 +383,6 @@ class TensorFlowDataFrame(df.DataFrame):
                                  feature_spec,
                                  has_header=True,
                                  column_names=None,
-                                 num_epochs=None,
                                  num_threads=1,
                                  enqueue_size=None,
                                  batch_size=32,
@@ -362,9 +402,6 @@ class TensorFlowDataFrame(df.DataFrame):
           `VarLenFeature`.
       has_header: whether or not the CSV files have headers.
       column_names: a list of names for the columns in the CSV files.
-      num_epochs: the number of times that the reader should loop through all
-        the file names. If set to `None`, then the reader will continue
-        indefinitely.
       num_threads: the number of readers that will work in parallel.
       enqueue_size: block size for each read operation.
       batch_size: desired batch size.
@@ -387,7 +424,7 @@ class TensorFlowDataFrame(df.DataFrame):
       return [_get_default_value(feature_spec[name]) for name in column_names]
 
     dataframe = cls._from_csv_base(filepatterns, get_default_values, has_header,
-                                   column_names, num_epochs, num_threads,
+                                   column_names, num_threads,
                                    enqueue_size, batch_size, queue_capacity,
                                    min_after_dequeue, shuffle, seed)
 
@@ -405,7 +442,6 @@ class TensorFlowDataFrame(df.DataFrame):
                     filepatterns,
                     features,
                     reader_cls=io_ops.TFRecordReader,
-                    num_epochs=None,
                     num_threads=1,
                     enqueue_size=None,
                     batch_size=32,
@@ -421,9 +457,6 @@ class TensorFlowDataFrame(df.DataFrame):
         `FixedLenFeature`.
       reader_cls: a subclass of `tensorflow.ReaderBase` that will be used to
         read the `Example`s.
-      num_epochs: the number of times that the reader should loop through all
-        the file names. If set to `None`, then the reader will continue
-        indefinitely.
       num_threads: the number of readers that will work in parallel.
       enqueue_size: block size for each read operation.
       batch_size: desired batch size.
@@ -454,7 +487,6 @@ class TensorFlowDataFrame(df.DataFrame):
         filenames,
         enqueue_size=enqueue_size,
         batch_size=batch_size,
-        num_epochs=num_epochs,
         queue_capacity=queue_capacity,
         shuffle=shuffle,
         min_after_dequeue=min_after_dequeue,
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transform.py b/tensorflow/contrib/learn/python/learn/dataframe/transform.py
index 745d556f929..bbb97d2f290 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transform.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transform.py
@@ -223,13 +223,14 @@ class Transform(object):
     # pylint: disable=not-callable
     return self.return_type(*output_series)
 
-  def build_transitive(self, input_series, cache=None):
+  def build_transitive(self, input_series, cache=None, **kwargs):
     """Apply this `Transform` to the provided `Series`, producing 'Tensor's.
 
     Args:
       input_series: None, a `Series`, or a list of input `Series`, acting as
          positional arguments.
       cache: a dict from Series reprs to Tensors.
+      **kwargs: Additional keyword arguments, unused here.
 
     Returns:
       A namedtuple of the output Tensors.
@@ -244,7 +245,7 @@ class Transform(object):
     if len(input_series) != self.input_valency:
       raise ValueError("Expected %s input Series but received %s." %
                        (self.input_valency, len(input_series)))
-    input_tensors = [series.build(cache) for series in input_series]
+    input_tensors = [series.build(cache, **kwargs) for series in input_series]
 
     # Note we cache each output individually, not just the entire output
     # tuple.  This allows using the graph as the cache, since it can sensibly
@@ -254,7 +255,7 @@ class Transform(object):
     output_tensors = [cache.get(output_repr) for output_repr in output_reprs]
 
     if None in output_tensors:
-      result = self._apply_transform(input_tensors)
+      result = self._apply_transform(input_tensors, **kwargs)
       for output_name, output_repr in zip(self.output_names, output_reprs):
         cache[output_repr] = getattr(result, output_name)
     else:
@@ -264,12 +265,13 @@ class Transform(object):
     return result
 
   @abstractmethod
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     """Applies the transformation to the `transform_input`.
 
     Args:
-        input_tensors: a list of Tensors representing the input to
+      input_tensors: a list of Tensors representing the input to
         the Transform.
+      **kwargs: Additional keyword arguments, unused here.
 
     Returns:
         A namedtuple of Tensors representing the transformed output.
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py
index 352a028ee33..cf1585634ca 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py
@@ -72,7 +72,7 @@ class Batch(AbstractBatchTransform):
   def name(self):
     return "Batch"
 
-  def _apply_transform(self, transform_input):
+  def _apply_transform(self, transform_input, **kwargs):
     batched = input_ops.batch(transform_input,
                               batch_size=self.batch_size,
                               num_threads=self.num_threads,
@@ -121,7 +121,7 @@ class ShuffleBatch(AbstractBatchTransform):
   def seed(self):
     return self._seed
 
-  def _apply_transform(self, transform_input):
+  def _apply_transform(self, transform_input, **kwargs):
     batched = input_ops.shuffle_batch(transform_input,
                                       batch_size=self.batch_size,
                                       capacity=self.queue_capacity,
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/binary_transforms.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/binary_transforms.py
index 7d46fb6d05e..78a21250c9c 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/binary_transforms.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/binary_transforms.py
@@ -1,4 +1,4 @@
-# Copyright 2016 Google Inc. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -53,7 +53,7 @@ class SeriesBinaryTransform(transform.Transform):
   def _output_names(self):
     return "output",
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     # TODO(jamieas): consider supporting sparse inputs.
     if isinstance(input_tensors[0], ops.SparseTensor) or isinstance(
         input_tensors[1], ops.SparseTensor):
@@ -87,7 +87,7 @@ class ScalarBinaryTransform(transform.Transform):
   def _output_names(self):
     return "output",
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     input_tensor = input_tensors[0]
     if isinstance(input_tensor, ops.SparseTensor):
       result = ops.SparseTensor(input_tensor.indices,
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/boolean_mask.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/boolean_mask.py
index f572cf137f7..eb5a8edbfb6 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/boolean_mask.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/boolean_mask.py
@@ -77,18 +77,21 @@ class BooleanMask(transform.Transform):
   def _output_names(self):
     return "output",
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     """Applies the transformation to the `transform_input`.
 
     Args:
-        input_tensors: a list of Tensors representing the input to
+      input_tensors: a list of Tensors representing the input to
         the Transform.
+      **kwargs: Additional keyword arguments, unused here.
 
     Returns:
         A namedtuple of Tensors representing the transformed output.
     """
     input_tensor = input_tensors[0]
     mask = input_tensors[1]
+    if mask.get_shape().ndims > 1:
+      mask = array_ops.squeeze(mask)
 
     if isinstance(input_tensor, ops.SparseTensor):
       mask_fn = sparse_boolean_mask
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/csv_parser.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/csv_parser.py
index caa83f5a966..d78b5652d6e 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/csv_parser.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/csv_parser.py
@@ -58,7 +58,7 @@ class CSVParser(transform.Transform):
   def default_values(self):
     return self._default_values
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     default_consts = [constant_op.constant(d, shape=[1])
                       for d in self._default_values]
     parsed_values = parsing_ops.decode_csv(input_tensors[0],
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/densify.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/densify.py
index 2f389153178..0f0c1a08911 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/densify.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/densify.py
@@ -47,12 +47,13 @@ class Densify(transform.Transform):
   def _output_names(self):
     return "output",
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     """Applies the transformation to the `transform_input`.
 
     Args:
-        input_tensors: a list of Tensors representing the input to
+      input_tensors: a list of Tensors representing the input to
         the Transform.
+      **kwargs: Additional keyword arguments, unused here.
 
     Returns:
         A namedtuple of Tensors representing the transformed output.
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/difference.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/difference.py
index d4e6c10094b..b585fceeb63 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/difference.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/difference.py
@@ -1,4 +1,4 @@
-# Copyright 2016 Google Inc. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ class Difference(transform.Transform):
   def _output_names(self):
     return "output",
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     pair_sparsity = (isinstance(input_tensors[0], ops.SparseTensor),
                      isinstance(input_tensors[1], ops.SparseTensor))
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/example_parser.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/example_parser.py
index e22ef740ed9..c2c5e0cbed5 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/example_parser.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/example_parser.py
@@ -61,7 +61,7 @@ class ExampleParser(transform.Transform):
   def feature_definitions(self):
     return self._ordered_features
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     parsed_values = parsing_ops.parse_example(input_tensors[0],
                                               features=self._ordered_features)
     # pylint: disable=not-callable
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/hashes.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/hashes.py
new file mode 100644
index 00000000000..325e7827ce2
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/hashes.py
@@ -0,0 +1,68 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Masks one `Series` based on the content of another `Series`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.dataframe import transform
+from tensorflow.python.ops import string_ops
+
+
+class HashFast(transform.Transform):
+  """Perform a fast hash of a `Series`."""
+
+  def __init__(self, num_buckets):
+    """Initialize `CSVParser`.
+
+    Args:
+      num_buckets: The number of hash buckets to use.
+    """
+    # TODO(soergel): allow seed?
+    super(HashFast, self).__init__()
+    self._num_buckets = num_buckets
+
+  @property
+  def name(self):
+    return "HashFast"
+
+  @property
+  def input_valency(self):
+    return 1
+
+  @property
+  def _output_names(self):
+    return "output",
+
+  def _apply_transform(self, input_tensors, **kwargs):
+    """Applies the transformation to the `transform_input`.
+
+    Args:
+      input_tensors: a list of Tensors representing the input to
+        the Transform.
+      **kwargs: additional keyword arguments, unused here.
+
+    Returns:
+        A namedtuple of Tensors representing the transformed output.
+    """
+    result = string_ops.string_to_hash_bucket_fast(input_tensors[0],
+                                                   self._num_buckets,
+                                                   name=None)
+    # pylint: disable=not-callable
+    return self.return_type(result)
+
+
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py
index 97453c30325..d96d53468a5 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py
@@ -89,7 +89,7 @@ class BaseInMemorySource(transform.Transform):
   def input_valency(self):
     return 0
 
-  def _apply_transform(self, transform_input):
+  def _apply_transform(self, transform_input, **kwargs):
     queue = feeding_functions.enqueue_data(self.data,
                                            self.queue_capacity,
                                            self.shuffle,
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/reader_source.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/reader_source.py
index 23556c40657..ddb2d321d1c 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/reader_source.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/reader_source.py
@@ -32,7 +32,6 @@ class ReaderSource(transform.Transform):
                reader_kwargs=None,
                enqueue_size=None,
                batch_size=1,
-               num_epochs=None,
                queue_capacity=None,
                shuffle=False,
                min_after_dequeue=None,
@@ -49,9 +48,6 @@ class ReaderSource(transform.Transform):
         is constructed.
       enqueue_size: block size for each read operation.
       batch_size: The desired batch size of output. Defaults to 1.
-      num_epochs: the number of times that the reader should loop through all
-        the file names. If set to `None`, then the reader will continue
-        indefinitely.
       queue_capacity: Capacity of the queue. Defaults to 10 * `batch_size`.
       shuffle: Whether records will be shuffled before returning. Defaults to
         false.
@@ -73,7 +69,6 @@ class ReaderSource(transform.Transform):
     self._batch_size = batch_size
     self._queue_capacity = (batch_size * 10 if queue_capacity is None else
                             queue_capacity)
-    self._num_epochs = num_epochs
     self._shuffle = shuffle
     self._min_after_dequeue = int(self.queue_capacity / 4 if min_after_dequeue
                                   is None else min_after_dequeue)
@@ -100,10 +95,6 @@ class ReaderSource(transform.Transform):
   def batch_size(self):
     return self._batch_size
 
-  @transform.parameter
-  def num_epochs(self):
-    return self._num_epochs
-
   @transform.parameter
   def queue_capacity(self):
     return self._queue_capacity
@@ -136,11 +127,12 @@ class ReaderSource(transform.Transform):
   def _output_names(self):
     return ("index", "value")
 
-  def _apply_transform(self, transform_input):
-    filename_queue = input_ops.string_input_producer(self.work_units,
-                                                     num_epochs=self.num_epochs,
-                                                     shuffle=self.shuffle,
-                                                     seed=self.seed)
+  def _apply_transform(self, transform_input, **kwargs):
+    filename_queue = input_ops.string_input_producer(
+        self.work_units,
+        num_epochs=kwargs.get("num_epochs"),
+        shuffle=self.shuffle,
+        seed=self.seed)
     reader_ops = []
     for _ in range(self.num_threads):
       reader = self._reader_cls(**self._reader_kwargs)
@@ -174,7 +166,6 @@ def TextFileSource(file_names,
                    reader_kwargs=None,
                    enqueue_size=1,
                    batch_size=1,
-                   num_epochs=None,
                    queue_capacity=None,
                    shuffle=False,
                    min_after_dequeue=None,
@@ -185,7 +176,6 @@ def TextFileSource(file_names,
                       reader_kwargs=reader_kwargs,
                       enqueue_size=enqueue_size,
                       batch_size=batch_size,
-                      num_epochs=num_epochs,
                       queue_capacity=queue_capacity,
                       shuffle=shuffle,
                       min_after_dequeue=min_after_dequeue,
@@ -197,7 +187,6 @@ def TFRecordSource(file_names,
                    reader_kwargs=None,
                    enqueue_size=1,
                    batch_size=1,
-                   num_epochs=None,
                    queue_capacity=None,
                    shuffle=False,
                    min_after_dequeue=None,
@@ -208,7 +197,6 @@ def TFRecordSource(file_names,
                       reader_kwargs=reader_kwargs,
                       enqueue_size=enqueue_size,
                       batch_size=batch_size,
-                      num_epochs=num_epochs,
                       queue_capacity=queue_capacity,
                       shuffle=shuffle,
                       min_after_dequeue=min_after_dequeue,
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/sparsify.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/sparsify.py
index 552012ea330..f3447c5d940 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/sparsify.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/sparsify.py
@@ -52,12 +52,13 @@ class Sparsify(transform.Transform):
   def _output_names(self):
     return "output",
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     """Applies the transformation to the `transform_input`.
 
     Args:
-        input_tensors: a list of Tensors representing the input to
+      input_tensors: a list of Tensors representing the input to
         the Transform.
+      **kwargs: Additional keyword arguments, unused here.
 
     Returns:
         A namedtuple of Tensors representing the transformed output.
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/sum.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/sum.py
index 6b04166e09c..878b08f4b0a 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/sum.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/sum.py
@@ -1,4 +1,4 @@
-# Copyright 2016 Google Inc. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ class Sum(transform.Transform):
   def _output_names(self):
     return "output",
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     pair_sparsity = (isinstance(input_tensors[0], ops.SparseTensor),
                      isinstance(input_tensors[1], ops.SparseTensor))
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/unary_transforms.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/unary_transforms.py
index 3fd8c2a6a90..7f9eb7ce1da 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/unary_transforms.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/unary_transforms.py
@@ -1,4 +1,4 @@
-# Copyright 2016 Google Inc. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -43,7 +43,8 @@ UNARY_TRANSFORMS = [("__neg__", math_ops.neg),
                     ("lgamma", math_ops.lgamma),
                     ("digamma", math_ops.digamma),
                     ("erf", math_ops.erf),
-                    ("erfc", math_ops.erfc)]
+                    ("erfc", math_ops.erfc),
+                    ("__invert__", math_ops.logical_not, bool)]
 
 DOC_FORMAT_STRING = (
     "A `Transform` that wraps the `{0}` operation. "
@@ -52,7 +53,7 @@ DOC_FORMAT_STRING = (
 
 
 # pylint: disable=unused-argument
-def register_unary_op(registered_name, operation):
+def register_unary_op(registered_name, operation, ignore_dtype=None):
   """Creates a `Transform` that wraps a unary tensorflow operation.
 
   If `registered_name` is specified, the `Transform` is registered as a member
@@ -62,6 +63,8 @@ def register_unary_op(registered_name, operation):
     registered_name: the name of the member function of `Series` corresponding
       to the returned `Transform`.
     operation: a unary TensorFlow operation.
+    ignore_dtype: an optional dtype, not used here but needed for symmetry with
+      test.
   """
 
   doc = DOC_FORMAT_STRING.format(operation.__name__, operation.__doc__)
@@ -78,7 +81,7 @@ def register_unary_op(registered_name, operation):
   def _output_names(self):
     return "output"
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     input_tensor = input_tensors[0]
     if isinstance(input_tensor, ops.SparseTensor):
       result = ops.SparseTensor(input_tensor.indices,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/classifier_test.py b/tensorflow/contrib/learn/python/learn/estimators/classifier_test.py
index d278c9e0af0..5ce6c4878b0 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/classifier_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/classifier_test.py
@@ -29,14 +29,10 @@ from tensorflow.contrib.learn.python.learn.estimators import _sklearn
 
 def iris_input_fn(num_epochs=None):
   iris = tf.contrib.learn.datasets.load_iris()
-  features = tf.cast(
-      tf.reshape(
-          tf.constant(iris.data), [-1, 4]), tf.float32)
+  features = tf.reshape(tf.constant(iris.data), [-1, 4])
   if num_epochs:
     features = tf.train.limit_epochs(features, num_epochs=num_epochs)
-  target = tf.cast(
-      tf.reshape(
-          tf.constant(iris.target), [-1]), tf.int64)
+  target = tf.reshape(tf.constant(iris.target), [-1])
   return features, target
 
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/composable_model.py b/tensorflow/contrib/learn/python/learn/estimators/composable_model.py
index 20cafc4cb4a..f47ae184558 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/composable_model.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/composable_model.py
@@ -20,11 +20,13 @@ from __future__ import division
 from __future__ import print_function
 
 import math
+import re
 
 import six
 
 from tensorflow.contrib import layers
 from tensorflow.contrib.layers.python.layers import feature_column_ops
+from tensorflow.contrib.learn.python.learn.utils import checkpoints
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradients
@@ -47,31 +49,31 @@ class _ComposableModel(object):
   def __init__(self,
                num_label_columns,
                optimizer,
-               weight_collection_name,
                gradient_clip_norm,
-               num_ps_replicas):
+               num_ps_replicas,
+               scope):
     """Common initialization for all _ComposableModel objects.
 
     Args:
       num_label_columns: The number of label/target columns.
       optimizer: An instance of `tf.Optimizer` used to apply gradients to
         the model. If `None`, will use a FTRL optimizer.
-      weight_collection_name: A string defining the name to use for the
-        collection of weights (e.g. 'dnn').
       gradient_clip_norm: A float > 0. If provided, gradients are clipped
         to their global norm with this clipping ratio. See
         tf.clip_by_global_norm for more details.
       num_ps_replicas: The number of parameter server replicas.
+      scope: Scope for variables created in this model.
     """
     self._num_label_columns = num_label_columns
     self._optimizer = optimizer
-    self._weight_collection_name = weight_collection_name
     self._gradient_clip_norm = gradient_clip_norm
     self._num_ps_replicas = num_ps_replicas
+    self._scope = scope
     self._feature_columns = None
 
-  def get_weight_collection_name(self):
-    return self._weight_collection_name
+  def get_scope_name(self):
+    """Returns the scope name used by this model for variables."""
+    return self._scope
 
   def build_model(self, features, feature_columns, is_training):
     """Builds the model that can calculate the logits.
@@ -114,7 +116,7 @@ class _ComposableModel(object):
 
   def _get_vars(self):
     if self._get_feature_columns():
-      return ops.get_collection(self._weight_collection_name)
+      return ops.get_collection(self._scope)
     return []
 
   def _get_optimizer(self):
@@ -142,7 +144,8 @@ class LinearComposableModel(_ComposableModel):
                num_label_columns,
                optimizer=None,
                gradient_clip_norm=None,
-               num_ps_replicas=0):
+               num_ps_replicas=0,
+               scope=None):
     """Initializes LinearComposableModel objects.
 
     Args:
@@ -153,13 +156,49 @@ class LinearComposableModel(_ComposableModel):
         to their global norm with this clipping ratio. See
         tf.clip_by_global_norm for more details.
       num_ps_replicas: The number of parameter server replicas.
+      scope: Optional scope for variables created in this model. If scope
+        is not supplied, it will default to 'linear'.
     """
+    scope = "linear" if not scope else scope
     super(LinearComposableModel, self).__init__(
         num_label_columns=num_label_columns,
         optimizer=optimizer,
-        weight_collection_name="linear",
         gradient_clip_norm=gradient_clip_norm,
-        num_ps_replicas=num_ps_replicas)
+        num_ps_replicas=num_ps_replicas,
+        scope=scope)
+
+  def get_weights(self, model_dir):
+    """Returns weights per feature of the linear part.
+
+    Args:
+      model_dir: Directory where model parameters, graph and etc. are saved.
+
+    Returns:
+      The weights created by this model (without the optimizer weights).
+    """
+    all_variables = [name for name, _ in checkpoints.list_variables(model_dir)]
+    values = {}
+    optimizer_regex = r".*/" + self._get_optimizer().get_name() + r"(_\d)?$"
+    for name in all_variables:
+      if (name.startswith(self._scope + "/") and
+          name != self._scope + "/bias_weight" and
+          not re.match(optimizer_regex, name)):
+        values[name] = checkpoints.load_variable(model_dir, name)
+    if len(values) == 1:
+      return values[list(values.keys())[0]]
+    return values
+
+  def get_bias(self, model_dir):
+    """Returns bias of the model.
+
+    Args:
+      model_dir: Directory where model parameters, graph and etc. are saved.
+
+    Returns:
+      The bias weights created by this model.
+    """
+    return checkpoints.load_variable(model_dir,
+                                     name=(self._scope+"/bias_weight"))
 
   def build_model(self, features, feature_columns, is_training):
     """See base class."""
@@ -168,12 +207,12 @@ class LinearComposableModel(_ComposableModel):
         max_partitions=self._num_ps_replicas,
         min_slice_size=64 << 20)
     with variable_scope.variable_op_scope(
-        features.values(), "linear", partitioner=partitioner) as scope:
+        features.values(), self._scope, partitioner=partitioner) as scope:
       logits, _, _ = layers.weighted_sum_from_feature_columns(
           columns_to_tensors=features,
           feature_columns=self._get_feature_columns(),
           num_outputs=self._num_label_columns,
-          weight_collections=[self._weight_collection_name],
+          weight_collections=[self._scope],
           scope=scope)
     return logits
 
@@ -200,7 +239,8 @@ class DNNComposableModel(_ComposableModel):
                activation_fn=nn.relu,
                dropout=None,
                gradient_clip_norm=None,
-               num_ps_replicas=0):
+               num_ps_replicas=0,
+               scope=None):
     """Initializes DNNComposableModel objects.
 
     Args:
@@ -217,17 +257,50 @@ class DNNComposableModel(_ComposableModel):
         to their global norm with this clipping ratio. See
         tf.clip_by_global_norm for more details.
       num_ps_replicas: The number of parameter server replicas.
+      scope: Optional scope for variables created in this model. If not scope
+        is supplied, one is generated.
     """
+    scope = "dnn" if not scope else scope
     super(DNNComposableModel, self).__init__(
         num_label_columns=num_label_columns,
         optimizer=optimizer,
-        weight_collection_name="DNN",
         gradient_clip_norm=gradient_clip_norm,
-        num_ps_replicas=num_ps_replicas)
+        num_ps_replicas=num_ps_replicas,
+        scope=scope)
     self._hidden_units = hidden_units
     self._activation_fn = activation_fn
     self._dropout = dropout
 
+  def get_weights(self, model_dir):
+    """Returns the weights of the model.
+
+    Args:
+      model_dir: Directory where model parameters, graph and etc. are saved.
+
+    Returns:
+      The weights created by this model.
+    """
+    return [checkpoints.load_variable(
+        model_dir, name=(self._scope+"/hiddenlayer_%d/weights" % i))
+            for i, _ in enumerate(self._hidden_units)] + [
+                checkpoints.load_variable(
+                    model_dir, name=(self._scope+"/logits/weights"))]
+
+  def get_bias(self, model_dir):
+    """Returns the bias of the model.
+
+    Args:
+      model_dir: Directory where model parameters, graph and etc. are saved.
+
+    Returns:
+      The bias weights created by this model.
+    """
+    return [checkpoints.load_variable(
+        model_dir, name=(self._scope+"/hiddenlayer_%d/biases" % i))
+            for i, _ in enumerate(self._hidden_units)] + [
+                checkpoints.load_variable(
+                    model_dir, name=(self._scope+"/logits/biases"))]
+
   def _add_hidden_layer_summary(self, value, tag):
     # TODO(zakaria): Move this code to tf.learn and add test.
     logging_ops.scalar_summary("%s:fraction_of_zero_values" % tag,
@@ -244,12 +317,12 @@ class DNNComposableModel(_ComposableModel):
             min_slice_size=64 << 20))
     with variable_scope.variable_op_scope(
         features.values(),
-        "input_from_feature_columns",
+        self._scope + "/input_from_feature_columns",
         partitioner=input_layer_partitioner) as scope:
       net = layers.input_from_feature_columns(
           features,
           self._get_feature_columns(),
-          weight_collections=[self._weight_collection_name],
+          weight_collections=[self._scope],
           scope=scope)
 
     hidden_layer_partitioner = (
@@ -257,13 +330,13 @@ class DNNComposableModel(_ComposableModel):
             max_partitions=self._num_ps_replicas))
     for layer_id, num_hidden_units in enumerate(self._hidden_units):
       with variable_scope.variable_op_scope(
-          [net], "hiddenlayer_%d" % layer_id,
+          [net], self._scope + "/hiddenlayer_%d" % layer_id,
           partitioner=hidden_layer_partitioner) as scope:
         net = layers.fully_connected(
             net,
             num_hidden_units,
             activation_fn=self._activation_fn,
-            variables_collections=[self._weight_collection_name],
+            variables_collections=[self._scope],
             scope=scope)
         if self._dropout is not None and is_training:
           net = layers.dropout(
@@ -272,15 +345,15 @@ class DNNComposableModel(_ComposableModel):
       self._add_hidden_layer_summary(net, scope.name)
 
     with variable_scope.variable_op_scope(
-        [net], "dnn_logits",
+        [net], self._scope + "/logits",
         partitioner=hidden_layer_partitioner) as scope:
       logits = layers.fully_connected(
           net,
           self._num_label_columns,
           activation_fn=None,
-          variables_collections=[self._weight_collection_name],
+          variables_collections=[self._scope],
           scope=scope)
-    self._add_hidden_layer_summary(logits, "dnn_logits")
+    self._add_hidden_layer_summary(logits, "logits")
     return logits
 
   def _get_default_optimizer(self, optimizer_name=None):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
index df7be73a164..57e6a455852 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import tempfile
+
 import tensorflow as tf
 
 from tensorflow.contrib import layers
@@ -42,7 +44,7 @@ class _BaseEstimatorForTest(estimator.BaseEstimator):
   def __init__(self,
                target_column,
                feature_columns):
-    super(_BaseEstimatorForTest, self).__init__()
+    super(_BaseEstimatorForTest, self).__init__(model_dir=tempfile.mkdtemp())
     self._target_column = target_column
     self._feature_columns = feature_columns
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
index 3d7ae1e380b..79a45161e7b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@@ -71,9 +71,9 @@ class _DNNLinearCombinedBaseEstimator(estimator.BaseEstimator):
 
     Args:
       target_column: A _TargetColumn object.
-      model_dir: Directory to save model parameters, graph and etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
       linear_feature_columns: An iterable containing all the feature columns
         used by linear part of the model. All items in the set should be
         instances of classes derived from `FeatureColumn`.
@@ -102,8 +102,8 @@ class _DNNLinearCombinedBaseEstimator(estimator.BaseEstimator):
       ValueError: If both linear_feature_columns and dnn_features_columns are
         empty at the same time.
     """
-    super(_DNNLinearCombinedBaseEstimator, self).__init__(model_dir=model_dir,
-                                                          config=config)
+    super(_DNNLinearCombinedBaseEstimator, self).__init__(
+        model_dir=model_dir, config=config)
 
     num_ps_replicas = config.num_ps_replicas if config else 0
 
@@ -124,8 +124,6 @@ class _DNNLinearCombinedBaseEstimator(estimator.BaseEstimator):
 
     self._linear_feature_columns = linear_feature_columns
     self._linear_optimizer = linear_optimizer
-    self._linear_weight_collection = (
-        self._linear_model.get_weight_collection_name())
     self._dnn_feature_columns = dnn_feature_columns
     self._dnn_hidden_units = dnn_hidden_units
     self._centered_bias_weight_collection = "centered_bias"
@@ -135,38 +133,24 @@ class _DNNLinearCombinedBaseEstimator(estimator.BaseEstimator):
   @property
   def linear_weights_(self):
     """Returns weights per feature of the linear part."""
-    all_variables = self.get_variable_names()
-    # TODO(ispir): Figure out a better way to retrieve variables for features.
-    # for example using feature info / columns.
-    values = {}
-    for name in all_variables:
-      if (name.startswith("linear/") and name.rfind("/") == 6 and
-          name != "linear/bias_weight"):
-        values[name] = self.get_variable_value(name)
-    if len(values) == 1:
-      return values[list(values.keys())[0]]
-    return values
+    return self._linear_model.get_weights(model_dir=self._model_dir)
 
   @property
   def linear_bias_(self):
     """Returns bias of the linear part."""
-    return (self.get_variable_value("linear/bias_weight") +
+    return (self._linear_model.get_bias(model_dir=self._model_dir) +
             self.get_variable_value("centered_bias_weight"))
 
   @property
   def dnn_weights_(self):
     """Returns weights of deep neural network part."""
-    return [self.get_variable_value("hiddenlayer_%d/weights" % i)
-            for i, _ in enumerate(self._dnn_hidden_units)] + [
-                self.get_variable_value("dnn_logits/weights")]
+    return self._dnn_model.get_weights(model_dir=self._model_dir)
 
   @property
   def dnn_bias_(self):
     """Returns bias of deep neural network part."""
-    return [self.get_variable_value("hiddenlayer_%d/biases" % i)
-            for i, _ in enumerate(self._dnn_hidden_units)] + [
-                self.get_variable_value("dnn_logits/biases"),
-                self.get_variable_value("centered_bias_weight")]
+    return (self._dnn_model.get_bias(model_dir=self._model_dir) +
+            [self.get_variable_value("centered_bias_weight")])
 
   def _get_feature_dict(self, features):
     if isinstance(features, dict):
@@ -347,9 +331,9 @@ class DNNLinearCombinedClassifier(_DNNLinearCombinedBaseEstimator):
     """Constructs a DNNLinearCombinedClassifier instance.
 
     Args:
-      model_dir: Directory to save model parameters, graph and etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
       n_classes: number of target classes. Default is binary classification.
       weight_column_name: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training.
@@ -532,9 +516,9 @@ class DNNLinearCombinedRegressor(_DNNLinearCombinedBaseEstimator):
     """Initializes a DNNLinearCombinedRegressor instance.
 
     Args:
-      model_dir: Directory to save model parameters, graph and etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
       weight_column_name: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training. It
         will be multiplied by the loss of the example.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
index 7cfb2b68b67..9ea6de7751e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
@@ -23,6 +23,7 @@ import tempfile
 
 import numpy as np
 import tensorflow as tf
+
 from tensorflow.contrib.learn.python.learn.estimators import _sklearn
 
 
@@ -458,10 +459,39 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase):
     self.assertLess(loss2, 0.01)
     self.assertTrue('centered_bias_weight' in classifier.get_variable_names())
 
-    self.assertNotIn('dnn_logits/biases', classifier.get_variable_names())
-    self.assertNotIn('dnn_logits/weights', classifier.get_variable_names())
+    self.assertNotIn('dnn/logits/biases', classifier.get_variable_names())
+    self.assertNotIn('dnn/logits/weights', classifier.get_variable_names())
     self.assertEquals(1, len(classifier.linear_bias_))
-    self.assertEquals(100, len(classifier.linear_weights_))
+    self.assertEquals(2, len(classifier.linear_weights_))
+    self.assertEquals(1, len(classifier.linear_weights_['linear/age/weight']))
+    self.assertEquals(
+        100, len(classifier.linear_weights_['linear/language_weights']))
+
+  def testLinearOnlyOneFeature(self):
+    """Tests that linear-only instantiation works for one feature only."""
+    def input_fn():
+      return {
+          'language': tf.SparseTensor(values=['english'],
+                                      indices=[[0, 0]],
+                                      shape=[1, 1])
+      }, tf.constant([[1]])
+
+    language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 99)
+
+    classifier = tf.contrib.learn.DNNLinearCombinedClassifier(
+        linear_feature_columns=[language])
+    classifier.fit(input_fn=input_fn, steps=100)
+    loss1 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
+    classifier.fit(input_fn=input_fn, steps=200)
+    loss2 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
+    self.assertLess(loss2, loss1)
+    self.assertLess(loss2, 0.01)
+    self.assertTrue('centered_bias_weight' in classifier.get_variable_names())
+
+    self.assertNotIn('dnn/logits/biases', classifier.get_variable_names())
+    self.assertNotIn('dnn/logits/weights', classifier.get_variable_names())
+    self.assertEquals(1, len(classifier.linear_bias_))
+    self.assertEquals(99, len(classifier.linear_weights_))
 
   def testDNNOnly(self):
     """Tests that DNN-only instantiation works."""
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 0e73e316a5f..1dd8baa94e4 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -31,7 +31,9 @@ import six
 
 from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib import layers
+from tensorflow.contrib.learn.python.learn import evaluable
 from tensorflow.contrib.learn.python.learn import graph_actions
+from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import _sklearn as sklearn
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import tensor_signature
@@ -138,7 +140,8 @@ def _get_arguments(func):
     return _get_arguments(func.func)
 
 
-class BaseEstimator(sklearn.BaseEstimator):
+class BaseEstimator(
+    sklearn.BaseEstimator, evaluable.Evaluable, trainable.Trainable):
   """Abstract BaseEstimator class to train and evaluate TensorFlow models.
 
   Concrete implementation of this class should provide the following functions:
@@ -158,9 +161,9 @@ class BaseEstimator(sklearn.BaseEstimator):
     """Initializes a BaseEstimator instance.
 
     Args:
-      model_dir: Directory to save model parameters, graph and etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
       config: A RunConfig instance.
     """
     # Model directory.
@@ -196,34 +199,8 @@ class BaseEstimator(sklearn.BaseEstimator):
 
   def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
           monitors=None, max_steps=None):
-    """Trains a model given training data `x` predictions and `y` targets.
-
-    Args:
-      x: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
-      y: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-         iterator that returns array of targets. The training target values
-         (class labels in classification, real numbers in regression). If set,
-         `input_fn` must be `None`.
-      input_fn: Input function. If set, `x`, `y`, and `batch_size` must be
-        `None`.
-      steps: Number of steps for which to train model. If `None`, train forever.
-        If set, `max_steps` must be `None`.
-      batch_size: minibatch size to use on the input, defaults to first
-        dimension of `x`. Must be `None` if `input_fn` is provided.
-      monitors: List of `BaseMonitor` subclass instances. Used for callbacks
-        inside the training loop.
-      max_steps: Number of total steps for which to train model. If `None`,
-        train forever. If set, `steps` must be `None`.
-
-        Two calls to `fit(steps=100)` means 200 training
-        iterations. On the other hand, two calls to `fit(max_steps=100)` means
-        that the second call will not do any iteration since first call did
-        all 100 steps.
-
-    Returns:
-      `self`, for chaining.
+    # pylint: disable=g-doc-args,g-doc-return-or-yield
+    """See `Trainable`.
 
     Raises:
       ValueError: If `x` or `y` are not `None` while `input_fn` is not `None`.
@@ -284,61 +261,11 @@ class BaseEstimator(sklearn.BaseEstimator):
     return self.fit(x=x, y=y, input_fn=input_fn, steps=steps,
                     batch_size=batch_size, monitors=monitors)
 
-  def evaluate(self,
-               x=None,
-               y=None,
-               input_fn=None,
-               feed_fn=None,
-               batch_size=None,
-               steps=None,
-               metrics=None,
-               name=None):
-    """Evaluates given model with provided evaluation data.
-
-    Evaluates on the given input data. If `input_fn` is provided, that
-    input function should raise an end-of-input exception (`OutOfRangeError` or
-    `StopIteration`) after one epoch of the training data has been provided.
-
-    By default, the whole evaluation dataset is used. If `steps` is provided,
-    only `steps` batches of size `batch_size` are processed.
-
-    The return value is a dict containing the metrics specified in `metrics`, as
-    well as an entry `global_step` which contains the value of the global step
-    for which this evaluation was performed.
-
-    Args:
-      x: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
-      y: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-         iterator that returns array of targets. The training target values
-         (class labels in classification, real numbers in regression). If set,
-         `input_fn` must be `None`.
-      input_fn: Input function. If set, `x`, `y`, and `batch_size` must be
-        `None`.
-      feed_fn: Function creating a feed dict every time it is called. Called
-        once per iteration.
-      batch_size: minibatch size to use on the input, defaults to first
-        dimension of `x`, if specified. Must be `None` if `input_fn` is
-        provided.
-      steps: Number of steps for which to evaluate model. If `None`, evaluate
-        until running tensors generated by `metrics` raises an exception.
-      metrics: Dict of metric ops to run. If `None`, the default metric
-        functions are used; if `{}`, no metrics are used. If model has one
-        output (i.e., returning single predction), keys are `str`, e.g.
-        `'accuracy'` - just a name of the metric that will show up in
-        the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-        `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-        the predictions to run this metric on.
-
-        Metric ops should support streaming, e.g., returning
-        update_op and value tensors. See more details in
-        ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-      name: Name of the evaluation if user needs to run multiple evaluations on
-        different data sets, such as on training data vs test data.
-
-    Returns:
-      Returns `dict` with evaluation results.
+  def evaluate(
+      self, x=None, y=None, input_fn=None, feed_fn=None, batch_size=None,
+      steps=None, metrics=None, name=None):
+    # pylint: disable=g-doc-args,g-doc-return-or-yield
+    """See `Evaluable`.
 
     Raises:
       ValueError: If at least one of `x` or `y` is provided, and at least one of
@@ -571,7 +498,7 @@ class BaseEstimator(sklearn.BaseEstimator):
           log_every_steps=log_every_steps,
           supervisor_is_chief=(self._config.task == 0),
           supervisor_master=self._config.master,
-          supervisor_save_model_steps=self._config.save_checkpoints_steps,
+          supervisor_save_model_secs=self._config.save_checkpoints_secs,
           keep_checkpoint_max=self._config.keep_checkpoint_max,
           feed_fn=feed_fn,
           steps=steps,
@@ -770,9 +697,9 @@ class Estimator(BaseEstimator):
                  is passed to Estimator in `params` parameter. This allows
                  to configure Estimators from hyper parameter tunning.
 
-      model_dir: Directory to save model parameters, graph and etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
       config: Configuration object.
       params: `dict` of hyper parameters that will be passed into `model_fn`.
               Keys are names of parameters, values are basic python types.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index dbb1b40a4bc..c3dca0451dc 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -36,32 +36,26 @@ _IRIS_INPUT_DIM = 4
 
 def boston_input_fn(num_epochs=None):
   boston = tf.contrib.learn.datasets.load_boston()
-  features = tf.cast(
-      tf.reshape(tf.constant(boston.data), [-1, _BOSTON_INPUT_DIM]), tf.float32)
+  features = tf.reshape(tf.constant(boston.data), [-1, _BOSTON_INPUT_DIM])
   if num_epochs:
     features = tf.train.limit_epochs(features, num_epochs=num_epochs)
-  target = tf.cast(
-      tf.reshape(tf.constant(boston.target), [-1, 1]), tf.float32)
+  target = tf.reshape(tf.constant(boston.target), [-1, 1])
   return features, target
 
 
 def iris_input_fn():
   iris = tf.contrib.learn.datasets.load_iris()
-  features = tf.cast(
-      tf.reshape(tf.constant(iris.data), [-1, _IRIS_INPUT_DIM]), tf.float32)
-  target = tf.cast(
-      tf.reshape(tf.constant(iris.target), [-1]), tf.int32)
+  features = tf.reshape(tf.constant(iris.data), [-1, _IRIS_INPUT_DIM])
+  target = tf.reshape(tf.constant(iris.target), [-1])
   return features, target
 
 
 def boston_eval_fn():
   boston = tf.contrib.learn.datasets.load_boston()
   n_examples = len(boston.target)
-  features = tf.cast(
-      tf.reshape(tf.constant(boston.data), [n_examples, _BOSTON_INPUT_DIM]),
-      tf.float32)
-  target = tf.cast(
-      tf.reshape(tf.constant(boston.target), [n_examples, 1]), tf.float32)
+  features = tf.reshape(
+      tf.constant(boston.data), [n_examples, _BOSTON_INPUT_DIM])
+  target = tf.reshape(tf.constant(boston.target), [n_examples, 1])
   return tf.concat(0, [features, features]), tf.concat(0, [target, target])
 
 
@@ -188,7 +182,7 @@ class EstimatorTest(tf.test.TestCase):
     with self.assertRaises(tf.contrib.learn.NotFittedError):
       _ = est.evaluate(
           x=boston.data,
-          y=boston.target.astype(np.float32))
+          y=boston.target.astype(np.float64))
     with self.assertRaises(tf.contrib.learn.NotFittedError):
       est.predict(x=boston.data)
 
@@ -197,10 +191,11 @@ class EstimatorTest(tf.test.TestCase):
     output_dir = tempfile.mkdtemp()
     est = tf.contrib.learn.Estimator(model_fn=linear_model_fn,
                                      model_dir=output_dir)
-    est.fit(x=boston.data, y=boston.target.astype(np.float32), steps=50)
+    float64_target = boston.target.astype(np.float64)
+    est.fit(x=boston.data, y=float64_target, steps=50)
     scores = est.evaluate(
         x=boston.data,
-        y=boston.target.astype(np.float32),
+        y=float64_target,
         metrics={'MSE': tf.contrib.metrics.streaming_mean_squared_error})
     del est
     # Create another estimator object with the same output dir.
@@ -210,19 +205,19 @@ class EstimatorTest(tf.test.TestCase):
     # Check we can evaluate and predict.
     scores2 = est2.evaluate(
         x=boston.data,
-        y=boston.target.astype(np.float32),
+        y=float64_target,
         metrics={'MSE': tf.contrib.metrics.streaming_mean_squared_error})
     self.assertAllClose(scores2['MSE'],
                         scores['MSE'])
     predictions = est2.predict(x=boston.data)
-    other_score = _sklearn.mean_squared_error(predictions, boston.target)
+    other_score = _sklearn.mean_squared_error(predictions, float64_target)
     self.assertAllClose(other_score, scores['MSE'])
 
     # Check we can keep training.
-    est2.fit(x=boston.data, y=boston.target.astype(np.float32), steps=100)
+    est2.fit(x=boston.data, y=float64_target, steps=100)
     scores3 = est2.evaluate(
         x=boston.data,
-        y=boston.target.astype(np.float32),
+        y=float64_target,
         metrics={'MSE': tf.contrib.metrics.streaming_mean_squared_error})
     self.assertLess(scores3['MSE'], scores['MSE'])
 
@@ -230,15 +225,16 @@ class EstimatorTest(tf.test.TestCase):
     boston = tf.contrib.learn.datasets.load_boston()
     est = tf.contrib.learn.Estimator(model_fn=linear_model_params_fn,
                                      params={'learning_rate': 0.01})
-    est.fit(x=boston.data, y=boston.target.astype(np.float32), steps=100)
+    est.fit(x=boston.data, y=boston.target, steps=100)
 
   def testBostonAll(self):
     boston = tf.contrib.learn.datasets.load_boston()
     est = tf.contrib.learn.Estimator(model_fn=linear_model_fn)
-    est.fit(x=boston.data, y=boston.target.astype(np.float32), steps=100)
+    float64_target = boston.target.astype(np.float64)
+    est.fit(x=boston.data, y=float64_target, steps=100)
     scores = est.evaluate(
         x=boston.data,
-        y=boston.target.astype(np.float32),
+        y=float64_target,
         metrics={'MSE': tf.contrib.metrics.streaming_mean_squared_error})
     predictions = est.predict(x=boston.data)
     other_score = _sklearn.mean_squared_error(predictions, boston.target)
@@ -277,7 +273,7 @@ class EstimatorTest(tf.test.TestCase):
     iris = tf.contrib.learn.datasets.load_iris()
     est = tf.contrib.learn.Estimator(model_fn=logistic_model_no_mode_fn)
     x_iter = itertools.islice(iris.data, 100)
-    y_iter = itertools.islice(np.int32(iris.target), 100)
+    y_iter = itertools.islice(iris.target, 100)
     est.fit(x_iter, y_iter, steps=100)
     _ = est.evaluate(input_fn=iris_input_fn, steps=1)
     predictions = est.predict(x=iris.data)['class']
@@ -374,19 +370,16 @@ class InferRealValuedColumnsTest(tf.test.TestCase):
         '': tf.FixedLenFeature(shape=expected_shape, dtype=expected_dtype)
     }, feature_column.config)
 
-  # Note: See tf.contrib.learn.io.data_feeder for why int32 converts to float32.
   def testInt32Input(self):
     feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
         np.ones(shape=[7, 8], dtype=np.int32))
-    self._assert_single_feature_column([8], tf.float32, feature_columns)
+    self._assert_single_feature_column([8], tf.int32, feature_columns)
 
   def testInt32InputFn(self):
     feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input_fn(
         lambda: (tf.ones(shape=[7, 8], dtype=tf.int32), None))
     self._assert_single_feature_column([8], tf.int32, feature_columns)
 
-  # Note: See tf.contrib.learn.io.data_feeder for why int64 doesn't convert to
-  # float64.
   def testInt64Input(self):
     feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
         np.ones(shape=[7, 8], dtype=np.int64))
@@ -407,12 +400,10 @@ class InferRealValuedColumnsTest(tf.test.TestCase):
         lambda: (tf.ones(shape=[7, 8], dtype=tf.float32), None))
     self._assert_single_feature_column([8], tf.float32, feature_columns)
 
-  # Note: See tf.contrib.learn.io.data_feeder for why float64 converts to
-  # float32.
   def testFloat64Input(self):
     feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
         np.ones(shape=[7, 8], dtype=np.float64))
-    self._assert_single_feature_column([8], tf.float32, feature_columns)
+    self._assert_single_feature_column([8], tf.float64, feature_columns)
 
   def testFloat64InputFn(self):
     feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input_fn(
@@ -420,9 +411,10 @@ class InferRealValuedColumnsTest(tf.test.TestCase):
     self._assert_single_feature_column([8], tf.float64, feature_columns)
 
   def testBoolInput(self):
-    feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
-        np.array([[False for _ in xrange(8)] for _ in xrange(7)]))
-    self._assert_single_feature_column([8], tf.float32, feature_columns)
+    with self.assertRaisesRegexp(
+        ValueError, 'on integer or non floating types are not supported'):
+      tf.contrib.learn.infer_real_valued_columns_from_input(
+          np.array([[False for _ in xrange(8)] for _ in xrange(7)]))
 
   def testBoolInputFn(self):
     with self.assertRaisesRegexp(
@@ -431,18 +423,12 @@ class InferRealValuedColumnsTest(tf.test.TestCase):
       tf.contrib.learn.infer_real_valued_columns_from_input_fn(
           lambda: (tf.constant(False, shape=[7, 8], dtype=tf.bool), None))
 
-  def testInvalidStringInput(self):
-    # pylint: disable=g-long-lambda
-    with self.assertRaisesRegexp(
-        ValueError, 'could not convert string to float'):
-      tf.contrib.learn.infer_real_valued_columns_from_input(
-          np.array([['foo%d' % i for i in xrange(8)] for _ in xrange(7)]))
-
   def testStringInput(self):
-    # pylint: disable=g-long-lambda
-    feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
-        np.array([['%d.0' % i for i in xrange(8)] for _ in xrange(7)]))
-    self._assert_single_feature_column([8], tf.float32, feature_columns)
+    with self.assertRaisesRegexp(
+        ValueError, 'on integer or non floating types are not supported'):
+      # pylint: disable=g-long-lambda
+      tf.contrib.learn.infer_real_valued_columns_from_input(
+          np.array([['%d.0' % i for i in xrange(8)] for _ in xrange(7)]))
 
   def testStringInputFn(self):
     with self.assertRaisesRegexp(
@@ -457,13 +443,13 @@ class InferRealValuedColumnsTest(tf.test.TestCase):
     feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input_fn(
         boston_input_fn)
     self._assert_single_feature_column(
-        [_BOSTON_INPUT_DIM], tf.float32, feature_columns)
+        [_BOSTON_INPUT_DIM], tf.float64, feature_columns)
 
   def testIrisInputFn(self):
     feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input_fn(
         iris_input_fn)
     self._assert_single_feature_column(
-        [_IRIS_INPUT_DIM], tf.float32, feature_columns)
+        [_IRIS_INPUT_DIM], tf.float64, feature_columns)
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index f025fc0941e..beb4dd5aa86 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -122,9 +122,9 @@ class LinearClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
       feature_columns: An iterable containing all the feature columns used by
         the model. All items in the set should be instances of classes derived
         from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
       n_classes: number of target classes. Default is binary classification.
       weight_column_name: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training. It
@@ -186,8 +186,8 @@ class LinearClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
         columns_to_tensors=features,
         feature_columns=self._linear_feature_columns,
         num_outputs=self._target_column.num_label_columns,
-        weight_collections=[self._linear_weight_collection],
-        scope="linear")
+        weight_collections=[self._linear_model.get_scope_name()],
+        scope=self._linear_model.get_scope_name())
     with ops.control_dependencies([self._centered_bias()]):
       loss = self._target_column.loss(logits, targets, features)
     logging_ops.scalar_summary("loss", loss)
@@ -282,9 +282,9 @@ class LinearRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
       feature_columns: An iterable containing all the feature columns used by
         the model. All items in the set should be instances of classes derived
         from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph, etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
+      model_dir: Directory to save model parameters, graph, etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
       weight_column_name: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training. It
         will be multiplied by the loss of the example.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
index cafdb980c55..e3f784cf415 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
@@ -56,7 +56,7 @@ class LogisticRegressor(estimator.Estimator):
       model_fn: Model function. See superclass Estimator for more details. This
         expects the returned predictions to be probabilities in [0.0, 1.0].
       thresholds: List of floating point thresholds to use for accuracy,
-        precision, and recall metrics. If None, defaults to [0.5].
+        precision, and recall metrics. If `None`, defaults to `[0.5]`.
       model_dir: Directory to save model parameters, graphs, etc. This can also
         be used to load checkpoints from the directory into a estimator to continue
         training a previously saved model.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/random_forest.py b/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
index ec704531638..5d82b2c4a5e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
@@ -17,8 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
 import numpy as np
 import six
 
@@ -26,18 +24,36 @@ from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib.learn.python.learn import monitors as mon
 
 from tensorflow.contrib.learn.python.learn.estimators import estimator
-from tensorflow.contrib.learn.python.learn.estimators import run_config
 
 from tensorflow.contrib.tensor_forest.client import eval_metrics
 from tensorflow.contrib.tensor_forest.data import data_ops
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 
 
+def _assert_float32(tensors):
+  """Assert all tensors are float32.
+
+  Args:
+    tensors: `Tensor` or `dict` of `Tensor` objects.
+
+  Raises:
+    TypeError: if any tensor is not float32.
+  """
+  if not isinstance(tensors, dict):
+    tensors = [tensors]
+  else:
+    tensors = tensors.values()
+  for tensor in tensors:
+    if tensor.dtype.base_dtype != dtypes.float32:
+      raise TypeError('Expected dtype=float32, %s.' % tensor)
+
+
 class LossMonitor(mon.EveryN):
   """Terminates training when training loss stops decreasing."""
 
@@ -146,6 +162,8 @@ class TensorForestEstimator(estimator.BaseEstimator):
     Returns:
       Tuple of train `Operation` and loss `Tensor`.
     """
+    _assert_float32(features)
+    _assert_float32(targets)
     features, spec = data_ops.ParseDataTensorOrDict(features)
     labels = data_ops.ParseLabelTensorOrDict(targets)
 
@@ -168,6 +186,7 @@ class TensorForestEstimator(estimator.BaseEstimator):
     return train, self.training_loss
 
   def _get_predict_ops(self, features):
+    _assert_float32(features)
     graph_builder = self.graph_builder_class(
         self.params, device_assigner=self.device_assigner, training=False,
         **self.construction_args)
@@ -175,6 +194,8 @@ class TensorForestEstimator(estimator.BaseEstimator):
     return graph_builder.inference_graph(features, data_spec=spec)
 
   def _get_eval_ops(self, features, targets, metrics):
+    _assert_float32(features)
+    _assert_float32(targets)
     features, spec = data_ops.ParseDataTensorOrDict(features)
     labels = data_ops.ParseLabelTensorOrDict(targets)
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/random_forest_test.py b/tensorflow/contrib/learn/python/learn/estimators/random_forest_test.py
index 81754064d6b..640167a70bf 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/random_forest_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/random_forest_test.py
@@ -19,11 +19,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 import tensorflow as tf
 
 
 class TensorForestTrainerTests(tf.test.TestCase):
 
+  def testFloat64(self):
+    hparams = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
+        num_trees=3, max_nodes=1000, num_classes=3, num_features=4)
+    classifier = tf.contrib.learn.TensorForestEstimator(hparams)
+    iris = tf.contrib.learn.datasets.load_iris()
+    with self.assertRaisesRegexp(TypeError, 'float32'):
+      classifier.fit(x=iris.data, y=iris.target, steps=100)
+
   def testClassification(self):
     """Tests multi-class classification using matrix data as input."""
     hparams = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
@@ -31,9 +40,11 @@ class TensorForestTrainerTests(tf.test.TestCase):
     classifier = tf.contrib.learn.TensorForestEstimator(hparams)
 
     iris = tf.contrib.learn.datasets.load_iris()
+    data = iris.data.astype(np.float32)
+    target = iris.target.astype(np.float32)
 
-    classifier.fit(x=iris.data, y=iris.target, steps=100)
-    classifier.evaluate(x=iris.data, y=iris.target, steps=10)
+    classifier.fit(x=data, y=target, steps=100)
+    classifier.evaluate(x=data, y=target, steps=10)
 
   def testRegression(self):
     """Tests multi-class classification using matrix data as input."""
@@ -45,9 +56,11 @@ class TensorForestTrainerTests(tf.test.TestCase):
     regressor = tf.contrib.learn.TensorForestEstimator(hparams)
 
     boston = tf.contrib.learn.datasets.load_boston()
+    data = boston.data.astype(np.float32)
+    target = boston.target.astype(np.float32)
 
-    regressor.fit(x=boston.data, y=boston.target, steps=100)
-    regressor.evaluate(x=boston.data, y=boston.target, steps=10)
+    regressor.fit(x=data, y=target, steps=100)
+    regressor.evaluate(x=data, y=target, steps=10)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index fcd4389c072..bfcf0d3e1f5 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -38,8 +38,7 @@ class RunConfig(object):
                save_summary_steps=100,
                save_checkpoints_secs=60,
                keep_checkpoint_max=5,
-               keep_checkpoint_every_n_hours=10000,
-               save_checkpoints_steps=1000):
+               keep_checkpoint_every_n_hours=10000):
     """Constructor.
 
     Args:
@@ -61,7 +60,6 @@ class RunConfig(object):
       keep_checkpoint_every_n_hours: Number of hours between each checkpoint
         to be saved. The default value of 10,000 hours effectively disables
         the feature.
-      save_checkpoints_steps: Number of steps between each checkpoint saving.
     """
     self.master = master
     self.task = task
@@ -77,4 +75,3 @@ class RunConfig(object):
     self.save_checkpoints_secs = save_checkpoints_secs
     self.keep_checkpoint_max = keep_checkpoint_max
     self.keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours
-    self.save_checkpoints_steps = save_checkpoints_steps
diff --git a/tensorflow/contrib/learn/python/learn/evaluable.py b/tensorflow/contrib/learn/python/learn/evaluable.py
new file mode 100644
index 00000000000..1ff14193939
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/evaluable.py
@@ -0,0 +1,81 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""`Evaluable` interface."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+
+class Evaluable(object):
+  """Interface for objects that are evaluatable by, e.g., `Experiment`.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractmethod
+  def evaluate(
+      self, x=None, y=None, input_fn=None, feed_fn=None, batch_size=None,
+      steps=None, metrics=None, name=None):
+    """Evaluates given model with provided evaluation data.
+
+    Evaluates on the given input data. If `input_fn` is provided, that
+    input function should raise an end-of-input exception (`OutOfRangeError` or
+    `StopIteration`) after one epoch of the training data has been provided.
+
+    By default, the whole evaluation dataset is used. If `steps` is provided,
+    only `steps` batches of size `batch_size` are processed.
+
+    The return value is a dict containing the metrics specified in `metrics`, as
+    well as an entry `global_step` which contains the value of the global step
+    for which this evaluation was performed.
+
+    Args:
+      x: Matrix of shape [n_samples, n_features...]. Can be iterator that
+         returns arrays of features. The training input samples for fitting the
+         model. If set, `input_fn` must be `None`.
+      y: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
+         iterator that returns array of targets. The training target values
+         (class labels in classification, real numbers in regression). If set,
+         `input_fn` must be `None`.
+      input_fn: Input function. If set, `x`, `y`, and `batch_size` must be
+        `None`.
+      feed_fn: Function creating a feed dict every time it is called. Called
+        once per iteration. Must be `None` if `input_fn` is provided.
+      batch_size: minibatch size to use on the input, defaults to first
+        dimension of `x`, if specified. Must be `None` if `input_fn` is
+        provided.
+      steps: Number of steps for which to evaluate model. If `None`, evaluate
+        until running tensors generated by `metrics` raises an exception.
+      metrics: Dict of metric ops to run. If `None`, the default metric
+        functions are used; if `{}`, no metrics are used. If model has one
+        output (i.e., returning single predction), keys are `str`, e.g.
+        `'accuracy'` - just a name of the metric that will show up in
+        the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
+        `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
+        the predictions to run this metric on.
+
+        Metric ops should support streaming, e.g., returning
+        update_op and value tensors. See more details in
+        ../../../metrics/python/metrics/ops/streaming_metrics.py.
+      name: Name of the evaluation if user needs to run multiple evaluations on
+        different data sets, such as on training data vs test data.
+
+    Returns:
+      Returns `dict` with evaluation results.
+    """
+    raise NotImplementedError
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 2271e5161ed..0f96b70fae1 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -21,7 +21,9 @@ from __future__ import print_function
 
 import time
 
+from tensorflow.contrib.learn.python.learn import evaluable
 from tensorflow.contrib.learn.python.learn import monitors
+from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators._sklearn import NotFittedError
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import tf_logging as logging
@@ -47,7 +49,7 @@ class Experiment(object):
     """Constructor for `Experiment`.
 
     Args:
-      estimator: `Estimator` object.
+      estimator: Object implementing `Trainable` and `Evaluable`.
       train_input_fn: function, returns features and targets for training.
       eval_input_fn: function, returns features and targets for evaluation. If
         `eval_steps` is `None`, this should be configured only to produce for a
@@ -67,7 +69,14 @@ class Experiment(object):
       continuous_eval_throttle_secs: Do not re-evaluate unless the last
         evaluation was started at least this many seconds ago for
         continuous_eval().
+
+    Raises:
+      ValueError: if `estimator` does not implement `Evaluable` and `Trainable`.
     """
+    if not isinstance(estimator, evaluable.Evaluable):
+      raise ValueError("`estimator` must implement `Evaluable`.")
+    if not isinstance(estimator, trainable.Trainable):
+      raise ValueError("`estimator` must implement `Trainable`.")
     super(Experiment, self).__init__()
     self._estimator = estimator
     self._train_input_fn = train_input_fn
diff --git a/tensorflow/contrib/learn/python/learn/graph_actions.py b/tensorflow/contrib/learn/python/learn/graph_actions.py
index 2b448752d8b..6da6bee1ec0 100644
--- a/tensorflow/contrib/learn/python/learn/graph_actions.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions.py
@@ -130,7 +130,7 @@ def _supervised_train(graph,
                       log_every_steps=10,
                       supervisor_is_chief=True,
                       supervisor_master='',
-                      supervisor_save_model_steps=1000,
+                      supervisor_save_model_secs=600,
                       keep_checkpoint_max=5,
                       supervisor_save_summaries_steps=100,
                       feed_fn=None,
@@ -171,8 +171,8 @@ def _supervised_train(graph,
     supervisor_is_chief: Whether the current process is the chief supervisor in
       charge of restoring the model and running standard services.
     supervisor_master: The master string to use when preparing the session.
-    supervisor_save_model_steps: Save a checkpoint every
-      `supervisor_save_model_steps` steps when training.
+    supervisor_save_model_secs: Save model every
+      `supervisor_save_model_secs` seconds when training.
     keep_checkpoint_max: The maximum number of recent checkpoint files to
       keep. As new files are created, older files are deleted. If None or 0,
       all checkpoint files are kept. This is simply passed as the max_to_keep
@@ -251,15 +251,18 @@ def _supervised_train(graph,
         init_fn=init_fn,
         keep_checkpoint_max=keep_checkpoint_max)
     if supervisor_is_chief:
-      if scaffold.summary_op is not None:
-        monitors.append(monitors_lib.SummarySaver(
-            scaffold.summary_op,
-            save_steps=supervisor_save_summaries_steps,
-            summary_writer=summary_writer))
-      if supervisor_save_model_steps > 0:
+      monitors.append(
+          monitors_lib.SummarySaver(
+              summary_op=None,
+              save_steps=supervisor_save_summaries_steps,
+              summary_writer=summary_writer,
+              scaffold=scaffold))
+      if supervisor_save_model_secs > 0:
         monitors.append(
-            monitors_lib.CheckpointSaver(supervisor_save_model_steps,
-                                         scaffold.saver, output_dir))
+            monitors_lib.CheckpointSaver(
+                output_dir,
+                save_secs=supervisor_save_model_secs,
+                scaffold=scaffold))
 
     if steps is not None or max_steps is not None:
       monitors.append(monitors_lib.StopAtStep(steps, max_steps))
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
index 8c9790b6a6a..d0e9b61f42f 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@@ -30,6 +30,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
 
 # pylint: disable=g-multiple-import,g-bad-import-order
 from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels
@@ -206,6 +207,13 @@ def _access(data, iloc):
   return data[iloc]
 
 
+def _check_dtype(dtype):
+  if dtypes.as_dtype(dtype) == dtypes.float64:
+    logging.warn(
+        'float64 is not supported by many models, consider casting to float32.')
+  return dtype
+
+
 class DataFeeder(object):
   """Data feeder is an example class to sample data for TF trainer."""
 
@@ -215,60 +223,82 @@ class DataFeeder(object):
     """Initializes a DataFeeder instance.
 
     Args:
-      x: feature Nd numpy matrix of shape [n_samples, n_features, ...].
-      y: target vector, either floats for regression or class id for
+      x: Feature Nd numpy matrix of shape `[n_samples, n_features, ...]`.
+      y: Target vector, either floats for regression or class id for
         classification. If matrix, will consider as a sequence
-        of targets. Can be None for unsupervised setting.
-      n_classes: number of classes, 0 and 1 are considered regression, None will
-        pass through the input labels without one-hot conversion.
-      batch_size: mini batch size to accumulate.
-      random_state: numpy RandomState object to reproduce sampling.
+        of targets. Can be `None` for unsupervised setting.
+      n_classes: Number of classes, 0 and 1 are considered regression, `None`
+        will pass through the input labels without one-hot conversion.
+      batch_size: Mini-batch size to accumulate.
+      shuffle: Whether to shuffle `x`.
+      random_state: Numpy `RandomState` object to reproduce sampling.
+      epochs: Number of times to iterate over input data before raising
+        `StopIteration` exception.
 
     Attributes:
-      x: input features.
-      y: input target.
-      n_classes: number of classes (if None, pass through indices without
+      x: Input features.
+      y: Input target.
+      n_classes: Number of classes (if `None`, pass through indices without
         one-hot conversion).
-      batch_size: mini batch size to accumulate.
-      input_shape: shape of the input.
-      output_shape: shape of the output.
-      input_dtype: dtype of input.
-      output_dtype: dtype of output.
+      batch_size: Mini-batch size to accumulate.
+      input_shape: Shape of the input.
+      output_shape: Shape of the output.
+      input_dtype: DType of input.
+      output_dtype: DType of output.
     """
-    x_dtype = np.int64 if x.dtype == np.int64 else np.float32
+    self._x = check_array(x, dtype=x.dtype)
+    # self.n_classes is None means we're passing in raw target indices.
     y_dtype = (
         np.int64 if n_classes is not None and n_classes > 1 else np.float32)
-    self.x = check_array(x, dtype=x_dtype)
-    # self.n_classes is None means we're passing in raw target indices
     if n_classes is not None:
-      self.y = (None if y is None else check_array(y, dtype=y_dtype))
+      self._y = (None if y is None else check_array(y, dtype=y_dtype))
+    elif isinstance(y, list):
+      self._y = np.array(y)
     else:
-      self.y = y
-      if isinstance(self.y, list):
-        self.y = np.array(y)
+      self._y = y
     self.n_classes = n_classes
     self.max_epochs = epochs
     self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
-        self.x.shape, None if self.y is None else self.y.shape, n_classes,
+        self._x.shape, None if self._y is None else self._y.shape, n_classes,
         batch_size)
     # Input dtype matches dtype of x.
-    self.input_dtype = x_dtype
+    self._input_dtype = _check_dtype(self._x.dtype)
     # self.n_classes is None means we're passing in raw target indices
-    if n_classes is not None or y is None:
-      self.output_dtype = np.float32
+    if n_classes is not None or self._y is None:
+      self._output_dtype = np.float32
     else:
-      self.output_dtype = self.y.dtype
-    self.shuffle = shuffle
+      self._output_dtype = _check_dtype(self._y.dtype)
+    self._shuffle = shuffle
     self.random_state = np.random.RandomState(
         42) if random_state is None else random_state
-    if self.shuffle:
-      self.indices = self.random_state.permutation(self.x.shape[0])
+    if self._shuffle:
+      self.indices = self.random_state.permutation(self._x.shape[0])
     else:
-      self.indices = np.array(range(self.x.shape[0]))
+      self.indices = np.array(range(self._x.shape[0]))
     self.offset = 0
     self.epoch = 0
     self._epoch_placeholder = None
 
+  @property
+  def x(self):
+    return self._x
+
+  @property
+  def y(self):
+    return self._y
+
+  @property
+  def shuffle(self):
+    return self._shuffle
+
+  @property
+  def input_dtype(self):
+    return self._input_dtype
+
+  @property
+  def output_dtype(self):
+    return self._output_dtype
+
   @property
   def batch_size(self):
     return self._batch_size
@@ -291,7 +321,7 @@ class DataFeeder(object):
     """
     input_shape = [None] + self.input_shape[1:]
     self._input_placeholder = array_ops.placeholder(
-        dtypes.as_dtype(self.input_dtype),
+        dtypes.as_dtype(self._input_dtype),
         input_shape,
         name='input')
     if self.output_shape is None:
@@ -299,7 +329,7 @@ class DataFeeder(object):
     else:
       output_shape = [None] + self.output_shape[1:]
       self._output_placeholder = array_ops.placeholder(
-          dtypes.as_dtype(self.output_dtype),
+          dtypes.as_dtype(self._output_dtype),
           output_shape,
           name='output')
     return self._input_placeholder, self._output_placeholder
@@ -345,20 +375,20 @@ class DataFeeder(object):
         feed_dict[self._epoch_placeholder.name] = [self.epoch]
 
       # Take next batch of indices.
-      end = min(self.x.shape[0], self.offset + self._batch_size)
+      end = min(self._x.shape[0], self.offset + self._batch_size)
       batch_indices = self.indices[self.offset:end]
 
       # Assign input features from random indices.
       inp = (
-          np.array(_access(self.x, batch_indices)).reshape(
+          np.array(_access(self._x, batch_indices)).reshape(
               (batch_indices.shape[0], 1))
-          if len(self.x.shape) == 1 else _access(self.x, batch_indices))
+          if len(self._x.shape) == 1 else _access(self._x, batch_indices))
       feed_dict[self._input_placeholder.name] = inp
 
       # move offset and reset it if necessary
       self.offset += self._batch_size
-      if self.offset >= self.x.shape[0]:
-        self.indices = self.random_state.permutation(self.x.shape[0])
+      if self.offset >= self._x.shape[0]:
+        self.indices = self.random_state.permutation(self._x.shape[0])
         self.offset = 0
         self.epoch += 1
 
@@ -368,21 +398,21 @@ class DataFeeder(object):
 
       # assign labels from random indices
       self.output_shape[0] = batch_indices.shape[0]
-      out = np.zeros(self.output_shape, dtype=self.output_dtype)
+      out = np.zeros(self.output_shape, dtype=self._output_dtype)
       for i in xrange(out.shape[0]):
         sample = batch_indices[i]
         # self.n_classes is None means we're passing in raw target indices
         if self.n_classes is None:
-          out[i] = _access(self.y, sample)
+          out[i] = _access(self._y, sample)
         else:
           if self.n_classes > 1:
             if len(self.output_shape) == 2:
-              out.itemset((i, int(_access(self.y, sample))), 1.0)
+              out.itemset((i, int(_access(self._y, sample))), 1.0)
             else:
-              for idx, value in enumerate(_access(self.y, sample)):
+              for idx, value in enumerate(_access(self._y, sample)):
                 out.itemset(tuple([i, idx, value]), 1.0)
           else:
-            out[i] = _access(self.y, sample)
+            out[i] = _access(self._y, sample)
       feed_dict[self._output_placeholder.name] = out
 
       return feed_dict
@@ -420,32 +450,28 @@ class StreamingDataFeeder(DataFeeder):
     """
     # pylint: disable=invalid-name,super-init-not-called
     x_first_el = six.next(x)
-    self.x = itertools.chain([x_first_el], x)
+    self._x = itertools.chain([x_first_el], x)
     if y is not None:
       y_first_el = six.next(y)
-      self.y = itertools.chain([y_first_el], y)
+      self._y = itertools.chain([y_first_el], y)
     else:
       y_first_el = None
-      self.y = None
+      self._y = None
     self.n_classes = n_classes
     self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
         [1] + list(x_first_el.shape),
         [1] + list(y_first_el.shape) if y is not None else None,
         n_classes,
         batch_size)
-    self.input_dtype = x_first_el.dtype
-    # Convert float64 to float32, as all the parameters in the model are
-    # floats32 and there is a lot of benefits in using it in NNs.
-    if self.input_dtype == np.float64:
-      self.input_dtype = np.float32
+    self._input_dtype = _check_dtype(x_first_el.dtype)
     # Output types are floats, due to both softmaxes and regression req.
     if n_classes is not None and n_classes > 0:
-      self.output_dtype = np.float32
+      self._output_dtype = np.float32
     elif y is not None:
       if isinstance(y_first_el, list) or isinstance(y_first_el, np.ndarray):
-        self.output_dtype = np.dtype(type(y_first_el[0]))
+        self._output_dtype = _check_dtype(np.dtype(type(y_first_el[0])))
       else:
-        self.output_dtype = np.dtype(type(y_first_el))
+        self._output_dtype = _check_dtype(np.dtype(type(y_first_el)))
 
   def get_feed_params(self):
     """Function returns a dict with data feed params while training.
@@ -472,22 +498,22 @@ class StreamingDataFeeder(DataFeeder):
       """
       if self.stopped:
         raise StopIteration
-      inp = np.zeros(self.input_shape, dtype=self.input_dtype)
-      if self.y is not None:
-        out = np.zeros(self.output_shape, dtype=self.output_dtype)
+      inp = np.zeros(self.input_shape, dtype=self._input_dtype)
+      if self._y is not None:
+        out = np.zeros(self.output_shape, dtype=self._output_dtype)
       for i in xrange(self._batch_size):
         # Add handling when queue ends.
         try:
-          inp[i, :] = six.next(self.x)
+          inp[i, :] = six.next(self._x)
         except StopIteration:
           self.stopped = True
           inp = inp[:i, :]
-          if self.y is not None:
+          if self._y is not None:
             out = out[:i]
           break
 
-        if self.y is not None:
-          y = six.next(self.y)
+        if self._y is not None:
+          y = six.next(self._y)
           if self.n_classes is not None and self.n_classes > 1:
             if len(self.output_shape) == 2:
               out.itemset((i, y), 1.0)
@@ -496,7 +522,7 @@ class StreamingDataFeeder(DataFeeder):
                 out.itemset(tuple([i, idx, value]), 1.0)
           else:
             out[i] = y
-      if self.y is None:
+      if self._y is None:
         return {self._input_placeholder.name: inp}
       return {self._input_placeholder.name: inp,
               self._output_placeholder.name: out}
@@ -511,6 +537,7 @@ class DaskDataFeeder(object):
   into them. DaskDataFeeder will remove requirement to have full dataset in the
   memory and still do random seeks for sampling of batches.
   """
+
   def __init__(self, x, y, n_classes, batch_size, shuffle=True,
                random_state=None, epochs=None):
     """Initializes a DaskDataFeeder instance.
@@ -521,8 +548,10 @@ class DaskDataFeeder(object):
         regression values.
       n_classes: indicator of how many classes the target has.
       batch_size: Mini batch size to accumulate.
+      shuffle: Whether to shuffle the inputs.
       random_state: random state for RNG. Note that it will mutate so use a
         int value for this if you want consistent sized batches.
+      epochs: Number of epochs to run.
 
     Attributes:
       x: input features.
@@ -537,35 +566,33 @@ class DaskDataFeeder(object):
     # pylint: disable=invalid-name,super-init-not-called
     import dask.dataframe as dd  # pylint: disable=g-import-not-at-top
     # TODO(terrytangyuan): check x and y dtypes in dask_io like pandas
-    self.x = x
-    self.y = y
+    self._x = x
+    self._y = y
     # save column names
-    self.x_columns = list(x.columns)
+    self._x_columns = list(x.columns)
     if isinstance(y.columns[0], str):
-      self.y_columns = list(y.columns)
+      self._y_columns = list(y.columns)
     else:
       # deal with cases where two DFs have overlapped default numeric colnames
-      self.y_columns = len(self.x_columns) + 1
-      self.y = self.y.rename(columns={y.columns[0]: self.y_columns})
+      self._y_columns = len(self._x_columns) + 1
+      self._y = self._y.rename(columns={y.columns[0]: self._y_columns})
 
     # TODO(terrytangyuan): deal with unsupervised cases
     # combine into a data frame
-    self.df = dd.multi.concat([self.x, self.y], axis=1)
+    self.df = dd.multi.concat([self._x, self._y], axis=1)
     self.n_classes = n_classes
 
     x_count = x.count().compute()[0]
-    x_shape = (x_count, len(self.x.columns))
-    y_shape = (x_count, len(self.y.columns))
+    x_shape = (x_count, len(self._x.columns))
+    y_shape = (x_count, len(self._y.columns))
     # TODO(terrytangyuan): Add support for shuffle and epochs.
-    self.shuffle = shuffle
+    self._shuffle = shuffle
     self.epochs = epochs
     self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
         x_shape, y_shape, n_classes, batch_size)
     self.sample_fraction = self._batch_size / float(x_count)
-    # TODO(ptucker,ipolosukhin): Remove this?
-    # TODO(ipolosukhin): remove or restore.
-    # self.x.dtypes[0], self.y.dtypes[self.y_columns]
-    self.input_dtype, self.output_dtype = np.float32, np.float32
+    self._input_dtype = _check_dtype(self._x.dtypes[0])
+    self._output_dtype = _check_dtype(self._y.dtypes[self._y_columns])
     if random_state is None:
       self.random_state = 66
     else:
@@ -597,17 +624,17 @@ class DaskDataFeeder(object):
       sample = self.df.random_split(
           [self.sample_fraction, 1 - self.sample_fraction],
           random_state=self.random_state)
-      inp = extract_pandas_matrix(sample[0][self.x_columns].compute()).tolist()
-      out = extract_pandas_matrix(sample[0][self.y_columns].compute())
+      inp = extract_pandas_matrix(sample[0][self._x_columns].compute()).tolist()
+      out = extract_pandas_matrix(sample[0][self._y_columns].compute())
       # convert to correct dtype
-      inp = np.array(inp, dtype=self.input_dtype)
+      inp = np.array(inp, dtype=self._input_dtype)
       # one-hot encode out for each class for cross entropy loss
       if HAS_PANDAS:
         import pandas as pd  # pylint: disable=g-import-not-at-top
         if not isinstance(out, pd.Series):
           out = out.flatten()
-      out_max = self.y.max().compute().values[0]
-      encoded_out = np.zeros((out.size, out_max + 1), dtype=self.output_dtype)
+      out_max = self._y.max().compute().values[0]
+      encoded_out = np.zeros((out.size, out_max + 1), dtype=self._output_dtype)
       encoded_out[np.arange(out.size), out] = 1
       return {input_placeholder.name: inp,
               output_placeholder.name: encoded_out}
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
index 1709e428fc2..bf5e62cb4c0 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
@@ -20,12 +20,17 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import input as input_ops
-
+from tensorflow.python.training import queue_runner
 
 # Default name for key in the feature dict.
 KEY_FEATURE_NAME = '__key__'
@@ -219,11 +224,18 @@ def read_keyed_batch_examples(
     return queued_examples_with_keys
 
 
-def read_keyed_batch_features(
-    file_pattern, batch_size, features, reader,
-    randomize_input=True, num_epochs=None,
-    queue_capacity=10000, reader_num_threads=1,
-    parser_num_threads=1, name=None):
+def read_keyed_batch_features(file_pattern,
+                              batch_size,
+                              features,
+                              reader,
+                              randomize_input=True,
+                              num_epochs=None,
+                              queue_capacity=10000,
+                              reader_num_threads=1,
+                              feature_queue_capacity=100,
+                              num_queue_runners=2,
+                              parser_num_threads=None,
+                              name=None):
   """Adds operations to read, queue, batch and parse `Example` protos.
 
   Given file pattern (or list of files), will setup a queue for file names,
@@ -251,7 +263,12 @@ def read_keyed_batch_features(
       tf.initialize_local_variables() as shown in the tests.
     queue_capacity: Capacity for input queue.
     reader_num_threads: The number of threads to read examples.
-    parser_num_threads: The number of threads to parse examples.
+    feature_queue_capacity: Capacity of the parsed features queue.
+    num_queue_runners: Number of queue runners to start for the feature queue,
+      Adding multiple queue runners for the parsed example queue helps maintain
+      a full queue when the subsequent computations overall are cheaper than
+      parsing.
+    parser_num_threads: (Deprecated) The number of threads to parse examples.
     name: Name of resulting op.
 
   Returns:
@@ -261,6 +278,11 @@ def read_keyed_batch_features(
   Raises:
     ValueError: for invalid inputs.
   """
+
+  if parser_num_threads:
+    # TODO(sibyl-Aix6ihai): Remove on Sept 3 2016.
+    logging.warning('parser_num_threads is deprecated, it will be removed on'
+                    'Sept 3 2016')
   with ops.op_scope([file_pattern], name, 'read_batch_features') as scope:
     keys, examples = read_keyed_batch_examples(
         file_pattern, batch_size, reader, randomize_input=randomize_input,
@@ -268,24 +290,66 @@ def read_keyed_batch_features(
         num_threads=reader_num_threads, read_batch_size=batch_size,
         name=scope)
 
-    if parser_num_threads == 1:
-      # Avoid queue overhead for single thread
-      return keys, parsing_ops.parse_example(examples, features)
+    # Parse the example.
+    feature_map = parsing_ops.parse_example(examples, features)
 
-    # Parse features into tensors in many threads and put on the queue.
-    features_list = []
-    for _ in range(parser_num_threads):
-      feature_dict = parsing_ops.parse_example(examples, features)
-      feature_dict[KEY_FEATURE_NAME] = keys
-      features_list.append(feature_dict)
-    queued_features = input_ops.batch_join(
-        features_list,
-        batch_size=batch_size,
-        capacity=queue_capacity,
-        enqueue_many=True,
-        name='parse_example_batch_join')
-    queued_keys = queued_features.pop(KEY_FEATURE_NAME)
-    return queued_keys, queued_features
+    # Lets also add preprocessed tensors into the queue types for each item of
+    # the queue.
+    tensors_to_enqueue = []
+    # Each entry contains the key, and a boolean which indicates whether the
+    # tensor was a sparse tensor.
+    tensors_mapping = []
+    # TODO(sibyl-Aix6ihai): Most of the functionality here is about pushing sparse
+    # tensors into a queue. This could be taken care in somewhere else so others
+    # can reuse it. Also, QueueBase maybe extended to handle sparse tensors
+    # directly.
+    for key, tensor in feature_map.iteritems():
+      if isinstance(tensor, ops.SparseTensor):
+        tensors_mapping.append((key, True))
+        tensors_to_enqueue.extend([tensor.indices, tensor.values, tensor.shape])
+      else:
+        tensors_mapping.append((key, False))
+        tensors_to_enqueue.append(tensor)
+    tensors_to_enqueue.append(keys)
+
+    queue_dtypes = [x.dtype for x in tensors_to_enqueue]
+    input_queue = data_flow_ops.FIFOQueue(feature_queue_capacity, queue_dtypes)
+
+    # Add a summary op to debug if our feature queue is full or not.
+    logging_ops.scalar_summary('queue/parsed_features/%s/fraction_of_%d_full' %
+                               (input_queue.name, feature_queue_capacity),
+                               math_ops.cast(input_queue.size(), dtypes.float32)
+                               * (1. / feature_queue_capacity))
+
+    # Add multiple queue runners so that the queue is always full. Adding more
+    # than two queue-runners may hog the cpu on the worker to fill up the queue.
+    for _ in range(num_queue_runners):
+      queue_runner.add_queue_runner(
+          queue_runner.QueueRunner(input_queue, [input_queue.enqueue(
+              tensors_to_enqueue)]))
+
+    dequeued_tensors = input_queue.dequeue()
+
+    # Reset shapes on dequeued tensors.
+    for i in range(len(tensors_to_enqueue)):
+      dequeued_tensors[i].set_shape(tensors_to_enqueue[i].get_shape())
+
+    # Recreate feature mapping according to the original dictionary.
+    dequeued_feature_map = {}
+    index = 0
+    for key, is_sparse_tensor in tensors_mapping:
+      if is_sparse_tensor:
+        # Three tensors are (indices, values, shape).
+        dequeued_feature_map[key] = ops.SparseTensor(
+            dequeued_tensors[index], dequeued_tensors[index + 1],
+            dequeued_tensors[index + 2])
+        index += 3
+      else:
+        dequeued_feature_map[key] = dequeued_tensors[index]
+        index += 1
+    dequeued_keys = dequeued_tensors[-1]
+
+    return dequeued_keys, dequeued_feature_map
 
 
 def read_batch_features(file_pattern, batch_size, features, reader,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
index f11f0a841f1..d15ef13d7eb 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
@@ -124,18 +124,18 @@ class GraphIOTest(tf.test.TestCase):
           _VALID_FILE_PATTERN, batch_size, features, randomize_input=False,
           queue_capacity=queue_capacity, reader_num_threads=2,
           parser_num_threads=2, name=name)
-      self.assertEqual("%s/parse_example_batch_join:1" % name,
+      self.assertEqual("%s/fifo_queue_1_Dequeue:0" % name,
                        features["feature"].name)
       file_name_queue_name = "%s/file_name_queue" % name
       file_names_name = "%s/input" % file_name_queue_name
       example_queue_name = "%s/fifo_queue" % name
-      parse_example_queue_name = "%s/parse_example_batch_join" % name
+      parse_example_queue_name = "%s/fifo_queue" % name
       op_nodes = test_util.assert_ops_in_graph({
           file_names_name: "Const",
           file_name_queue_name: "FIFOQueue",
           "%s/read/TFRecordReader" % name: "TFRecordReader",
           example_queue_name: "FIFOQueue",
-          parse_example_queue_name: "QueueDequeueMany",
+          parse_example_queue_name: "FIFOQueue",
           name: "QueueDequeueMany"
       }, g)
       self.assertAllEqual(_FILE_NAMES, sess.run(["%s:0" % file_names_name])[0])
diff --git a/tensorflow/contrib/learn/python/learn/models.py b/tensorflow/contrib/learn/python/learn/models.py
index d48fa20fb4a..3d41e4907b3 100644
--- a/tensorflow/contrib/learn/python/learn/models.py
+++ b/tensorflow/contrib/learn/python/learn/models.py
@@ -19,10 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.contrib.learn.python.learn.ops import autoencoder_ops
 from tensorflow.contrib.learn.python.learn.ops import dnn_ops
 from tensorflow.contrib.learn.python.learn.ops import losses_ops
-from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops as array_ops_
@@ -81,6 +81,7 @@ def linear_regression(x, y, init_mean=None, init_stddev=1.0):
   with vs.variable_scope('linear_regression'):
     logging_ops.histogram_summary('linear_regression.x', x)
     logging_ops.histogram_summary('linear_regression.y', y)
+    dtype = x.dtype.base_dtype
     y_shape = y.get_shape()
     if len(y_shape) == 1:
       output_shape = 1
@@ -88,15 +89,18 @@ def linear_regression(x, y, init_mean=None, init_stddev=1.0):
       output_shape = y_shape[1]
     # Set up the requested initialization.
     if init_mean is None:
-      weights = vs.get_variable('weights', [x.get_shape()[1], output_shape])
-      bias = vs.get_variable('bias', [output_shape])
+      weights = vs.get_variable(
+          'weights', [x.get_shape()[1], output_shape], dtype=dtype)
+      bias = vs.get_variable('bias', [output_shape], dtype=dtype)
     else:
       weights = vs.get_variable('weights', [x.get_shape()[1], output_shape],
                                 initializer=init_ops.random_normal_initializer(
-                                    init_mean, init_stddev))
+                                    init_mean, init_stddev, dtype=dtype),
+                                dtype=dtype)
       bias = vs.get_variable('bias', [output_shape],
                              initializer=init_ops.random_normal_initializer(
-                                 init_mean, init_stddev))
+                                 init_mean, init_stddev, dtype=dtype),
+                             dtype=dtype)
     logging_ops.histogram_summary('linear_regression.weights', weights)
     logging_ops.histogram_summary('linear_regression.bias', bias)
     return losses_ops.mean_squared_error_regressor(x, y, weights, bias)
@@ -135,19 +139,22 @@ def logistic_regression(x,
   with vs.variable_scope('logistic_regression'):
     logging_ops.histogram_summary('%s.x' % vs.get_variable_scope().name, x)
     logging_ops.histogram_summary('%s.y' % vs.get_variable_scope().name, y)
+    dtype = x.dtype.base_dtype
     # Set up the requested initialization.
     if init_mean is None:
-      weights = vs.get_variable('weights',
-                                [x.get_shape()[1], y.get_shape()[-1]])
-      bias = vs.get_variable('bias', [y.get_shape()[-1]])
+      weights = vs.get_variable(
+          'weights', [x.get_shape()[1], y.get_shape()[-1]], dtype=dtype)
+      bias = vs.get_variable('bias', [y.get_shape()[-1]], dtype=dtype)
     else:
       weights = vs.get_variable('weights',
                                 [x.get_shape()[1], y.get_shape()[-1]],
                                 initializer=init_ops.random_normal_initializer(
-                                    init_mean, init_stddev))
+                                    init_mean, init_stddev, dtype=dtype),
+                                dtype=dtype)
       bias = vs.get_variable('bias', [y.get_shape()[-1]],
                              initializer=init_ops.random_normal_initializer(
-                                 init_mean, init_stddev))
+                                 init_mean, init_stddev, dtype=dtype),
+                             dtype=dtype)
     logging_ops.histogram_summary('%s.weights' % vs.get_variable_scope().name,
                                   weights)
     logging_ops.histogram_summary('%s.bias' % vs.get_variable_scope().name,
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index dca39386bed..ddf97437423 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -535,8 +535,12 @@ class LoggingTrainable(EveryN):
 class SummarySaver(EveryN):
   """Saves summaries every N steps."""
 
-  def __init__(self, summary_op, save_steps=100, output_dir=None,
-               summary_writer=None):
+  def __init__(self,
+               summary_op,
+               save_steps=100,
+               output_dir=None,
+               summary_writer=None,
+               scaffold=None):
     """Initializes a `SummarySaver` monitor.
 
     Args:
@@ -548,6 +552,7 @@ class SummarySaver(EveryN):
           if no `summary_writer` is supplied.
       summary_writer: `SummaryWriter`. If `None` and an `output_dir` was passed,
           one will be created accordingly.
+      scaffold: `Scaffold` to get summary_op if it's not provided.
     """
     # TODO(ipolosukhin): Implement every N seconds.
     super(SummarySaver, self).__init__(every_n_steps=save_steps)
@@ -555,6 +560,7 @@ class SummarySaver(EveryN):
     self._summary_writer = summary_writer
     if summary_writer is None and output_dir:
       self._summary_writer = summary_io.SummaryWriter(output_dir)
+    self._scaffold = scaffold
     # TODO(mdan): Throw an error if output_dir and summary_writer are None.
 
   def set_estimator(self, estimator):
@@ -565,15 +571,18 @@ class SummarySaver(EveryN):
 
   def every_n_step_begin(self, step):
     super(SummarySaver, self).every_n_step_begin(step)
+    if self._summary_op is None and self._scaffold is not None:
+      self._summary_op = self._scaffold.summary_op
     if self._summary_op is not None:
       return [self._summary_op]
     return []
 
   def every_n_step_end(self, step, outputs):
     super(SummarySaver, self).every_n_step_end(step, outputs)
-    summary_strs = _extract_output(outputs, self._summary_op)
-    if self._summary_writer and self._summary_op is not None:
-      self._summary_writer.add_summary(summary_strs, step)
+    if self._summary_op is not None:
+      summary_strs = _extract_output(outputs, self._summary_op)
+      if self._summary_writer:
+        self._summary_writer.add_summary(summary_strs, step)
     return False
 
   def end(self, session=None):
@@ -923,37 +932,89 @@ class ExportMonitor(EveryN):
                             default_batch_size=self._default_batch_size)
 
 
-class CheckpointSaver(EveryN):
+class CheckpointSaver(BaseMonitor):
   """Saves checkpoints every N steps."""
 
-  def __init__(self, every_n_steps, saver, checkpoint_dir,
+  def __init__(self,
+               checkpoint_dir,
+               save_secs=None,
+               save_steps=None,
+               saver=None,
                checkpoint_basename="model.ckpt",
-               first_n_steps=-1):
+               scaffold=None):
     """Initialize CheckpointSaver monitor.
 
     Args:
-      every_n_steps: `int`, save every N steps.
-      saver: `Saver` object, used for saving.
       checkpoint_dir: `str`, base directory for the checkpoint files.
+      save_secs: `int`, save every N secs.
+      save_steps: `int`, save every N steps.
+      saver: `Saver` object, used for saving.
       checkpoint_basename: `str`, base name for the checkpoint files.
-      first_n_steps: `int`, if positive, save every step during the
-        first `first_n_steps` steps.
+      scaffold: `Scaffold`, use to get saver object.
+
+    Raises:
+      ValueError: If both `save_steps` and `save_secs` are not `None`.
+      ValueError: If both `save_steps` and `save_secs` are `None`.
     """
     logging.info("Create CheckpointSaver")
-    super(CheckpointSaver, self).__init__(every_n_steps=every_n_steps,
-                                          first_n_steps=first_n_steps)
+    super(CheckpointSaver, self).__init__()
     self._saver = saver
     self._summary_writer = SummaryWriterCache.get(checkpoint_dir)
     self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
+    self._scaffold = scaffold
+    self._save_secs = save_secs
+    self._save_steps = save_steps
+    self._last_saved_time = None
+    self._last_begin_step = None
+    self._last_saved_step = None
 
-  def every_n_post_step(self, step, session):
+    if save_steps is None and save_secs is None:
+      raise ValueError("Either save_steps or save_secs should be provided")
+    if (save_steps is not None) and (save_secs is not None):
+      raise ValueError("Can not provide both save_steps and save_secs.")
+
+  def begin(self, max_steps=None):
+    super(CheckpointSaver, self).begin(max_steps)
+    self._last_saved_time = None
+    self._last_begin_step = None
+    self._last_saved_step = None
+
+  def step_begin(self, step):
+    super(CheckpointSaver, self).step_begin(step)
+    self._last_begin_step = step
+
+  def post_step(self, step, session):
+    super(CheckpointSaver, self).post_step(step, session)
+    if self._last_saved_time is None:
+      self._save(step, session)
+
+    if self._save_steps is not None:
+      if step >= self._last_saved_step + self._save_steps:
+        self._save(step, session)
+
+    if self._save_secs is not None:
+      if time.time() >= self._last_saved_time + self._save_secs:
+        self._save(step, session)
+
+  def end(self, session=None):
+    super(CheckpointSaver, self).end(session)
+    self._save(self._last_begin_step, session)
+
+  def _save(self, step, session):
+    """Saves the latest checkpoint."""
+    if step == self._last_saved_step:
+      return
     logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
-    self._saver.save(session, self._save_path, global_step=step)
-    if self._summary_writer:
-      self._summary_writer.add_session_log(
-          SessionLog(status=SessionLog.CHECKPOINT,
-                     checkpoint_path=self._save_path),
-          step)
+    self._last_saved_time = time.time()
+    self._last_saved_step = step
+    if self._saver is None:
+      self._scaffold.saver.save(session, self._save_path, global_step=step)
+    else:
+      self._saver.save(session, self._save_path, global_step=step)
+    self._summary_writer.add_session_log(
+        SessionLog(
+            status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
+        step)
 
 
 class StepCounter(EveryN):
diff --git a/tensorflow/contrib/learn/python/learn/supervised_session.py b/tensorflow/contrib/learn/python/learn/supervised_session.py
index 982f49dd70a..07d100fefc7 100644
--- a/tensorflow/contrib/learn/python/learn/supervised_session.py
+++ b/tensorflow/contrib/learn/python/learn/supervised_session.py
@@ -119,47 +119,85 @@ class Scaffold(object):
       keep_checkpoint_max: Optional parameter to use to construct a saver if
         none is already there in the graph.
     """
-    if global_step_tensor is None:
-      global_step_tensor = contrib_variables.get_or_create_global_step()
-    self.global_step_tensor = global_step_tensor
-    if init_op is None:
-      init_op = Scaffold._get_or_default('init_op', ops.GraphKeys.INIT_OP,
-                                         variables.initialize_all_variables)
-    self.init_op = init_op
-    self.init_feed_dict = init_feed_dict
+
     # NOTE(touts): modifying the init function to be passed the scaffold is a
     # hack to make it easy to find the saver.  Is there a better way?
     if init_fn:
-      self.init_fn = lambda sess: init_fn(self, sess)
+      self._init_fn = lambda sess: init_fn(self, sess)
     else:
-      self.init_fn = None
-    if ready_op is None:
-      ready_op = Scaffold._get_or_default(
+      self._init_fn = None
+
+    self._global_step_tensor = global_step_tensor
+    self._init_op = init_op
+    self._ready_op = ready_op
+    self._local_init_op = local_init_op
+    self._summary_op = summary_op
+    self._saver = saver
+    self._keep_checkpoint_max = keep_checkpoint_max
+    self._init_feed_dict = init_feed_dict
+
+  def finalize(self):
+    """Creates operations if needed and finalizes the graph."""
+    if self._global_step_tensor is None:
+      self._global_step_tensor = contrib_variables.get_or_create_global_step()
+    if self._init_op is None:
+      self._init_op = Scaffold._get_or_default(
+          'init_op', ops.GraphKeys.INIT_OP, variables.initialize_all_variables)
+    if self._ready_op is None:
+      self._ready_op = Scaffold._get_or_default(
           'ready_op', ops.GraphKeys.READY_OP,
           variables.report_uninitialized_variables)
-    self.ready_op = ready_op
-    if local_init_op is None:
-      local_init_op = Scaffold._get_or_default('local_init_op',
-                                               ops.GraphKeys.LOCAL_INIT_OP,
-                                               Scaffold._default_local_init_op)
-    self.local_init_op = local_init_op
-    if summary_op is None:
-      summary_op = Scaffold._get_or_default('summary_op',
-                                            ops.GraphKeys.SUMMARY_OP,
-                                            logging_ops.merge_all_summaries)
-    self.summary_op = summary_op
+    if self._local_init_op is None:
+      self._local_init_op = Scaffold._get_or_default(
+          'local_init_op', ops.GraphKeys.LOCAL_INIT_OP,
+          Scaffold._default_local_init_op)
+    if self._summary_op is None:
+      self._summary_op = Scaffold._get_or_default(
+          'summary_op', ops.GraphKeys.SUMMARY_OP,
+          logging_ops.merge_all_summaries)
     # pylint: disable=g-long-lambda
-    if saver is None:
-      saver = Scaffold._get_or_default(
+    if self._saver is None:
+      self._saver = Scaffold._get_or_default(
           'saver',
           ops.GraphKeys.SAVERS,
           lambda: training_saver.Saver(sharded=True,
-                                       max_to_keep=keep_checkpoint_max))
+                                       max_to_keep=self._keep_checkpoint_max))
     # pylint: enable=g-long-lambda
-    self.saver = saver
 
     ops.get_default_graph().finalize()
 
+  @property
+  def global_step_tensor(self):
+    return self._global_step_tensor
+
+  @property
+  def init_fn(self):
+    return self._init_fn
+
+  @property
+  def init_op(self):
+    return self._init_op
+
+  @property
+  def ready_op(self):
+    return self._ready_op
+
+  @property
+  def local_init_op(self):
+    return self._local_init_op
+
+  @property
+  def summary_op(self):
+    return self._summary_op
+
+  @property
+  def saver(self):
+    return self._saver
+
+  @property
+  def init_feed_dict(self):
+    return self._init_feed_dict
+
   @staticmethod
   def _get_or_default(arg_name, collection_key, default_constructor):
     """Get from cache or create a default operation."""
@@ -213,9 +251,10 @@ class SupervisedSession(object):
     self._config = config
     self._monitors = monitors or []
     self._scaffold = scaffold or Scaffold()
-    # Finalize and write the graph.
-    self._graph.finalize()
+    for monitor in self._monitors:
+      monitor.begin(max_steps=None)
     # Create the session.
+    self._scaffold.finalize()
     self._session_manager = sm.SessionManager(
         local_init_op=self._scaffold.local_init_op,
         ready_op=self._scaffold.ready_op,
@@ -223,8 +262,6 @@ class SupervisedSession(object):
     self._sess = recoverable_session.RecoverableSession(self._create_session)
     # Call the begin() method of monitors.
     self._init_step = self._tf_sess.run(self._scaffold.global_step_tensor)
-    for monitor in self._monitors:
-      monitor.begin(max_steps=None)
     # Write the graph out, note: this uses self._init_step.
     self.write_graph()
 
diff --git a/tensorflow/contrib/learn/python/learn/tests/coordinated_session_test.py b/tensorflow/contrib/learn/python/learn/tests/coordinated_session_test.py
index aad9b71d453..72ff75fbfde 100644
--- a/tensorflow/contrib/learn/python/learn/tests/coordinated_session_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/coordinated_session_test.py
@@ -76,9 +76,8 @@ class CoordinatedSessionTest(tf.test.TestCase):
       self.assertFalse(coord_sess.should_stop())
       self.assertEqual(0, coord_sess.run(c))
       self.assertEqual(1, coord_sess.run(v, feed_dict={c: 1}))
-      with self.assertRaisesRegexp(tf.errors.InvalidArgumentError,
-                                   'both fed and fetched'):
-        coord_sess.run(c, feed_dict={c: 2})
+      with self.assertRaisesRegexp(TypeError, 'None has invalid type'):
+        coord_sess.run([None], feed_dict={c: 2})
       self.assertTrue(coord.should_stop())
       self.assertTrue(coord_sess.should_stop())
 
@@ -101,9 +100,8 @@ class CoordinatedSessionTest(tf.test.TestCase):
       self.assertEqual(1, coord_sess.run(v, feed_dict={c: 1}))
       for t in threads:
         self.assertTrue(t.is_alive())
-      with self.assertRaisesRegexp(tf.errors.InvalidArgumentError,
-                                   'both fed and fetched'):
-        coord_sess.run(c, feed_dict={c: 2})
+      with self.assertRaisesRegexp(TypeError, 'None has invalid type'):
+        coord_sess.run([None], feed_dict={c: 2})
       for t in threads:
         self.assertFalse(t.is_alive())
       self.assertTrue(coord.should_stop())
diff --git a/tensorflow/contrib/learn/python/learn/tests/data_feeder_test.py b/tensorflow/contrib/learn/python/learn/tests/data_feeder_test.py
index 89e4186e253..fe675e31229 100644
--- a/tensorflow/contrib/learn/python/learn/tests/data_feeder_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/data_feeder_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
 import tensorflow as tf
 # pylint: disable=wildcard-import
@@ -31,6 +32,68 @@ class DataFeederTest(tf.test.TestCase):
   # pylint: disable=undefined-variable
   """Tests for `DataFeeder`."""
 
+  def _assert_raises(self, input_data):
+    with self.assertRaisesRegexp(TypeError, 'annot convert'):
+      data_feeder.DataFeeder(input_data, None, n_classes=0, batch_size=1)
+
+  def test_input_uint32(self):
+    self._assert_raises(np.matrix([[1, 2], [3, 4]], dtype=np.uint32))
+
+  def test_input_uint64(self):
+    self._assert_raises(np.matrix([[1, 2], [3, 4]], dtype=np.uint64))
+
+  def _assert_dtype(self, expected_np_dtype, expected_tf_dtype, input_data):
+    feeder = data_feeder.DataFeeder(input_data, None, n_classes=0, batch_size=1)
+    self.assertEqual(expected_np_dtype, feeder.input_dtype)
+    with tf.Graph().as_default() as g, self.test_session(g):
+      inp, _ = feeder.input_builder()
+      self.assertEqual(expected_tf_dtype, inp.dtype)
+
+  def test_input_int8(self):
+    self._assert_dtype(
+        np.int8, tf.int8, np.matrix([[1, 2], [3, 4]], dtype=np.int8))
+
+  def test_input_int16(self):
+    self._assert_dtype(
+        np.int16, tf.int16, np.matrix([[1, 2], [3, 4]], dtype=np.int16))
+
+  def test_input_int32(self):
+    self._assert_dtype(
+        np.int32, tf.int32, np.matrix([[1, 2], [3, 4]], dtype=np.int32))
+
+  def test_input_int64(self):
+    self._assert_dtype(
+        np.int64, tf.int64, np.matrix([[1, 2], [3, 4]], dtype=np.int64))
+
+  def test_input_uint8(self):
+    self._assert_dtype(
+        np.uint8, tf.uint8, np.matrix([[1, 2], [3, 4]], dtype=np.uint8))
+
+  def test_input_uint16(self):
+    self._assert_dtype(
+        np.uint16, tf.uint16, np.matrix([[1, 2], [3, 4]], dtype=np.uint16))
+
+  def test_input_float16(self):
+    self._assert_dtype(
+        np.float16, tf.float16, np.matrix([[1, 2], [3, 4]], dtype=np.float16))
+
+  def test_input_float32(self):
+    self._assert_dtype(
+        np.float32, tf.float32, np.matrix([[1, 2], [3, 4]], dtype=np.float32))
+
+  def test_input_float64(self):
+    self._assert_dtype(
+        np.float64, tf.float64, np.matrix([[1, 2], [3, 4]], dtype=np.float64))
+
+  def test_input_bool(self):
+    self._assert_dtype(
+        np.bool, tf.bool,
+        np.array([[False for _ in xrange(2)] for _ in xrange(2)]))
+
+  def test_input_string(self):
+    input_data = np.array([['str%d' % i for i in xrange(2)] for _ in xrange(2)])
+    self._assert_dtype(input_data.dtype, tf.string, input_data)
+
   def test_unsupervised(self):
     data = np.matrix([[1, 2], [2, 3], [3, 4]])
     feeder = data_feeder.DataFeeder(data, None, n_classes=0, batch_size=1)
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/boolean_mask_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/boolean_mask_test.py
index b3af36b52cf..1e3a069b6da 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/boolean_mask_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/boolean_mask_test.py
@@ -1,4 +1,4 @@
-# Copyright 2016 Google Inc. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/estimator_utils_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/estimator_utils_test.py
index 9fc1360ca32..0aeecc50158 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/estimator_utils_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/estimator_utils_test.py
@@ -1,4 +1,3 @@
-# pylint: disable=g-bad-file-header
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
index 14e283cb791..d7e2fe684b8 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
@@ -208,12 +208,17 @@ class TensorFlowDataFrameTestCase(tf.test.TestCase):
     tensorflow_df = df.TensorFlowDataFrame.from_csv(
         [data_path],
         batch_size=batch_size,
-        num_epochs=num_epochs,
         shuffle=False,
         default_values=default_values)
-    actual_num_batches = len(list(tensorflow_df.run()))
+    result_batches = list(tensorflow_df.run(num_epochs=num_epochs))
+    actual_num_batches = len(result_batches)
     self.assertEqual(expected_num_batches, actual_num_batches)
 
+    # TODO(soergel): figure out how to dequeue the final small batch
+    expected_rows = 1696  # num_epochs * 100
+    actual_rows = sum([len(x["int"]) for x in result_batches])
+    self.assertEqual(expected_rows, actual_rows)
+
   def testFromCSVWithFeatureSpec(self):
     if not HAS_PANDAS:
       return
@@ -297,6 +302,53 @@ class TensorFlowDataFrameTestCase(tf.test.TestCase):
         expected_value = expected_row[ind[1]]
         np.testing.assert_array_equal(expected_value, val)
 
+  def testSplitString(self):
+    batch_size = 8
+    num_epochs = 17
+    expected_num_batches = (num_epochs * 100) // batch_size
+
+    data_path = _make_test_csv()
+    default_values = [0, 0.0, 0, ""]
+
+    tensorflow_df = df.TensorFlowDataFrame.from_csv(
+        [data_path],
+        batch_size=batch_size,
+        shuffle=False,
+        default_values=default_values)
+
+    a, b = tensorflow_df.split("string", 0.7)  # no rebatching
+
+    total_result_batches = list(tensorflow_df.run(num_epochs=num_epochs))
+    a_result_batches = list(a.run(num_epochs=num_epochs))
+    b_result_batches = list(b.run(num_epochs=num_epochs))
+
+    self.assertEqual(expected_num_batches, len(total_result_batches))
+    self.assertEqual(expected_num_batches, len(a_result_batches))
+    self.assertEqual(expected_num_batches, len(b_result_batches))
+
+    total_rows = sum([len(x["int"]) for x in total_result_batches])
+    a_total_rows = sum([len(x["int"]) for x in a_result_batches])
+    b_total_rows = sum([len(x["int"]) for x in b_result_batches])
+
+    print("Split rows: %s => %s, %s" % (total_rows, a_total_rows, b_total_rows))
+
+    # TODO(soergel): figure out how to dequeue the final small batch
+    expected_total_rows = 1696  # (num_epochs * 100)
+
+    self.assertEqual(expected_total_rows, total_rows)
+    self.assertEqual(1087, a_total_rows)  # stochastic but deterministic
+    # self.assertEqual(int(total_rows * 0.7), a_total_rows)
+    self.assertEqual(609, b_total_rows)  # stochastic but deterministic
+    # self.assertEqual(int(total_rows * 0.3), b_total_rows)
+
+    # The strings used for hashing were all unique in the original data, but
+    # we ran 17 epochs, so each one should appear 17 times.  Each copy should
+    # be hashed into the same partition, so there should be no overlap of the
+    # keys.
+    a_strings = set([s for x in a_result_batches for s in x["string"]])
+    b_strings = set([s for x in b_result_batches for s in x["string"]])
+    self.assertEqual(frozenset(), a_strings & b_strings)
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/unary_transform_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/unary_transform_test.py
index 05b625ee05d..0c317966af3 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/unary_transform_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/unary_transform_test.py
@@ -1,4 +1,4 @@
-# Copyright 2016 Google Inc. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,7 +34,12 @@ class UnaryTestCase(tf.test.TestCase):
   @classmethod
   def add_test_case(cls, name, op, np_dtype=float):
     def _test(self):
-      arr = np.arange(NUMPY_ARRAY_SIZE, dtype=np_dtype)
+      if np_dtype == bool:
+        arr = np.array([True] * int(NUMPY_ARRAY_SIZE/2) +
+                       [False] * int(NUMPY_ARRAY_SIZE/2))
+        np.random.shuffle(arr)
+      else:
+        arr = np.arange(NUMPY_ARRAY_SIZE, dtype=np_dtype)
       frame = df.TensorFlowDataFrame.from_numpy(arr,
                                                 batch_size=NUMPY_ARRAY_SIZE,
                                                 shuffle=False)
diff --git a/tensorflow/contrib/learn/python/learn/tests/experiment_test.py b/tensorflow/contrib/learn/python/learn/tests/experiment_test.py
index d44ace1be4a..0ccb7b03ed7 100644
--- a/tensorflow/contrib/learn/python/learn/tests/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/experiment_test.py
@@ -23,7 +23,7 @@ import tensorflow as tf
 from tensorflow.contrib.learn.python.learn import runner_flags  # pylint: disable=unused-import
 
 
-class TestEstimator(object):
+class TestEstimator(tf.contrib.learn.Evaluable, tf.contrib.learn.Trainable):
 
   def __init__(self):
     self.eval_count = 0
diff --git a/tensorflow/contrib/learn/python/learn/tests/graph_actions_test.py b/tensorflow/contrib/learn/python/learn/tests/graph_actions_test.py
index 14a0c2c58ea..1acee3c4a32 100644
--- a/tensorflow/contrib/learn/python/learn/tests/graph_actions_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/graph_actions_test.py
@@ -207,9 +207,8 @@ class GraphActionsTest(tf.test.TestCase):
     with tf.Graph().as_default() as g, self.test_session(g):
       self._assert_ckpt(self._output_dir, False)
       in0, _, _ = self._build_inference_graph()
-      with self.assertRaisesRegexp(
-          tf.errors.InvalidArgumentError, 'both fed and fetched'):
-        learn.graph_actions.infer(None, {'a': in0}, feed_dict={in0: 4.0})
+      with self.assertRaisesRegexp(TypeError, 'Can not convert a NoneType'):
+        learn.graph_actions.infer(None, {'a': in0}, feed_dict={None: 4.0})
       self._assert_ckpt(self._output_dir, False)
 
   def test_infer_feed(self):
diff --git a/tensorflow/contrib/learn/python/learn/tests/monitors_test.py b/tensorflow/contrib/learn/python/learn/tests/monitors_test.py
index 29ec17400fc..574e9d13dd3 100644
--- a/tensorflow/contrib/learn/python/learn/tests/monitors_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/monitors_test.py
@@ -19,11 +19,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import shutil
+import tempfile
+import time
+
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
 from tensorflow.contrib import testing
 from tensorflow.contrib.learn.python import learn
+from tensorflow.contrib.learn.python.learn import supervised_session
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -328,5 +333,126 @@ class StopAtStepTest(tf.test.TestCase):
     self.assertTrue(m.step_end(15, None))
 
 
+class CheckpointSaverTest(tf.test.TestCase):
+
+  def setUp(self):
+    self.model_dir = tempfile.mkdtemp()
+    self.graph = tf.Graph()
+    with self.graph.as_default():
+      self.scaffold = supervised_session.Scaffold()
+      self.global_step = tf.contrib.framework.get_or_create_global_step()
+      self.train_op = tf.assign_add(self.global_step, 1)
+
+  def tearDown(self):
+    shutil.rmtree(self.model_dir, ignore_errors=True)
+
+  def _run(self, monitor, step, train_op, sess):
+    monitor.step_begin(step)
+    sess.run(train_op)
+    monitor.post_step(step, sess)
+
+  def test_raise_in_both_secs_and_steps(self):
+    with self.assertRaises(ValueError):
+      learn.monitors.CheckpointSaver(
+          self.model_dir, save_secs=10, save_steps=20)
+
+  def test_raise_in_none_secs_and_steps(self):
+    with self.assertRaises(ValueError):
+      learn.monitors.CheckpointSaver(self.model_dir)
+
+  def test_save_secs_saves_in_first_step(self):
+    with self.graph.as_default():
+      monitor = learn.monitors.CheckpointSaver(
+          self.model_dir, save_secs=2, scaffold=self.scaffold)
+      monitor.begin()
+      self.scaffold.finalize()
+      with tf.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        self._run(monitor, 1, self.train_op, sess)
+        self.assertEqual(1, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+
+  def test_save_secs_saves_periodically(self):
+    with self.graph.as_default():
+      monitor = learn.monitors.CheckpointSaver(
+          self.model_dir, save_secs=2, scaffold=self.scaffold)
+      monitor.begin()
+      self.scaffold.finalize()
+      with tf.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        self._run(monitor, 1, self.train_op, sess)
+        self._run(monitor, 2, self.train_op, sess)
+        # Not saved
+        self.assertEqual(1, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+        time.sleep(2.5)
+        self._run(monitor, 3, self.train_op, sess)
+        # saved
+        self.assertEqual(3, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+        self._run(monitor, 4, self.train_op, sess)
+        self._run(monitor, 5, self.train_op, sess)
+        # Not saved
+        self.assertEqual(3, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+        time.sleep(2.5)
+        self._run(monitor, 6, self.train_op, sess)
+        # saved
+        self.assertEqual(6, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+
+  def test_save_steps_saves_in_first_step(self):
+    with self.graph.as_default():
+      monitor = learn.monitors.CheckpointSaver(
+          self.model_dir, save_steps=2, scaffold=self.scaffold)
+      monitor.begin()
+      self.scaffold.finalize()
+      with tf.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        self._run(monitor, 1, self.train_op, sess)
+        self.assertEqual(1, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+
+  def test_save_steps_saves_periodically(self):
+    with self.graph.as_default():
+      monitor = learn.monitors.CheckpointSaver(
+          self.model_dir, save_steps=2, scaffold=self.scaffold)
+      monitor.begin()
+      self.scaffold.finalize()
+      with tf.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        self._run(monitor, 1, self.train_op, sess)
+        self._run(monitor, 2, self.train_op, sess)
+        # Not saved
+        self.assertEqual(1, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+        self._run(monitor, 3, self.train_op, sess)
+        # saved
+        self.assertEqual(3, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+        self._run(monitor, 4, self.train_op, sess)
+        # Not saved
+        self.assertEqual(3, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+        self._run(monitor, 5, self.train_op, sess)
+        # saved
+        self.assertEqual(5, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+
+  def test_save_saves_at_end(self):
+    with self.graph.as_default():
+      monitor = learn.monitors.CheckpointSaver(
+          self.model_dir, save_secs=2, scaffold=self.scaffold)
+      monitor.begin()
+      self.scaffold.finalize()
+      with tf.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        self._run(monitor, 1, self.train_op, sess)
+        self._run(monitor, 2, self.train_op, sess)
+        monitor.end(sess)
+        self.assertEqual(2, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+
+
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/tests/supervised_session_test.py b/tensorflow/contrib/learn/python/learn/tests/supervised_session_test.py
index 203878010d7..722333f62f1 100644
--- a/tensorflow/contrib/learn/python/learn/tests/supervised_session_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/supervised_session_test.py
@@ -30,9 +30,21 @@ from tensorflow.contrib.learn.python.learn import supervised_session
 class ScaffoldTest(tf.test.TestCase):
   """Scaffold tests."""
 
+  def test_nothing_created_before_finalize(self):
+    with tf.Graph().as_default():
+      scaffold = supervised_session.Scaffold()
+      self.assertEqual(None, scaffold.global_step_tensor)
+      self.assertEqual(None, scaffold.init_op)
+      self.assertEqual(None, scaffold.init_feed_dict)
+      self.assertEqual(None, scaffold.init_fn)
+      self.assertEqual(None, scaffold.ready_op)
+      self.assertEqual(None, scaffold.local_init_op)
+      self.assertEqual(None, scaffold.saver)
+
   def test_defaults_empty_graph(self):
     with tf.Graph().as_default():
       scaffold = supervised_session.Scaffold()
+      scaffold.finalize()
       self.assertTrue(isinstance(scaffold.global_step_tensor, tf.Variable))
       self.assertTrue(isinstance(scaffold.init_op, tf.Operation))
       self.assertEqual(None, scaffold.init_feed_dict)
@@ -49,7 +61,9 @@ class ScaffoldTest(tf.test.TestCase):
   def test_caches_values(self):
     with tf.Graph().as_default():
       scaffold1 = supervised_session.Scaffold()
+      scaffold1.finalize()
       scaffold2 = supervised_session.Scaffold()
+      scaffold2.finalize()
       self.assertEqual(scaffold1.global_step_tensor,
                        scaffold2.global_step_tensor)
       self.assertEqual(scaffold1.init_op, scaffold2.init_op)
@@ -63,7 +77,7 @@ class ScaffoldTest(tf.test.TestCase):
       tf.add_to_collection(tf.GraphKeys.SAVERS, tf.train.Saver())
       tf.add_to_collection(tf.GraphKeys.SAVERS, tf.train.Saver())
       with self.assertRaisesRegexp(RuntimeError, 'More than one item'):
-        supervised_session.Scaffold()
+        supervised_session.Scaffold().finalize()
 
   def test_uses_passed_values(self):
     with tf.Graph().as_default():
@@ -74,6 +88,7 @@ class ScaffoldTest(tf.test.TestCase):
                                              ready_op=5,
                                              local_init_op=6,
                                              saver=7)
+      scaffold.finalize()
       self.assertEqual(1, scaffold.global_step_tensor)
       self.assertEqual(2, scaffold.init_op)
       self.assertEqual(3, scaffold.init_feed_dict)
@@ -84,7 +99,7 @@ class ScaffoldTest(tf.test.TestCase):
 
   def test_graph_is_finalized(self):
     with tf.Graph().as_default():
-      supervised_session.Scaffold()
+      supervised_session.Scaffold().finalize()
       with self.assertRaisesRegexp(RuntimeError,
                                    'Graph is finalized and cannot be modified'):
         tf.constant([0])
@@ -214,7 +229,7 @@ class SupervisedSessionTest(tf.test.TestCase):
       # Use a monitor to save the model every 100 steps.  It also saves it at
       # the end.
       monitors = [tf.contrib.learn.monitors.CheckpointSaver(
-          100, scaffold.saver, logdir)]
+          logdir, save_steps=1, scaffold=scaffold)]
       with supervised_session.SupervisedSession('', scaffold=scaffold,
                                                 checkpoint_dir=logdir,
                                                 monitors=monitors) as session:
@@ -262,7 +277,7 @@ class SupervisedSessionTest(tf.test.TestCase):
           3, tf.errors.AbortedError(None, None, 'Abort'))
       # Save after each step.
       ckpt_monitor = tf.contrib.learn.monitors.CheckpointSaver(
-          1, scaffold.saver, logdir)
+          logdir, save_steps=1, scaffold=scaffold)
       monitors = [abort_monitor, ckpt_monitor]
       with supervised_session.SupervisedSession('', scaffold=scaffold,
                                                 checkpoint_dir=logdir,
diff --git a/tensorflow/contrib/learn/python/learn/trainable.py b/tensorflow/contrib/learn/python/learn/trainable.py
new file mode 100644
index 00000000000..de82ae6e1d9
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/trainable.py
@@ -0,0 +1,63 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""`Trainable` interface."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+
+class Trainable(object):
+  """Interface for objects that are trainable by, e.g., `Experiment`.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractmethod
+  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
+          monitors=None, max_steps=None):
+    """Trains a model given training data `x` predictions and `y` targets.
+
+    Args:
+      x: Matrix of shape [n_samples, n_features...]. Can be iterator that
+         returns arrays of features. The training input samples for fitting the
+         model. If set, `input_fn` must be `None`.
+      y: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
+         iterator that returns array of targets. The training target values
+         (class labels in classification, real numbers in regression). If set,
+         `input_fn` must be `None`.
+      input_fn: Input function. If set, `x`, `y`, and `batch_size` must be
+        `None`.
+      steps: Number of steps for which to train model. If `None`, train forever.
+        If set, `max_steps` must be `None`.
+      batch_size: minibatch size to use on the input, defaults to first
+        dimension of `x`. Must be `None` if `input_fn` is provided.
+      monitors: List of `BaseMonitor` subclass instances. Used for callbacks
+        inside the training loop.
+      max_steps: Number of total steps for which to train model. If `None`,
+        train forever. If set, `steps` must be `None`.
+
+        Two calls to `fit(steps=100)` means 200 training
+        iterations. On the other hand, two calls to `fit(max_steps=100)` means
+        that the second call will not do any iteration since first call did
+        all 100 steps.
+
+    Returns:
+      `self`, for chaining.
+    """
+    raise NotImplementedError
+
diff --git a/tensorflow/contrib/linear_optimizer/kernels/sdca_ops.cc b/tensorflow/contrib/linear_optimizer/kernels/sdca_ops.cc
index 8f01552defc..49d0cdd98c6 100644
--- a/tensorflow/contrib/linear_optimizer/kernels/sdca_ops.cc
+++ b/tensorflow/contrib/linear_optimizer/kernels/sdca_ops.cc
@@ -61,8 +61,9 @@ using UnalignedInt64Vector = TTypes<const int64>::UnalignedConstVec;
 
 // Statistics computed with input (ModelWeights, Example).
 struct ExampleStatistics {
-  // feature_weights dot feature_values for the example
+  // feature_weights dot feature_values for the example.
   double wx = 0;
+
   // sum of squared feature values occurring in the example divided by
   // L2 * sum(example_weights).
   double normalized_squared_norm = 0;
@@ -76,21 +77,26 @@ class Regularizations {
   Status Initialize(OpKernelConstruction* const context) {
     TF_RETURN_IF_ERROR(context->GetAttr("l1", &symmetric_l1_));
     TF_RETURN_IF_ERROR(context->GetAttr("l2", &symmetric_l2_));
-    shrinkage_factor_ = symmetric_l1_ / symmetric_l2_;
+    shrinkage_ = symmetric_l1_ / symmetric_l2_;
     return Status::OK();
   }
 
   // Proximal SDCA shrinking for L1 regularization.
   double Shrink(const double weight) const {
-    const double shrink_weight =
-        std::max(std::abs(weight) - shrinkage_factor_, 0.0);
-    if (shrink_weight > 0.0) {
-      return std::copysign(shrink_weight, weight);
+    const double shrinked = std::max(std::abs(weight) - shrinkage_, 0.0);
+    if (shrinked > 0.0) {
+      return std::copysign(shrinked, weight);
     }
     return 0.0;
   }
 
-  float shrinkage_factor() const { return shrinkage_factor_; }
+  // Vectorized float variant of the above.
+  Eigen::Tensor<float, 1, Eigen::RowMajor> EigenShrink(
+      const Eigen::Tensor<float, 1, Eigen::RowMajor> weights) const {
+    // Proximal step on the weights which is sign(w)*|w - shrinkage|+.
+    return weights.sign() * ((weights.abs() - weights.constant(shrinkage_))
+                                 .cwiseMax(weights.constant(0.0)));
+  }
 
   float symmetric_l2() const { return symmetric_l2_; }
 
@@ -98,42 +104,29 @@ class Regularizations {
   float symmetric_l1_ = 0;
   float symmetric_l2_ = 0;
 
-  // L1 divided by L2, precomputed for use during weight shrinking.
-  double shrinkage_factor_ = 0;
+  // L1 divided by L2, pre-computed for use during weight shrinking.
+  double shrinkage_ = 0;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Regularizations);
 };
 
-// A dense vector which is a row-slice of the underlying matrix.
-struct DenseVector {
-  // Returns a row slice from the matrix.
-  inline Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>> row()
-      const {
-    // TensorMap to a row slice of the matrix.
-    return Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>>(
-        data_matrix.data() + row_index * data_matrix.dimension(1),
-        data_matrix.dimension(1));
-  }
-
-  const TTypes<float>::ConstMatrix data_matrix;
-  const int row_index;
-};
-
 class ModelWeights;
 
 // Struct describing a single example.
 class Example {
  public:
-  float example_label() const { return example_label_; }
-  float example_weight() const { return example_weight_; }
-  double squared_norm() const { return squared_norm_; }
-
   // Compute dot product between weights, and example feature values. This
   // method also computes the normalized example norm used in SDCA update.
   const ExampleStatistics ComputeWxAndWeightedExampleNorm(
-      const int num_partitions, const ModelWeights& weights,
+      const int num_partitions, const ModelWeights& model_weights,
       const Regularizations& regularization) const;
 
+  float example_label() const { return example_label_; }
+
+  float example_weight() const { return example_weight_; }
+
+  double squared_norm() const { return squared_norm_; }
+
  private:
   // Sparse features associated with the example.
   // Indices and Values are the associated feature index, and values. Values
@@ -144,7 +137,23 @@ class Example {
     std::unique_ptr<UnalignedFloatVector> values;  // nullptr encodes optional.
   };
   std::vector<SparseFeatures> sparse_features_;
-  std::vector<std::unique_ptr<DenseVector>> dense_values_;
+
+  // A dense vector which is a row-slice of the underlying matrix.
+  struct DenseVector {
+    // Returns a row slice from the matrix.
+    Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>> row()
+        const {
+      // TensorMap to a row slice of the matrix.
+      return Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>>(
+          data_matrix.data() + row_index * data_matrix.dimension(1),
+          data_matrix.dimension(1));
+    }
+
+    const TTypes<float>::ConstMatrix data_matrix;
+    const int64 row_index;
+  };
+  std::vector<std::unique_ptr<DenseVector>> dense_vectors_;
+
   float example_label_ = 0;
   float example_weight_ = 0;
   double squared_norm_ = 0;  // sum squared norm of the features.
@@ -162,28 +171,32 @@ class ModelWeights {
  public:
   ModelWeights() {}
 
+  // Go through all the features present in the example, and update the
+  // weights based on the dual delta.
   void UpdateDeltaWeights(const Eigen::ThreadPoolDevice& device,
-                          const Example& example, const double dual_delta,
-                          const Regularizations& regularization) {
-    // Go through all the features present in the example, and update the
-    // weights based on the dual delta.
-    for (int j = 0; j < sparse_weights_.size(); ++j) {
+                          const Example& example,
+                          const double normalized_bounded_dual_delta) {
+    // Sparse weights.
+    for (size_t j = 0; j < sparse_weights_.size(); ++j) {
       const Example::SparseFeatures& sparse_features =
           example.sparse_features_[j];
-      for (int k = 0; k < sparse_features.indices->size(); ++k) {
-        double delta_w = dual_delta / regularization.symmetric_l2();
-        if (sparse_features.values) {
-          delta_w *= (*sparse_features.values)(k);
-        }
-        sparse_delta_weights_[j]((*sparse_features.indices)(k)) += delta_w;
+      FeatureWeights* const feature_weights = &sparse_weights_[j];
+      for (int64 k = 0; k < sparse_features.indices->size(); ++k) {
+        const double feature_value = sparse_features.values == nullptr
+                                         ? 1.0
+                                         : (*sparse_features.values)(k);
+        feature_weights->deltas((*sparse_features.indices)(k)) +=
+            feature_value * normalized_bounded_dual_delta;
       }
     }
-    for (int j = 0; j < dense_weights_.size(); ++j) {
-      TTypes<float>::Vec w = dense_delta_weights_[j];
-      w.device(device) =
-          w +
-          (example.dense_values_[j]->row()) *
-              w.constant(dual_delta / regularization.symmetric_l2());
+
+    // Dense weights.
+    for (size_t j = 0; j < dense_weights_.size(); ++j) {
+      const Example::DenseVector& dense_vector = *example.dense_vectors_[j];
+      TTypes<float>::Vec deltas = dense_weights_[j].deltas;
+      deltas.device(device) =
+          deltas +
+          dense_vector.row() * deltas.constant(normalized_bounded_dual_delta);
     }
   }
 
@@ -206,23 +219,22 @@ class ModelWeights {
     // Reads in the weights, and allocates and initializes the delta weights.
     const auto intialize_weights = [&](
         const OpInputList& weight_inputs, OpOutputList* const weight_outputs,
-        std::vector<TTypes<const float>::Vec>* const weights,
-        std::vector<TTypes<float>::Vec>* const delta_weights) {
-
+        std::vector<FeatureWeights>* const feature_weights) {
       for (int i = 0; i < weight_inputs.size(); ++i) {
-        weights->push_back(weight_inputs[i].flat<float>());
         Tensor* delta_t;
         weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t);
-        auto delta_vec = delta_t->flat<float>();
-        delta_vec.setZero();
-        delta_weights->push_back(delta_vec);
+        auto deltas = delta_t->flat<float>();
+        deltas.setZero();
+        feature_weights->emplace_back(
+            FeatureWeights{weight_inputs[i].flat<float>(), deltas});
       }
     };
 
     intialize_weights(sparse_weights_inputs, &sparse_weights_outputs,
-                      &sparse_weights_, &sparse_delta_weights_);
+                      &sparse_weights_);
     intialize_weights(dense_weights_inputs, &dense_weights_outputs,
-                      &dense_weights_, &dense_delta_weights_);
+                      &dense_weights_);
+
     return Status::OK();
   }
 
@@ -230,11 +242,18 @@ class ModelWeights {
   // TODO(sibyl-Aix6ihai): Refactor this to support both small-batch mode, and large
   // batch mode, where we use sparse storage (hashmap) vs dense storage
   // (vectors).
-  // Weights for each of the feature groups.
-  std::vector<TTypes<const float>::Vec> sparse_weights_;
-  std::vector<TTypes<float>::Vec> sparse_delta_weights_;
-  std::vector<TTypes<const float>::Vec> dense_weights_;
-  std::vector<TTypes<float>::Vec> dense_delta_weights_;
+
+  // Weights relate to a feature group.
+  struct FeatureWeights {
+    // The nominal value of the weight for a feature (indexed by its id).
+    TTypes<const float>::Vec nominals;
+
+    // The accumulated delta weight for a feature (indexed by its id).
+    TTypes<float>::Vec deltas;
+  };
+
+  std::vector<FeatureWeights> sparse_weights_;
+  std::vector<FeatureWeights> dense_weights_;
 
   // Example requires ModelWeights to compute the ExampleStatistics.
   friend class Example;
@@ -243,41 +262,48 @@ class ModelWeights {
 };
 
 const ExampleStatistics Example::ComputeWxAndWeightedExampleNorm(
-    const int num_partitions, const ModelWeights& weights,
+    const int num_partitions, const ModelWeights& model_weights,
     const Regularizations& regularization) const {
   ExampleStatistics result;
+
   result.normalized_squared_norm =
       squared_norm_ / regularization.symmetric_l2();
 
-  const int num_sparse_features = weights.sparse_weights_.size();
   // Compute the w \dot x.
-  for (int j = 0; j < num_sparse_features; ++j) {
+
+  // Sparse features contribution.
+  for (size_t j = 0; j < sparse_features_.size(); ++j) {
     const Example::SparseFeatures& sparse_features = sparse_features_[j];
-    const int num_features = sparse_features.indices->size();
-    for (int k = 0; k < num_features; ++k) {
-      const int feature_index = (*sparse_features.indices)(k);
-      const float w = regularization.Shrink(
-          (weights.sparse_weights_[j](feature_index) +
-           num_partitions * weights.sparse_delta_weights_[j](feature_index)));
-      if (sparse_features.values) {
-        result.wx += (*sparse_features.values)(k)*w;
-      } else {
-        result.wx += w;
-      }
+    const ModelWeights::FeatureWeights& sparse_weights =
+        model_weights.sparse_weights_[j];
+
+    for (int64 k = 0; k < sparse_features.indices->size(); ++k) {
+      const int64 feature_index = (*sparse_features.indices)(k);
+      const double feature_value = sparse_features.values == nullptr
+                                       ? 1.0
+                                       : (*sparse_features.values)(k);
+      const double feature_weight =
+          sparse_weights.nominals(feature_index) +
+          sparse_weights.deltas(feature_index) * num_partitions;
+      result.wx += feature_value * regularization.Shrink(feature_weight);
     }
   }
 
-  for (int j = 0; j < weights.dense_weights_.size(); ++j) {
-    auto w = (weights.dense_weights_[j] +
-              weights.dense_delta_weights_[j] *
-                  weights.dense_delta_weights_[j].constant(num_partitions));
+  // Dense features contribution.
+  for (size_t j = 0; j < dense_vectors_.size(); ++j) {
+    const Example::DenseVector& dense_vector = *dense_vectors_[j];
+    const ModelWeights::FeatureWeights& dense_weights =
+        model_weights.dense_weights_[j];
+
+    const Eigen::Tensor<float, 1, Eigen::RowMajor> feature_weights =
+        dense_weights.nominals +
+        dense_weights.deltas * dense_weights.deltas.constant(num_partitions);
     const Eigen::Tensor<float, 0, Eigen::RowMajor> prediction =
-        ((dense_values_[j]->row()) *
-         (w.sign() * ((w.abs() - w.constant(regularization.shrinkage_factor()))
-                          .cwiseMax(w.constant(0.0)))))
+        (dense_vector.row() * regularization.EigenShrink(feature_weights))
             .sum();
     result.wx += prediction();
   }
+
   return result;
 }
 
@@ -286,13 +312,14 @@ class Examples {
  public:
   Examples() {}
 
-  // Returns features for example at |example_index|.
+  // Returns the Example at |example_index|.
   const Example& example(const int example_index) const {
     return examples_.at(example_index);
   }
 
   int num_examples() const { return examples_.size(); }
-  int num_columns() const { return num_columns_; }
+
+  int num_features() const { return num_features_; }
 
   // Initialize() must be called immediately after construction.
   // TODO(sibyl-Aix6ihai): Refactor/shorten this function.
@@ -300,7 +327,8 @@ class Examples {
                     const int num_sparse_features,
                     const int num_sparse_features_with_values,
                     const int num_dense_features) {
-    num_columns_ = num_sparse_features + num_dense_features;
+    num_features_ = num_sparse_features + num_dense_features;
+
     OpInputList sparse_example_indices_inputs;
     TF_RETURN_IF_ERROR(context->input_list("sparse_example_indices",
                                            &sparse_example_indices_inputs));
@@ -329,9 +357,9 @@ class Examples {
     examples_.clear();
     examples_.resize(num_examples);
     for (int example_id = 0; example_id < num_examples; ++example_id) {
-      Example* example = &examples_[example_id];
+      Example* const example = &examples_[example_id];
       example->sparse_features_.resize(num_sparse_features);
-      example->dense_values_.resize(num_dense_features);
+      example->dense_vectors_.resize(num_dense_features);
       example->example_weight_ = example_weights(example_id);
       example->example_label_ = example_labels(example_id);
     }
@@ -359,7 +387,7 @@ class Examples {
             }
             if (start_id < example_indices.size() &&
                 example_indices(start_id) == example_id) {
-              Example::SparseFeatures* sparse_features =
+              Example::SparseFeatures* const sparse_features =
                   &examples_[example_id].sparse_features_[i];
               sparse_features->indices.reset(new UnalignedInt64Vector(
                   &(feature_indices(start_id)), end_id - start_id));
@@ -370,7 +398,7 @@ class Examples {
                     &(feature_weights(start_id)), end_id - start_id));
               }
             } else {
-              Example::SparseFeatures* sparse_features =
+              Example::SparseFeatures* const sparse_features =
                   &examples_[example_id].sparse_features_[i];
               // Add a Tensor that has size 0.
               sparse_features->indices.reset(
@@ -396,8 +424,8 @@ class Examples {
       Shard(worker_threads.num_threads, worker_threads.workers,
             num_sparse_features, num_examples, parse_partition);
     }
-    // Parse dense.
-    {
+
+    {  // Parse dense.
       auto parse_partition = [&](const int64 begin, const int64 end) {
         // The static_cast here is safe since begin and end can be at most
         // num_examples which is an int.
@@ -405,8 +433,8 @@ class Examples {
           auto dense_features =
               dense_features_inputs[i].template matrix<float>();
           for (int example_id = 0; example_id < num_examples; ++example_id) {
-            examples_[example_id].dense_values_[i].reset(
-                new DenseVector{dense_features, example_id});
+            examples_[example_id].dense_vectors_[i].reset(
+                new Example::DenseVector{dense_features, example_id});
           }
         }
       };
@@ -416,16 +444,17 @@ class Examples {
       Shard(worker_threads.num_threads, worker_threads.workers,
             num_dense_features, kCostPerUnit, parse_partition);
     }
-    // Compute norm of examples.
-    {
+
+    {  // Compute norm of examples.
       auto compute_example_norm = [&](const int64 begin, const int64 end) {
         // The static_cast here is safe since begin and end can be at most
         // num_examples which is an int.
-        for (int i = static_cast<int>(begin); i < end; ++i) {
+        for (int example_id = static_cast<int>(begin); example_id < end;
+             ++example_id) {
           double squared_norm = 0;
           for (int j = 0; j < num_sparse_features; ++j) {
             const Example::SparseFeatures& sparse_features =
-                examples_[i].sparse_features_[j];
+                examples_[example_id].sparse_features_[j];
             if (sparse_features.values) {
               const Eigen::Tensor<float, 0, Eigen::RowMajor> sn =
                   sparse_features.values->square().sum();
@@ -436,10 +465,10 @@ class Examples {
           }
           for (int j = 0; j < num_dense_features; ++j) {
             const Eigen::Tensor<float, 0, Eigen::RowMajor> sn =
-                examples_[i].dense_values_[j]->row().square().sum();
+                examples_[example_id].dense_vectors_[j]->row().square().sum();
             squared_norm += sn();
           }
-          examples_[i].squared_norm_ = squared_norm;
+          examples_[example_id].squared_norm_ = squared_norm;
         }
       };
       // TODO(sibyl-Aix6ihai): Compute the cost optimally.
@@ -455,7 +484,8 @@ class Examples {
  private:
   // All examples in the batch.
   std::vector<Example> examples_;
-  int num_columns_;
+
+  int num_features_ = 0;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Examples);
 };
@@ -478,7 +508,6 @@ class DistributedSdcaLargeBatchSolver : public OpKernel {
       OP_REQUIRES(context, false, errors::InvalidArgument(
                                       "Unsupported loss type: ", loss_type));
     }
-
     OP_REQUIRES_OK(context, context->GetAttr("num_sparse_features",
                                              &num_sparse_features_));
     OP_REQUIRES_OK(context,
@@ -558,9 +587,11 @@ class DistributedSdcaLargeBatchSolver : public OpKernel {
             primal_loss, dual_loss);
 
         // Compute new weights.
-        const double bounded_dual_delta = (new_dual - dual) * example_weight;
+        const double normalized_bounded_dual_delta =
+            (new_dual - dual) * example_weight /
+            regularizations_.symmetric_l2();
         model_weights.UpdateDeltaWeights(context->eigen_cpu_device(), example,
-                                         bounded_dual_delta, regularizations_);
+                                         normalized_bounded_dual_delta);
 
         // Update example data.
         example_state_data(example_index, 0) = new_dual;
@@ -571,7 +602,8 @@ class DistributedSdcaLargeBatchSolver : public OpKernel {
     };
     // TODO(sibyl-Aix6ihai): Tune this properly based on sparsity of the data,
     // number of cpus, and cost per example.
-    const int64 kCostPerUnit = examples.num_examples() * examples.num_columns();
+    const int64 kCostPerUnit =
+        examples.num_examples() * examples.num_features();
     const DeviceBase::CpuWorkerThreads& worker_threads =
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers,
@@ -584,11 +616,11 @@ class DistributedSdcaLargeBatchSolver : public OpKernel {
   // template the entire class to avoid the virtual table lookup penalty in
   // the inner loop.
   std::unique_ptr<DualLossUpdater> loss_updater_;
-  int num_sparse_features_;
-  int num_sparse_features_with_values_;
-  int num_dense_features_;
-  int num_inner_iterations_;
-  int num_partitions_;
+  int num_sparse_features_ = 0;
+  int num_sparse_features_with_values_ = 0;
+  int num_dense_features_ = 0;
+  int num_inner_iterations_ = 0;
+  int num_partitions_ = 0;
   Regularizations regularizations_;
 };
 REGISTER_KERNEL_BUILDER(
@@ -612,15 +644,14 @@ class SdcaShrinkL1 : public OpKernel {
                                                         &dense_weights_inputs));
 
     auto shrink_l1 = [&](OpMutableInputList* const inputs) {
+      // TODO(sibyl-Mooth6ku): Maybe parallelize this.
       for (int i = 0; i < inputs->size(); ++i) {
         auto prox_w = inputs->at(i, /*lock_held=*/true).flat<float>();
         prox_w.device(context->eigen_cpu_device()) =
-            prox_w.sign() *
-            ((prox_w.abs() -
-              prox_w.constant(regularizations_.shrinkage_factor()))
-                 .cwiseMax(prox_w.constant(0.0)));
+            regularizations_.EigenShrink(prox_w);
       }
     };
+
     // Shrink both sparse, and dense weights.
     shrink_l1(&sparse_weights_inputs);
     shrink_l1(&dense_weights_inputs);
diff --git a/tensorflow/contrib/metrics/ops/set_ops.cc b/tensorflow/contrib/metrics/ops/set_ops.cc
index 72eb352a460..0db12d05a7c 100644
--- a/tensorflow/contrib/metrics/ops/set_ops.cc
+++ b/tensorflow/contrib/metrics/ops/set_ops.cc
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
 namespace tensorflow {
 
+using shape_inference::InferenceContext;
+
 REGISTER_OP("SetSize")
     .Input("set_indices: int64")
     .Input("set_values: T")
@@ -24,6 +28,7 @@ REGISTER_OP("SetSize")
     .Attr("validate_indices: bool = true")
     .Attr("T: {int8, int16, int32, int64, uint8, uint16, string}")
     .Output("size: int32")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Number of unique elements along last dimension of input `set`.
 
@@ -51,6 +56,12 @@ REGISTER_OP("DenseToDenseSetOperation")
     .Output("result_indices: int64")
     .Output("result_values: T")
     .Output("result_shape: int64")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->Dim(c->input(0), 0), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Applies set operation along last dimension of 2 `Tensor` inputs.
 
@@ -84,6 +95,12 @@ REGISTER_OP("DenseToSparseSetOperation")
     .Output("result_indices: int64")
     .Output("result_values: T")
     .Output("result_shape: int64")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->Dim(c->input(0), 0), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Applies set operation along last dimension of `Tensor` and `SparseTensor`.
 
@@ -132,6 +149,12 @@ REGISTER_OP("SparseToSparseSetOperation")
     .Output("result_indices: int64")
     .Output("result_values: T")
     .Output("result_shape: int64")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Applies set operation along last dimension of 2 `SparseTensor` inputs.
 
diff --git a/tensorflow/contrib/metrics/python/kernel_tests/histogram_ops_test.py b/tensorflow/contrib/metrics/python/kernel_tests/histogram_ops_test.py
index 15fa107d4fb..6ba3b11f3ba 100644
--- a/tensorflow/contrib/metrics/python/kernel_tests/histogram_ops_test.py
+++ b/tensorflow/contrib/metrics/python/kernel_tests/histogram_ops_test.py
@@ -163,7 +163,7 @@ class AUCUsingHistogramTest(tf.test.TestCase):
                                           self.rng, frac_true)
       # Fetch current auc, and verify that fetching again doesn't change it.
       auc_eval = auc.eval()
-      self.assertEqual(auc_eval, auc.eval())
+      self.assertAlmostEqual(auc_eval, auc.eval(), places=5)
 
     msg = ('nbins: %s, desired_auc: %s, score_range: %s, '
            'num_records: %s, frac_true: %s, num_updates: %s') % (nbins,
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 916fb0a1720..0671cb3d809 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -24,6 +24,7 @@ py_test(
     deps = [
         ":opt_py",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python:extra_py_tests_deps",
     ],
 )
 
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py
index 0627f5db66b..7629662b079 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
 
 
 __all__ = ['ExternalOptimizerInterface', 'ScipyOptimizerInterface']
@@ -117,24 +118,24 @@ class ExternalOptimizerInterface(object):
       step_callback: A function to be called at each optimization step;
         arguments are the current values of all optimization variables
         flattened into a single vector.
-      loss_callback: A function to be called every time the loss is computed,
-        with evaluated fetches supplied as positional arguments.
-      grad_callback: A function to be called every time the loss gradient is
-        computed, with evaluated fetches supplied as positional arguments.
+      loss_callback: A function to be called every time the loss and gradients
+        are computed, with evaluated fetches supplied as positional arguments.
+      grad_callback: Deprecated.
     """
     session = session or ops.get_default_session()
     feed_dict = feed_dict or {}
     fetches = fetches or []
 
     loss_callback = loss_callback or (lambda *fetches: None)
-    grad_callback = grad_callback or (lambda *fetches: None)
     step_callback = step_callback or (lambda xk: None)
+    # TODO(chapelle): Remove grad_callback (b/30590858)
+    if grad_callback:
+      logging.warn('grad_callback is deprecated. Please use loss_callback.')
 
     # Construct loss function and associated gradient.
-    loss_func = self._make_eval_func(
-        self._loss, session, feed_dict, fetches, loss_callback)
     loss_grad_func = self._make_eval_func(
-        self._packed_loss_grad, session, feed_dict, fetches, grad_callback)
+        [self._loss, self._packed_loss_grad],
+        session, feed_dict, fetches, loss_callback)
 
     # Construct equality constraint functions and associated gradients.
     equality_funcs = self._make_eval_funcs(
@@ -153,8 +154,8 @@ class ExternalOptimizerInterface(object):
 
     # Perform minimization.
     packed_var_val = self._minimize(
-        initial_val=initial_packed_var_val, loss_func=loss_func,
-        loss_grad_func=loss_grad_func, equality_funcs=equality_funcs,
+        initial_val=initial_packed_var_val, loss_grad_func=loss_grad_func,
+        equality_funcs=equality_funcs,
         equality_grad_funcs=equality_grad_funcs,
         inequality_funcs=inequality_funcs,
         inequality_grad_funcs=inequality_grad_funcs,
@@ -166,7 +167,7 @@ class ExternalOptimizerInterface(object):
     session.run(self._var_updates,
                 feed_dict=dict(zip(self._update_placeholders, var_vals)))
 
-  def _minimize(self, initial_val, loss_func, loss_grad_func, equality_funcs,
+  def _minimize(self, initial_val, loss_grad_func, equality_funcs,
                 equality_grad_funcs, inequality_funcs, inequality_grad_funcs,
                 step_callback, optimizer_kwargs):
     """Wrapper for a particular optimization algorithm implementation.
@@ -177,9 +178,8 @@ class ExternalOptimizerInterface(object):
 
     Args:
       initial_val: A NumPy vector of initial values.
-      loss_func: A function accepting a NumPy packed variable vector and
-        returning a loss value.
-      loss_grad_func: A function that computes the gradient of loss_func with
+      loss_grad_func: A function accepting a NumPy packed variable vector and
+        returning two outputs, a loss value and the gradient of that loss with
         respect to the packed variable vector.
       equality_funcs: A list of functions each of which specifies a scalar
         quantity that an optimizer should hold exactly zero.
@@ -209,9 +209,13 @@ class ExternalOptimizerInterface(object):
       flattened = [array_ops.reshape(tensor, [-1]) for tensor in tensors]
       return array_ops.concat(0, flattened)
 
-  def _make_eval_func(self, tensor, session, feed_dict, fetches,
+  def _make_eval_func(self, tensors, session, feed_dict, fetches,
                       callback=None):
-    """Construct a function that evaluates a `Tensor`."""
+    """Construct a function that evaluates a `Tensor` or list of `Tensor`s."""
+    if not isinstance(tensors, list):
+      tensors = [tensors]
+    num_tensors = len(tensors)
+
     def eval_func(x):
       """Function to evaluate a `Tensor`."""
       augmented_feed_dict = {
@@ -219,15 +223,15 @@ class ExternalOptimizerInterface(object):
           for var, packing_slice in zip(self._vars, self._packing_slices)
       }
       augmented_feed_dict.update(feed_dict)
-      augmented_fetches = [tensor] + fetches
+      augmented_fetches = tensors + fetches
 
       augmented_fetch_vals = session.run(
           augmented_fetches, feed_dict=augmented_feed_dict)
 
       if callable(callback):
-        callback(*augmented_fetch_vals[1:])
+        callback(*augmented_fetch_vals[num_tensors:])
 
-      return augmented_fetch_vals[0]
+      return augmented_fetch_vals[:num_tensors]
 
     return eval_func
 
@@ -284,12 +288,13 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
 
   _DEFAULT_METHOD = 'L-BFGS-B'
 
-  def _minimize(self, initial_val, loss_func, loss_grad_func, equality_funcs,
+  def _minimize(self, initial_val, loss_grad_func, equality_funcs,
                 equality_grad_funcs, inequality_funcs, inequality_grad_funcs,
                 step_callback, optimizer_kwargs):
-    def grad_func_wrapper(x):
+    def loss_grad_func_wrapper(x):
       # SciPy's L-BFGS-B Fortran implementation requires gradients as doubles.
-      return loss_grad_func(x).astype('float64')
+      loss, gradient = loss_grad_func(x)
+      return loss, gradient.astype('float64')
 
     method = optimizer_kwargs.pop('method', self._DEFAULT_METHOD)
 
@@ -299,9 +304,9 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
     for func, grad_func in zip(inequality_funcs, inequality_grad_funcs):
       constraints.append({'type': 'ineq', 'fun': func, 'jac': grad_func})
 
-    minimize_args = [loss_func, initial_val]
+    minimize_args = [loss_grad_func_wrapper, initial_val]
     minimize_kwargs = {
-        'jac': grad_func_wrapper,
+        'jac': True,
         'callback': step_callback,
         'method': method,
         'constraints': constraints,
@@ -313,7 +318,15 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
       del minimize_kwargs['callback']
 
     import scipy.optimize  # pylint: disable=g-import-not-at-top
-    return scipy.optimize.minimize(*minimize_args, **minimize_kwargs)['x']
+    result = scipy.optimize.minimize(*minimize_args, **minimize_kwargs)
+    logging.info('Optimization terminated with:\n'
+                 '  Message: %s\n'
+                 '  Objective function value: %f\n'
+                 '  Number of iterations: %d\n'
+                 '  Number of functions evaluations: %d',
+                 result.message, result.fun, result.nit, result.nfev)
+
+    return result['x']
 
 
 def _accumulate(list_):
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer_test.py b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
index 6226f22eae2..95d27d0fe9c 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
@@ -28,36 +28,22 @@ try:
 except ImportError:
   import builtins
 
-try:
-  import mock
-except ImportError:
-  try:
-    import unittest.mock as mock
-  except ImportError:
-    # At the moment TensorFlow does not have access to mock when in Python 2.7
-    # mode, although mock is part of the standard Python 3 library. If mock is
-    # not available, indicate this by assigning None to it.
-    mock = None
-# pylint: enable=g-import-not-at-top,unused-import
-
 
 class MockOptimizerInterface(tf.contrib.opt.ExternalOptimizerInterface):
 
   NUM_STEP_CALLS = 5
   NUM_LOSS_CALLS = 2
-  NUM_GRAD_CALLS = 3
 
-  def _minimize(self, initial_val, loss_func, loss_grad_func, step_callback,
+  def _minimize(self, initial_val, loss_grad_func, step_callback,
                 optimizer_kwargs, **unused_kwargs):
     """Minimize (x - x0)**2 / 2 with respect to x."""
     for _ in range(self.NUM_LOSS_CALLS):
-      loss_func(initial_val)
-    for _ in range(self.NUM_GRAD_CALLS - 1):
       loss_grad_func(initial_val)
     for _ in range(self.NUM_STEP_CALLS):
       step_callback(initial_val)
 
-    return initial_val - loss_grad_func(initial_val)
+    _, grad = loss_grad_func(initial_val)
+    return initial_val - grad
 
 
 class TestCase(tf.test.TestCase):
@@ -72,30 +58,6 @@ class TestCase(tf.test.TestCase):
 
     super(TestCase, self).assertAllClose(array1, array2, rtol=1e-5, atol=1e-5)
 
-  def mock_import(self, module_name):
-    """Causes importing a specific module to return a mock.MagicMock instance.
-
-    Usage:
-      with mock_import('scipy'):
-        import scipy  # scipy is a MagicMock.
-        x = scipy.blah()[7]  # x is also a MagicMock.
-
-    Args:
-      module_name: Name of module that should be mocked.
-
-    Returns:
-      A context manager for use in a with statement.
-    """
-    orig_import = __import__
-    mocked_module = mock.MagicMock()
-
-    def import_mock(name, *args, **kwargs):
-      if name == module_name:
-        return mocked_module
-      return orig_import(name, *args, **kwargs)
-
-    return mock.patch.object(builtins, '__import__', side_effect=import_mock)
-
 
 class ExternalOptimizerInterfaceTest(TestCase):
 
@@ -123,11 +85,6 @@ class ExternalOptimizerInterfaceTest(TestCase):
       self.assertAllClose(np.arange(6).reshape(2, 3) + 3, sess.run(matrix))
 
   def test_callbacks(self):
-    if mock is None:
-      # This test requires mock. See comment in imports section at top.
-      tf.logging.warning('This test requires mock and will not be run')
-      return
-
     vector_val = np.array([7., -2.], dtype=np.float32)
     vector = tf.Variable(vector_val, 'vector')
 
@@ -146,21 +103,17 @@ class ExternalOptimizerInterfaceTest(TestCase):
 
       extra_fetches = [loss]
 
-      step_callback = mock.Mock()
-      loss_callback = mock.Mock()
-      grad_callback = mock.Mock()
+      step_callback = tf.test.mock.Mock()
+      loss_callback = tf.test.mock.Mock()
 
       optimizer.minimize(
           sess, fetches=extra_fetches, loss_callback=loss_callback,
-          grad_callback=grad_callback, step_callback=step_callback)
+          step_callback=step_callback)
 
-      call = mock.call(loss_val)
+      call = tf.test.mock.call(loss_val)
       loss_calls = [call] * MockOptimizerInterface.NUM_LOSS_CALLS
       loss_callback.assert_has_calls(loss_calls)
 
-      grad_calls = [call] * MockOptimizerInterface.NUM_GRAD_CALLS
-      grad_callback.assert_has_calls(grad_calls)
-
       args, _ = step_callback.call_args
       self.assertAllClose(initial_vector_val, args[0])
 
@@ -168,52 +121,35 @@ class ExternalOptimizerInterfaceTest(TestCase):
 class ScipyOptimizerInterfaceTest(TestCase):
 
   def test_unconstrained(self):
-    if mock is None:
-      # This test requires mock. See comment in imports section at top.
-      tf.logging.warning('This test requires mock and will not be run')
-      return
 
-    vector_initial_value = [7., 7.]
-    vector = tf.Variable(vector_initial_value, 'vector')
+    def objective(x):
+      """Rosenbrock function. (Carl Edward Rasmussen, 2001-07-21).
 
-    # Make norm as small as possible.
-    loss = tf.reduce_sum(tf.square(vector))
+      f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2
 
-    optimizer = tf.contrib.opt.ScipyOptimizerInterface(loss)
+      Args:
+        x: a Variable
+      Returns:
+        f: a tensor (objective value)
+      """
+
+      d = tf.size(x)
+      s = tf.add(100 * tf.square(tf.sub(tf.slice(x, [1], [d - 1]),
+                                        tf.square(tf.slice(x, [0], [d - 1])))),
+                 tf.square(tf.sub(1.0, tf.slice(x, [0], [d - 1]))))
+      return tf.reduce_sum(s)
+
+    dimension = 5
+    x = tf.Variable(tf.zeros(dimension))
+    optimizer = tf.contrib.opt.ScipyOptimizerInterface(objective(x))
 
     with self.test_session() as sess:
       sess.run(tf.initialize_all_variables())
+      optimizer.minimize(sess)
 
-      with self.mock_import('scipy.optimize'):
-        import scipy.optimize  # pylint: disable=g-import-not-at-top
-        # scipy.optimize is now a mock.MagicMock.
-        optimized_vector = np.array([1.23, -0.1])
-        scipy.optimize.minimize.return_value = {'x': optimized_vector}
-        optimizer.minimize(sess)
-
-        self.assertAllClose(optimized_vector, sess.run(vector))
-
-        self.assertEqual(1, len(scipy.optimize.minimize.mock_calls))
-        call_signature = scipy.optimize.minimize.mock_calls[0]
-
-        args = call_signature[1]
-        self.assertEqual(2, len(args))
-        self.assertTrue(callable(args[0]))
-        self.assertAllClose(vector_initial_value, args[1])
-
-        kwargs = call_signature[2]
-        self.assertEqual(4, len(kwargs))
-        self.assertEqual('L-BFGS-B', kwargs['method'])
-        self.assertTrue(callable(kwargs['jac']))
-        self.assertTrue(callable(kwargs['callback']))
-        self.assertEqual([], kwargs['constraints'])
+      self.assertAllClose(np.ones(dimension), sess.run(x))
 
   def test_nonlinear_programming(self):
-    if mock is None:
-      # This test requires mock. See comment in imports section at top.
-      tf.logging.warning('This test requires mock and will not be run')
-      return
-
     vector_initial_value = [7., 7.]
     vector = tf.Variable(vector_initial_value, 'vector')
 
@@ -230,46 +166,8 @@ class ScipyOptimizerInterfaceTest(TestCase):
 
     with self.test_session() as sess:
       sess.run(tf.initialize_all_variables())
-
-      with self.mock_import('scipy.optimize'):
-        import scipy.optimize  # pylint: disable=g-import-not-at-top
-        # scipy.optimize is now a mock.MagicMock.
-        optimized_vector = np.array([1.23, -0.1])
-        scipy.optimize.minimize.return_value = {'x': optimized_vector}
-
-        optimizer.minimize(sess)
-
-        self.assertAllClose(optimized_vector, sess.run(vector))
-
-        self.assertEqual(1, len(scipy.optimize.minimize.mock_calls))
-        call_signature = scipy.optimize.minimize.mock_calls[0]
-
-        args = call_signature[1]
-        self.assertEqual(2, len(args))
-        self.assertTrue(callable(args[0]))
-        self.assertAllClose(vector_initial_value, args[1])
-
-        kwargs = call_signature[2]
-        self.assertEqual(3, len(kwargs))
-        self.assertEqual('SLSQP', kwargs['method'])
-        self.assertTrue(callable(kwargs['jac']))
-        # No callback keyword arg since SLSQP doesn't support it.
-
-        constraints = kwargs['constraints']
-        self.assertEqual(2, len(constraints))
-
-        eq_constraint = constraints[0]
-        self.assertEqual(3, len(eq_constraint))
-        self.assertEqual('eq', eq_constraint['type'])
-        self.assertTrue(callable(eq_constraint['fun']))
-        self.assertTrue(callable(eq_constraint['jac']))
-
-        ineq_constraint = constraints[1]
-        self.assertEqual(3, len(ineq_constraint))
-        self.assertEqual('ineq', ineq_constraint['type'])
-        self.assertTrue(callable(ineq_constraint['fun']))
-        self.assertTrue(callable(ineq_constraint['jac']))
-
+      optimizer.minimize(sess)
+      self.assertAllClose(np.ones(2), sess.run(vector))
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/quantization/kernels/quantization_utils.h b/tensorflow/contrib/quantization/kernels/quantization_utils.h
index c9a3c777977..45fda79ce50 100644
--- a/tensorflow/contrib/quantization/kernels/quantization_utils.h
+++ b/tensorflow/contrib/quantization/kernels/quantization_utils.h
@@ -25,7 +25,7 @@ limitations under the License.
 // to avoid a dependency on floating-point hardware.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "external/gemmlowp/public/gemmlowp.h"
+#include "public/gemmlowp.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
diff --git a/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc b/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc
index 647e68ea121..b25bff45a11 100644
--- a/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc
+++ b/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
-#include "external/gemmlowp/public/gemmlowp.h"
+#include "public/gemmlowp.h"
 #include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
 #include "tensorflow/contrib/quantization/kernels/reference_gemm.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/contrib/quantization/kernels/quantized_matmul_op.cc b/tensorflow/contrib/quantization/kernels/quantized_matmul_op.cc
index 21abce932a1..18de2d1d97f 100644
--- a/tensorflow/contrib/quantization/kernels/quantized_matmul_op.cc
+++ b/tensorflow/contrib/quantization/kernels/quantized_matmul_op.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // Implements a quantized eight-bit version of the matmul operation.
 
-#include "external/gemmlowp/public/gemmlowp.h"
+#include "public/gemmlowp.h"
 #include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
 #include "tensorflow/contrib/quantization/kernels/reference_gemm.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/contrib/quantization/kernels/quantized_pooling_ops.cc b/tensorflow/contrib/quantization/kernels/quantized_pooling_ops.cc
index c078de7ab18..33a12c47466 100644
--- a/tensorflow/contrib/quantization/kernels/quantized_pooling_ops.cc
+++ b/tensorflow/contrib/quantization/kernels/quantized_pooling_ops.cc
@@ -29,11 +29,6 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#if GOOGLE_CUDA
-#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
-#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
-#endif  // GOOGLE_CUDA
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
diff --git a/tensorflow/contrib/quantization/ops/math_ops.cc b/tensorflow/contrib/quantization/ops/math_ops.cc
index 6bc408531aa..ed0930c2d64 100644
--- a/tensorflow/contrib/quantization/ops/math_ops.cc
+++ b/tensorflow/contrib/quantization/ops/math_ops.cc
@@ -80,6 +80,15 @@ REGISTER_OP("QuantizeDownAndShrinkRange")
     .Output("output_max: float")
     .Attr("Tinput: quantizedtype")
     .Attr("out_type: quantizedtype")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
     .Doc(R"doc(
 Convert the quantized 'input' tensor into a lower-precision 'output', using the
 actual distribution of the values to maximize the usage of the lower bit depth
diff --git a/tensorflow/contrib/quantization/ops/nn_ops.cc b/tensorflow/contrib/quantization/ops/nn_ops.cc
index 18db2b0eaa2..c33f318c6e7 100644
--- a/tensorflow/contrib/quantization/ops/nn_ops.cc
+++ b/tensorflow/contrib/quantization/ops/nn_ops.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 namespace tensorflow {
 
+using shape_inference::Dimension;
 using shape_inference::InferenceContext;
 using shape_inference::Shape;
 
@@ -292,6 +293,25 @@ REGISTER_OP("QuantizedBatchNormWithGlobalNormalization")
     .Attr("out_type: quantizedtype")
     .Attr("variance_epsilon: float")
     .Attr("scale_after_normalization: bool")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+
+      const Dimension* last_dim = c->Dim(input, 3);
+      for (int i = 1; i < 5; ++i) {  // covers m, v, beta, gamma
+        const Shape* vec;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i * 3), 1, &vec));
+        TF_RETURN_IF_ERROR(c->Merge(last_dim, c->Dim(vec, 0), &last_dim));
+      }
+
+      const Shape* out;
+      TF_RETURN_IF_ERROR(c->ReplaceDim(input, 3, last_dim, &out));
+      c->set_output(0, out);
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+
+      return Status::OK();
+    })
     .Doc(R"doc(
 Quantized Batch normalization.
 
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index dffd139ec0d..f69c656c68b 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -18,6 +18,7 @@ py_library(
         ":python/ops/_lstm_ops.so",
     ],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
 )
 
 cuda_py_tests(
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index 2ecc415d351..231d13caa6c 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -411,7 +411,7 @@ class LSTMFusedCell(rnn_cell.RNNCell):
     Args:
       num_units: int, The number of units in the LSTM cell.
       forget_bias: float, The bias added to forget gates (see above).
-      use_peephole: Whether to use peephole connectios or not.
+      use_peephole: Whether to use peephole connections or not.
     """
     self._num_units = num_units
     self._forget_bias = forget_bias
diff --git a/tensorflow/contrib/session_bundle/session_bundle.py b/tensorflow/contrib/session_bundle/session_bundle.py
index 50d8be7452a..6f895cb2515 100644
--- a/tensorflow/contrib/session_bundle/session_bundle.py
+++ b/tensorflow/contrib/session_bundle/session_bundle.py
@@ -32,7 +32,7 @@ from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.lib.io import file_io
 
 
-def LoadSessionBundleFromPath(export_dir, target="", config=None):
+def load_session_bundle_from_path(export_dir, target="", config=None):
   """Load session bundle from the given path.
 
   The function reads input from the export_dir, constructs the graph data to the
diff --git a/tensorflow/contrib/session_bundle/session_bundle_test.py b/tensorflow/contrib/session_bundle/session_bundle_test.py
index a9e157eb196..a080e16d1b4 100644
--- a/tensorflow/contrib/session_bundle/session_bundle_test.py
+++ b/tensorflow/contrib/session_bundle/session_bundle_test.py
@@ -33,7 +33,7 @@ class SessionBundleLoadTest(tf.test.TestCase):
     base_path = tf.test.test_src_dir_path(
         "contrib/session_bundle/example/half_plus_two/00000123")
     tf.reset_default_graph()
-    sess, meta_graph_def = session_bundle.LoadSessionBundleFromPath(
+    sess, meta_graph_def = session_bundle.load_session_bundle_from_path(
         base_path, target="", config=tf.ConfigProto(device_count={"CPU": 2}))
 
     self.assertTrue(sess)
@@ -66,7 +66,7 @@ class SessionBundleLoadTest(tf.test.TestCase):
     base_path = tf.test.test_src_dir_path("/no/such/a/dir")
     tf.reset_default_graph()
     with self.assertRaises(RuntimeError) as cm:
-      _, _ = session_bundle.LoadSessionBundleFromPath(
+      _, _ = session_bundle.load_session_bundle_from_path(
           base_path, target="local",
           config=tf.ConfigProto(device_count={"CPU": 2}))
     self.assertTrue("Expected meta graph file missing" in str(cm.exception))
diff --git a/tensorflow/contrib/slim/python/slim/data/README.md b/tensorflow/contrib/slim/python/slim/data/README.md
new file mode 100644
index 00000000000..858c6949902
--- /dev/null
+++ b/tensorflow/contrib/slim/python/slim/data/README.md
@@ -0,0 +1,153 @@
+# TensorFlow-Slim Data
+
+TF-Slim provides a data loading library for facilitating the reading of data
+from various formats. TF-Slim's data modules are composed of several layers of
+abstraction to make it flexible enough to support multiple file storage types,
+such as TFRecords or Text files, data encoding and features naming schemes.
+
+# Overview
+
+The task of loading data has two main components: (1) specification of how
+a dataset is represented so it can be read and interpreted and (2) instruction
+for providing the data to consumers of the dataset.
+
+Secondly, one must specify instructions for how
+the data is actually provided and housed in memory. For example, if the data is
+sharded over many sources, should it be read in parallel from these sources?
+Should it be read serially? Should the data be shuffled in memory?
+
+# Dataset Specification
+
+TF-Slim defines a dataset to be a set of files (that may or may not be encoded)
+representing a finite set of samples, and which can be read to provide a
+predefined set of entities or `items`. For example, a dataset might be stored
+over thousands of files or a single file. The files might store the data in
+clear text or some advanced encoding scheme. It might provide a single `item`,
+like an image, or several `items`, like an image, a class label and a scene
+label.
+
+More concretely, TF-Slim's
+[dataset](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/dataset.py)
+is a tuple that encapsulates the following elements of a dataset specification:
+
+* `data_sources`: A list of file paths that together make up the dataset
+* `reader`: A TensorFlow
+[Reader](https://www.tensorflow.org/api_docs/python/io_ops.html#ReaderBase)
+appropriate for the file type in `data_sources`.
+* `decoder`: A TF-Slim
+[data_decoder](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/data_decoder.py)
+class which is used to decode the content of the read dataset files.
+* `num_samples`: The number of samples in the dataset.
+* `items_to_descriptions`: A map from the items provided by the dataset to
+descriptions of each.
+
+In a nutshell, a dataset is read by (a) opening the files specified by
+`data_sources` using the given `reader` class (b) decoding the files using
+the given `decoder` and (c) allowing the user to request a list of `items` to
+be returned as `Tensors`.
+
+## Data Decoders
+
+A
+[data_decoder](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/data_decoder.py)
+is a class which is given some (possibly serialized/encoded) data and returns a
+list of `Tensors`. In particular, a given data decoder is able to decode a
+predefined list of `items` and can return a subset or all of them, when
+requested:
+
+```python
+# Load the data
+my_encoded_data = ...
+data_decoder = MyDataDecoder()
+
+# Decode the inputs and labels:
+decoded_input, decoded_labels = data_decoder.Decode(data, ['input', 'labels'])
+
+# Decode just the inputs:
+decoded_input = data_decoder.Decode(data, ['input'])
+
+# Check which items a data decoder knows how to decode:
+for item in data_decoder.list_items():
+  print(item)
+```
+
+## Example: TFExampleDataDecoder
+
+The
+[tfexample_data_decoder.py](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/tfexample_data_decoder.py)
+is a data decoder which decodes serialized `TFExample` protocol buffers. A
+`TFExample` protocol buffer is a map from keys (strings) to either a
+`tf.FixedLenFeature` or `tf.VarLenFeature`. Consequently, to decode a
+`TFExample`, one must provide a mapping from one or more `TFExample` fields
+to each of the `items` that the `tfexample_data_decoder` can provide. For
+example, a dataset of `TFExamples` might store images in various formats and
+each `TFExample` might contain an `encoding` key and a `format` key which can
+be used to decode the image using the appropriate decoder (jpg, png, etc).
+
+To make this possible, the `tfexample_data_decoder` is constructed by specifying
+the a map of `TFExample` keys to either `tf.FixedLenFeature` or
+`tf.VarLenFeature` as well as a set of `ItemHandlers`. An `ItemHandler`
+provides a mapping from `TFExample` keys to the item being provided. Because a
+`tfexample_data_decoder` might return multiple `items`, one often constructs a
+`tfexample_data_decoder` using multiple `ItemHandlers`.
+
+`tfexample_data_decoder` provides some predefined `ItemHandlers` which take care
+of the common cases of mapping `TFExamples` to images, `Tensors` and
+`SparseTensors`. For example, the following specification might be
+used to decode a dataset of images:
+
+```python
+keys_to_features = {
+    'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
+    'image/format': tf.FixedLenFeature((), tf.string, default_value='raw'),
+    'image/class/label': tf.FixedLenFeature(
+        [1], tf.int64, default_value=tf.zeros([1], dtype=tf.int64)),
+}
+
+items_to_handlers = {
+    'image': tfexample_decoder.Image(
+      image_key = 'image/encoded',
+      format_key = 'image/format',
+      shape=[28, 28],
+      channels=1),
+    'label': tfexample_decoder.Tensor('image/class/label'),
+}
+
+decoder = tfexample_decoder.TFExampleDecoder(
+    keys_to_features, items_to_handlers)
+```
+
+Notice that the TFExample is parsed using three keys: `image/encoded`,
+`image/format` and `image/class/label`. Additionally, the first two keys are
+mapped to a single `item` named 'image'. As defined, this `data_decoder`
+provides two `items` named 'image' and 'label'.
+
+# Data Provision
+
+A
+[data_provider](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/data_provider.py)
+is a class which provides `Tensors` for each item requested:
+
+```python
+my_data_provider = ...
+image, class_label, bounding_box = my_data_provider.get(
+    ['image', 'label', 'bb'])
+```
+
+The
+[dataset_data_provider](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py)
+is a `data_provider` that provides data from a given `dataset` specification:
+
+```python
+dataset = GetDataset(...)
+data_provider = dataset_data_provider.DatasetDataProvider(
+    dataset, common_queue_capacity=32, common_queue_min=8)
+```
+
+The `dataset_data_provider` enables control over several elements of data
+provision:
+
+* How many concurrent readers are used.
+* Whether the data is shuffled as its loaded into its queue
+* Whether to take a single pass over the data or read data indefinitely.
+
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index cd052576044..d768722cd8a 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -1,4 +1,4 @@
-# Copyright 2016 Google Inc. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -97,49 +97,57 @@ class ItemHandlerCallback(ItemHandler):
 
 
 class Tensor(ItemHandler):
-  """An ItemHandler that returns a parsed Tensor or SparseTensor."""
+  """An ItemHandler that returns a parsed Tensor."""
 
-  def __init__(self, tensor_key, shape_key=None, shape=None, default_value=0):
+  def __init__(self, tensor_key, shape_keys=None, shape=None, default_value=0):
     """Initializes the Tensor handler.
 
     Tensors are, by default, returned without any reshaping. However, there are
-    two mechanisms which allow reshaping to occur at load time. If `shape_key`
-    is provided, both the `Tensor` corresponding to `tensor_key` and `shape_key`
-    is loaded and the former `Tensor` is reshaped with the values of the latter.
-    Alternatively, if a fixed `shape` is provided, the `Tensor` corresponding to
-    `tensor_key` is loaded and reshape appropriately. If neither `shape_key` nor
-    `shape` are provided, the `Tensor` will be returned without any reshaping.
+    two mechanisms which allow reshaping to occur at load time. If `shape_keys`
+    is provided, both the `Tensor` corresponding to `tensor_key` and
+    `shape_keys` is loaded and the former `Tensor` is reshaped with the values
+    of the latter. Alternatively, if a fixed `shape` is provided, the `Tensor`
+    corresponding to `tensor_key` is loaded and reshape appropriately.
+    If neither `shape_keys` nor `shape` are provided, the `Tensor` will be
+    returned without any reshaping.
 
     Args:
       tensor_key: the name of the `TFExample` feature to read the tensor from.
-      shape_key: Optional name of the TF-Example feature in which the tensor
-        shape is stored.
-      shape: Optional output shape of the Tensor. If provided, the `Tensor` is
+      shape_keys: Optional name or list of names of the TF-Example feature in
+        which the tensor shape is stored. If a list, then each corresponds to
+        one dimension of the shape.
+      shape: Optional output shape of the `Tensor`. If provided, the `Tensor` is
         reshaped accordingly.
-      default_value: Scalar value to set when making dense for indices not
-        specified in the `SparseTensor`.
+      default_value: The value used when the `tensor_key` is not found in a
+        particular `TFExample`.
 
     Raises:
-      ValueError: if both `shape_key` and `shape` are specified.
+      ValueError: if both `shape_keys` and `shape` are specified.
     """
-    if shape_key and shape is not None:
-      raise ValueError('Cannot specify both shape_key and shape parameters.')
+    if shape_keys and shape is not None:
+      raise ValueError('Cannot specify both shape_keys and shape parameters.')
+    if shape_keys and not isinstance(shape_keys, list):
+      shape_keys = [shape_keys]
     self._tensor_key = tensor_key
-    self._shape_key = shape_key
+    self._shape_keys = shape_keys
     self._shape = shape
     self._default_value = default_value
     keys = [tensor_key]
-    if shape_key:
-      keys.append(shape_key)
+    if shape_keys:
+      keys.extend(shape_keys)
     super(Tensor, self).__init__(keys)
 
   def tensors_to_item(self, keys_to_tensors):
     tensor = keys_to_tensors[self._tensor_key]
     shape = self._shape
-    if self._shape_key:
-      shape = keys_to_tensors[self._shape_key]
-      if isinstance(shape, ops.SparseTensor):
-        shape = sparse_ops.sparse_tensor_to_dense(shape)
+    if self._shape_keys:
+      shape_dims = []
+      for k in self._shape_keys:
+        shape_dim = keys_to_tensors[k]
+        if isinstance(shape_dim, ops.SparseTensor):
+          shape_dim = sparse_ops.sparse_tensor_to_dense(shape_dim)
+        shape_dims.append(shape_dim)
+      shape = array_ops.squeeze(array_ops.pack(shape_dims))
     if isinstance(tensor, ops.SparseTensor):
       if shape is not None:
         tensor = sparse_ops.sparse_reshape(tensor, shape)
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
index 7fd5ac6646a..7f0dd30ed9f 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
@@ -315,9 +315,50 @@ class TFExampleDecoderTest(tf.test.TestCase):
       }
       items_to_handlers = {
           'image': slim.tfexample_decoder.Tensor('image',
-                                                 shape_key='image/shape'),
+                                                 shape_keys='image/shape'),
           'labels': slim.tfexample_decoder.Tensor('labels',
-                                                  shape_key='labels/shape'),
+                                                  shape_keys='labels/shape'),
+      }
+      decoder = slim.tfexample_decoder.TFExampleDecoder(
+          keys_to_features, items_to_handlers)
+      [tf_image, tf_labels] = decoder.decode(serialized_example,
+                                             ['image', 'labels'])
+      self.assertAllEqual(tf_image.eval(), np_image)
+      self.assertAllEqual(tf_labels.eval(), np_labels)
+
+  def testDecodeExampleMultiShapeKeyTensor(self):
+    np_image = np.random.rand(2, 3, 1).astype('f')
+    np_labels = np.array([[[1], [2], [3]],
+                          [[4], [5], [6]]])
+    height, width, depth = np_labels.shape
+
+    example = tf.train.Example(features=tf.train.Features(feature={
+        'image': self._EncodedFloatFeature(np_image),
+        'image/shape': self._EncodedInt64Feature(np.array(np_image.shape)),
+        'labels': self._EncodedInt64Feature(np_labels),
+        'labels/height': self._EncodedInt64Feature(np.array([height])),
+        'labels/width': self._EncodedInt64Feature(np.array([width])),
+        'labels/depth': self._EncodedInt64Feature(np.array([depth])),
+    }))
+
+    serialized_example = example.SerializeToString()
+
+    with self.test_session():
+      serialized_example = tf.reshape(serialized_example, shape=[])
+      keys_to_features = {
+          'image': tf.VarLenFeature(dtype=tf.float32),
+          'image/shape': tf.VarLenFeature(dtype=tf.int64),
+          'labels': tf.VarLenFeature(dtype=tf.int64),
+          'labels/height': tf.VarLenFeature(dtype=tf.int64),
+          'labels/width': tf.VarLenFeature(dtype=tf.int64),
+          'labels/depth': tf.VarLenFeature(dtype=tf.int64),
+      }
+      items_to_handlers = {
+          'image': slim.tfexample_decoder.Tensor(
+              'image', shape_keys='image/shape'),
+          'labels': slim.tfexample_decoder.Tensor(
+              'labels',
+              shape_keys=['labels/height', 'labels/width', 'labels/depth']),
       }
       decoder = slim.tfexample_decoder.TFExampleDecoder(
           keys_to_features, items_to_handlers)
diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py
index e6314a9ce9c..433e4ae61f0 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@@ -253,7 +253,8 @@ def evaluation_loop(master,
                     summary_op_feed_dict=None,
                     variables_to_restore=None,
                     eval_interval_secs=60,
-                    max_number_of_evaluations=None):
+                    max_number_of_evaluations=None,
+                    session_config=None):
   """Runs TF-Slim's Evaluation Loop.
 
   Args:
@@ -276,6 +277,8 @@ def evaluation_loop(master,
     eval_interval_secs: The minimum number of seconds between evaluations.
     max_number_of_evaluations: the max number of iterations of the evaluation.
       If the value is left as 'None', the evaluation continues indefinitely.
+    session_config: An instance of `tf.ConfigProto` that will be used to
+      configure the `Session`. If left as `None`, the default will be used.
   """
   if summary_op == _USE_DEFAULT:
     summary_op = logging_ops.merge_all_summaries()
@@ -307,7 +310,8 @@ def evaluation_loop(master,
     logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                            time.gmtime()))
 
-    with sv.managed_session(master, start_standard_services=False) as sess:
+    with sv.managed_session(
+        master, start_standard_services=False, config=session_config) as sess:
       sv.saver.restore(sess, last_checkpoint)
       sv.start_queue_runners(sess)
       evaluation(sess,
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index ccb26bba47b..c6312e4a001 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -538,12 +538,14 @@ def train(
     init_feed_dict=None,
     local_init_op=None,
     init_fn=None,
+    ready_op=_USE_DEFAULT,
     summary_op=_USE_DEFAULT,
     save_summaries_secs=600,
     startup_delay_steps=0,
     saver=None,
     save_interval_secs=600,
-    sync_optimizer=None):
+    sync_optimizer=None,
+    session_config=None):
   """Runs a training loop using a TensorFlow supervisor.
 
   When the sync_optimizer is supplied, gradient updates are applied
@@ -579,6 +581,9 @@ def train(
       `tf.initialize_local_variables()` and `tf.initialize_all_tables()`.
     init_fn: An optional callable to be executed after `init_op` is called. The
       callable must accept one argument, the session being initialized.
+    ready_op: Operation to check if the model is ready to use. If left to its
+      default value, then the session checks for readiness by calling
+      `tf.report_uninitialized_variables()`.
     summary_op: The summary operation.
     save_summaries_secs: How often, in seconds, to save summaries.
     startup_delay_steps: The number of steps to wait for before beginning. Note
@@ -589,6 +594,8 @@ def train(
     sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
       argument is supplied, gradient updates will be synchronous. If left as
       `None`, gradient updates will be asynchronous.
+    session_config: An instance of `tf.ConfigProto` that will be used to
+      configure the `Session`. If left as `None`, the default will be used.
 
   Returns:
     the value of the loss function after training.
@@ -624,6 +631,9 @@ def train(
     if init_op == _USE_DEFAULT:
       init_op = tf_variables.initialize_all_variables()
 
+    if ready_op == _USE_DEFAULT:
+      ready_op = tf_variables.report_uninitialized_variables()
+
     if summary_op == _USE_DEFAULT:
       summary_op = logging_ops.merge_all_summaries()
 
@@ -660,6 +670,7 @@ def train(
       init_op=init_op,
       init_feed_dict=init_feed_dict,
       local_init_op=local_init_op,
+      ready_op=ready_op,
       summary_op=summary_op,
       global_step=global_step,
       saver=saver,
@@ -671,7 +682,8 @@ def train(
   while should_retry:
     try:
       should_retry = False
-      with sv.managed_session(master, start_standard_services=False) as sess:
+      with sv.managed_session(
+          master, start_standard_services=False, config=session_config) as sess:
         logging.info('Starting Session.')
         if is_chief:
           if logdir:
@@ -694,10 +706,11 @@ def train(
           if logdir and sv.is_chief:
             logging.info('Finished training! Saving model to disk.')
             sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
-        finally:
+        except:
           if sv.is_chief and cleanup_op is not None:
             logging.info('About to execute sync_clean_up_op!')
             sess.run(cleanup_op)
+          raise
 
     except errors.AbortedError:
       # Always re-run on AbortedError as it indicates a restart of one of the
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index b57c8f8fe63..4b7e42ceb24 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -362,6 +362,32 @@ class TrainTest(tf.test.TestCase):
     self.assertIsNotNone(loss)
     self.assertLess(loss, .015)
 
+  def testTrainWithSessionConfig(self):
+    g = tf.Graph()
+    with g.as_default():
+      tf.set_random_seed(0)
+      tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+      tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+      tf_predictions = LogisticClassifier(tf_inputs)
+      slim.losses.log_loss(tf_predictions, tf_labels)
+      total_loss = slim.losses.get_total_loss()
+
+      optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
+
+      train_op = slim.learning.create_train_op(total_loss, optimizer)
+
+    session_config = tf.ConfigProto(allow_soft_placement=True)
+    loss = slim.learning.train(
+        train_op,
+        None,
+        number_of_steps=300,
+        log_every_n_steps=10,
+        graph=g,
+        session_config=session_config)
+    self.assertIsNotNone(loss)
+    self.assertLess(loss, .015)
+
   def testTrainWithNoneAsLogdirWhenUsingSummariesRaisesError(self):
     with tf.Graph().as_default():
       tf.set_random_seed(0)
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index abf4f0ee1f1..e0a396fb9a5 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -156,6 +156,7 @@ cc_library(
         "lib/io/table_options.h",
         "lib/jpeg/jpeg_mem.h",
         "lib/monitoring/counter.h",
+        "lib/monitoring/export_registry.h",
         "lib/monitoring/metric_def.h",
         "lib/random/distribution_sampler.h",
         "lib/random/philox_random.h",
@@ -172,6 +173,7 @@ cc_library(
         "platform/logging.h",
         "platform/macros.h",
         "platform/mem.h",
+        "platform/net.h",
         "platform/mutex.h",
         "platform/protobuf.h",  # TODO(josh11b): make internal
         "platform/regexp.h",
@@ -1009,6 +1011,7 @@ filegroup(
         "platform/default/protobuf.h",
         "platform/default/thread_annotations.h",
         "platform/env.h",
+        "platform/file_statistics.h",
         "platform/file_system.h",
         "platform/fingerprint.h",
         "platform/host_info.h",
@@ -1016,6 +1019,7 @@ filegroup(
         "platform/macros.h",
         "platform/mem.h",
         "platform/mutex.h",
+        "platform/net.h",
         "platform/platform.h",
         "platform/protobuf.h",
         "platform/strong_hash.h",
@@ -1280,6 +1284,8 @@ tf_cc_tests(
         "lib/io/table_test.cc",
         "lib/io/zlib_buffers_test.cc",
         "lib/monitoring/counter_test.cc",
+        "lib/monitoring/export_registry_test.cc",
+        "lib/monitoring/metric_def_test.cc",
         "lib/random/distribution_sampler_test.cc",
         "lib/random/philox_random_test.cc",
         "lib/random/random_distributions_test.cc",
@@ -1294,6 +1300,7 @@ tf_cc_tests(
         "platform/fingerprint_test.cc",
         "platform/integral_types_test.cc",
         "platform/logging_test.cc",
+        "platform/net_test.cc",
         "platform/port_test.cc",
     ],
     deps = [
@@ -1778,6 +1785,7 @@ tf_cc_tests(
     tests = [
         "ops/array_ops_test.cc",
         "ops/candidate_sampling_ops_test.cc",
+        "ops/control_flow_ops_test.cc",
         "ops/ctc_ops_test.cc",
         "ops/data_flow_ops_test.cc",
         "ops/functional_ops_test.cc",
@@ -1789,6 +1797,7 @@ tf_cc_tests(
         "ops/parsing_ops_test.cc",
         "ops/random_ops_test.cc",
         "ops/sparse_ops_test.cc",
+        "ops/state_ops_test.cc",
         "ops/string_ops_test.cc",
         "ops/training_ops_test.cc",
     ],
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 03b93cf9a98..234069a6268 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -39,7 +39,8 @@ namespace tensorflow {
 
 namespace {
 
-bool IsConstantFoldable(const Node* n,
+bool IsConstantFoldable(const FunctionLibraryDefinition* flib_def,
+                        const Node* n,
                         std::function<bool(const Node*)> consider) {
   if (n->op_def().is_stateful()) {
     return false;
@@ -61,18 +62,28 @@ bool IsConstantFoldable(const Node* n,
   if (n->IsSink()) {
     return false;
   }
+  // For now, don't try to constant-fold functions. (They may be inlined, in
+  // which case they will become subject to constant-folding again.)
+  // TODO(phawkins): support constant-folding for functions; functions may
+  // be arbitrarily expensive to execute.
+  if (flib_def && flib_def->Find(n->type_string())) {
+    return false;
+  }
   return true;
 }
 
 // Returns the constant foldable nodes in `nodes_result` in data flow order.
-void FindConstantFoldableNodes(const Graph* graph, ConstantFoldingOptions opts,
+void FindConstantFoldableNodes(const Graph* graph,
+                               const FunctionLibraryDefinition* flib_def,
+                               ConstantFoldingOptions opts,
                                std::vector<Node*>* nodes_result) {
   std::set<const Node*> node_set;
   std::vector<Node*>& nodes = *nodes_result;
   bool internal_node_inserted = false;
   // Walk the nodes in data flow order
   ReverseDFS(*graph, nullptr,
-             [&nodes, &node_set, &internal_node_inserted, opts](Node* n) {
+             [&nodes, &node_set, &internal_node_inserted, opts,
+              flib_def](Node* n) {
                if (n->IsConstant()) {
                  // Constants with no control inputs (except from _SOURCE node)
                  // are definitely constant foldable.
@@ -82,7 +93,7 @@ void FindConstantFoldableNodes(const Graph* graph, ConstantFoldingOptions opts,
                    node_set.insert(n);
                    nodes.push_back(n);
                  }
-               } else if (IsConstantFoldable(n, opts.consider)) {
+               } else if (IsConstantFoldable(flib_def, n, opts.consider)) {
                  // Check whether the set of this node's in_nodes is completely
                  // included in the set of constant foldable nodes. If true,
                  // then this node is also constant foldable.
@@ -303,6 +314,7 @@ bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
 }
 
 bool DoConstantFolding(const ConstantFoldingOptions& opts,
+                       FunctionLibraryRuntime* function_library,
                        Device* partition_device, Graph* graph) {
   DumpGraph("Before", graph);
   Device* device = GetCPUDevice();
@@ -313,8 +325,12 @@ bool DoConstantFolding(const ConstantFoldingOptions& opts,
     return false;
   }
 
+  const FunctionLibraryDefinition* flib_def = nullptr;
+  if (function_library) {
+    flib_def = function_library->GetFunctionLibraryDefinition();
+  }
   std::vector<Node*> constant_foldable_nodes;
-  FindConstantFoldableNodes(graph, opts, &constant_foldable_nodes);
+  FindConstantFoldableNodes(graph, flib_def, opts, &constant_foldable_nodes);
   if (constant_foldable_nodes.empty()) {
     VLOG(1) << "No constant foldable nodes found";
     return false;
diff --git a/tensorflow/core/common_runtime/constant_folding.h b/tensorflow/core/common_runtime/constant_folding.h
index e0bc868bc63..f354aedc592 100644
--- a/tensorflow/core/common_runtime/constant_folding.h
+++ b/tensorflow/core/common_runtime/constant_folding.h
@@ -31,6 +31,7 @@ namespace tensorflow {
 // assumed to execute.
 // Returns true if and only if "graph" has been mutated.
 bool DoConstantFolding(const ConstantFoldingOptions& opts,
+                       FunctionLibraryRuntime* function_library,
                        Device* partition_device, Graph* graph);
 
 typedef std::pair<Node*, int> NodeAndOutput;
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index 704e9fb2fb0..946b939e9a1 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -108,7 +109,7 @@ class ConstantFoldingTest : public ::testing::Test {
 
 TEST_F(ConstantFoldingTest, Basic) {
   SIMPLE_GRAPH;
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, g));
+  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, nullptr, g));
 
   // Nodes s1 and s2 now should now have a constant input
   EXPECT_EQ(1, s1->num_inputs());
@@ -124,7 +125,7 @@ TEST_F(ConstantFoldingTest, ConsiderFunction) {
   ConstantFoldingOptions opts;
   // Do not allow constant folding of m2
   opts.consider = [m2](const Node* n) { return m2 != n; };
-  EXPECT_TRUE(DoConstantFolding(opts, nullptr, g));
+  EXPECT_TRUE(DoConstantFolding(opts, nullptr, nullptr, g));
 
   // Node s1 now should now have a constant input
   EXPECT_EQ(1, s1->num_inputs());
@@ -141,7 +142,7 @@ TEST_F(ConstantFoldingTest, TestNoReplaceAnotherConstant) {
   g->AddControlEdge(g->source_node(), d);
   Node* s3 = test::graph::Send(g, d, "d", "sender", 0, "receiver");
   g->AddControlEdge(s3, g->sink_node());
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, g));
+  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, nullptr, g));
 
   // Nodes s3 should still have d as input
   EXPECT_EQ(1, s3->num_inputs());
@@ -167,7 +168,7 @@ TEST_F(ConstantFoldingTest, TwoOutputs) {
   g->AddControlEdge(b0, g->sink_node());
   g->AddControlEdge(b1, g->sink_node());
 
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, g));
+  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, nullptr, g));
   EXPECT_EQ(1, b0->num_inputs());
   ExpectNodeEqual<int>(*(b0->in_nodes().begin()), {0, 1}, {2});
   EXPECT_EQ(1, b1->num_inputs());
@@ -193,7 +194,7 @@ TEST_F(ConstantFoldingTest, TwoOutputsFoldOneOutput) {
 
   ConstantFoldingOptions opts;
   opts.consider = [b1_ident](const Node* n) { return b1_ident != n; };
-  EXPECT_TRUE(DoConstantFolding(opts, nullptr, g));
+  EXPECT_TRUE(DoConstantFolding(opts, nullptr, nullptr, g));
   // 0th output of b should have been folded.
   EXPECT_EQ(1, b0->num_inputs());
   ExpectNodeEqual<int>(*(b0->in_nodes().begin()), {0, 1}, {2});
@@ -229,11 +230,11 @@ TEST_F(ConstantFoldingTest, TestNoReplaceOnGPU) {
   g->AddControlEdge(send, g->sink_node());
 
   // No ops should be replaced, as there is no kernel for BFLOAT16 on GPU.
-  EXPECT_FALSE(DoConstantFolding(ConstantFoldingOptions{}, device, g));
+  EXPECT_FALSE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, device, g));
 
   // But constant folding should have replaced the cast op with a constant when
   // running on CPU.
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, g));
+  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, nullptr, g));
 
   for (auto d : devices) {
     delete d;
@@ -258,7 +259,39 @@ TEST_F(ConstantFoldingTest, TestNoReplaceLargeConstant) {
   g->AddControlEdge(concat_send, g->sink_node());
 
   // The above concat should not have been constant folded.
-  EXPECT_FALSE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, g));
+  EXPECT_FALSE(
+      DoConstantFolding(ConstantFoldingOptions{}, nullptr, nullptr, g));
+}
+
+TEST_F(ConstantFoldingTest, TestNoReplaceFunctionCall) {
+  FunctionDefLibrary fdef_lib;
+  *fdef_lib.add_function() = test::function::XTimesTwo();
+
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), fdef_lib);
+  g_.reset(new Graph(&flib_def));
+
+  Graph* g = g_.get();
+  Node* s =
+      Constant<int>(std::vector<int>(5 * 1024 * 256, 0), {5 * 1024 * 256});
+  g->AddControlEdge(g->source_node(), s);
+
+  NodeDef def;
+  TF_ASSERT_OK(NodeDefBuilder("times_two", "XTimesTwo", g->op_registry())
+                   .Input(s->name(), 0, DT_INT32)
+                   .Finalize(&def));
+  Status status;
+  Node* times_two = g->AddNode(def, &status);
+  TF_ASSERT_OK(status);
+
+  Node* times_two_send = test::graph::Send(g, times_two, "times_two_send",
+                                           "sender", 0, "receiver");
+  g->AddControlEdge(times_two_send, g->sink_node());
+
+  // The above function call should not have been constant folded.
+  EXPECT_FALSE(
+      DoConstantFolding(ConstantFoldingOptions{}, nullptr, nullptr, g));
+
+  g_ = nullptr;
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 8621118bda0..9e20aee879a 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -730,12 +730,14 @@ Status DirectSession::GetOrCreateExecutors(
   options.fetch_endpoints = outputs_sorted;
   options.target_nodes = tn_sorted;
 
+  std::unique_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
+
   // The executor_lock_ is intentionally released while executor is
   // being created.
   std::unordered_map<string, std::unique_ptr<Graph>> graphs;
-  TF_RETURN_IF_ERROR(CreateGraphs(options, &graphs, run_state_args));
+  TF_RETURN_IF_ERROR(
+      CreateGraphs(options, &graphs, &ek->flib_def, run_state_args));
 
-  std::unique_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
   if (run_state_args->is_partial_run) {
     ek->graph = std::move(run_state_args->graph);
     std::unordered_set<StringPiece, StringPiece::Hasher> names;
@@ -769,7 +771,7 @@ Status DirectSession::GetOrCreateExecutors(
     auto* item = &(ek->items.back());
     item->flib.reset(
         NewFunctionLibraryRuntime(device_mgr_.get(), device, graph_def_version,
-                                  flib_def_.get(), optimizer_opts));
+                                  ek->flib_def.get(), optimizer_opts));
 
     LocalExecutorParams params;
     params.device = device;
@@ -848,6 +850,7 @@ Status DirectSession::GetOrCreateExecutors(
 Status DirectSession::CreateGraphs(
     const BuildGraphOptions& options,
     std::unordered_map<string, std::unique_ptr<Graph>>* outputs,
+    std::unique_ptr<FunctionLibraryDefinition>* flib_def,
     RunStateArgs* run_state_args) {
   mutex_lock l(graph_def_lock_);
   std::unique_ptr<SimpleClientGraph> client_graph;
@@ -964,7 +967,8 @@ Status DirectSession::CreateGraphs(
     if (!s.ok()) {
       break;
     }
-    std::unique_ptr<Graph> device_graph(new Graph(flib_def_.get()));
+    std::unique_ptr<Graph> device_graph(
+        new Graph(client_graph->flib_def.get()));
     GraphConstructorOptions device_opts;
     // There are internal operations (e.g., send/recv) that we now
     // allow.
@@ -974,6 +978,7 @@ Status DirectSession::CreateGraphs(
         ConvertGraphDefToGraph(device_opts, *graph_def, device_graph.get()));
     outputs->emplace(partition_name, std::move(device_graph));
   }
+  *flib_def = std::move(client_graph->flib_def);
   return s;
 }
 
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 21d5d9e5e2f..1b748954470 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -108,10 +108,15 @@ class DirectSession : public Session {
   // a partition of the graph bundled with its dependent library runtime.
   // 'input_keys' are the rendezvous keys for the feeds and 'output_keys'
   // are rendezvous keys for the fetches.
+  // 'flib_def' is the function library used by graphs in 'items'.
+  // TODO(phawkins): currently partitions always share the same function
+  // library. Consider giving each partition its own function library to enable
+  // per-partition rewrites.
   struct ExecutorsAndKeys {
     int64 step_count = 0;
     std::unique_ptr<Graph> graph;
     NameNodeMap name_to_node;
+    std::unique_ptr<FunctionLibraryDefinition> flib_def;
     std::vector<PerPartitionExecutorsAndLib> items;
     std::unordered_map<string, string> input_keys;
     std::unordered_map<string, string> output_keys;
@@ -157,10 +162,12 @@ class DirectSession : public Session {
       ExecutorsAndKeys** executors_and_keys, RunStateArgs* run_state_args);
 
   // Creates several graphs given the existing graph_def_ and the
-  // input feeds and fetches, given 'devices'.
+  // input feeds and fetches, given 'devices'. The graphs share a common
+  // function library 'flib_def'.
   ::tensorflow::Status CreateGraphs(
       const BuildGraphOptions& options,
       std::unordered_map<string, std::unique_ptr<Graph>>* outputs,
+      std::unique_ptr<FunctionLibraryDefinition>* flib_def,
       RunStateArgs* run_state_args);
 
   ::tensorflow::Status ExtendLocked(const GraphDef& graph)
@@ -237,6 +244,10 @@ class DirectSession : public Session {
   // Execution_state; used when placing the entire graph.
   std::unique_ptr<SimpleGraphExecutionState> execution_state_
       GUARDED_BY(graph_def_lock_);
+
+  // The function library, before any rewrites or optimizations have been
+  // performed. In particular, CreateGraphs() may need to modify the function
+  // library; it copies and modifies the function library.
   std::unique_ptr<FunctionLibraryDefinition> flib_def_;
 
   // For generating unique names.
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 51371804798..fc859426b57 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -272,6 +272,11 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   bool IsStateful(const string& function) override;
 
+  const FunctionLibraryDefinition* GetFunctionLibraryDefinition()
+      const override {
+    return lib_def_;
+  }
+
   Device* device() override { return device_; }
 
  private:
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 8ac62e45d22..36fb1e97c76 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -60,7 +60,7 @@ void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Device* device,
 
     if (opts_.do_constant_folding()) {
       ConstantFoldingOptions cf_opts;
-      if (DoConstantFolding(cf_opts, device, g)) {
+      if (DoConstantFolding(cf_opts, runtime, device, g)) {
         RemoveDeadNodes(g);
         DumpGraph("ConstFolding", g);
         changed = true;
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index eea3112b3fa..5dfaa160d0c 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -173,17 +173,6 @@ Status BiasAddGradShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
-namespace {
-Status CheckKnownDim(shape_inference::InferenceContext* c, const Dimension* dim,
-                     const char* name) {
-  if (!c->ValueKnown(dim)) {
-    return errors::InvalidArgument("Cannot infer shape because dimension ",
-                                   name, " is not known.");
-  }
-  return Status::OK();
-}
-}  // namespace
-
 Status Conv2DShape(shape_inference::InferenceContext* c) {
   const Shape* input_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
@@ -224,10 +213,10 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   const Dimension* output_depth_dim = c->Dim(filter_shape, 3);
 
   // At the moment we need to know the values of several fields.
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_rows_dim, "in_rows"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_cols_dim, "in_cols"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, filter_rows_dim, "filter_rows"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, filter_cols_dim, "filter_cols"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_rows_dim, "filter_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_cols_dim, "filter_cols"));
 
   auto in_rows = c->Value(in_rows_dim);
   auto in_cols = c->Value(in_cols_dim);
@@ -263,6 +252,75 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+Status Conv3DShape(shape_inference::InferenceContext* c) {
+  const Shape* input_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 5, &input_shape));
+  const Shape* filter_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 5, &filter_shape));
+
+  std::vector<int32> strides;
+  TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
+  if (strides.size() != 5) {
+    return errors::InvalidArgument(
+        "Conv3D requires the stride attribute to contain 5 values, but got: ",
+        strides.size());
+  }
+
+  int32 stride_planes = strides[1];
+  int32 stride_rows = strides[2];
+  int32 stride_cols = strides[3];
+
+  const Dimension* batch_size_dim = c->Dim(input_shape, 0);
+  const Dimension* in_planes_dim = c->Dim(input_shape, 1);
+  const Dimension* in_rows_dim = c->Dim(input_shape, 2);
+  const Dimension* in_cols_dim = c->Dim(input_shape, 3);
+
+  const Dimension* filter_planes_dim = c->Dim(filter_shape, 0);
+  const Dimension* filter_rows_dim = c->Dim(filter_shape, 1);
+  const Dimension* filter_cols_dim = c->Dim(filter_shape, 2);
+  const Dimension* output_depth_dim = c->Dim(filter_shape, 4);
+
+  // At the moment we need to know the values of several fields.
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_planes_dim, "in_planes"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_planes_dim, "filter_planes"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_rows_dim, "filter_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_cols_dim, "filter_cols"));
+
+  auto in_planes = c->Value(in_planes_dim);
+  auto in_rows = c->Value(in_rows_dim);
+  auto in_cols = c->Value(in_cols_dim);
+  auto filter_planes = c->Value(filter_planes_dim);
+  auto filter_rows = c->Value(filter_rows_dim);
+  auto filter_cols = c->Value(filter_cols_dim);
+
+  const Dimension* unused;
+  TF_RETURN_IF_ERROR(
+      c->Merge(c->Dim(input_shape, 4), c->Dim(filter_shape, 3), &unused));
+
+  Padding padding;
+  TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
+
+  int64 output_planes, output_rows, output_cols;
+  int64 padding_before, padding_after;
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_planes, filter_planes, stride_planes, padding, &output_planes,
+      &padding_before, &padding_after));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_rows, filter_rows, stride_rows, padding, &output_rows, &padding_before,
+      &padding_after));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_cols, filter_cols, stride_cols, padding, &output_cols, &padding_before,
+      &padding_after));
+
+  const Shape* output_shape =
+      c->MakeShape({batch_size_dim, output_planes, output_rows, output_cols,
+                    output_depth_dim});
+  c->set_output(0, output_shape);
+  return Status::OK();
+}
+
 Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c) {
   const Shape* input_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
@@ -288,12 +346,12 @@ Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c) {
   const Dimension* depth_multiplier = c->Dim(filter_shape, 3);
 
   // At the moment we need to know the values of several fields.
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_rows_dim, "in_rows"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_cols_dim, "in_cols"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, filter_rows_dim, "filter_rows"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, filter_cols_dim, "filter_cols"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, input_depth, "depth"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, depth_multiplier, "depth_multiplier"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_rows_dim, "filter_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_cols_dim, "filter_cols"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(input_depth, "depth"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(depth_multiplier, "depth_multiplier"));
 
   // Check that the input depths are compatible.
   TF_RETURN_IF_ERROR(
@@ -380,8 +438,8 @@ Status AvgPoolShape(shape_inference::InferenceContext* c) {
   const Dimension* output_depth_dim = c->Dim(input_shape, 3);
 
   // At the moment we need to know the values of several fields.
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_rows_dim, "in_rows"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_cols_dim, "in_cols"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
 
   Padding padding;
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
@@ -467,9 +525,9 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
   const Dimension* in_depth_dim = c->Dim(input_shape, 3);
 
   // At the moment we need to know the values of several fields.
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_rows_dim, "in_rows"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_cols_dim, "in_cols"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_depth_dim, "in_depth"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_depth_dim, "in_depth"));
 
   Padding padding;
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
@@ -507,6 +565,78 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+Status Pool3DShape(shape_inference::InferenceContext* c) {
+  const Shape* input_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 5, &input_shape));
+
+  std::vector<int32> strides;
+  TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
+  if (strides.size() != 5) {
+    return errors::InvalidArgument(
+        "Pool3D ops require the stride attribute to contain 5 values, but "
+        "got: ",
+        strides.size());
+  }
+
+  std::vector<int32> kernel_sizes;
+  TF_RETURN_IF_ERROR(c->GetAttr("ksize", &kernel_sizes));
+  if (kernel_sizes.size() != 5) {
+    return errors::InvalidArgument(
+        "Pool3D requires the ksize attribute to contain 5 values, but got: ",
+        kernel_sizes.size());
+  }
+
+  int32 stride_planes, stride_rows, stride_cols;
+  int32 kernel_planes, kernel_rows, kernel_cols;
+
+  stride_planes = strides[1];
+  stride_rows = strides[2];
+  stride_cols = strides[3];
+  kernel_planes = kernel_sizes[1];
+  kernel_rows = kernel_sizes[2];
+  kernel_cols = kernel_sizes[3];
+
+  const Dimension* batch_size_dim = c->Dim(input_shape, 0);
+  const Dimension* in_planes_dim = c->Dim(input_shape, 1);
+  const Dimension* in_rows_dim = c->Dim(input_shape, 2);
+  const Dimension* in_cols_dim = c->Dim(input_shape, 3);
+  const Dimension* output_depth_dim = c->Dim(input_shape, 4);
+
+  // At the moment we need to know the values of several fields.
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_planes_dim, "in_planes"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
+
+  Padding padding;
+  TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
+
+  // TODO(mrry,shlens): Raise an error if the stride would cause
+  // information in the input to be ignored. This will require a change
+  // in the kernel implementation.
+  auto in_planes = c->Value(in_planes_dim);
+  auto in_rows = c->Value(in_rows_dim);
+  auto in_cols = c->Value(in_cols_dim);
+
+  int64 output_planes, output_rows, output_cols;
+  int64 padding_before, padding_after;
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_planes, kernel_planes, stride_planes, padding, &output_planes,
+      &padding_before, &padding_after));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_rows, kernel_rows, stride_rows, padding, &output_rows, &padding_before,
+      &padding_after));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_cols, kernel_cols, stride_cols, padding, &output_cols, &padding_before,
+      &padding_after));
+
+  const Shape* output_shape =
+      c->MakeShape({batch_size_dim, output_planes, output_rows, output_cols,
+                    output_depth_dim});
+
+  c->set_output(0, output_shape);
+  return Status::OK();
+}
+
 Status UnknownShape(shape_inference::InferenceContext* c) {
   for (int i = 0; i < c->num_outputs(); ++i) {
     c->set_output(i, c->UnknownShape());
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index f1bdd5ee8d1..0ca64990365 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -157,6 +157,9 @@ Status BiasAddGradShape(shape_inference::InferenceContext* c);
 // Shape function for Conv2D-like operations.
 Status Conv2DShape(shape_inference::InferenceContext* c);
 
+// Shape function for Conv3D-like operations.
+Status Conv3DShape(shape_inference::InferenceContext* c);
+
 // Shape function for DepthwiseConv2D-like operations.
 Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c);
 
@@ -166,6 +169,9 @@ Status AvgPoolShape(shape_inference::InferenceContext* c);
 // Shape function for MaxPool-like operations.
 Status MaxPoolShape(shape_inference::InferenceContext* c);
 
+// Shape function for 3D Pooling operations.
+Status Pool3DShape(shape_inference::InferenceContext* c);
+
 // Shape function for use with ops whose output shapes are unknown.
 Status UnknownShape(shape_inference::InferenceContext* c);
 
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index eada469b17a..6e0dd7f742d 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -419,6 +419,55 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,4,4,d1_3]");
 }
 
+TEST(CommonShapeFnsTest, Conv3DShapeTest) {
+  ShapeInferenceTestOp op("Conv3D");
+  auto set_op = [&op](const std::vector<int32>& strides,
+                      const string& padding) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Conv3D")
+                    .Input("input", 0, DT_FLOAT)
+                    .Input("filter", 0, DT_FLOAT)
+                    .Attr("strides", strides)
+                    .Attr("padding", padding)
+                    .Finalize(&op.node_def));
+  };
+
+  // 1x1x1 filter
+  set_op({{1, 1, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
+
+  // Invalid rank for input
+  INFER_ERROR("must be rank 5", op, "[4,4];[2,1,1,1]");
+  // Invalid rank for filter
+  INFER_ERROR("must be rank 5", op, "[1,4,4,1];[2,1,1]");
+
+  // No unknown dims in the critical fields.
+  INFER_ERROR("is not known", op, "[1,?,2,2,1];[1,1,1,1,1]");
+  INFER_ERROR("is not known", op, "[1,2,?,2,1];[1,1,1,1,1]");
+  INFER_ERROR("is not known", op, "[1,2,2,?,1];[1,1,1,1,1]");
+  INFER_ERROR("is not known", op, "[1,2,2,2,1];[?,1,1,1,1]");
+  INFER_ERROR("is not known", op, "[1,2,2,2,1];[1,?,1,1,1]");
+
+  // input depths must match.
+  INFER_ERROR("Dimensions must be equal, but are 10 and 10000", op,
+              "[1,2,2,2,10];[1,1,1,10000,20]");
+
+  // 2x2x2 filter
+  set_op({{1, 1, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,2,2,2,1];[2,2,2,1,1]", "[d0_0,1,1,1,d1_4]");
+
+  // 3x3 input, 1x1 filter, 2x2 stride
+  set_op({{1, 2, 2, 2, 1}}, "VALID");
+  INFER_OK(op, "[1,3,3,3,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
+
+  // 3x3 input, 1x1 filter, 2x1x1 stride
+  set_op({{1, 2, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,3,3,3,1];[1,1,1,1,1]", "[d0_0,2,3,3,d1_4]");
+
+  // 4x4 input, 2x2 filter, 1x1 stride
+  set_op({{1, 1, 1, 1, 1}}, "SAME");
+  INFER_OK(op, "[1,4,4,4,1];[2,2,2,1,1]", "[d0_0,4,4,4,d1_4]");
+}
+
 TEST(CommonShapeFnsTest, DepthwiseConv2DShapeTest) {
   ShapeInferenceTestOp op("DepthwiseConv2dNative");
   std::vector<int32> strides = {{1, 1, 1, 1}};
@@ -512,6 +561,26 @@ TEST(CommonShapeFnsTest, MaxPool2DShapeTest) {
   INFER_OK(op, "[1,7,5,5]", "[d0_0,3,5,5]");
 }
 
+TEST(CommonShapeFnsTest, Pool3DShapeTest) {
+  ShapeInferenceTestOp op("MaxPool3D");
+  auto set_op = [&op](const std::vector<int32>& strides,
+                      const std::vector<int32>& ksizes, const string& padding) {
+    TF_CHECK_OK(NodeDefBuilder("test", "MaxPool3D")
+                    .Input("input", 0, DT_FLOAT)
+                    .Attr("strides", strides)
+                    .Attr("ksize", ksizes)
+                    .Attr("padding", padding)
+                    .Finalize(&op.node_def));
+  };
+
+  // Most of the functionality is tested by conv-like shapes,
+  // so we check that we handle the extra dimension properly.
+
+  // 2x3x4 stride, 1x1x1 filter.
+  set_op({1, 2, 3, 4, 1}, {1, 1, 1, 1, 1}, "VALID");
+  INFER_OK(op, "[1,24,24,24,1]", "[d0_0,12,8,6,d0_4]");
+}
+
 TEST(CommonShapeFnsTest, UnknownShapeTest) {
   {
     // Single output
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 03d4bde37b0..52afde9fac3 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -360,6 +360,10 @@ class FunctionLibraryRuntime {
 
   // Return the device on which the function executes.
   virtual Device* device() = 0;
+
+  // Returns the function library definition that backs this runtime.
+  virtual const FunctionLibraryDefinition* GetFunctionLibraryDefinition()
+      const = 0;
 };
 
 // To register a gradient function for a builtin op, one should use
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index c66d9fb4e14..9c90bfe0f50 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -99,6 +99,14 @@ InferenceContext::~InferenceContext() {
   for (auto* d : all_dims_) delete d;
 }
 
+bool InferenceContext::FullyDefined(const Shape* s) {
+  if (!RankKnown(s)) return false;
+  for (int i = 0; i < Rank(s); ++i) {
+    if (!ValueKnown(Dim(s, i))) return false;
+  }
+  return true;
+}
+
 const Dimension* InferenceContext::NumElements(const Shape* s) {
   const auto rank = Rank(s);
   if (rank == kUnknownRank) return UnknownDim();
@@ -379,12 +387,6 @@ Status InferenceContext::ReplaceDim(const Shape* s, int dim_index_in,
   return ReturnCreatedShape(dims, out);
 }
 
-const Dimension* InferenceContext::GetDimension(const DimensionOrConstant& d) {
-  if (d.dim != nullptr) return d.dim;
-  DCHECK(d.val >= 0 || d.val == kUnknownDim);
-  return MakeDim(d.val);
-}
-
 const Shape* InferenceContext::MakeShape(
     const std::vector<const Dimension*>& dims) {
   all_shapes_.push_back(new Shape(dims));
@@ -396,7 +398,7 @@ const Shape* InferenceContext::MakeShape(
   std::vector<const Dimension*> dims_actual;
   dims_actual.reserve(dims.size());
   for (const DimensionOrConstant& d : dims) {
-    dims_actual.push_back(GetDimension(d));
+    dims_actual.push_back(MakeDim(d));
   }
   return MakeShape(dims_actual);
 }
@@ -480,11 +482,6 @@ Status InferenceContext::MakeShapeFromShapeProto(const TensorShapeProto& proto,
   return ReturnCreatedShape(dims, out);
 }
 
-const Dimension* InferenceContext::MakeDim(int64 value) {
-  all_dims_.push_back(new Dimension(value));
-  return all_dims_.back();
-}
-
 // Returns a new dimension whose value is given by a scalar input tensor.
 Status InferenceContext::MakeDimForScalarInput(int idx, const Dimension** out) {
   const Tensor* t = input_tensor(idx);
@@ -514,11 +511,6 @@ Status InferenceContext::MakeDimForScalarInput(int idx, const Dimension** out) {
   return Status::OK();
 }
 
-const Dimension* InferenceContext::UnknownDim() {
-  all_dims_.push_back(new Dimension());
-  return all_dims_.back();
-}
-
 Status InferenceContext::Divide(const Dimension* dividend, int64 divisor,
                                 const Dimension** out) {
   if (divisor == 1) {
@@ -527,6 +519,10 @@ Status InferenceContext::Divide(const Dimension* dividend, int64 divisor,
     *out = UnknownDim();
   } else {
     const int64 v = Value(dividend);
+    if (divisor <= 0) {
+      return errors::InvalidArgument("Divisor must be positive but is ",
+                                     divisor);
+    }
     if ((v % divisor) != 0) {
       return errors::InvalidArgument("Dimension size must be divisible by ",
                                      divisor, " but is ", v);
@@ -538,87 +534,112 @@ Status InferenceContext::Divide(const Dimension* dividend, int64 divisor,
 
 Status InferenceContext::Add(const Dimension* first, DimensionOrConstant second,
                              const Dimension** out) {
-  const int64 second_value =
-      second.dim == nullptr ? second.val : Value(second.dim);
-  if (second.dim != nullptr && !ValueKnown(second.dim)) {
-    *out = UnknownDim();
+  const int64 first_value = Value(first);
+  const int64 second_value = Value(second);
+  // Special cases.
+  if (first_value == 0) {
+    *out = MakeDim(second);
   } else if (second_value == 0) {
-    *out = first;
-  } else if (!ValueKnown(first)) {
+    *out = MakeDim(first);
+  } else if (first_value == kUnknownDim || second_value == kUnknownDim) {
     *out = UnknownDim();
   } else {
-    const int64 v = Value(first);
-    const int64 sum = v + second_value;
-    if (second_value > 0 && sum < 0) {
-      return errors::InvalidArgument("Dimension size overflow from adding ", v,
-                                     " and ", second_value);
-    } else if (second_value < 0 && sum < 0) {
-      return errors::InvalidArgument("Negative dimension size from adding ", v,
-                                     " and ", second_value);
+    // Invariant: Both values are known and positive.
+    const int64 sum = first_value + second_value;
+    if (sum < 0) {
+      return errors::InvalidArgument("Dimension size overflow from adding ",
+                                     first_value, " and ", second_value);
     }
     *out = MakeDim(sum);
   }
   return Status::OK();
 }
 
+Status InferenceContext::Subtract(const Dimension* first,
+                                  DimensionOrConstant second,
+                                  const Dimension** out) {
+  const int64 first_value = Value(first);
+  const int64 second_value = Value(second);
+  // Special cases.
+  if (second_value == 0) {
+    *out = MakeDim(first);
+  } else if (first_value == kUnknownDim || second_value == kUnknownDim) {
+    *out = UnknownDim();
+  } else {
+    // Invariant: Both values are known, first_value is non-negative, and
+    // second_value is positive.
+    if (first_value < second_value) {
+      return errors::InvalidArgument(
+          "Negative dimension size caused by subtracting ", second_value,
+          " from ", first_value);
+    }
+    *out = MakeDim(first_value - second_value);
+  }
+  return Status::OK();
+}
+
 Status InferenceContext::Multiply(const Dimension* first,
                                   DimensionOrConstant second,
                                   const Dimension** out) {
-  int64 first_value = -1;
-  // Special cases for multiply are when the values are 0 or 1.
-  if (ValueKnown(first)) {
-    first_value = Value(first);
-    if (first_value == 0) {
-      *out = MakeDim(0);
-      return Status::OK();
-    }
-
-    // Output is whatever the second value is.
-    if (first_value == 1) {
-      *out = GetDimension(second);
-      return Status::OK();
-    }
-  }
-
-  // Same check for when the second argument is a known value.
-  // First find out if the value is known from DimOrConstant.
-  int64 second_value;
-  if (second.dim == nullptr) {
-    second_value = second.val;
-  } else {
-    if (!ValueKnown(second.dim)) {
-      // Second value is not known and first is not a special caase
-      *out = UnknownDim();
-      return Status::OK();
-    }
-    second_value = Value(second.dim);
-  }
-
-  // Now that we know whether the value is known, apply the special
-  // casing.
-  if (second_value == 0) {
-    *out = MakeDim(0);
-    return Status::OK();
-  }
-
-  // Output is whatever the first value is.
-  if (second_value == 1) {
+  const int64 first_value = Value(first);
+  const int64 second_value = Value(second);
+  // Special cases.
+  if (first_value == 0) {
     *out = first;
-    return Status::OK();
-  }
-
-  if (!ValueKnown(first)) {
-    // First value is not known and second is not a special caase
+  } else if (second_value == 0) {
+    *out = MakeDim(second);
+  } else if (first_value == 1) {
+    *out = MakeDim(second);
+  } else if (second_value == 1) {
+    *out = first;
+  } else if (first_value == kUnknownDim || second_value == kUnknownDim) {
     *out = UnknownDim();
-    return Status::OK();
+  } else {
+    // Invariant: Both values are known and and greater than 1.
+    const int64 product = first_value * second_value;
+    if (product < 0) {
+      return errors::InvalidArgument(
+          "Negative dimension size caused by overflow when multiplying ",
+          first_value, " and ", second_value);
+    }
+    *out = MakeDim(product);
   }
+  return Status::OK();
+}
 
-  const int64 product = first_value * second_value;
-  if (product < 0) {
-    return errors::InvalidArgument("Negative dimension size from multiplying ",
-                                   first_value, " and ", second_value);
+Status InferenceContext::Min(const Dimension* first, DimensionOrConstant second,
+                             const Dimension** out) {
+  const int64 first_value = Value(first);
+  const int64 second_value = Value(second);
+  if (first_value == 0) {
+    *out = first;
+  } else if (second_value == 0) {
+    *out = MakeDim(second);
+  } else if (first_value == kUnknownDim || second_value == kUnknownDim) {
+    *out = UnknownDim();
+  } else {
+    if (first_value <= second_value) {
+      *out = first;
+    } else {
+      *out = MakeDim(second);
+    }
+  }
+  return Status::OK();
+}
+
+Status InferenceContext::Max(const Dimension* first, DimensionOrConstant second,
+                             const Dimension** out) {
+  const int64 first_value = Value(first);
+  const int64 second_value = Value(second);
+  if (first_value == kUnknownDim || second_value == kUnknownDim) {
+    *out = UnknownDim();
+  } else {
+    if (first_value >= second_value) {
+      *out = first;
+    } else {
+      *out = MakeDim(second);
+    }
   }
-  *out = MakeDim(product);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index a7a5c50d02d..f35c8a4c815 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -46,7 +46,7 @@ class Dimension {
 class Shape {
  private:
   Shape();
-  Shape(std::vector<const Dimension*> dims);
+  Shape(const std::vector<const Dimension*>& dims);
   ~Shape() {}
 
   const int32 rank_;
@@ -61,13 +61,17 @@ class Shape {
 struct DimensionOrConstant {
  public:
   // Intentionally not explicit.
-  DimensionOrConstant(const Dimension* dim) : dim(dim) {}
+  DimensionOrConstant(const Dimension* dim);
 
   // val must be non-negative or InferenceContext::kUnknownDim.
-  DimensionOrConstant(int64 val) : val(val) {}
+  DimensionOrConstant(int64 val);
 
-  const Dimension* dim = nullptr;
-  int64 val = 0;
+  // dim takes precedence. If dim != nullptr, val is ignored.
+  const Dimension* dim;
+  int64 val;
+
+ private:
+  DimensionOrConstant();
 };
 
 // Note: This is experimental support for op shape inference in C++.  Shape
@@ -81,8 +85,8 @@ struct DimensionOrConstant {
 // by the InferenceContext.
 class InferenceContext {
  public:
-  static constexpr int32 kUnknownRank = -1;
   static constexpr int64 kUnknownDim = -1;
+  static constexpr int32 kUnknownRank = -1;
 
   // This is a temporary constructor used for initial testing.
   //
@@ -127,8 +131,15 @@ class InferenceContext {
   }
   int32 Rank(const Shape* s) { return s->rank_; }
   bool RankKnown(const Shape* s) { return Rank(s) != kUnknownRank; }
-  int64 Value(const Dimension* d) { return d->value_; }
-  bool ValueKnown(const Dimension* d) { return Value(d) != kUnknownDim; }
+  inline int64 Value(DimensionOrConstant d) {
+    return d.dim ? d.dim->value_ : d.val;
+  }
+  inline bool ValueKnown(DimensionOrConstant d) {
+    return Value(d) != kUnknownDim;
+  }
+
+  // Returns true if the rank and all dimensions of the Shape are known.
+  bool FullyDefined(const Shape* s);
 
   // Returns the total number of elements, or an unknown dimension for an
   // incomplete shape.
@@ -229,8 +240,15 @@ class InferenceContext {
 
   // Returns a new dimension of the given size.  The returned value is owned by
   // this context.
-  const Dimension* MakeDim(int64 value);
-  const Dimension* UnknownDim();
+  inline const Dimension* MakeDim(DimensionOrConstant d) {
+    if (d.dim) {
+      return d.dim;
+    } else {
+      all_dims_.push_back(new Dimension(d.val));
+      return all_dims_.back();
+    }
+  }
+  inline const Dimension* UnknownDim() { return MakeDim(kUnknownDim); }
 
   // Returns a new dimension whose value is given by a scalar input tensor.
   // The input tensor must be in host memory, since it is dereferenced to get
@@ -244,7 +262,8 @@ class InferenceContext {
   Status GetAttr(StringPiece attr_name, T* value) const;
 
   // Returns in <out> the result of dividing <dividend> by <divisor>.
-  // Returns an error if <divisor> does not evenly divide <dividend>.
+  // Returns an error if <divisor>  is not positive or does not evenly
+  // divide <dividend>.
   Status Divide(const Dimension* dividend, int64 divisor,
                 const Dimension** out);
 
@@ -252,12 +271,37 @@ class InferenceContext {
   Status Add(const Dimension* first, DimensionOrConstant second,
              const Dimension** out);
 
+  // Returns in <out> the dimension that is <first> minus <second>.
+  Status Subtract(const Dimension* first, DimensionOrConstant second,
+                  const Dimension** out);
+
   // Returns in <out> the product of <first> and <second>.
   Status Multiply(const Dimension* first, DimensionOrConstant second,
                   const Dimension** out);
 
+  // Returns in <out> the minimum of <first> and <second>. If either <first> or
+  // <second> is zero the results is zero. Otherwise, if either <first> or
+  // <second> is unknown the results is unknown.
+  Status Min(const Dimension* first, DimensionOrConstant second,
+             const Dimension** out);
+
+  // Returns in <out> the maximum of <first> and <second>. If either <first> or
+  // <second> is unknown the results is unknown.
+  Status Max(const Dimension* first, DimensionOrConstant second,
+             const Dimension** out);
+
   Status construction_status() const { return construction_status_; }
 
+  // Validates that 'dim' has a known value, and prints an error
+  // message containing 'name' if validation fails.
+  Status ValidateKnownDim(const Dimension* dim, const char* name) {
+    if (!ValueKnown(dim)) {
+      return errors::InvalidArgument("Cannot infer shape because dimension ",
+                                     name, " is not known.");
+    }
+    return Status::OK();
+  }
+
  private:
   const Dimension* GetDimension(const DimensionOrConstant& d);
 
@@ -294,12 +338,30 @@ class InferenceContext {
 // Template and inline method implementations, please ignore
 
 inline Dimension::Dimension() : value_(InferenceContext::kUnknownDim) {}
-inline Dimension::Dimension(int64 value) : value_(value) {}
+inline Dimension::Dimension(int64 value) : value_(value) {
+  DCHECK(value >= 0 || value == InferenceContext::kUnknownDim)
+      << "Dimension must be non-negative or equal to "
+         "InferenceContext::kUnknownDim but got"
+      << value;
+}
 
 inline Shape::Shape() : rank_(InferenceContext::kUnknownRank) {}
-inline Shape::Shape(const std::vector<const Dimension*> dims)
+inline Shape::Shape(const std::vector<const Dimension*>& dims)
     : rank_(dims.size()), dims_(dims) {}
 
+inline DimensionOrConstant::DimensionOrConstant(const Dimension* dim)
+    : dim(dim) {
+  DCHECK(dim != nullptr) << "Internal error: Got nullptr for Dimension.";
+}
+
+inline DimensionOrConstant::DimensionOrConstant(int64 val)
+    : dim(nullptr), val(val) {
+  DCHECK(val >= 0 || val == InferenceContext::kUnknownDim)
+      << "Dimension must be non-negative or equal to "
+         "InferenceContext::kUnknownDim but got"
+      << val;
+}
+
 template <class T>
 Status InferenceContext::GetAttr(StringPiece attr_name, T* value) const {
   return GetNodeAttr(node_def_, attr_name, value);
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index a1557912c70..1ecba2839a7 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -36,6 +36,19 @@ static OpDef MakeOpDef(int num_inputs, int num_outputs) {
   return op_reg_data.op_def;
 }
 
+TEST(ShapeInferenceTest, DimensionOrConstant) {
+  NodeDef def;
+  InferenceContext c(&def, MakeOpDef(1, 1), {"?"}, {});
+  EXPECT_EQ(InferenceContext::kUnknownDim,
+            c.Value(InferenceContext::kUnknownDim));
+  EXPECT_EQ(1, c.Value(1));
+
+#ifndef NDEBUG
+  // Only run death test if DCHECKS are enabled.
+  EXPECT_DEATH(c.Value(-7), "Dimension must be non\\-negative or equal to");
+#endif
+}
+
 TEST(ShapeInferenceTest, RankAndDimInspection) {
   NodeDef def;
   InferenceContext c(&def, MakeOpDef(3, 2), {"?", "[1,?,3]", "[]"}, {});
@@ -767,15 +780,20 @@ TEST(ShapeInferenceTest, Divide) {
 
   EXPECT_EQ("Dimension size must be divisible by 5 but is 6",
             c.Divide(d_6, 5, &out).error_message());
+  EXPECT_EQ("Divisor must be positive but is 0",
+            c.Divide(d_6, 0, &out).error_message());
+  EXPECT_EQ("Divisor must be positive but is -1",
+            c.Divide(d_6, -1, &out).error_message());
 }
 
 TEST(ShapeInferenceTest, Add) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {"[6,?]"}, {});
+  InferenceContext c(&def, MakeOpDef(1, 2), {"[6,?,0]"}, {});
 
   auto s = c.input(0);
   auto d_6 = c.Dim(s, 0);
   auto d_unknown = c.Dim(s, 1);
+  auto d_0 = c.Dim(s, 2);
 
   // Adding non-zero to unknown gives new unknown.
   const Dimension* out;
@@ -790,16 +808,14 @@ TEST(ShapeInferenceTest, Add) {
   EXPECT_TRUE(out == d_6);
 
   // Adding dimension with value 0 to anything gives input.
-  EXPECT_TRUE(c.Add(d_unknown, c.MakeDim(0), &out).ok());
+  EXPECT_TRUE(c.Add(d_unknown, c.MakeDim(0ll), &out).ok());
   EXPECT_TRUE(out == d_unknown);
-  EXPECT_TRUE(c.Add(d_6, c.MakeDim(0), &out).ok());
+  EXPECT_TRUE(c.Add(d_6, c.MakeDim(0ll), &out).ok());
   EXPECT_TRUE(out == d_6);
 
   // Test addition.
   EXPECT_TRUE(c.Add(d_6, 2, &out).ok());
   EXPECT_EQ("8", c.DebugString(out));
-  EXPECT_TRUE(c.Add(d_6, -6, &out).ok());
-  EXPECT_EQ("0", c.DebugString(out));
   EXPECT_TRUE(c.Add(d_6, std::numeric_limits<int64>::max() - 6, &out).ok());
   EXPECT_EQ(std::numeric_limits<int64>::max(), c.Value(out));
 
@@ -811,14 +827,62 @@ TEST(ShapeInferenceTest, Add) {
   EXPECT_EQ(std::numeric_limits<int64>::max(), c.Value(out));
   EXPECT_TRUE(c.Add(d_6, c.UnknownDim(), &out).ok());
   EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(c.Add(d_0, d_6, &out).ok());
+  EXPECT_TRUE(out == d_6);
 
-  EXPECT_EQ("Negative dimension size from adding 6 and -7",
-            c.Add(d_6, -7, &out).error_message());
   EXPECT_EQ(
       "Dimension size overflow from adding 6 and 9223372036854775802",
       c.Add(d_6, std::numeric_limits<int64>::max() - 5, &out).error_message());
 }
 
+TEST(ShapeInferenceTest, Subtract) {
+  NodeDef def;
+  InferenceContext c(&def, MakeOpDef(1, 2), {"[6,?,0,5]"}, {});
+
+  auto s = c.input(0);
+  auto d_6 = c.Dim(s, 0);
+  auto d_unknown = c.Dim(s, 1);
+  auto d_0 = c.Dim(s, 2);
+  auto d_5 = c.Dim(s, 3);
+
+  // Subtracting non-zero from unknown gives new unknown.
+  const Dimension* out;
+  EXPECT_TRUE(c.Subtract(d_unknown, 1, &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(out != d_unknown);
+
+  // Subtracting 0 from anything gives input.
+  EXPECT_TRUE(c.Subtract(d_unknown, 0ll, &out).ok());
+  EXPECT_TRUE(out == d_unknown);
+  EXPECT_TRUE(c.Subtract(d_6, 0ll, &out).ok());
+  EXPECT_TRUE(out == d_6);
+
+  // Subtracting dimension with value 0 from anything gives input.
+  EXPECT_TRUE(c.Subtract(d_unknown, c.MakeDim(0ll), &out).ok());
+  EXPECT_TRUE(out == d_unknown);
+  EXPECT_TRUE(c.Subtract(d_6, c.MakeDim(0ll), &out).ok());
+  EXPECT_TRUE(out == d_6);
+
+  // Test subtraction.
+  EXPECT_TRUE(c.Subtract(d_6, 2, &out).ok());
+  EXPECT_EQ("4", c.DebugString(out));
+  EXPECT_TRUE(c.Subtract(d_6, 6, &out).ok());
+  EXPECT_EQ("0", c.DebugString(out));
+
+  // Test subtraction using dimension as second value.
+  EXPECT_TRUE(c.Subtract(d_6, c.MakeDim(2), &out).ok());
+  EXPECT_EQ("4", c.DebugString(out));
+  EXPECT_TRUE(c.Subtract(d_6, d_5, &out).ok());
+  EXPECT_EQ("1", c.DebugString(out));
+  EXPECT_TRUE(c.Subtract(d_6, c.UnknownDim(), &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(c.Subtract(d_6, d_0, &out).ok());
+  EXPECT_TRUE(out == d_6);
+
+  EXPECT_EQ("Negative dimension size caused by subtracting 6 from 5",
+            c.Subtract(d_5, d_6, &out).error_message());
+}
+
 TEST(ShapeInferenceTest, Multiply) {
   NodeDef def;
   InferenceContext c(&def, MakeOpDef(1, 2), {"[6,?,0,1]"}, {});
@@ -831,7 +895,7 @@ TEST(ShapeInferenceTest, Multiply) {
 
   // Multiplying non-zero to unknown gives new unknown.
   const Dimension* out;
-  EXPECT_TRUE(c.Multiply(d_unknown, 1, &out).ok());
+  EXPECT_TRUE(c.Multiply(d_unknown, 2, &out).ok());
   EXPECT_EQ("?", c.DebugString(out));
 
   // Multiplying 0 to anything gives 0.
@@ -844,19 +908,19 @@ TEST(ShapeInferenceTest, Multiply) {
 
   // Multiplying 1 to anything gives the original.
   // (unknown -> unknown)
-  EXPECT_TRUE(c.Multiply(d_unknown, static_cast<int64>(1), &out).ok());
-  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(c.Multiply(d_unknown, 1, &out).ok());
+  EXPECT_EQ(d_unknown, out);
   EXPECT_TRUE(c.Multiply(d_unknown, d_1, &out).ok());
-  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_EQ(d_unknown, out);
   EXPECT_TRUE(c.Multiply(d_1, d_unknown, &out).ok());
-  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_EQ(d_unknown, out);
   // (known -> known)
-  EXPECT_TRUE(c.Multiply(d_6, static_cast<int64>(1), &out).ok());
-  EXPECT_EQ("6", c.DebugString(out));
+  EXPECT_TRUE(c.Multiply(d_6, 1, &out).ok());
+  EXPECT_EQ(d_6, out);
   EXPECT_TRUE(c.Multiply(d_6, d_1, &out).ok());
-  EXPECT_EQ("6", c.DebugString(out));
+  EXPECT_EQ(d_6, out);
   EXPECT_TRUE(c.Multiply(d_1, d_6, &out).ok());
-  EXPECT_EQ("6", c.DebugString(out));
+  EXPECT_EQ(d_6, out);
 
   // Test multiplication.
   EXPECT_TRUE(c.Multiply(d_6, 2, &out).ok());
@@ -869,9 +933,112 @@ TEST(ShapeInferenceTest, Multiply) {
   EXPECT_EQ("12", c.DebugString(out));
   EXPECT_TRUE(c.Multiply(d_6, c.UnknownDim(), &out).ok());
   EXPECT_EQ("?", c.DebugString(out));
+}
 
-  EXPECT_EQ("Negative dimension size from multiplying 6 and -7",
-            c.Multiply(d_6, -7, &out).error_message());
+TEST(ShapeInferenceTest, FullyDefined) {
+  NodeDef def;
+  InferenceContext c(&def, MakeOpDef(0, 2), {}, {});
+
+  // No rank or missing dimension information should return false.
+  EXPECT_FALSE(c.FullyDefined(c.UnknownShape()));
+  EXPECT_FALSE(c.FullyDefined(c.Matrix(c.MakeDim(1), c.UnknownDim())));
+
+  // Return true if all information exists.
+  EXPECT_TRUE(c.FullyDefined(c.Matrix(c.MakeDim(1), c.MakeDim(2))));
+  EXPECT_TRUE(c.FullyDefined(c.Scalar()));
+}
+
+TEST(ShapeInferenceTest, ValidateKnownDim) {
+  NodeDef def;
+  InferenceContext c(&def, MakeOpDef(0, 2), {}, {});
+
+  EXPECT_FALSE(c.ValidateKnownDim(c.UnknownDim(), "unknown").ok());
+  EXPECT_TRUE(c.ValidateKnownDim(c.Dim(c.Matrix(1, 2), 0), "known").ok());
+}
+
+TEST(ShapeInferenceTest, Min) {
+  NodeDef def;
+  InferenceContext c(&def, MakeOpDef(1, 2), {"[1,2,?,0]"}, {});
+
+  auto s = c.input(0);
+  auto d_1 = c.Dim(s, 0);
+  auto d_2 = c.Dim(s, 1);
+  auto d_unknown = c.Dim(s, 2);
+  auto d_0 = c.Dim(s, 3);
+
+  // Minimum involving zero and unknown returns zero.
+  const Dimension* out;
+  EXPECT_TRUE(c.Min(d_0, d_unknown, &out).ok());
+  EXPECT_EQ(d_0, out);
+  EXPECT_TRUE(c.Min(d_unknown, d_0, &out).ok());
+  EXPECT_EQ(d_0, out);
+  EXPECT_TRUE(c.Min(c.MakeDim(0ll), d_unknown, &out).ok());
+  EXPECT_EQ("0", c.DebugString(out));
+  EXPECT_TRUE(c.Min(d_unknown, 0ll, &out).ok());
+  EXPECT_EQ("0", c.DebugString(out));
+
+  // Minimum involving unknowns and non-zeros gives new unknown.
+  EXPECT_TRUE(c.Min(d_unknown, d_unknown, &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(c.Min(d_unknown, 1, &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(c.Min(d_1, d_unknown, &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+
+  // Minimum with constant second arg.
+  EXPECT_TRUE(c.Min(d_1, 1, &out).ok());
+  EXPECT_EQ(d_1, out);
+  EXPECT_TRUE(c.Min(d_1, 3, &out).ok());
+  EXPECT_EQ(d_1, out);
+  EXPECT_TRUE(c.Min(d_2, 1, &out).ok());
+  EXPECT_EQ("1", c.DebugString(out));
+
+  // Minimum with two dimensions.
+  EXPECT_TRUE(c.Min(d_1, d_1, &out).ok());
+  EXPECT_EQ(d_1, out);
+  EXPECT_TRUE(c.Min(d_1, d_2, &out).ok());
+  EXPECT_EQ(d_1, out);
+  EXPECT_TRUE(c.Min(d_2, d_1, &out).ok());
+  EXPECT_EQ(d_1, out);
+  EXPECT_TRUE(c.Min(d_2, d_2, &out).ok());
+  EXPECT_EQ(d_2, out);
+}
+
+TEST(ShapeInferenceTest, Max) {
+  NodeDef def;
+  InferenceContext c(&def, MakeOpDef(1, 2), {"[1,2,?]"}, {});
+
+  auto s = c.input(0);
+  auto d_1 = c.Dim(s, 0);
+  auto d_2 = c.Dim(s, 1);
+  auto d_unknown = c.Dim(s, 2);
+
+  // Maximum involving unknowns gives new unknown.
+  const Dimension* out;
+  EXPECT_TRUE(c.Max(d_unknown, d_unknown, &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(c.Max(d_unknown, 1, &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(c.Max(d_1, d_unknown, &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+
+  // Maximum with constant second arg.
+  EXPECT_TRUE(c.Max(d_1, 1, &out).ok());
+  EXPECT_EQ(d_1, out);
+  EXPECT_TRUE(c.Max(d_2, 1, &out).ok());
+  EXPECT_EQ(d_2, out);
+  EXPECT_TRUE(c.Max(d_2, 3, &out).ok());
+  EXPECT_EQ("3", c.DebugString(out));
+
+  // Maximum with two dimensions.
+  EXPECT_TRUE(c.Max(d_1, d_1, &out).ok());
+  EXPECT_EQ(d_1, out);
+  EXPECT_TRUE(c.Max(d_1, d_2, &out).ok());
+  EXPECT_EQ(d_2, out);
+  EXPECT_TRUE(c.Max(d_2, d_1, &out).ok());
+  EXPECT_EQ(d_2, out);
+  EXPECT_TRUE(c.Max(d_2, d_2, &out).ok());
+  EXPECT_EQ(d_2, out);
 }
 
 }  // namespace shape_inference
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index c1e55d032d6..60a9cb101fd 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -40,6 +40,11 @@ Status InferShapes(ShapeInferenceTestOp op, const string& ins,
   shape_inference::InferenceContext c(&op.node_def, op_reg_data->op_def, ins_v,
                                       op.input_tensors);
   TF_RETURN_IF_ERROR(c.construction_status());
+  if (op_reg_data->shape_inference_fn == nullptr) {
+    return errors::InvalidArgument(
+        "No shape inference function exists for op '", op.name,
+        "', did you forget to define it?");
+  }
   TF_RETURN_IF_ERROR(op_reg_data->shape_inference_fn(&c));
   const int num_outputs = c.num_outputs();
 
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 9f61d3d47e6..4e1a99acd68 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -33,13 +33,14 @@ static void AppendTo(const TensorShape& s, gtl::InlinedVector<int64, 8>* vals) {
 }
 
 void TensorShape::CheckDimsEqual(int NDIMS) const {
-  CHECK_EQ(NDIMS, dims()) << "Asking for tensor of " << NDIMS
-                          << " for a tensor of " << dims() << " dimensions";
+  CHECK_EQ(NDIMS, dims()) << "Asking for tensor of " << NDIMS << "dimensions"
+                          << " from a tensor of " << dims() << " dimensions";
 }
 
 void TensorShape::CheckDimsAtLeast(int NDIMS) const {
   CHECK_GE(NDIMS, dims()) << "Asking for tensor of at least " << NDIMS
-                          << " for a tensor of " << dims() << " dimensions";
+                          << " dimensions from a tensor of " << dims()
+                          << " dimensions";
 }
 
 bool TensorShape::IsValid(const TensorShapeProto& proto) {
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 7cf25ba48f4..7098bed572f 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -935,13 +935,15 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     ref_recvs.clear();
     ref_control_inputs.clear();
     const Edge* control_flow_edge = nullptr;
+    int32 num_control_flow_edges = 0;
     for (const Edge* edge : dst->in_edges()) {
       if (edge->IsControlEdge()) {
         if (IsMerge(edge->src()) && IsControlLoop(edge->src())) {
           // This is one of the control edges added for control flow. There
           // can be multiple such edges as the dest node may have multiple
-          // remote inputs. We will just take one and ignore the others.
+          // remote inputs. We keep track of the number of such edges.
           control_flow_edge = edge;
+          ++num_control_flow_edges;
         } else {
           inputs.push_back(edge);
         }
@@ -953,7 +955,6 @@ Status Partition(const PartitionOptions& opts, Graph* g,
 
     // Process in order so that all data edges are added as inputs to
     // dst in Edge::dst_input() order.
-    bool recv_added = false;
     for (const Edge* edge : inputs) {
       const Node* src = edge->src();
       if (!src->IsOp()) continue;  // Skip Sink/Source nodes.
@@ -1041,21 +1042,21 @@ Status Partition(const PartitionOptions& opts, Graph* g,
           AddRecv(opts, g_info, dst_graph, edge, &real_recv, &status);
       if (!status.ok()) return status;
 
-      // Fix up the control flow edge. Redirect it to the recv.
+      // Fix up the control flow edge.
       // NOTE(yuanbyu): 'real_recv' must be the real recv node.
-      recv_added = true;
-      if (control_flow_edge != nullptr) {
+      if (src_graph == dst_graph) {
+        // For same device send/recv, add a control edge from send to recv.
+        // This prevents the asynchronous recv kernel from being scheduled
+        // before the data is available.
+        AddInput(real_recv, send->name(), Graph::kControlSlot);
+      } else if (control_flow_edge != nullptr) {
+        // Redirect control edge to the real recv since this is not a same
+        // device send/recv.
+        --num_control_flow_edges;
         AddInput(real_recv, control_flow_edge->src()->name(),
                  Graph::kControlSlot);
       }
 
-      // For same device send/recv, add a control edge from send to recv.
-      // This prevents the asynchronous recv kernel from being scheduled
-      // immediately.
-      if (src_graph == dst_graph) {
-        AddInput(real_recv, send->name(), Graph::kControlSlot);
-      }
-
       if (!edge->IsControlEdge() &&
           IsRefType(src->output_type(edge->src_output()))) {
         AddNodeAttr("_start_time", recv_start_time, recv);
@@ -1092,9 +1093,12 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     // execution of recvs until all the other inputs become available.
     AddReadControl(ref_recvs, ref_control_inputs);
 
-    // Add back this control edge for control flow if not used.
-    if (!recv_added && (control_flow_edge != nullptr)) {
-      AddInput(dst_def, control_flow_edge->src()->name(), Graph::kControlSlot);
+    // Add back the control edges for control flow that are not used.
+    if (control_flow_edge != nullptr) {
+      for (int i = 0; i < num_control_flow_edges; ++i) {
+        AddInput(dst_def, control_flow_edge->src()->name(),
+                 Graph::kControlSlot);
+      }
     }
   }
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index a078488dd18..3c2dab98b39 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -379,8 +379,8 @@ tf_kernel_libraries(
         "batch_matrix_diag_op",
         "batch_matrix_set_diag_op",
         "edit_distance_op",
-        "gather_nd_op",
         "gather_op",
+        "gather_nd_op",
         "identity_op",
         "immutable_constant_op",
         "listdiff_op",
@@ -1019,10 +1019,12 @@ tf_kernel_libraries(
         "cholesky_grad",
         "determinant_op",
         "self_adjoint_eig_op",
+        "self_adjoint_eig_v2_op",
         "matrix_inverse_op",
         "matrix_solve_ls_op",
         "matrix_solve_op",
         "matrix_triangular_solve_op",
+        "svd_op",
     ],
     deps = [
         ":linalg_ops_common",
diff --git a/tensorflow/core/kernels/cwise_op_lgamma.cc b/tensorflow/core/kernels/cwise_op_lgamma.cc
index 930a861eae2..b7fe4472dc2 100644
--- a/tensorflow/core/kernels/cwise_op_lgamma.cc
+++ b/tensorflow/core/kernels/cwise_op_lgamma.cc
@@ -16,8 +16,17 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "Lgamma", functor::lgamma, float, Eigen::half, double);
+
+template <typename Device, typename Functor>
+class LgammaOp : public UnaryOp<Device, Functor> {
+ public:
+  explicit LgammaOp(OpKernelConstruction* ctx) : UnaryOp<Device, Functor>(ctx) {
+    TF_ANNOTATE_BENIGN_RACE(&signgam, "signgam output from lgamma is unused");
+  }
+};
+
+REGISTER3(LgammaOp, CPU, "Lgamma", functor::lgamma, float, Eigen::half, double);
 #if GOOGLE_CUDA
-REGISTER3(UnaryOp, GPU, "Lgamma", functor::lgamma, float, Eigen::half, double);
+REGISTER3(LgammaOp, GPU, "Lgamma", functor::lgamma, float, Eigen::half, double);
 #endif
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index b4d9f03efc6..c2a5192efb1 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -16,13 +16,11 @@ limitations under the License.
 // See docs in ../ops/array_ops.cc.
 #define EIGEN_USE_THREADS
 
-#include <atomic>
-
+#include "tensorflow/core/kernels/gather_nd_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/gather_nd_op.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
@@ -155,97 +153,6 @@ class GatherNdOp : public OpKernel {
   }
 };
 
-// Specialization of GatherNdSlice to CPU
-namespace generator {
-
-template <typename T, typename Index, int IXDIM>
-class GatherNdSliceGenerator {
- public:
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE GatherNdSliceGenerator(
-      const Index slice_size, typename TTypes<Index>::ConstMatrix Tindices,
-      typename TTypes<T, IXDIM + 1>::ConstTensor Tparams,
-      typename TTypes<T>::Matrix Tout, std::atomic<Index>* error_loc)
-      : slice_size_(slice_size),
-        Tindices_(Tindices),
-        Tparams_(Tparams),
-        Tout_(Tout),
-        error_loc_(error_loc) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool GenerateIndices(
-      const Index loc, Eigen::array<Eigen::DenseIndex, IXDIM + 1>* ix) const {
-    (*ix)[IXDIM] = 0;
-    bool out_of_bounds = false;
-    for (int i = 0; i < IXDIM; ++i) {
-      const Index ix_i = internal::SubtleMustCopy(Tindices_(loc, i));
-      (*ix)[i] = ix_i;
-      out_of_bounds |= !FastBoundsCheck(ix_i, Tparams_.dimension(i));
-    }
-    return out_of_bounds;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int32
-  operator()(const Eigen::array<Eigen::DenseIndex, 1>& loc_array) const {
-    const Index loc = loc_array[0];
-    Eigen::array<Eigen::DenseIndex, IXDIM + 1> ix;
-    Eigen::array<Eigen::DenseIndex, 2> ix_out;
-    ix_out[0] = loc;
-    ix_out[1] = 0;
-    const bool out_of_bounds = GenerateIndices(loc, &ix);
-    if (TF_PREDICT_FALSE(out_of_bounds)) {
-      error_loc_->store(loc);
-      std::fill_n(&Tout_(ix_out), slice_size_, T());
-    } else {
-      std::copy_n(&Tparams_(ix), slice_size_, &Tout_(ix_out));
-    }
-
-    return static_cast<int32>(0);  // Return something...
-  }
-
- private:
-  const Index slice_size_;
-  const typename TTypes<Index>::ConstMatrix Tindices_;
-  const typename TTypes<T, IXDIM + 1>::ConstTensor Tparams_;
-  mutable typename TTypes<T>::Matrix Tout_;
-  std::atomic<Index>* error_loc_;
-};
-
-}  // namespace generator
-
-namespace functor {
-
-template <typename T, typename Index, int IXDIM>
-struct GatherNdSlice<CPUDevice, T, Index, IXDIM> {
-  Index operator()(const CPUDevice& d, const Index slice_size,
-                   typename TTypes<int32>::Scalar Tscratch,
-                   typename TTypes<T, IXDIM + 1>::ConstTensor Tparams,
-                   typename TTypes<Index>::ConstMatrix Tindices,
-                   typename TTypes<T>::Matrix Tout) {
-    std::atomic<Index> error_loc(-1);
-
-    const Eigen::DenseIndex batch_size = Tindices.dimension(0);
-#if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::Tensor<Eigen::DenseIndex, 1>::Dimensions reshape_dims{{ 1 }};
-    Eigen::array<Eigen::DenseIndex, 1> broadcast_dims{{ batch_size }};
-#else
-    Eigen::IndexList<Eigen::type2index<1> > reshape_dims;
-    Eigen::IndexList<Eigen::DenseIndex> broadcast_dims;
-    broadcast_dims.set(0, batch_size);
-#endif
-    generator::GatherNdSliceGenerator<T, Index, IXDIM> gather_nd_generator(
-        slice_size, Tindices, Tparams, Tout, &error_loc);
-    Tscratch.device(d) = Tscratch.reshape(reshape_dims)
-                             .broadcast(broadcast_dims)
-                             .generate(gather_nd_generator)
-                             .sum();
-
-    // error_loc() returns -1 if there's no out-of-bounds index,
-    // otherwise it returns the location of an OOB index in Tindices.
-    return error_loc.load();
-  }
-};
-
-}  // namespace functor
-
 #define REGISTER_GATHER_ND_FULL(dev, type, index_type)                 \
   REGISTER_KERNEL_BUILDER(Name("GatherNd")                             \
                               .Device(DEVICE_##dev)                    \
diff --git a/tensorflow/core/kernels/gather_nd_op.h b/tensorflow/core/kernels/gather_nd_op.h
index 0ee783bd593..d7279d5712a 100644
--- a/tensorflow/core/kernels/gather_nd_op.h
+++ b/tensorflow/core/kernels/gather_nd_op.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
new file mode 100644
index 00000000000..dc028c2f1e9
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
@@ -0,0 +1,145 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
+#define TENSORFLOW_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
+
+// Specialization of GatherNdSlice to CPU
+
+#define EIGEN_USE_THREADS
+
+#include <atomic>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/gather_nd_op.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+namespace generator {
+
+template <typename T, typename Index, int IXDIM>
+class GatherNdSliceGenerator {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE GatherNdSliceGenerator(
+      const Index slice_size, typename TTypes<Index>::ConstMatrix Tindices,
+      typename TTypes<T, IXDIM + 1>::ConstTensor Tparams,
+      typename TTypes<T>::Matrix Tout, std::atomic<Index>* error_loc)
+      : slice_size_(slice_size),
+        Tindices_(Tindices),
+        Tparams_(Tparams),
+        Tout_(Tout),
+        error_loc_(error_loc) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool GenerateIndices(
+      const Index loc, Eigen::array<Eigen::DenseIndex, IXDIM + 1>* ix) const {
+    (*ix)[IXDIM] = 0;
+    bool out_of_bounds = false;
+    for (int i = 0; i < IXDIM; ++i) {
+      const Index ix_i = internal::SubtleMustCopy(Tindices_(loc, i));
+      (*ix)[i] = ix_i;
+      out_of_bounds |= !FastBoundsCheck(ix_i, Tparams_.dimension(i));
+    }
+    return out_of_bounds;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int32
+  operator()(const Eigen::array<Eigen::DenseIndex, 1>& loc_array) const {
+    const Index loc = loc_array[0];
+    Eigen::array<Eigen::DenseIndex, IXDIM + 1> ix;
+    Eigen::array<Eigen::DenseIndex, 2> ix_out;
+    ix_out[0] = loc;
+    ix_out[1] = 0;
+    const bool out_of_bounds = GenerateIndices(loc, &ix);
+    if (TF_PREDICT_FALSE(out_of_bounds)) {
+      error_loc_->store(loc);
+      std::fill_n(&Tout_(ix_out), slice_size_, T());
+    } else {
+      std::copy_n(&Tparams_(ix), slice_size_, &Tout_(ix_out));
+    }
+
+    return static_cast<int32>(0);  // Return something...
+  }
+
+ private:
+  const Index slice_size_;
+  const typename TTypes<Index>::ConstMatrix Tindices_;
+  const typename TTypes<T, IXDIM + 1>::ConstTensor Tparams_;
+  mutable typename TTypes<T>::Matrix Tout_;
+  std::atomic<Index>* error_loc_;
+};
+
+}  // namespace generator
+
+namespace functor {
+
+template <typename T, typename Index, int IXDIM>
+struct GatherNdSlice<CPUDevice, T, Index, IXDIM> {
+  Index operator()(const CPUDevice& d, const Index slice_size,
+                   typename TTypes<int32>::Scalar Tscratch,
+                   typename TTypes<T, IXDIM + 1>::ConstTensor Tparams,
+                   typename TTypes<Index>::ConstMatrix Tindices,
+                   typename TTypes<T>::Matrix Tout) {
+    std::atomic<Index> error_loc(-1);
+
+    const Eigen::DenseIndex batch_size = Tindices.dimension(0);
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::Tensor<Eigen::DenseIndex, 1>::Dimensions reshape_dims{{ 1 }};
+    Eigen::array<Eigen::DenseIndex, 1> broadcast_dims{{ batch_size }};
+#else
+    Eigen::IndexList<Eigen::type2index<1> > reshape_dims;
+    Eigen::IndexList<Eigen::DenseIndex> broadcast_dims;
+    broadcast_dims.set(0, batch_size);
+#endif
+    generator::GatherNdSliceGenerator<T, Index, IXDIM> gather_nd_generator(
+        slice_size, Tindices, Tparams, Tout, &error_loc);
+    Tscratch.device(d) = Tscratch.reshape(reshape_dims)
+                             .broadcast(broadcast_dims)
+                             .generate(gather_nd_generator)
+                             .sum();
+
+    // error_loc() returns -1 if there's no out-of-bounds index,
+    // otherwise it returns the location of an OOB index in Tindices.
+    return error_loc.load();
+  }
+};
+
+#define REGISTER_GATHER_ND_FULL(T, Index)                                     \
+  template Index GatherNdSlice<CPUDevice, T, Index, CPU_PROVIDED_IXDIM>::     \
+  operator()(const CPUDevice& d, const Index slice_size,                      \
+             typename TTypes<int32>::Scalar Tscratch,                         \
+             typename TTypes<T, CPU_PROVIDED_IXDIM + 1>::ConstTensor Tparams, \
+             typename TTypes<Index>::ConstMatrix Tindices,                    \
+             typename TTypes<T>::Matrix Tout);
+
+#define REGISTER_GATHER_ND_CPU(type)    \
+  REGISTER_GATHER_ND_FULL(type, int32); \
+  REGISTER_GATHER_ND_FULL(type, int64)
+
+TF_CALL_ALL_TYPES(REGISTER_GATHER_ND_CPU);
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_0.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_0.cc
new file mode 100644
index 00000000000..246e9f729b8
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_0.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 0
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_1.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_1.cc
new file mode 100644
index 00000000000..5b7720fc4ef
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_1.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 1
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_2.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_2.cc
new file mode 100644
index 00000000000..0f6932394ed
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_2.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 2
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_3.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_3.cc
new file mode 100644
index 00000000000..1c2aec7820a
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_3.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 3
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_4.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_4.cc
new file mode 100644
index 00000000000..3e164668c5b
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_4.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 4
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_5.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_5.cc
new file mode 100644
index 00000000000..7141ea70df9
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_5.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 5
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
index 9fbb6db9cf0..575c7e2e7c2 100644
--- a/tensorflow/core/kernels/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -90,19 +90,35 @@ void LinearAlgebraOp<Scalar, SupportsBatchOperation>::Compute(
   TensorInputs inputs;
   TensorShapes input_matrix_shapes;
   TensorShape batch_shape;
+  AnalyzeInputs(context, &inputs, &input_matrix_shapes, &batch_shape);
+
+  TensorShapes output_matrix_shapes;
+  TensorOutputs outputs;
+  PrepareOutputs(context, input_matrix_shapes, batch_shape, &outputs,
+                 &output_matrix_shapes);
+
+  // Process the individual matrix problems in parallel using a threadpool.
+  auto shard = [this, &inputs, &input_matrix_shapes, &outputs,
+                &output_matrix_shapes, context](int64 begin, int64 end) {
+    for (int64 i = begin; i < end; ++i) {
+      ComputeTensorSlice(context, i, inputs, input_matrix_shapes, outputs,
+                         output_matrix_shapes);
+    }
+  };
+  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  Shard(worker_threads.num_threads, worker_threads.workers,
+        batch_shape.num_elements(), GetCostPerUnit(input_matrix_shapes), shard);
+}
+
+template <typename Scalar, bool SupportsBatchOperation>
+void LinearAlgebraOp<Scalar, SupportsBatchOperation>::AnalyzeInputs(
+    OpKernelContext* context, TensorInputs* inputs,
+    TensorShapes* input_matrix_shapes, TensorShape* batch_shape) {
   int input_rank = -1;
-  int num_batch_matrices = 1;
   for (int i = 0; i < NumMatrixInputs(context); ++i) {
     const Tensor& in = context->input(i);
     if (i == 0) {
-      // If the tensor rank is greater than 2, we consider the inner-most
-      // dimensions as matrices, and loop over all the other outer ("batch")
-      // dimensions to compute the results.
       input_rank = in.dims();
-      for (int dim = 0; dim < input_rank - 2; ++dim) {
-        num_batch_matrices *= in.dim_size(dim);
-        batch_shape.AddDim(in.dim_size(dim));
-      }
       if (SupportsBatchOperation) {
         OP_REQUIRES(
             context, input_rank >= 2,
@@ -114,6 +130,13 @@ void LinearAlgebraOp<Scalar, SupportsBatchOperation>::Compute(
             errors::InvalidArgument("Input tensor ", i,
                                     " must have rank == 2, got", input_rank));
       }
+
+      // If the tensor rank is greater than 2, we consider the inner-most
+      // dimensions as matrices, and loop over all the other outer ("batch")
+      // dimensions to compute the results.
+      for (int dim = 0; dim < input_rank - 2; ++dim) {
+        batch_shape->AddDim(in.dim_size(dim));
+      }
     } else {
       // Make sure that all inputs have the same rank and outer dimensions.
       OP_REQUIRES(context, input_rank == in.dims(),
@@ -121,7 +144,7 @@ void LinearAlgebraOp<Scalar, SupportsBatchOperation>::Compute(
                       "All input tensors must have the same rank."));
       for (int dim = 0; dim < input_rank - 2; ++dim) {
         OP_REQUIRES(
-            context, in.dim_size(dim) == batch_shape.dim_size(dim),
+            context, in.dim_size(dim) == batch_shape->dim_size(dim),
             errors::InvalidArgument(
                 "All input tensors must have the same outer dimensions."));
       }
@@ -131,64 +154,59 @@ void LinearAlgebraOp<Scalar, SupportsBatchOperation>::Compute(
     const int col_dimension = input_rank - 1;
     const int64 num_rows = in.dim_size(row_dimension);
     const int64 num_cols = in.dim_size(col_dimension);
-    input_matrix_shapes.push_back(TensorShape({num_rows, num_cols}));
-    inputs.push_back(in);
+    // TODO(rmlarsen): Use emplace_back when it is added to InlinedVector. Same
+    // in several places below.
+    input_matrix_shapes->push_back(TensorShape({num_rows, num_cols}));
+    inputs->push_back(in);
   }
   // Have the derived class validate that the inputs are as expected.
-  ValidateInputMatrixShapes(context, input_matrix_shapes);
-
-  // Get shape for each of the matrix outputs.
-  const TensorShapes output_matrix_shapes =
-      GetOutputMatrixShapes(input_matrix_shapes);
-  // Make sure the number of outputs is what the derived class expects.
-  OP_REQUIRES(
-      context, output_matrix_shapes.size() == context->num_outputs(),
-      errors::Internal(
-          "Derived class expected (%d) output matrices for op, got (%d).",
-          output_matrix_shapes.size(), context->num_outputs()));
-
-  // Allocate outputs.
-  TensorShapes output_shapes;
-  TensorOutputs outputs;
-  for (int i = 0; i < context->num_outputs(); ++i) {
-    OP_REQUIRES(context, output_matrix_shapes[i].dims() <= 2,
-                errors::InvalidArgument(
-                    "Rank of matrix output no. %d must be 0, 1 or 2, got %d.",
-                    i, output_matrix_shapes[i].dims()));
-
-    // The final output has the shape of the outer batch dimensions concatenated
-    // with the output_matrix_shape (if the output is not scalar).
-    TensorShape output_shape;
-    if (input_rank == 2) {
-      output_shape = output_matrix_shapes[i];
-    } else {
-      output_shape = batch_shape;
-      // Add the inner dimensions that depend on the operation implemented by
-      // the derived class.
-      for (int dim = 0; dim < output_matrix_shapes[i].dims(); ++dim) {
-        output_shape.AddDim(output_matrix_shapes[i].dim_size(dim));
-      }
-    }
-    output_shapes.push_back(output_shape);
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(i, output_shape, &out));
-    outputs.push_back(out);
-  }
-
-  auto shard = [this, &inputs, &input_matrix_shapes, &outputs,
-                &output_matrix_shapes, context](int64 begin, int64 end) {
-    for (int64 i = begin; i < end; ++i) {
-      ComputeTensorSlice(context, i, inputs, input_matrix_shapes, outputs,
-                         output_matrix_shapes);
-    }
-  };
-  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-  Shard(worker_threads.num_threads, worker_threads.workers, num_batch_matrices,
-        GetCostPerUnit(input_matrix_shapes), shard);
+  ValidateInputMatrixShapes(context, *input_matrix_shapes);
 }
 
-template <typename Scalar, bool SupportsBatchOperationT>
-void LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ComputeTensorSlice(
+template <typename Scalar, bool SupportsBatchOperation>
+void LinearAlgebraOp<Scalar, SupportsBatchOperation>::PrepareOutputs(
+    OpKernelContext* context, const TensorShapes& input_matrix_shapes,
+    const TensorShape& batch_shape, TensorOutputs* outputs,
+    TensorShapes* output_matrix_shapes) {
+  // Get shape for each of the matrix outputs produced by the derived class.
+  *output_matrix_shapes = GetOutputMatrixShapes(input_matrix_shapes);
+  const int num_outputs = output_matrix_shapes->size();
+
+  // Make sure the number of op outputs is what the derived class expects.
+  OP_REQUIRES(
+      context, num_outputs <= context->num_outputs(),
+      errors::Internal(
+          "Derived class expected more outputs (%d) that the op has (%d).",
+          num_outputs, context->num_outputs()));
+
+  // Allocate outputs.
+  for (int i = 0; i < context->num_outputs(); ++i) {
+    TensorShape output_tensor_shape({0});
+    if (i < num_outputs) {
+      // This output is used, set up output shape and allocate it.
+      const TensorShape& output_matrix_shape = output_matrix_shapes->at(i);
+      OP_REQUIRES(context, output_matrix_shape.dims() <= 2,
+                  errors::InvalidArgument(
+                      "Rank of matrix output no. %d must be 0, 1 or 2, got %d.",
+                      i, output_matrix_shape.dims()));
+
+      // The final output has the shape of the outer batch dimensions
+      // concatenated with the output_matrix_shape (if the output is not
+      // scalar).
+      output_tensor_shape = batch_shape;
+      for (int dim = 0; dim < output_matrix_shape.dims(); ++dim) {
+        output_tensor_shape.AddDim(output_matrix_shape.dim_size(dim));
+      }
+    }
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(i, output_tensor_shape, &out));
+    outputs->push_back(out);
+  }
+}
+
+template <typename Scalar, bool SupportsBatchOperation>
+void LinearAlgebraOp<Scalar, SupportsBatchOperation>::ComputeTensorSlice(
     OpKernelContext* context, int64 matrix_index, const TensorInputs& inputs,
     const TensorShapes& input_matrix_shapes, const TensorOutputs& outputs,
     const TensorShapes& output_matrix_shapes) {
@@ -204,7 +222,7 @@ void LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ComputeTensorSlice(
   }
 
   MatrixMaps matrix_outputs;
-  for (int i = 0; i < outputs.size(); ++i) {
+  for (int i = 0; i < output_matrix_shapes.size(); ++i) {
     // The output matrix shape may not be a matrix.
     int num_output_rows = output_matrix_shapes[i].dims() >= 1
                               ? output_matrix_shapes[i].dim_size(0)
diff --git a/tensorflow/core/kernels/linalg_ops_common.h b/tensorflow/core/kernels/linalg_ops_common.h
index dda83ad2d12..3be9853c6cf 100644
--- a/tensorflow/core/kernels/linalg_ops_common.h
+++ b/tensorflow/core/kernels/linalg_ops_common.h
@@ -43,7 +43,7 @@ template <typename Scalar, bool SupportsBatchOperationT>
 class LinearAlgebraOp : public OpKernel {
  public:
   explicit LinearAlgebraOp(OpKernelConstruction* context) : OpKernel(context) {}
-  ~LinearAlgebraOp() override {}
+
   void Compute(OpKernelContext* context) override;
 
  protected:
@@ -80,19 +80,26 @@ class LinearAlgebraOp : public OpKernel {
                                    const TensorShapes& input_matrix_shapes);
 
   // Returns the output shapes of each individual matrix operation. Output
-  // matrices shapes must be rank 0, 1, or 2.  Scalar outputs are rank 0.
-  // For many ops the output dimensions are the same as the input dimensions,
+  // matrices shapes must be rank 0, 1, or 2. Scalar outputs are rank 0.
+  //
+  // The derived class may return a number of shapes (N) less than
+  // context->num_outputs() (M) to indicate that a only leading subset of
+  // the outputs will be populated. In this case, a dummy scalar tensor with
+  // value zero will be return for the last M-N outputs.
+  //
+  // For many ops, the output dimensions are the same as the input dimensions,
   // so we provide that as a default implementation for convenience.
   virtual TensorShapes GetOutputMatrixShapes(
       const TensorShapes& input_matrix_shapes) const {
     return input_matrix_shapes;
   }
 
-  // Returns the cost per matrix operation. Cost per unit is assumed to be
-  // roughly 1ns, based on comments in core/util/work_sharder.cc.
-  // Many linear algebra ops take roughly max(m,n) * min(m,n)^2, where the first
-  // input matrix is m-by-n. We provide that as a default implementation for
-  // convenience.
+  // Returns the cost per matrix operation. This is used to determine the
+  // number of threads to use for parallelizing calls to ComputeMatrix in
+  // batch mode. Cost per unit is assumed to be roughly 1ns, based on comments
+  // in core/util/work_sharder.cc. Many linear algebra ops take roughly max(m,n)
+  // * min(m,n)^2, where the first input matrix is m-by-n. We provide that as a
+  // default implementation for convenience.
   virtual int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const {
     double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
     double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
@@ -111,7 +118,9 @@ class LinearAlgebraOp : public OpKernel {
   // Performs a single matrix computation given input matrices, and
   // stores the result in outputs. For batch operations, this will be called
   // repeatedly for a single call to Compute() when multiple matrices exist in
-  // input Tensors with rank > 2.
+  // input Tensors with rank > 2. In this case the calls to ComputeMatrix are
+  // parallelized. The number of threads used is determined by a cost model from
+  // the value returned by GetCostPerUnit().
   virtual void ComputeMatrix(OpKernelContext* context,
                              const ConstMatrixMaps& inputs,
                              MatrixMaps* outputs) = 0;
@@ -142,6 +151,15 @@ class LinearAlgebraOp : public OpKernel {
                           const TensorShapes& input_matrix_shapes,
                           const TensorOutputs& outputs,
                           const TensorShapes& output_matrix_shapes);
+
+  void AnalyzeInputs(OpKernelContext* context, TensorInputs* inputs,
+                     TensorShapes* input_matrix_shapes,
+                     TensorShape* batch_shape);
+
+  void PrepareOutputs(OpKernelContext* context,
+                      const TensorShapes& input_matrix_shapes,
+                      const TensorShape& batch_shape, TensorOutputs* outputs,
+                      TensorShapes* output_matrix_shapes);
 };
 
 // Declare that LinearAlgebraOp is explicitly instantiated in
diff --git a/tensorflow/core/kernels/reader_ops.cc b/tensorflow/core/kernels/reader_ops.cc
index 1c7fbae81cd..bb8e35cc089 100644
--- a/tensorflow/core/kernels/reader_ops.cc
+++ b/tensorflow/core/kernels/reader_ops.cc
@@ -55,8 +55,9 @@ class ReaderVerbAsyncOpKernel : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
     ReaderInterface* reader;
-    OP_REQUIRES_OK(context,
-                   GetResourceFromContext(context, "reader_handle", &reader));
+    OP_REQUIRES_OK_ASYNC(
+        context, GetResourceFromContext(context, "reader_handle", &reader),
+        done);
     thread_pool_->Schedule([this, context, reader, done]() {
       ComputeWithReader(context, reader);
       reader->Unref();
diff --git a/tensorflow/core/kernels/self_adjoint_eig_op.cc b/tensorflow/core/kernels/self_adjoint_eig_op.cc
index 39aa69be529..9d3a411f3b2 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_op.cc
+++ b/tensorflow/core/kernels/self_adjoint_eig_op.cc
@@ -25,8 +25,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-// TODO(rmlarsen): Change this op to return the eigenvalues and eigenvectors in
-// separate output tensors.
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/self_adjoint_eig_v2_op.cc b/tensorflow/core/kernels/self_adjoint_eig_v2_op.cc
new file mode 100644
index 00000000000..1b457ebe9ef
--- /dev/null
+++ b/tensorflow/core/kernels/self_adjoint_eig_v2_op.cc
@@ -0,0 +1,91 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/Eigen/Eigenvalues"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+template <class Scalar, bool SupportsBatchOperation>
+class SelfAdjointEigV2Op
+    : public LinearAlgebraOp<Scalar, SupportsBatchOperation> {
+ public:
+  typedef LinearAlgebraOp<Scalar, SupportsBatchOperation> Base;
+
+  explicit SelfAdjointEigV2Op(OpKernelConstruction* context) : Base(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("compute_v", &compute_v_));
+  }
+
+  using TensorShapes = typename Base::TensorShapes;
+  using Matrix = typename Base::Matrix;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    int64 n = input_matrix_shapes[0].dim_size(0);
+    if (compute_v_) {
+      return TensorShapes({TensorShape({n}), TensorShape({n, n})});
+    } else {
+      return TensorShapes({TensorShape({n})});
+    }
+  }
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const int64 rows = inputs[0].rows();
+    if (rows == 0) {
+      // If X is an empty matrix (0 rows, 0 col), X * X' == X.
+      // Therefore, we return X.
+      return;
+    }
+
+    Eigen::SelfAdjointEigenSolver<Matrix> eig(
+        inputs[0],
+        compute_v_ ? Eigen::ComputeEigenvectors : Eigen::EigenvaluesOnly);
+    OP_REQUIRES(
+        context, eig.info() == Eigen::Success,
+        errors::InvalidArgument("Self Adjoint Eigen decomposition was not "
+                                "successful. The input might not be valid."));
+
+    outputs->at(0) = eig.eigenvalues();
+    if (compute_v_) {
+      outputs->at(1) = eig.eigenvectors();
+    }
+  }
+
+ private:
+  bool compute_v_;
+};
+
+REGISTER_LINALG_OP("SelfAdjointEigV2", (SelfAdjointEigV2Op<float, false>),
+                   float);
+REGISTER_LINALG_OP("SelfAdjointEigV2", (SelfAdjointEigV2Op<double, false>),
+                   double);
+REGISTER_LINALG_OP("BatchSelfAdjointEigV2", (SelfAdjointEigV2Op<float, true>),
+                   float);
+REGISTER_LINALG_OP("BatchSelfAdjointEigV2", (SelfAdjointEigV2Op<double, true>),
+                   double);
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/svd_op.cc b/tensorflow/core/kernels/svd_op.cc
new file mode 100644
index 00000000000..c3686947dda
--- /dev/null
+++ b/tensorflow/core/kernels/svd_op.cc
@@ -0,0 +1,105 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+#include <algorithm>
+
+#include "third_party/eigen3/Eigen/SVD"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+template <class Scalar, bool SupportsBatchOperation>
+class SvdOp : public LinearAlgebraOp<Scalar, SupportsBatchOperation> {
+ public:
+  typedef LinearAlgebraOp<Scalar, SupportsBatchOperation> Base;
+
+  explicit SvdOp(OpKernelConstruction* context) : Base(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("compute_uv", &compute_uv_));
+    OP_REQUIRES_OK(context, context->GetAttr("full_matrices", &full_matrices_));
+  }
+
+  using TensorShapes = typename Base::TensorShapes;
+
+  void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    Base::ValidateSingleMatrix(context, input_matrix_shapes);
+  }
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    int64 m = input_matrix_shapes[0].dim_size(0);
+    int64 n = input_matrix_shapes[0].dim_size(1);
+    int64 min_size = std::min(m, n);
+    if (compute_uv_) {
+      return TensorShapes({TensorShape({min_size}),
+                           TensorShape({m, full_matrices_ ? m : min_size}),
+                           TensorShape({n, full_matrices_ ? n : min_size})});
+    } else {
+      return TensorShapes({TensorShape({min_size})});
+    }
+  }
+
+  // TODO(rmlarsen): This should depend on compute_uv. See b/30409375.
+  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
+    double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
+    double cost = 12 * std::max(m, n) * std::min(m, n) * std::min(m, n);
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
+
+  using Matrix = typename Base::Matrix;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    Eigen::JacobiSVD<Matrix, Eigen::HouseholderQRPreconditioner> svd;
+    if (compute_uv_) {
+      svd.compute(inputs[0],
+                  (full_matrices_ ? Eigen::ComputeFullU | Eigen::ComputeFullV
+                                  : Eigen::ComputeThinU | Eigen::ComputeThinV));
+      outputs->at(0) = svd.singularValues();
+      outputs->at(1) = svd.matrixU();
+      outputs->at(2) = svd.matrixV();
+    } else {
+      svd.compute(inputs[0]);
+      outputs->at(0) = svd.singularValues();
+    }
+  }
+
+ private:
+  bool compute_uv_;
+  bool full_matrices_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SvdOp);
+};
+
+REGISTER_LINALG_OP("Svd", (SvdOp<float, false>), float);
+REGISTER_LINALG_OP("Svd", (SvdOp<double, false>), double);
+REGISTER_LINALG_OP("BatchSvd", (SvdOp<float, true>), float);
+REGISTER_LINALG_OP("BatchSvd", (SvdOp<double, true>), double);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 59225049fa7..c3704da0b12 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -29,11 +29,14 @@ namespace thread {
 
 struct EigenEnvironment {
   typedef Thread EnvThread;
-  struct Task {
+  struct TaskImpl {
     std::function<void()> f;
     Context context;
     uint64 trace_id;
   };
+  struct Task {
+    std::unique_ptr<TaskImpl> f;
+  };
 
   Env* const env_;
   const ThreadOptions thread_options_;
@@ -58,17 +61,21 @@ struct EigenEnvironment {
       port::Tracing::RecordEvent(port::Tracing::EventCategory::kScheduleClosure,
                                  id);
     }
-    return Task{std::move(f), Context(), id};
+    return Task{
+        std::unique_ptr<TaskImpl>(new TaskImpl{
+            std::move(f), Context(ContextKind::kThread), id,
+        }),
+    };
   }
 
   void ExecuteTask(const Task& t) {
-    WithContext wc(t.context);
-    if (t.trace_id != 0) {
+    WithContext wc(t.f->context);
+    if (t.f->trace_id != 0) {
       port::Tracing::ScopedActivity region(
-          port::Tracing::EventCategory::kRunClosure, t.trace_id);
-      t.f();
+          port::Tracing::EventCategory::kRunClosure, t.f->trace_id);
+      t.f->f();
     } else {
-      t.f();
+      t.f->f();
     }
   }
 };
diff --git a/tensorflow/core/lib/monitoring/counter.h b/tensorflow/core/lib/monitoring/counter.h
index 7de85b75cb6..0fcbe90ea89 100644
--- a/tensorflow/core/lib/monitoring/counter.h
+++ b/tensorflow/core/lib/monitoring/counter.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <atomic>
 #include <map>
 
+#include "tensorflow/core/lib/monitoring/export_registry.h"
 #include "tensorflow/core/lib/monitoring/metric_def.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -73,11 +74,14 @@ class CounterCell {
 template <int NumLabels>
 class Counter {
  public:
-  ~Counter() {}
+  ~Counter() {
+    // Deleted here, before the metric_def is destroyed.
+    registration_handle_.reset();
+  }
 
-  explicit Counter(
-      const MetricDef<MetricKind::CUMULATIVE, int64, NumLabels>& metric_def)
-      : metric_def_(metric_def) {}
+  // Creates the metric based on the metric-definition.
+  static Counter* New(
+      const MetricDef<MetricKind::CUMULATIVE, int64, NumLabels>& metric_def);
 
   // Retrieves the cell for the specified labels, creating it on demand if
   // not already present.
@@ -85,12 +89,20 @@ class Counter {
   CounterCell* GetCell(const Labels&... labels) LOCKS_EXCLUDED(mu_);
 
  private:
+  explicit Counter(
+      const MetricDef<MetricKind::CUMULATIVE, int64, NumLabels>& metric_def)
+      : metric_def_(metric_def),
+        registration_handle_(
+            ExportRegistry::Default()->Register(&metric_def_)) {}
+
   mutable mutex mu_;
 
   // The metric definition. This will be used to identify the metric when we
   // register it for exporting.
   const MetricDef<MetricKind::CUMULATIVE, int64, NumLabels> metric_def_;
 
+  std::unique_ptr<ExportRegistry::RegistrationHandle> registration_handle_;
+
   using LabelArray = std::array<string, NumLabels>;
   std::map<LabelArray, CounterCell> cells_ GUARDED_BY(mu_);
 
@@ -101,6 +113,12 @@ class Counter {
 //  Implementation details follow. API readers may skip.
 ////
 
+template <int NumLabels>
+Counter<NumLabels>* Counter<NumLabels>::New(
+    const MetricDef<MetricKind::CUMULATIVE, int64, NumLabels>& metric_def) {
+  return new Counter<NumLabels>(metric_def);
+}
+
 inline void CounterCell::IncrementBy(const int64 step) {
   DCHECK_LE(0, step) << "Must not decrement cumulative metrics.";
   value_ += step;
diff --git a/tensorflow/core/lib/monitoring/counter_test.cc b/tensorflow/core/lib/monitoring/counter_test.cc
index 0e42aed794d..2bf361a534a 100644
--- a/tensorflow/core/lib/monitoring/counter_test.cc
+++ b/tensorflow/core/lib/monitoring/counter_test.cc
@@ -21,26 +21,22 @@ namespace tensorflow {
 namespace monitoring {
 namespace {
 
-class LabeledCounterTest : public ::testing::Test {
- protected:
-  LabeledCounterTest() {}
+auto* counter_with_labels =
+    Counter<1>::New({"/tensorflow/test/counter_with_labels",
+                     "Counter with one label.", "One label"});
 
-  Counter<1> counter_with_labels_{{"/tensorflow/test/counter_with_labels_",
-                                   "Counter with one label.", "One label"}};
-};
-
-TEST_F(LabeledCounterTest, InitializedWithZero) {
-  EXPECT_EQ(0, counter_with_labels_.GetCell("Empty")->value());
+TEST(LabeledCounterTest, InitializedWithZero) {
+  EXPECT_EQ(0, counter_with_labels->GetCell("Empty")->value());
 }
 
-TEST_F(LabeledCounterTest, GetCell) {
-  auto* cell = counter_with_labels_.GetCell("GetCellOp");
+TEST(LabeledCounterTest, GetCell) {
+  auto* cell = counter_with_labels->GetCell("GetCellOp");
   EXPECT_EQ(0, cell->value());
 
   cell->IncrementBy(42);
   EXPECT_EQ(42, cell->value());
 
-  auto* same_cell = counter_with_labels_.GetCell("GetCellOp");
+  auto* same_cell = counter_with_labels->GetCell("GetCellOp");
   EXPECT_EQ(42, same_cell->value());
 
   same_cell->IncrementBy(58);
@@ -48,34 +44,31 @@ TEST_F(LabeledCounterTest, GetCell) {
   EXPECT_EQ(100, same_cell->value());
 }
 
-using LabeledCounterDeathTest = LabeledCounterTest;
-
-TEST_F(LabeledCounterDeathTest, DiesOnDecrement) {
+TEST(LabeledCounterDeathTest, DiesOnDecrement) {
   EXPECT_DEBUG_DEATH(
-      { counter_with_labels_.GetCell("DyingOp")->IncrementBy(-1); },
+      { counter_with_labels->GetCell("DyingOp")->IncrementBy(-1); },
       "decrement");
 }
 
-class UnlabeledCounterTest : public ::testing::Test {
- protected:
-  UnlabeledCounterTest() {}
+auto* init_counter_without_labels = Counter<0>::New(
+    {"/tensorflow/test/init_counter_without_labels",
+     "Counter without any labels to check if it is initialized as 0."});
 
-  Counter<0> counter_without_labels_{
-      {"/tensorflow/test/counter0", "Counter without any labels."}};
-};
-
-TEST_F(UnlabeledCounterTest, InitializedWithZero) {
-  EXPECT_EQ(0, counter_without_labels_.GetCell()->value());
+TEST(UnlabeledCounterTest, InitializedWithZero) {
+  EXPECT_EQ(0, init_counter_without_labels->GetCell()->value());
 }
 
-TEST_F(UnlabeledCounterTest, GetCell) {
-  auto* cell = counter_without_labels_.GetCell();
+auto* counter_without_labels = Counter<0>::New(
+    {"/tensorflow/test/counter_without_labels", "Counter without any labels."});
+
+TEST(UnlabeledCounterTest, GetCell) {
+  auto* cell = counter_without_labels->GetCell();
   EXPECT_EQ(0, cell->value());
 
   cell->IncrementBy(42);
   EXPECT_EQ(42, cell->value());
 
-  auto* same_cell = counter_without_labels_.GetCell();
+  auto* same_cell = counter_without_labels->GetCell();
   EXPECT_EQ(42, same_cell->value());
 
   same_cell->IncrementBy(58);
@@ -83,11 +76,14 @@ TEST_F(UnlabeledCounterTest, GetCell) {
   EXPECT_EQ(100, same_cell->value());
 }
 
-using UnlabeledCounterDeathTest = UnlabeledCounterTest;
+auto* dead_counter_without_labels = Counter<0>::New(
+    {"/tensorflow/test/dead_counter_without_labels",
+     "Counter without any labels which goes on to die on decrement."});
 
-TEST_F(UnlabeledCounterDeathTest, DiesOnDecrement) {
-  EXPECT_DEBUG_DEATH({ counter_without_labels_.GetCell()->IncrementBy(-1); },
-                     "decrement");
+TEST(UnlabeledCounterDeathTest, DiesOnDecrement) {
+  EXPECT_DEBUG_DEATH(
+      { dead_counter_without_labels->GetCell()->IncrementBy(-1); },
+      "decrement");
 }
 
 }  // namespace
diff --git a/tensorflow/core/lib/monitoring/export_registry.cc b/tensorflow/core/lib/monitoring/export_registry.cc
new file mode 100644
index 00000000000..4c0eed668a4
--- /dev/null
+++ b/tensorflow/core/lib/monitoring/export_registry.cc
@@ -0,0 +1,53 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/monitoring/export_registry.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace monitoring {
+
+ExportRegistry* ExportRegistry::Default() {
+  static ExportRegistry* default_registry = new ExportRegistry();
+  return default_registry;
+}
+
+std::unique_ptr<ExportRegistry::RegistrationHandle> ExportRegistry::Register(
+    const AbstractMetricDef* const metric_def) {
+  mutex_lock l(mu_);
+
+  LOG(INFO) << "Here." << registry_.size();
+  const auto found_it = registry_.find(metric_def->name());
+  if (found_it != registry_.end()) {
+    LOG(INFO) << "Here2";
+    LOG(FATAL) << "Cannot register 2 metrics with the same name: "
+               << metric_def->name();
+  }
+  LOG(INFO) << "Here3";
+  registry_.insert({metric_def->name(), metric_def});
+  LOG(INFO) << "Here4." << registry_.size();
+
+  return std::unique_ptr<RegistrationHandle>(
+      new RegistrationHandle(this, metric_def));
+}
+
+void ExportRegistry::Unregister(const AbstractMetricDef* const metric_def) {
+  mutex_lock l(mu_);
+  registry_.erase(metric_def->name());
+}
+
+}  // namespace monitoring
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/monitoring/export_registry.h b/tensorflow/core/lib/monitoring/export_registry.h
new file mode 100644
index 00000000000..aca47735718
--- /dev/null
+++ b/tensorflow/core/lib/monitoring/export_registry.h
@@ -0,0 +1,88 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_EXPORT_REGISTRY_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_EXPORT_REGISTRY_H_
+
+#include <map>
+#include <memory>
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace monitoring {
+
+// An export registry for metrics.
+//
+// Metrics are registered here so that their state can be exported later using
+// an exporter.
+//
+// This class is thread-safe.
+class ExportRegistry {
+ public:
+  ~ExportRegistry() = default;
+
+  // Returns the default registry for the process.
+  //
+  // This registry belongs to this library and should never be deleted.
+  static ExportRegistry* Default();
+
+  // Registers the metric and returns a Registration object. The destruction of
+  // the registration object would cause the metric to be unregistered from this
+  // registry.
+  //
+  // IMPORTANT: Delete the handle before the metric-def is deleted.
+  class RegistrationHandle;
+  std::unique_ptr<RegistrationHandle> Register(
+      const AbstractMetricDef* metric_def)
+      LOCKS_EXCLUDED(mu_) TF_MUST_USE_RESULT;
+
+ private:
+  ExportRegistry() = default;
+
+  // Unregisters the metric from this registry. This is private because the
+  // public interface provides a Registration handle which automatically calls
+  // this upon destruction.
+  void Unregister(const AbstractMetricDef* metric_def) LOCKS_EXCLUDED(mu_);
+
+  mutable mutex mu_;
+  std::map<StringPiece, const AbstractMetricDef*> registry_ GUARDED_BY(mu_);
+};
+
+////
+// Implementation details follow. API readers may skip.
+////
+
+class ExportRegistry::RegistrationHandle {
+ public:
+  RegistrationHandle(ExportRegistry* const export_registry,
+                     const AbstractMetricDef* const metric_def)
+      : export_registry_(export_registry), metric_def_(metric_def) {}
+
+  ~RegistrationHandle() { export_registry_->Unregister(metric_def_); }
+
+ private:
+  ExportRegistry* const export_registry_;
+  const AbstractMetricDef* const metric_def_;
+};
+
+}  // namespace monitoring
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_EXPORT_REGISTRY_H_
diff --git a/tensorflow/core/lib/monitoring/export_registry_test.cc b/tensorflow/core/lib/monitoring/export_registry_test.cc
new file mode 100644
index 00000000000..a7cb0e8e52e
--- /dev/null
+++ b/tensorflow/core/lib/monitoring/export_registry_test.cc
@@ -0,0 +1,60 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/monitoring/export_registry.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace monitoring {
+namespace {
+
+TEST(ExportRegistryTest, RegistrationUnregistration) {
+  auto* export_registry = ExportRegistry::Default();
+  const MetricDef<MetricKind::CUMULATIVE, int64, 0> metric_def0(
+      "/tensorflow/metric0", "An example metric with no labels.");
+  const MetricDef<MetricKind::GAUGE, double, 1> metric_def1(
+      "/tensorflow/metric1", "An example metric with one label.", "LabelName");
+
+  {
+    // Enclosed in a scope so that we unregister before the stack variables
+    // above are destroyed.
+
+    std::unique_ptr<ExportRegistry::RegistrationHandle> handle0 =
+        export_registry->Register(&metric_def0);
+    std::unique_ptr<ExportRegistry::RegistrationHandle> handle1 =
+        export_registry->Register(&metric_def1);
+
+    handle0.reset();
+
+    // Able to register again because it was unregistered earlier.
+    handle0 = export_registry->Register(&metric_def0);
+  }
+}
+
+TEST(ExportRegistryDeathTest, DuplicateRegistration) {
+  auto* export_registry = ExportRegistry::Default();
+  const MetricDef<MetricKind::CUMULATIVE, int64, 0> metric_def(
+      "/tensorflow/metric", "An example metric with no labels.");
+
+  auto handle = export_registry->Register(&metric_def);
+  EXPECT_DEATH(
+      { auto duplicate_handle = export_registry->Register(&metric_def); },
+      "/tensorflow/metric");
+}
+
+}  // namespace
+}  // namespace monitoring
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/monitoring/metric_def.h b/tensorflow/core/lib/monitoring/metric_def.h
index f7037359eb3..01210e370ad 100644
--- a/tensorflow/core/lib/monitoring/metric_def.h
+++ b/tensorflow/core/lib/monitoring/metric_def.h
@@ -33,7 +33,7 @@ class StringLiteral {
  public:
   // We allow implicit conversions here on purpose.
   template <int N>
-  StringLiteral(const char (&data)[N]) : literal_(data, N) {}
+  StringLiteral(const char (&data)[N]) : literal_(data, N - 1) {}
 
   // This ctor will be called for non-literals, causing compile-time failure.
   template <typename NotStringLiteral>
diff --git a/tensorflow/core/lib/monitoring/metric_def_test.cc b/tensorflow/core/lib/monitoring/metric_def_test.cc
new file mode 100644
index 00000000000..5d371cca1b1
--- /dev/null
+++ b/tensorflow/core/lib/monitoring/metric_def_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace monitoring {
+namespace {
+
+TEST(MetricDefTest, Simple) {
+  const MetricDef<MetricKind::CUMULATIVE, int64, 0> metric_def0(
+      "/tensorflow/metric0", "An example metric with no labels.");
+  const MetricDef<MetricKind::GAUGE, double, 1> metric_def1(
+      "/tensorflow/metric1", "An example metric with one label.", "LabelName");
+
+  EXPECT_EQ("/tensorflow/metric0", metric_def0.name());
+  EXPECT_EQ("/tensorflow/metric1", metric_def1.name());
+
+  EXPECT_EQ(MetricKind::CUMULATIVE, metric_def0.kind());
+  EXPECT_EQ(MetricKind::GAUGE, metric_def1.kind());
+
+  EXPECT_EQ("An example metric with no labels.", metric_def0.description());
+  EXPECT_EQ("An example metric with one label.", metric_def1.description());
+
+  EXPECT_EQ(0, metric_def0.label_descriptions().size());
+  ASSERT_EQ(1, metric_def1.label_descriptions().size());
+  EXPECT_EQ("LabelName", metric_def1.label_descriptions()[0]);
+}
+
+}  // namespace
+}  // namespace monitoring
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 5ba4e0cce69..a6968888678 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -849,6 +849,32 @@ REGISTER_OP("EditDistance")
     .Attr("normalize: bool = true")
     .Attr("T: type")
     .Output("output: float")
+    .SetShapeFn([](InferenceContext* c) {
+      const Tensor* hypothesis_shape_t = c->input_tensor(2);
+      const Tensor* truth_shape_t = c->input_tensor(5);
+      if (hypothesis_shape_t == nullptr || truth_shape_t == nullptr) {
+        // We need to know the runtime shape of the two tensors,
+        // or else the output shape is unknown.
+        return shape_inference::UnknownShape(c);
+      }
+
+      if (hypothesis_shape_t->NumElements() != truth_shape_t->NumElements()) {
+        return errors::InvalidArgument(
+            "Num elements of hypothesis_shape does not match truth_shape: ",
+            hypothesis_shape_t->NumElements(), " vs. ",
+            truth_shape_t->NumElements());
+      }
+
+      auto h_values = hypothesis_shape_t->flat<int64>();
+      auto t_values = truth_shape_t->flat<int64>();
+      std::vector<const Dimension*> dims(hypothesis_shape_t->NumElements() - 1);
+      for (int i = 0; i < dims.size(); ++i) {
+        dims[i] = c->MakeDim(std::max(h_values(i), t_values(i)));
+      }
+
+      c->set_output(0, c->MakeShape(dims));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes the (possibly normalized) Levenshtein Edit Distance.
 
@@ -1782,6 +1808,44 @@ REGISTER_OP("Tile")
     .Input("multiples: int32")
     .Output("output: T")
     .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input;
+      const Shape* multiples;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &multiples));
+      const Dimension* multiples_dim0 = c->Dim(multiples, 0);
+      if (!c->ValueKnown(multiples_dim0)) {
+        // Length of multiples vector unknown, so output is unknown.
+        //
+        // NOTE: we could potentially merge the input rank with the
+        // multiples length.
+        return shape_inference::UnknownShape(c);
+      }
+
+      int32 rank = c->Value(multiples_dim0);
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), rank, &input));
+      const Tensor* multiples_t = c->input_tensor(1);
+      if (multiples_t == nullptr) {
+        // If multiples vector isn't available, we only know the
+        // output rank, not the sizes.
+        std::vector<const Dimension*> dims;
+        for (int64 i = 0; i < rank; ++i) {
+          dims.push_back(c->UnknownDim());
+        }
+        c->set_output(0, c->MakeShape(dims));
+        return Status::OK();
+      }
+
+      // Multiply each input dimension by its corresponding value
+      // from the multiples tensor.
+      auto multiples_data = multiples_t->vec<int32>();
+      std::vector<const Dimension*> dims(rank);
+      for (int i = 0; i < rank; ++i) {
+        const int32 multiple = multiples_data(i);
+        TF_RETURN_IF_ERROR(c->Multiply(c->Dim(input, i), multiple, &dims[i]));
+      }
+      c->set_output(0, c->MakeShape(dims));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Constructs a tensor by tiling a given tensor.
 
@@ -1966,6 +2030,49 @@ REGISTER_OP("MirrorPadGrad")
     .Output("output: T")
     .Attr("T: type")
     .Attr(GetMirrorPadModeAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* paddings;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &paddings));
+      const Dimension* pad_0 = c->Dim(paddings, 0);
+      if (!c->ValueKnown(pad_0)) {
+        // We don't know the rank of the output since the first
+        // padding dimension is unknown.
+        c->set_output(0, c->UnknownShape());
+        return Status::OK();
+      }
+
+      int64 input_rank = c->Value(pad_0);
+      const Shape* input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), input_rank, &input));
+      TF_RETURN_IF_ERROR(
+          c->Merge(paddings, c->Matrix(input_rank, 2), &paddings));
+
+      const Tensor* paddings_t = c->input_tensor(1);
+      if (paddings_t == nullptr) {
+        // Values of 'paddings' is not available, but we know the
+        // input rank, so return the rank of the output with unknown
+        // dimensions.
+        std::vector<const Dimension*> dims;
+        for (int64 i = 0; i < input_rank; ++i) dims.push_back(c->UnknownDim());
+        c->set_output(0, c->MakeShape(dims));
+        return Status::OK();
+      }
+
+      auto paddings_data = paddings_t->matrix<int32>();
+      std::vector<const Dimension*> dims(input_rank);
+      for (int i = 0; i < input_rank; ++i) {
+        const int64 pad0 = static_cast<int64>(paddings_data(i, 0));
+        const int64 pad1 = static_cast<int64>(paddings_data(i, 1));
+        if (pad0 < 0 || pad1 < 0) {
+          return errors::InvalidArgument("Paddings must be non-negative");
+        }
+
+        TF_RETURN_IF_ERROR(
+            c->Subtract(c->Dim(input, i), pad0 + pad1, &dims[i]));
+      }
+      c->set_output(0, c->MakeShape(dims));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
 
@@ -2665,6 +2772,76 @@ REGISTER_OP("ExtractImagePatches")
     .Attr("rates: list(int) >= 4")
     .Attr("T: realnumbertype")
     .Attr(GetPaddingAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+
+      std::vector<int32> ksizes;
+      TF_RETURN_IF_ERROR(c->GetAttr("ksizes", &ksizes));
+      if (ksizes.size() != 4) {
+        return errors::InvalidArgument(
+            "ExtractImagePatches requires the ksizes attribute to contain 4 "
+            "values, but got: ",
+            ksizes.size());
+      }
+
+      std::vector<int32> strides;
+      TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
+      if (strides.size() != 4) {
+        return errors::InvalidArgument(
+            "ExtractImagePatches requires the stride attribute to contain 4 "
+            "values, but got: ",
+            strides.size());
+      }
+
+      std::vector<int32> rates;
+      TF_RETURN_IF_ERROR(c->GetAttr("rates", &rates));
+      if (rates.size() != 4) {
+        return errors::InvalidArgument(
+            "ExtractImagePatches requires the rates attribute to contain 4 "
+            "values, but got: ",
+            rates.size());
+      }
+
+      int32 ksize_rows = ksizes[1];
+      int32 ksize_cols = ksizes[2];
+
+      int32 stride_rows = strides[1];
+      int32 stride_cols = strides[2];
+
+      int32 rate_rows = rates[1];
+      int32 rate_cols = rates[2];
+
+      int32 ksize_rows_eff = ksize_rows + (ksize_rows - 1) * (rate_rows - 1);
+      int32 ksize_cols_eff = ksize_cols + (ksize_cols - 1) * (rate_cols - 1);
+
+      const Dimension* batch_size_dim = c->Dim(input_shape, 0);
+      const Dimension* in_rows_dim = c->Dim(input_shape, 1);
+      const Dimension* in_cols_dim = c->Dim(input_shape, 2);
+      const Dimension* output_depth_dim = c->Dim(input_shape, 3);
+
+      // At the moment we need to know the values of several fields.
+      TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+      TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
+      auto in_rows = c->Value(in_rows_dim);
+      auto in_cols = c->Value(in_cols_dim);
+
+      Padding padding;
+      TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
+
+      int64 output_rows, output_cols;
+      int64 padding_before, padding_after;
+      TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+          in_rows, ksize_rows_eff, stride_rows, padding, &output_rows,
+          &padding_before, &padding_after));
+      TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+          in_cols, ksize_cols_eff, stride_cols, padding, &output_cols,
+          &padding_before, &padding_after));
+      const Shape* output_shape = c->MakeShape(
+          {batch_size_dim, output_rows, output_cols, output_depth_dim});
+      c->set_output(0, output_shape);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Extract `patches` from `images` and put them in the "depth" output dimension.
 
@@ -2771,6 +2948,32 @@ REGISTER_OP("OneHot")
     .Output("output: T")
     .Attr("T: type")
     .Attr("TI: {uint8, int32, int64} = DT_INT64")
+    .SetShapeFn([](InferenceContext* c) {
+      int32 axis;
+      TF_RETURN_IF_ERROR(c->GetAttr("axis", &axis));
+      if (axis < -1) return errors::InvalidArgument("axis must be >= -1");
+
+      const Dimension* depth;
+      TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(1, &depth));
+
+      const Shape* indices = c->input(0);
+      if (!c->RankKnown(indices)) return shape_inference::UnknownShape(c);
+
+      int32 new_rank = c->Rank(indices) + 1;
+      // We need to add new_rank to axis in the case the axis is -1 because
+      // C++ returns negative values from % if the dividend is negative.
+      int32 depth_index = (axis + new_rank) % new_rank;
+      // Out shape is indices[0:depth_index] + [depth] + indices[depth_index:].
+      const Shape* front;
+      const Shape* back;
+      const Shape* out;
+      TF_RETURN_IF_ERROR(c->Subshape(indices, 0, depth_index, &front));
+      TF_RETURN_IF_ERROR(c->Subshape(indices, depth_index, &back));
+      TF_RETURN_IF_ERROR(c->Concatenate(front, c->Vector(depth), &front));
+      TF_RETURN_IF_ERROR(c->Concatenate(front, back, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Returns a one-hot tensor.
 
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 6516b24f0b5..6345db128e7 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -301,6 +301,38 @@ TEST(ArrayOpsTest, PadD_ShapeFn) {
   }
 }
 
+TEST(ArrayOpsTest, MirrorPadGrad_ShapeFn) {
+  ShapeInferenceTestOp op("MirrorPadGrad");
+  op.input_tensors.resize(2);
+
+  // Inputs are input and paddings.
+  INFER_OK(op, "?;?", "?");
+
+  // First padding dimension is unknown, so rank is unknown.
+  INFER_OK(op, "?;[?,4]", "?");
+
+  // Input tensor rank doesn't match paddings dimension.
+  INFER_ERROR("must be rank 3 but is rank 2", op, "[?,?];[3,2]");
+
+  // Paddings tensor is not a [rank x 2] matrix.
+  INFER_ERROR("Dimension 1 in both shapes must be equal, but are 3 and 2", op,
+              "[?,?,?];[3,3]");
+
+  // Paddings tensor is unknown, but rank is known, so the output
+  // shape is a rank 3 unknown shape.
+  INFER_OK(op, "[?,?,?];[3,2]", "[?,?,?]");
+
+  // Make the paddings tensor known and verify padding values get
+  // subtracted.  E.g., if padding is ((1,10),(2,20),(3,30)) then
+  // values 11,22,23 are subtracted to input dims to get output.
+  Tensor paddings_t(DT_INT32, TensorShape{3, 2});
+  test::FillValues<int32>(&paddings_t, {1, 10, 2, 20, 3, 30});
+  op.input_tensors[1] = &paddings_t;
+
+  INFER_OK(op, "[111,222,333];[3,2]", "[100,200,300]");
+  INFER_OK(op, "[111,?,333];[3,2]", "[100,?,300]");
+}
+
 TEST(ArrayOpsTest, BroadcastGradientArgs_ShapeFn) {
   ShapeInferenceTestOp op("BroadcastGradientArgs");
   // Output is always two unknown vectors.
@@ -767,4 +799,114 @@ TEST(ArrayOpsTest, Split_ShapeFn) {
   INFER_ERROR("Dimension size must be divisible by 2 but is 5", op, "?;[1,5]");
 }
 
+TEST(ArrayOpsTest, Tile_ShapeFn) {
+  ShapeInferenceTestOp op("Tile");
+  op.input_tensors.resize(2);
+
+  // No value for split_dim and no input.
+  TF_CHECK_OK(NodeDefBuilder("test", "Tile")
+                  .Input("input", 0, DT_FLOAT)
+                  .Input("multiples", 1, DT_INT32)
+                  .Finalize(&op.node_def));
+
+  // If multiples rank is unknown, output is unknown.
+  INFER_OK(op, "[2,3,1,4];?", "?");
+
+  // Bad rank for 'multiples'
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[2,3,1,4];[4,1]");
+
+  // No multiples tensor available, but output rank is known.
+  INFER_OK(op, "[2,3,1,4];[4]", "[?,?,?,?]");
+
+  // Test a tile of a 4D input.
+  Tensor multiples = test::AsTensor<int32>({2, 3, 4, 5});
+  op.input_tensors[1] = &multiples;
+  INFER_OK(op, "[2,3,1,4];[4]", "[4,9,4,20]");
+}
+
+TEST(ArrayOpsTest, EditDistance_ShapeFn) {
+  ShapeInferenceTestOp op("EditDistance");
+  op.input_tensors.resize(6);
+
+  // If the shape tensors are not available, the output shape is unknown.
+  INFER_OK(op, "[?];[?];[4];[?];[?];[4]", "?");
+
+  Tensor hypothesis_shape = test::AsTensor<int64>({2, 30, 4, 50});
+  op.input_tensors[2] = &hypothesis_shape;
+  Tensor truth_shape = test::AsTensor<int64>({20, 3, 40, 5});
+  op.input_tensors[5] = &truth_shape;
+  INFER_OK(op, "[?];[?];[4];[?];[?];[4]", "[20,30,40]");
+
+  // Shape elements don't match
+  hypothesis_shape = test::AsTensor<int64>({2});
+  op.input_tensors[2] = &hypothesis_shape;
+  INFER_ERROR("Num elements of hypothesis_shape does not match truth_shape", op,
+              "[?];[?];[1];[?];[?];[4]");
+}
+
+TEST(ArrayOpsTest, OneHot_ShapeFn) {
+  ShapeInferenceTestOp op("OneHot");
+  op.input_tensors.resize(4);
+  auto set_axis = [&op](int axis) {
+    TF_CHECK_OK(NodeDefBuilder("test", "OneHot")
+                    .Input("indices", 0, DT_FLOAT)
+                    .Input("depth", 1, DT_INT32)
+                    .Input("on_value", 2, DT_FLOAT)
+                    .Input("off_value", 3, DT_FLOAT)
+                    .Attr("axis", axis)
+                    .Finalize(&op.node_def));
+  };
+
+  // Invalid axis value.
+  set_axis(-2);
+  INFER_ERROR("axis must be >= -1", op, "?;?;?;?");
+  set_axis(1);
+
+  // If indices shape is unknown, we return an unknown shape.
+  INFER_OK(op, "?;[];?;?", "?");
+
+  // Depth must be scalar.
+  Tensor depth = test::AsTensor<int32>({1, 2});
+  op.input_tensors[1] = &depth;
+  INFER_ERROR("Input must be scalar but has rank 1", op, "?;[2];?;?");
+
+  // Full information is available.
+  depth = test::AsScalar<int32>(2);
+  INFER_OK(op, "[1,3,4];[];?;?", "[d0_0,2,d0_1,d0_2]");
+  set_axis(-1);
+  INFER_OK(op, "[1,3,4];[];?;?", "[d0_0,d0_1,d0_2,2]");
+}
+
+TEST(NNOpsTest, ExtractImagePatchesShapeTest) {
+  ShapeInferenceTestOp op("ExtractImagePatches");
+  auto set_op = [&op](const std::vector<int32>& ksizes,
+                      const std::vector<int32>& strides,
+                      const std::vector<int32>& rates, const string& padding) {
+    TF_CHECK_OK(NodeDefBuilder("test", "ExtractImagePatches")
+                    .Input("input", 0, DT_FLOAT)
+                    .Attr("ksizes", ksizes)
+                    .Attr("strides", strides)
+                    .Attr("rates", rates)
+                    .Attr("padding", padding)
+                    .Finalize(&op.node_def));
+  };
+
+  // Just tests that the ksize calculation with rates works.  Most of
+  // the other code is boilerplate that is tested by a variety of
+  // other ops.
+  //
+  // ksizes is 2x2.  rate rows and cols is 2, so ksize_rows and
+  // cols are changed to be 2 + (2 - 1) = 3.  7x7 input with 3x3
+  // filter and 1x1 stride gives a 5x5 output.
+  set_op({1, 2, 2, 1}, {1, 1, 1, 1}, {1, 2, 2, 1}, "VALID");
+  INFER_OK(op, "[1,7,7,2]", "[d0_0,5,5,d0_3]");
+
+  // Bad ksize rank
+  set_op({1, 2, 2, 1, 1}, {1, 1, 1, 1}, {1, 2, 2, 1}, "VALID");
+  INFER_ERROR(
+      "ExtractImagePatches requires the ksizes attribute to contain 4 values, "
+      "but got: 5",
+      op, "[1,7,7,2]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/compat/ops_history.v0.pbtxt b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
index 6c7556076a9..5f40949b95c 100644
--- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
@@ -5246,6 +5246,148 @@ op {
     }
   }
 }
+op {
+  name: "BatchSelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 11
+  }
+}
+op {
+  name: "BatchSelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "BatchSvd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "BatchSvd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "BatchToSpace"
   input_arg {
@@ -20628,6 +20770,62 @@ op {
     }
   }
 }
+op {
+  name: "SelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 11
+  }
+}
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "SerializeManySparse"
   input_arg {
@@ -25183,6 +25381,92 @@ op {
     }
   }
 }
+op {
+  name: "Svd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "Svd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "Switch"
   input_arg {
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index 121a38d7d8a..c423c742209 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -15,18 +15,32 @@ limitations under the License.
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
 namespace tensorflow {
 
 using shape_inference::InferenceContext;
+using shape_inference::Shape;
 
 // --------------------------------------------------------------------------
+namespace {
+Status SwitchShape(InferenceContext* c) {
+  const Shape* unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+  const Shape* out = c->input(0);
+  c->set_output(0, out);
+  c->set_output(1, out);
+  return Status::OK();
+}
+}  // namespace
+
 REGISTER_OP("Switch")
     .Input("data: T")
     .Input("pred: bool")
     .Output("output_false: T")
     .Output("output_true: T")
     .Attr("T: type")
+    .SetShapeFn(SwitchShape)
     .Doc(R"doc(
 Forwards `data` to the output port determined by `pred`.
 
@@ -41,7 +55,6 @@ output_false: If `pred` is false, data will be forwarded to this output.
 output_true: If `pred` is true, data will be forwarded to this output.
 )doc");
 
-// --------------------------------------------------------------------------
 REGISTER_OP("RefSwitch")
     .Input("data: Ref(T)")
     .Input("pred: bool")
@@ -49,6 +62,7 @@ REGISTER_OP("RefSwitch")
     .Output("output_true: Ref(T)")
     .Attr("T: type")
     .SetAllowsUninitializedInput()
+    .SetShapeFn(SwitchShape)
     .Doc(R"doc(
 Forwards the ref tensor `data` to the output port determined by `pred`.
 
@@ -70,6 +84,26 @@ REGISTER_OP("RefSelect")
     .Output("output: Ref(T)")
     .Attr("T: type")
     .Attr("N: int >= 1")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      const Shape* first_input = c->input(1);
+      if (!c->FullyDefined(first_input)) {
+        c->set_output(0, c->UnknownShape());
+        return Status::OK();
+      }
+      // If any inputs aren't fully defined or don't match, we return unknown.
+      for (int i = 2; i < c->num_inputs(); ++i) {
+        const Shape* input = c->input(i);
+        if (!c->FullyDefined(input) ||
+            !c->Merge(first_input, input, &unused).ok()) {
+          c->set_output(0, c->UnknownShape());
+          return Status::OK();
+        }
+      }
+      c->set_output(0, first_input);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Forwards the `index`th element of `inputs` to `output`.
 
@@ -79,12 +113,40 @@ output: The forwarded tensor.
 )doc");
 
 // --------------------------------------------------------------------------
+namespace {
+Status MergeShape(InferenceContext* c) {
+  const Shape* out = c->input(0);
+  if (!c->RankKnown(out)) {
+    out = c->UnknownShape();
+  } else {
+    int32 rank = c->Rank(out);
+    for (int i = 1; i < c->num_inputs(); ++i) {
+      const Shape* input = c->input(i);
+      if (c->Rank(input) != rank) {
+        out = c->UnknownShape();
+        break;
+      }
+
+      for (int d = 0; d < rank; ++d) {
+        if (c->Value(c->Dim(input, d)) != c->Value(c->Dim(out, d))) {
+          TF_RETURN_IF_ERROR(c->ReplaceDim(out, d, c->UnknownDim(), &out));
+        }
+      }
+    }
+  }
+  c->set_output(0, out);
+  c->set_output(1, c->Scalar());
+  return Status::OK();
+}
+}  // namespace
+
 REGISTER_OP("Merge")
     .Input("inputs: N * T")
     .Output("output: T")
     .Output("value_index: int32")
     .Attr("T: type")
     .Attr("N: int >= 1")
+    .SetShapeFn(MergeShape)
     .Doc(R"doc(
 Forwards the value of an available tensor from `inputs` to `output`.
 
@@ -107,6 +169,7 @@ REGISTER_OP("RefMerge")
     .Output("value_index: int32")
     .Attr("T: type")
     .Attr("N: int >= 1")
+    .SetShapeFn(MergeShape)
     .Doc(R"doc(
 Forwards the value of an available tensor from `inputs` to `output`.
 
diff --git a/tensorflow/core/ops/control_flow_ops_test.cc b/tensorflow/core/ops/control_flow_ops_test.cc
new file mode 100644
index 00000000000..9aa14e27a0a
--- /dev/null
+++ b/tensorflow/core/ops/control_flow_ops_test.cc
@@ -0,0 +1,79 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(ControlFlowOpsTest, Merge_ShapeFn) {
+  ShapeInferenceTestOp op("Merge");
+
+  int n = 3;
+  std::vector<NodeDefBuilder::NodeOut> src_list;
+  for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
+  TF_ASSERT_OK(NodeDefBuilder("test", "Merge")
+                   .Input(src_list)
+                   .Attr("N", n)
+                   .Finalize(&op.node_def));
+
+  // The second output should always be scalar.
+  // The first output should be unknown if any of the inputs are unknown, or
+  // if two inputs disagree about rank.
+  INFER_OK(op, "?;?;?", "?;[]");
+  INFER_OK(op, "[2,1];?;[2,1]", "?;[]");
+  INFER_OK(op, "[2,1];[2,1];?", "?;[]");
+  INFER_OK(op, "[2,1];[2,1];[3,1,2]", "?;[]");
+  // If inputs on rank, but disagree on specific dimensions, those dimensions
+  // should be unknown.
+  INFER_OK(op, "[2,1];[2,1];[3,1]", "[?,d0_1];[]");
+  INFER_OK(op, "[2,1];[2,2];[3,1]", "[?,?];[]");
+  // Otherwise, all inputs agree and we return the first input.
+  INFER_OK(op, "[2,1];[2,1];[2,1]", "in0;[]");
+}
+
+TEST(ControlFlowOpsTest, RefSelect_ShapeFn) {
+  ShapeInferenceTestOp op("RefSelect");
+
+  int n = 3;
+  std::vector<NodeDefBuilder::NodeOut> src_list;
+  for (int i = 0; i < n; ++i) src_list.emplace_back("a", 1, DT_FLOAT_REF);
+  TF_ASSERT_OK(NodeDefBuilder("test", "RefSelect")
+                   .Input("index", 0, DT_INT32)
+                   .Input(src_list)
+                   .Attr("N", n)
+                   .Finalize(&op.node_def));
+
+  // The first argument should be scalar.
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[2];?;?;?");
+
+  // If any inputs aren't fully defined, we return an unknown shape.
+  INFER_OK(op, "?;?;?;?", "?");
+  INFER_OK(op, "[];?;?;?", "?");
+  INFER_OK(op, "[];[1,2,3];?;?", "?");
+  INFER_OK(op, "[];[1,2,3];[1,2,?];[1,2,3]", "?");
+  // If inputs disagree on rank or dimension, we return an unknown shape.
+  INFER_OK(op, "[];[1,2,3];[1,2];[1,2,3]", "?");
+  INFER_OK(op, "[];[1,2,3];[1,2,4];[1,2,3]", "?");
+  // Otherwise, all inputs agree and we return the first input.
+  INFER_OK(op, "[];[1,2,3];[1,2,3];[1,2,3]", "in1");
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index 0a0aa4ef7bd..8bd806af576 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -32,6 +32,40 @@ REGISTER_OP("DynamicPartition")
     .Output("outputs: num_partitions * T")
     .Attr("num_partitions: int")
     .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      int64 num_partitions;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_partitions", &num_partitions));
+
+      const Shape* data_shape = c->input(0);
+      const Shape* partitions_shape = c->input(1);
+
+      if (!c->RankKnown(partitions_shape)) {
+        return shape_inference::UnknownShape(c);
+      }
+
+      const int64 rank = c->Rank(partitions_shape);
+
+      // data shape must start with partitions_shape
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(
+          c->MergePrefix(data_shape, partitions_shape, &unused, &unused));
+
+      // The partition shape is dynamic in the 0th dimension, and matches
+      // data_shape in the remaining dimensions.
+      const Shape* unknown_dim0 = c->MakeShape({c->UnknownDim()});
+
+      const Shape* data_suffix_shape;
+      TF_RETURN_IF_ERROR(c->Subshape(data_shape, rank, &data_suffix_shape));
+      const Shape* result_shape;
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(unknown_dim0, data_suffix_shape, &result_shape));
+
+      for (int i = 0; i < c->num_outputs(); ++i) {
+        c->set_output(i, result_shape);
+      }
+
+      return Status::OK();
+    })
     .Doc(R"doc(
 Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 
@@ -77,6 +111,37 @@ REGISTER_OP("DynamicStitch")
     .Output("merged: T")
     .Attr("N : int >= 2")
     .Attr("T : type")
+    .SetShapeFn([](InferenceContext* c) {
+      int64 num_partitions;
+      TF_RETURN_IF_ERROR(c->GetAttr("N", &num_partitions));
+
+      const Shape* extra_shape = c->UnknownShape();
+      for (int i = 0; i < num_partitions; ++i) {
+        const Shape* indices_shape = c->input(i);
+        const Shape* data_shape = c->input(i + num_partitions);
+        if (!c->RankKnown(indices_shape)) {
+          continue;
+        }
+
+        const int64 indices_rank = c->Rank(indices_shape);
+
+        // Assert that data_shape starts with indices_shape.
+        const Shape* unused;
+        TF_RETURN_IF_ERROR(
+            c->MergePrefix(data_shape, indices_shape, &unused, &unused));
+
+        // The rest belongs to output.
+        const Shape* rest;
+        TF_RETURN_IF_ERROR(c->Subshape(data_shape, indices_rank, &rest));
+        TF_RETURN_IF_ERROR(c->Merge(extra_shape, rest, &extra_shape));
+      }
+
+      const Shape* output_shape = c->Vector(c->UnknownDim());
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(output_shape, extra_shape, &output_shape));
+      c->set_output(0, output_shape);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Interleave the values from the `data` tensors into a single tensor.
 
@@ -465,9 +530,7 @@ elem: The tensor that is popped from the top of the stack.
 elem_type: The type of the elem that is popped.
 )doc");
 
-REGISTER_OP("StackClose")
-    .Input("handle: Ref(string)")
-    .Doc(R"doc(
+REGISTER_OP("StackClose").Input("handle: Ref(string)").Doc(R"doc(
 Delete the stack from its resource container.
 
 handle: The handle to a stack.
@@ -483,6 +546,12 @@ REGISTER_OP("TensorArray")
     .Attr("tensor_array_name: string = ''")
     .Output("handle: Ref(string)")
     .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      c->set_output(0, c->Vector(2));
+      return Status::OK();
+    })
     .Doc(R"doc(
 An array of Tensors of given size, with data written via Write and read
 via Read or Pack.
@@ -506,6 +575,14 @@ REGISTER_OP("TensorArrayGrad")
     .Output("grad_handle: Ref(string)")
     .Attr("source: string")
     .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      c->set_output(0, c->Vector(2));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Creates a TensorArray for storing the gradients of values in the given handle.
 
@@ -559,6 +636,15 @@ REGISTER_OP("TensorArrayWrite")
     .Input("flow_in: float")
     .Output("flow_out: float")
     .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    })
     .Doc(R"doc(
 Push an element onto the tensor_array.
 
@@ -575,6 +661,15 @@ REGISTER_OP("TensorArrayRead")
     .Input("flow_in: float")
     .Output("value: dtype")
     .Attr("dtype: type")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::UnknownShape(c);
+    })
     .Doc(R"doc(
 Read an element from the TensorArray into output `value`.
 
@@ -590,6 +685,14 @@ REGISTER_OP("TensorArrayPack")
     .Output("value: dtype")
     .Attr("dtype: type")
     .Attr("element_shape: shape = { unknown_rank: true }")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::UnknownShape(c);
+    })
     .Doc(R"doc(
 Pack the elements from the TensorArray into output `value`.
 
@@ -611,6 +714,14 @@ REGISTER_OP("TensorArrayUnpack")
     .Input("flow_in: float")
     .Output("flow_out: float")
     .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    })
     .Doc(R"doc(
 Unpack the data from the input value into TensorArray elements.
 
@@ -627,6 +738,16 @@ REGISTER_OP("TensorArrayConcat")
     .Output("lengths: int64")
     .Attr("dtype: type")
     .Attr("element_shape_except0: shape = { unknown_rank: true }")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      c->set_output(0, c->UnknownShape());
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Concat the elements from the TensorArray into value `value`.
 
@@ -663,6 +784,15 @@ REGISTER_OP("TensorArraySplit")
     .Input("flow_in: float")
     .Output("flow_out: float")
     .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    })
     .Doc(R"doc(
 Split the data from the input value into TensorArray elements.
 
@@ -696,6 +826,13 @@ REGISTER_OP("TensorArraySize")
     .Input("handle: Ref(string)")
     .Input("flow_in: float")
     .Output("size: int32")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      return shape_inference::ScalarShape(c);
+    })
     .Doc(R"doc(
 Get the current size of the TensorArray.
 
@@ -706,6 +843,13 @@ size: The current size of the TensorArray.
 
 REGISTER_OP("TensorArrayClose")
     .Input("handle: Ref(string)")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Delete the TensorArray from its resource container.  This enables
 the user to close and release the resource in the middle of a step/run.
@@ -755,6 +899,16 @@ REGISTER_OP("BarrierInsertMany")
     .Input("values: T")
     .Attr("T: type")
     .Attr("component_index: int")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* keys = c->input(1);
+      const Shape* values = c->input(2);
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(keys, 1, &keys));
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 1, &values));
+      TF_RETURN_IF_ERROR(c->Merge(keys, c->Vector(c->Dim(values, 0)), &unused));
+      return Status::OK();
+    })
     .Doc(R"doc(
 For each key, assigns the respective value to the specified component.
 
@@ -954,6 +1108,7 @@ REGISTER_OP("HashTable")
     .Attr("key_dtype: type")
     .Attr("value_dtype: type")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Creates a non-initialized hash table.
 
@@ -977,6 +1132,7 @@ REGISTER_OP("MutableHashTable")
     .Attr("key_dtype: type")
     .Attr("value_dtype: type")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Creates an empty hash table.
 
@@ -1001,6 +1157,7 @@ REGISTER_OP("MutableHashTableOfTensors")
     .Attr("value_dtype: type")
     .Attr("value_shape: shape = {}")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Creates an empty hash table.
 
@@ -1100,9 +1257,7 @@ value: The tensor for the given handle.
 dtype: The type of the output value.
 )doc");
 
-REGISTER_OP("DeleteSessionTensor")
-    .Input("handle: string")
-    .Doc(R"doc(
+REGISTER_OP("DeleteSessionTensor").Input("handle: string").Doc(R"doc(
 Delete the tensor specified by its handle in the session.
 
 handle: The handle for a tensor stored in the session state.
diff --git a/tensorflow/core/ops/data_flow_ops_test.cc b/tensorflow/core/ops/data_flow_ops_test.cc
index e1f815a2520..d00c989f4b1 100644
--- a/tensorflow/core/ops/data_flow_ops_test.cc
+++ b/tensorflow/core/ops/data_flow_ops_test.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -90,4 +92,52 @@ TEST(MathOpsTest, InitializeTableFromTextFile) {
   INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[];[1]");
 }
 
+TEST(MathOpsTest, DynamicPartition) {
+  ShapeInferenceTestOp op("DynamicPartition");
+  TF_ASSERT_OK(NodeDefBuilder("test", "DynamicPartition")
+                   .Input("data", 0, DT_FLOAT_REF)
+                   .Input("indices", 0, DT_INT32)
+                   .Attr("num_partitions", 4)
+                   .Finalize(&op.node_def));
+
+  // Unknown rank for indices, so unknown shape.
+  INFER_OK(op, "?;?", "?;?;?;?");
+
+  // 3 dimensional data, 2 dimensional indices.
+  INFER_OK(op, "[3,4,5];[3,4]", "[?,d0_2];[?,d0_2];[?,d0_2];[?,d0_2]");
+
+  TF_ASSERT_OK(NodeDefBuilder("test", "DynamicPartition")
+                   .Input("data", 0, DT_FLOAT)
+                   .Input("indices", 0, DT_INT32)
+                   .Attr("num_partitions", 2)
+                   .Finalize(&op.node_def));
+
+  // Suffix after matching prefix is copied over.
+  INFER_OK(op, "[3,4,5,6];[3,4]", "[?,d0_2,d0_3];[?,d0_2,d0_3]");
+
+  // Does not start with proper prefix
+  INFER_ERROR("Dimensions must be equal, but are 4 and 100", op,
+              "[3,4,5];[3,100]");
+}
+
+TEST(MathOpsTest, DynamicStitch) {
+  ShapeInferenceTestOp op("DynamicStitch");
+  TF_ASSERT_OK(
+      NodeDefBuilder("test", "DynamicStitch")
+          .Input({{"indices", 0, DT_INT32}, {"indices_2", 1, DT_INT32}})
+          .Input({{"data", 0, DT_FLOAT}, {"data_2", 1, DT_FLOAT}})
+          .Attr("N", 2)
+          .Finalize(&op.node_def));
+
+  INFER_OK(op, "[2,3];[5,6];[2,3,4,5];[5,6,4,5]", "[?,d2_2,d2_3]");
+
+  // Bad prefix for the second data input.
+  INFER_ERROR("Dimensions must be equal, but are 10 and 5", op,
+              "[2,3];[5,6];[2,3,4,5];[10,11,4,5]");
+
+  // Inconsistent suffix dimensions
+  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 4 and 13", op,
+              "[2,3];[5,6];[2,3,4,5];[5,6,13,14]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 18869205971..5a55493517b 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -177,6 +177,10 @@ REGISTER_OP("ResizeBilinearGrad")
     .Output("output: T")
     .Attr("T: {float, half, double}")
     .Attr("align_corners: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(1));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes the gradient of bilinear interpolation.
 
@@ -219,6 +223,27 @@ REGISTER_OP("ResizeNearestNeighborGrad")
     .Output("output: T")
     .Attr("T: {uint8, int8, int32, half, float, double}")
     .Attr("align_corners: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(unused, 0), 2, &unused_dim));
+      const Tensor* size = c->input_tensor(1);
+      if (size == nullptr) {
+        TF_RETURN_IF_ERROR(c->ReplaceDim(input, 1, c->UnknownDim(), &input));
+        TF_RETURN_IF_ERROR(c->ReplaceDim(input, 2, c->UnknownDim(), &input));
+      } else {
+        auto size_vec = size->vec<int32>();
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(input, 1, c->MakeDim(size_vec(0)), &input));
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(input, 2, c->MakeDim(size_vec(1)), &input));
+      }
+      c->set_output(0, input);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes the gradient of nearest neighbor interpolation.
 
@@ -771,6 +796,13 @@ REGISTER_OP("CropAndResizeGradImage")
     .Output("output: T")
     .Attr("T: {float, half, double}")
     .Attr("method: {'bilinear'} = 'bilinear'")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(3, &out));
+      TF_RETURN_IF_ERROR(c->WithRank(out, 4, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes the gradient of the crop_and_resize op wrt the input image tensor.
 
@@ -803,6 +835,10 @@ REGISTER_OP("CropAndResizeGradBoxes")
     .Output("output: float")
     .Attr("T: {uint8, int8, int16, int32, int64, half, float, double}")
     .Attr("method: {'bilinear'} = 'bilinear'")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(2));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
 
@@ -834,6 +870,10 @@ REGISTER_OP("NonMaxSuppression")
     .Input("max_output_size: int32")
     .Output("selected_indices: int32")
     .Attr("iou_threshold: float = 0.5")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Greedily selects a subset of bounding boxes in descending order of score,
 pruning away boxes that have high intersection-over-union (IOU) overlap
diff --git a/tensorflow/core/ops/image_ops_test.cc b/tensorflow/core/ops/image_ops_test.cc
index 3cb33fe889b..fc9640ffb40 100644
--- a/tensorflow/core/ops/image_ops_test.cc
+++ b/tensorflow/core/ops/image_ops_test.cc
@@ -160,4 +160,37 @@ TEST(ImageOpsTest, CropAndResize_ShapeFn) {
   INFER_ERROR("Dimension must be 4 but is 3", op, "?;[?,3];?;?");
 }
 
+TEST(ImageOpsTest, ResizeNearestNeighborGrad_ShapeFn) {
+  ShapeInferenceTestOp op("ResizeNearestNeighborGrad");
+  op.input_tensors.resize(2);
+
+  // Rank and size checks.
+  INFER_ERROR("Shape must be rank 4 but is rank 3", op, "[1,2,3];?");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "?;[1,2]")
+  INFER_ERROR("Dimension must be 2 but is 1", op, "?;[1]");
+
+  // When the size tensor is not a constant, the middle dims are unknown.
+  INFER_OK(op, "[1,?,3,?];[2]", "[d0_0,?,?,d0_3]");
+
+  Tensor size_tensor = test::AsTensor<int32>({20, 30});
+  op.input_tensors[1] = &size_tensor;
+  INFER_OK(op, "[1,?,3,?];[2]", "[d0_0,20,30,d0_3]");
+}
+
+TEST(ImageOpsTest, CropAndResizeGradImage_ShapeFn) {
+  ShapeInferenceTestOp op("CropAndResizeGradImage");
+  op.input_tensors.resize(4);
+
+  // Rank checks.
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "?;?;?;[1,2]");
+
+  // Unknown image_size should result in output of rank 4 with unknown dims.
+  INFER_OK(op, "?;?;?;?", "[?,?,?,?]");
+
+  // Known image_size should result in full shape information.
+  Tensor image_size = test::AsTensor<int32>({10, 20, 30, 40});
+  op.input_tensors[3] = &image_size;
+  INFER_OK(op, "?;?;?;[1]", "[10, 20, 30, 40]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index ab4b2644b24..54b8e22b7ee 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -115,6 +115,111 @@ Status BatchMatrixSolveShapeFn(InferenceContext* c, bool square) {
   return Status::OK();
 }
 
+Status BatchSvdShapeHelperFn(InferenceContext* c, const Shape* input) {
+  const Dimension* m = c->Dim(input, -2);
+  const Dimension* n = c->Dim(input, -1);
+  const Dimension* p;
+  TF_RETURN_IF_ERROR(c->Min(m, n, &p));
+  const Shape* batch_shape;
+  TF_RETURN_IF_ERROR(c->Subshape(input, 0, -2, &batch_shape));
+  const Shape* e_shape;
+  TF_RETURN_IF_ERROR(c->Concatenate(batch_shape, c->Vector(p), &e_shape));
+  c->set_output(0, e_shape);
+  bool compute_uv;
+  TF_RETURN_IF_ERROR(c->GetAttr("compute_uv", &compute_uv));
+  if (compute_uv) {
+    const Shape* u_shape;
+    const Shape* v_shape;
+    bool full_matrices;
+    TF_RETURN_IF_ERROR(c->GetAttr("full_matrices", &full_matrices));
+    if (full_matrices) {
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(batch_shape, c->Matrix(m, m), &u_shape));
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(batch_shape, c->Matrix(n, n), &v_shape));
+    } else {
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(batch_shape, c->Matrix(m, p), &u_shape));
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(batch_shape, c->Matrix(n, p), &v_shape));
+    }
+    c->set_output(1, u_shape);
+    c->set_output(2, v_shape);
+  } else {
+    c->set_output(1, c->Vector(0ll));
+    c->set_output(2, c->Vector(0ll));
+  }
+  return Status::OK();
+}
+
+// Input is [M,N].  First output is [min(M,N)].
+// Second and third outputs are:
+//   [0]; [0], if compute_uv is false.
+//   [M,M]; [N,N], if compute_uv is true and full_matrices is true,
+//   [M,P]; [N,P], if compute_uv is true and full_matrices is false,
+// where P = min(M,N).
+Status SvdShapeFn(InferenceContext* c) {
+  const Shape* input;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+  return BatchSvdShapeHelperFn(c, input);
+}
+
+// Input is [...,M,N].  First output is [...,min(M,N)].
+// Second and third outputs are:
+//   [0]; [0], if compute_uv is false.
+//   [...,M,M]; [...,N,N], if compute_uv is true and full_matrices is true,
+//   [...,M,P]; [...,N,P], if compute_uv is true and full_matrices is false,
+// where P = min(M,N).
+Status BatchSvdShapeFn(InferenceContext* c) {
+  const Shape* input;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &input));
+  return BatchSvdShapeHelperFn(c, input);
+}
+
+// Input is [N,N]. Outputs are:
+//   [N];[0], if compute_v is false,
+//   [N];[N,N], if compute_v is true.
+Status SelfAdjointEigV2ShapeFn(InferenceContext* c) {
+  const Shape* input;
+  TF_RETURN_IF_ERROR(MakeSquareMatrix(c, c->input(0), &input));
+  const Dimension* n;
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(input, 0), c->Dim(input, 1), &n));
+  c->set_output(0, c->Vector(n));
+  bool compute_v;
+  TF_RETURN_IF_ERROR(c->GetAttr("compute_v", &compute_v));
+  if (compute_v) {
+    c->set_output(1, c->Matrix(n, n));
+  } else {
+    c->set_output(1, c->Vector(0ll));
+  }
+  return Status::OK();
+}
+
+// Input is [...,N,N]. Outputs are:
+//   [...,N];[0], if compute_v is false,
+//   [...,N];[...,N,N], if compute_v is true.
+Status BatchSelfAdjointEigV2ShapeFn(InferenceContext* c) {
+  const Shape* input;
+  TF_RETURN_IF_ERROR(MakeBatchSquareMatrix(c, c->input(0), &input));
+  const Dimension* n;
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(input, -2), c->Dim(input, -1), &n));
+  const Shape* batch_shape;
+  TF_RETURN_IF_ERROR(c->Subshape(input, 0, -2, &batch_shape));
+  const Shape* e_shape;
+  TF_RETURN_IF_ERROR(c->Concatenate(batch_shape, c->Vector(n), &e_shape));
+  c->set_output(0, e_shape);
+  bool compute_v;
+  TF_RETURN_IF_ERROR(c->GetAttr("compute_v", &compute_v));
+  if (compute_v) {
+    const Shape* v_shape;
+    TF_RETURN_IF_ERROR(c->Concatenate(batch_shape, c->Matrix(n, n), &v_shape));
+    c->set_output(1, v_shape);
+  } else {
+    c->set_output(1, c->Vector(0ll));
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 REGISTER_OP("MatrixDeterminant")
@@ -128,7 +233,7 @@ REGISTER_OP("MatrixDeterminant")
       return Status::OK();
     })
     .Doc(R"doc(
-Calculates the determinant of a square matrix.
+Computes the determinant of a square matrix.
 
 input: A tensor of shape `[M, M]`.
 output: A scalar, equal to the determinant of the input.
@@ -152,7 +257,7 @@ REGISTER_OP("BatchMatrixDeterminant")
       return Status::OK();
     })
     .Doc(R"doc(
-Calculates the determinants for a batch of square matrices.
+Computes the determinants for a batch of square matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices. The output is a tensor containing the determinants
@@ -169,7 +274,7 @@ REGISTER_OP("MatrixInverse")
     .Attr("T: {double, float}")
     .SetShapeFn(UnchangedSquareShapeFn)
     .Doc(R"doc(
-Calculates the inverse of a square invertible matrix or its adjoint (conjugate
+Computes the inverse of a square invertible matrix or its adjoint (conjugate
 transpose).
 
 The op uses LU decomposition with partial pivoting to compute the inverse.
@@ -191,7 +296,7 @@ REGISTER_OP("BatchMatrixInverse")
     .Attr("T: {double, float}")
     .SetShapeFn(BatchUnchangedSquareShapeFn)
     .Doc(R"doc(
-Calculates the inverse of square invertible matrices or their adjoints
+Computes the inverse of square invertible matrices or their adjoints
 (conjugate transposes).
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
@@ -214,7 +319,7 @@ REGISTER_OP("Cholesky")
     .Attr("T: {double, float}")
     .SetShapeFn(UnchangedSquareShapeFn)
     .Doc(R"doc(
-Calculates the Cholesky decomposition of a square matrix.
+Computes the Cholesky decomposition of a square matrix.
 
 The input has to be symmetric and positive definite. Only the lower-triangular
 part of the input will be used for this operation. The upper-triangular part
@@ -233,7 +338,7 @@ REGISTER_OP("BatchCholesky")
     .Attr("T: {double, float}")
     .SetShapeFn(BatchUnchangedSquareShapeFn)
     .Doc(R"doc(
-Calculates the Cholesky decomposition of a batch of square matrices.
+Computes the Cholesky decomposition of a batch of square matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices, with the same constraints as the single matrix Cholesky
@@ -251,16 +356,16 @@ REGISTER_OP("CholeskyGrad")
     .Attr("T: {float, double}")
     .SetShapeFn(UnchangedSquareShapeFn)
     .Doc(R"doc(
-Calculates the reverse mode backpropagated gradient of the Cholesky algorithm.
+Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 
 For an explanation see "Differentiation of the Cholesky algorithm" by
 Iain Murray http://arxiv.org/abs/1602.07527.
 
 l: Output of Cholesky algorithm l = chol(A). Shape is `[M, M]`.
   Algorithm depends only on lower triangular part of this matrix.
-grad: df/dl where f is some scalar function. Shape is `[M, M]'.
+grad: df/dl where f is some scalar function. Shape is `[M, M]`.
   Algorithm depends only on lower triangular part of this matrix.
-output: Symmetrized version of df/dA . Shape is `[M, M]'.
+output: Symmetrized version of df/dA . Shape is `[M, M]`.
 )doc");
 
 REGISTER_OP("BatchCholeskyGrad")
@@ -270,7 +375,7 @@ REGISTER_OP("BatchCholeskyGrad")
     .Attr("T: {float, double}")
     .SetShapeFn(BatchUnchangedSquareShapeFn)
     .Doc(R"doc(
-Calculates the reverse mode backpropagated gradient of the Cholesky algorithm.
+Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 
 For an explanation see "Differentiation of the Cholesky algorithm" by
 Iain Murray http://arxiv.org/abs/1602.07527.
@@ -278,16 +383,17 @@ Iain Murray http://arxiv.org/abs/1602.07527.
 l: Output of batch Cholesky algorithm l = batch_cholesky(A). Shape is `[..., M, M]`.
   Algorithm depends only on lower triangular part of the innermost matrices of
   this tensor.
-grad: df/dl where f is some scalar function. Shape is `[..., M, M]'.
+grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
   Algorithm depends only on lower triangular part of the innermost matrices of
   this tensor.
-output: Symmetrized version of df/dA . Shape is `[..., M, M]'
+output: Symmetrized version of df/dA . Shape is `[..., M, M]`
 )doc");
 
 REGISTER_OP("SelfAdjointEig")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: {double, float}")
+    .Deprecated(11, "Use SelfAdjointEigV2 instead.")
     .SetShapeFn([](InferenceContext* c) {
       const Shape* input;
       TF_RETURN_IF_ERROR(MakeSquareMatrix(c, c->input(0), &input));
@@ -299,7 +405,7 @@ REGISTER_OP("SelfAdjointEig")
       return Status::OK();
     })
     .Doc(R"doc(
-Calculates the Eigen Decomposition of a square Self-Adjoint matrix.
+Computes the Eigen Decomposition of a square Self-Adjoint matrix.
 
 Only the lower-triangular part of the input will be used in this case. The
 upper-triangular part will not be read.
@@ -315,6 +421,7 @@ REGISTER_OP("BatchSelfAdjointEig")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: {double, float}")
+    .Deprecated(11, "Use BatchSelfAdjointEigV2 instead.")
     .SetShapeFn([](InferenceContext* c) {
       const Shape* input;
       TF_RETURN_IF_ERROR(MakeBatchSquareMatrix(c, c->input(0), &input));
@@ -330,19 +437,75 @@ REGISTER_OP("BatchSelfAdjointEig")
       return Status::OK();
     })
     .Doc(R"doc(
-Calculates the Eigen Decomposition of a batch of square self-adjoint matrices.
+Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices, with the same constraints as the single matrix
 SelfAdjointEig.
 
-The result is a '[..., M+1, M] matrix with [..., 0,:] containing the
+The result is a [..., M+1, M] matrix with [..., 0,:] containing the
 eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
 
 input: Shape is `[..., M, M]`.
 output: Shape is `[..., M+1, M]`.
 )doc");
 
+REGISTER_OP("SelfAdjointEigV2")
+    .Input("input: T")
+    .Output("e: T")
+    .Output("v: T")
+    .Attr("compute_v: bool = True")
+    .Attr("T: {double, float}")
+    .SetShapeFn(SelfAdjointEigV2ShapeFn)
+    .Doc(R"doc(
+Computes the eigen decomposition of a self-adjoint (\"symmetric\") matrix.
+
+Computes the eigenvalues and (optionally) eigenvectors such that
+`input = v * diag(e)`.
+
+```prettyprint
+# a is a self-adjoint matrix.
+# e is a vector of eigenvalues.
+# v is a matrix of eigenvectors.
+e, v = self_adjoint_eig(a)
+e = self_adjoint_eig(a, compute_v=False)
+```
+
+input: `Tensor` input of shape `[N, N]`.
+compute_v: If `True` then eigenvectors will be computed and returned in `v`.
+  Otherwise, only the eigenvalues will be computed.
+e: Eigenvalues. Shape is `[N]`.
+v: Eigenvectors. Shape is `[N, N]`.
+)doc");
+
+REGISTER_OP("BatchSelfAdjointEigV2")
+    .Input("input: T")
+    .Output("e: T")
+    .Output("v: T")
+    .Attr("compute_v: bool = True")
+    .Attr("T: {double, float}")
+    .SetShapeFn(BatchSelfAdjointEigV2ShapeFn)
+    .Doc(R"doc(
+Computes the eigen decomposition of a batch of square self-adjoint matrices.
+
+Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+
+```prettyprint
+# a is a tensor.
+# e is a tensor of eigenvalues.
+# v is a tensor of eigenvectors.
+e, v = batch_self_adjoint_eig(a)
+e = batch_self_adjoint_eig(a, compute_v=False)
+```
+
+input: `Tensor` input of shape `[N, N]`.
+compute_v: If `True` then eigenvectors will be computed and returned in `v`.
+  Otherwise, only the eigenvalues will be computed.
+e: Eigenvalues. Shape is `[N]`.
+v: Eigenvectors. Shape is `[N, N]`.
+)doc");
+
 REGISTER_OP("MatrixSolve")
     .Input("matrix: T")
     .Input("rhs: T")
@@ -526,10 +689,10 @@ REGISTER_OP("BatchMatrixSolveLs")
 Solves multiple linear least-squares problems.
 
 `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-form square matrices. Rhs is a tensor of shape `[..., M, K]`. The output
-is a tensor shape `[..., N, K]` where each output matrix solves each of
-the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :] in the
-least squares sense.
+form matrices of size `[M, N]`. Rhs is a tensor of shape `[..., M, K]`.
+The output is a tensor shape `[..., N, K]` where each output matrix solves
+each of the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]
+in the least squares sense.
 
 Below we will use the following notation for each pair of
 matrix and right-hand sides in the batch:
@@ -563,4 +726,84 @@ rhs: Shape is `[..., M, K]`.
 output: Shape is `[..., N, K]`.
 )doc");
 
+REGISTER_OP("Svd")
+    .Input("input: T")
+    .Output("s: T")
+    .Output("u: T")
+    .Output("v: T")
+    .Attr("compute_uv: bool = True")
+    .Attr("full_matrices: bool = False")
+    .Attr("T: {double, float}")
+    .SetShapeFn(SvdShapeFn)
+    .Doc(R"doc(
+Computes the singular value decomposition of a matrix.
+
+Computes the SVD of if `input` such that `input = u * diag(s) * transpose(v)`
+
+```prettyprint
+# a is a matrix.
+# s is a vector of singular values.
+# u is the matrix of left singular vectors.
+# v is a matrix of right singular vectors.
+s, u, v = svd(a)
+s, _, _ = svd(a, compute_uv=False)
+```
+
+input: Shape is `[M, N]`. Let `P` be the minimum of `M` and `N`.
+s: Singular values. Shape is `[P]`.
+u: Left singular vectors; if `full_matrices` is `False` then shape is `[M, M]`.
+  If `full_matrices` is `True` then shape is `[M, P]`.
+  Undefined if `compute_uv` is `False`.
+v: Left singular vectors. If `full_matrices` is `False` then shape is `[N, N]`.
+  If `full_matrices` is `True` then shape is `[N, P]`.
+  Undefined if `compute_uv` is false.
+compute_uv: If true, left and right singular vectors will be
+  computed and returned in `u` and `v`, respectively.
+  If false, `u` and `v` are not set and should never referenced.
+full_matrices: If true, compute full-sized `u` and `v`. If false
+  (the default), compute only the leading `P` singular vectors.
+  Ignored if `compute_uv` is `False`.
+)doc");
+
+REGISTER_OP("BatchSvd")
+    .Input("input: T")
+    .Output("s: T")
+    .Output("u: T")
+    .Output("v: T")
+    .Attr("compute_uv: bool = True")
+    .Attr("full_matrices: bool = False")
+    .Attr("T: {double, float}")
+    .SetShapeFn(BatchSvdShapeFn)
+    .Doc(R"doc(
+Computes the singular value decompositions of a batch of matrices.
+
+Computes the SVD of each inner matrix in `input` such that
+`input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+
+```prettyprint
+# a is a tensor containing a batch of matrices.
+# s is a tensor of singular values for each matrix.
+# u is the tensor containing of left singular vectors for each matrix.
+# v is the tensor containing of right singular vectors for each matrix.
+s, u, v = batch_svd(a)
+s, _, _ = batch_svd(a, compute_uv=False)
+```
+
+input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+  form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+s: Singular values. Shape is `[..., P]`.
+u: Left singular vectors. If `full_matrices` is `False` then shape is
+  `[..., M, M]`; if `full_matrices` is `True` then shape is
+  `[..., M, P]`. Undefined if `compute_uv` is `False`.
+v: Left singular vectors. If `full_matrices` is `False` then shape is
+  `[..., N, N]`. If `full_matrices` is `True` then shape is `[..., N, P]`.
+  Undefined if `compute_uv` is false.
+compute_uv: If true, left and right singular vectors will be
+  computed and returned in `u` and `v`, respectively.
+  If false, `u` and `v` are not set and should never referenced.
+full_matrices: If true, compute full-sized `u` and `v`. If false
+  (the default), compute only the leading `P` singular vectors.
+  Ignored if `compute_uv` is `False`.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/linalg_ops_test.cc b/tensorflow/core/ops/linalg_ops_test.cc
index 84e888bb9c9..6414db13a41 100644
--- a/tensorflow/core/ops/linalg_ops_test.cc
+++ b/tensorflow/core/ops/linalg_ops_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 #include "tensorflow/core/platform/test.h"
@@ -112,6 +113,70 @@ TEST(LinalgOpsTest, BatchSelfAdjointEig_ShapeFn) {
   INFER_OK(op, "[5,?,7,?,1]", "[d0_0,d0_1,d0_2,2,d0_4]");
 }
 
+TEST(LinalgOpsTest, SelfAdjointEigV2_ShapeFn) {
+  ShapeInferenceTestOp op("SelfAdjointEigV2");
+  auto set_compute_v = [&op](bool compute_v) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Pack")
+                    .Input({{"input", 0, DT_FLOAT}})
+                    .Attr("compute_v", compute_v)
+                    .Finalize(&op.node_def));
+  };
+  set_compute_v(false);
+  INFER_OK(op, "?", "[?];[0]");
+  INFER_OK(op, "[?,?]", "[d0_0|d0_1];[0]");
+  INFER_OK(op, "[1,?]", "[d0_0|d0_1];[0]");
+  INFER_OK(op, "[?,1]", "[d0_0|d0_1];[0]");
+  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Dimensions must be equal, but are 1 and 2", op, "[1,2]");
+
+  set_compute_v(true);
+  INFER_OK(op, "?", "[?];[?,?]");
+  INFER_OK(op, "[?,?]", "[d0_0|d0_1];[d0_0|d0_1,d0_0|d0_1]");
+  INFER_OK(op, "[1,?]", "[d0_0|d0_1];[d0_0|d0_1,d0_0|d0_1]");
+  INFER_OK(op, "[?,1]", "[d0_0|d0_1];[d0_0|d0_1,d0_0|d0_1]");
+  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Dimensions must be equal, but are 1 and 2", op, "[1,2]");
+}
+
+TEST(LinalgOpsTest, BatchSelfAdjointEigV2_ShapeFn) {
+  ShapeInferenceTestOp op("BatchSelfAdjointEigV2");
+  auto set_compute_v = [&op](bool compute_v) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Pack")
+                    .Input({{"input", 0, DT_FLOAT}})
+                    .Attr("compute_v", compute_v)
+                    .Finalize(&op.node_def));
+  };
+
+  set_compute_v(false);
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Dimensions must be equal, but are 1 and 2", op, "[1,2]");
+  INFER_ERROR("Dimensions must be equal, but are 1 and 2", op, "[3,1,2]");
+
+  INFER_OK(op, "?", "?;[0]");
+  INFER_OK(op, "[?,?]", "[d0_0|d0_1];[0]");
+  INFER_OK(op, "[1,?]", "[d0_0|d0_1];[0]");
+  INFER_OK(op, "[?,1]", "[d0_0|d0_1];[0]");
+
+  // Repeat previous block of tests with input rank > 2.
+  INFER_OK(op, "[5,?,7,?,?]", "[d0_0,d0_1,d0_2,d0_3|d0_4];[0]");
+  INFER_OK(op, "[5,?,7,1,?]", "[d0_0,d0_1,d0_2,d0_3|d0_4];[0]");
+  INFER_OK(op, "[5,?,7,?,1]", "[d0_0,d0_1,d0_2,d0_3|d0_4];[0]");
+
+  set_compute_v(true);
+  INFER_OK(op, "?", "?;?");
+  INFER_OK(op, "[?,?]", "[d0_0|d0_1];[d0_0|d0_1,d0_0|d0_1]");
+  INFER_OK(op, "[1,?]", "[d0_0|d0_1];[d0_0|d0_1,d0_0|d0_1]");
+  INFER_OK(op, "[?,1]", "[d0_0|d0_1];[d0_0|d0_1,d0_0|d0_1]");
+
+  // Repeat previous block of tests with input rank > 2.
+  INFER_OK(op, "[5,?,7,?,?]",
+           "[d0_0,d0_1,d0_2,d0_3|d0_4];[d0_0,d0_1,d0_2,d0_3|d0_4,d0_3|d0_4]");
+  INFER_OK(op, "[5,?,7,1,?]",
+           "[d0_0,d0_1,d0_2,d0_3|d0_4];[d0_0,d0_1,d0_2,d0_3|d0_4,d0_3|d0_4]");
+  INFER_OK(op, "[5,?,7,?,1]",
+           "[d0_0,d0_1,d0_2,d0_3|d0_4];[d0_0,d0_1,d0_2,d0_3|d0_4,d0_3|d0_4]");
+}
+
 TEST(LinalgOpsTest, SquareMatrixSolve_ShapeFn) {
   for (const char* op_name : {"MatrixSolve", "MatrixTriangularSolve"}) {
     ShapeInferenceTestOp op(op_name);
@@ -200,4 +265,100 @@ TEST(LinalgOpsTest, BatchMatrixSolveLs_ShapeFn) {
   INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "?;[1];?");
 }
 
+TEST(LinalgOpsTest, Svd_ShapeFn) {
+  ShapeInferenceTestOp op("Svd");
+  auto set_attrs = [&op](bool compute_uv, bool full_matrices) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Svd")
+                    .Input({"input", 0, DT_FLOAT})
+                    .Attr("compute_uv", compute_uv)
+                    .Attr("full_matrices", full_matrices)
+                    .Finalize(&op.node_def));
+  };
+
+  set_attrs(false, false);
+  INFER_OK(op, "?", "[?];[0];[0]");
+  INFER_OK(op, "[?,?]", "[?];[0];[0]");
+  INFER_OK(op, "[2,?]", "[?];[0];[0]");
+  INFER_OK(op, "[?,2]", "[?];[0];[0]");
+  INFER_OK(op, "[2,2]", "[d0_0];[0];[0]");
+  INFER_OK(op, "[3,2]", "[d0_1];[0];[0]");
+  INFER_OK(op, "[2,3]", "[d0_0];[0];[0]");
+  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Shape must be rank 2 but is rank 3", op, "[1,2,3]");
+
+  set_attrs(true, false);
+  INFER_OK(op, "?", "[?];[?,?];[?,?]");
+  INFER_OK(op, "[?,?]", "[?];[d0_0,?];[d0_1,?]");
+  INFER_OK(op, "[2,?]", "[?];[d0_0,?];[d0_1,?]");
+  INFER_OK(op, "[?,2]", "[?];[d0_0,?];[d0_1,?]");
+  INFER_OK(op, "[2,2]", "[d0_0];[d0_0,d0_0];[d0_1,d0_0]");
+  INFER_OK(op, "[3,2]", "[d0_1];[d0_0,d0_1];[d0_1,d0_1]");
+  INFER_OK(op, "[2,3]", "[d0_0];[d0_0,d0_0];[d0_1,d0_0]");
+  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Shape must be rank 2 but is rank 3", op, "[1,2,3]");
+
+  set_attrs(true, true);
+  INFER_OK(op, "?", "[?];[?,?];[?,?]");
+  INFER_OK(op, "[?,?]", "[?];[d0_0,d0_0];[d0_1,d0_1]");
+  INFER_OK(op, "[2,?]", "[?];[d0_0,d0_0];[d0_1,d0_1]");
+  INFER_OK(op, "[?,2]", "[?];[d0_0,d0_0];[d0_1,d0_1]");
+  INFER_OK(op, "[2,2]", "[d0_0];[d0_0,d0_0];[d0_1,d0_1]");
+  INFER_OK(op, "[3,2]", "[d0_1];[d0_0,d0_0];[d0_1,d0_1]");
+  INFER_OK(op, "[2,3]", "[d0_0];[d0_0,d0_0];[d0_1,d0_1]");
+  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Shape must be rank 2 but is rank 3", op, "[1,2,3]");
+}
+
+TEST(LinalgOpsTest, BatchSvd_ShapeFn) {
+  ShapeInferenceTestOp op("BatchSvd");
+  auto set_attrs = [&op](bool compute_uv, bool full_matrices) {
+    TF_CHECK_OK(NodeDefBuilder("test", "BatchSvd")
+                    .Input({"input", 0, DT_FLOAT})
+                    .Attr("compute_uv", compute_uv)
+                    .Attr("full_matrices", full_matrices)
+                    .Finalize(&op.node_def));
+  };
+  set_attrs(false, false);
+  INFER_OK(op, "?", "?;[0];[0]");
+  INFER_OK(op, "[?,?,?]", "[d0_0,?];[0];[0]");
+  INFER_OK(op, "[4,?,?]", "[d0_0,?];[0];[0]");
+  INFER_OK(op, "[4,2,?]", "[d0_0,?];[0];[0]");
+  INFER_OK(op, "[4,?,2]", "[d0_0,?];[0];[0]");
+  INFER_OK(op, "[?,2,2]", "[d0_0,d0_1];[0];[0]");
+  INFER_OK(op, "[4,2,2]", "[d0_0,d0_1];[0];[0]");
+  INFER_OK(op, "[?,3,2]", "[d0_0,d0_2];[0];[0]");
+  INFER_OK(op, "[4,3,2]", "[d0_0,d0_2];[0];[0]");
+  INFER_OK(op, "[?,2,3]", "[d0_0,d0_1];[0];[0]");
+  INFER_OK(op, "[4,2,3]", "[d0_0,d0_1];[0];[0]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
+
+  set_attrs(true, false);
+  INFER_OK(op, "?", "?;?;?");
+  INFER_OK(op, "[?,?,?]", "[d0_0,?];[d0_0,d0_1,?];[d0_0,d0_2,?]");
+  INFER_OK(op, "[4,?,?]", "[d0_0,?];[d0_0,d0_1,?];[d0_0,d0_2,?]");
+  INFER_OK(op, "[4,2,?]", "[d0_0,?];[d0_0,d0_1,?];[d0_0,d0_2,?]");
+  INFER_OK(op, "[4,?,2]", "[d0_0,?];[d0_0,d0_1,?];[d0_0,d0_2,?]");
+  INFER_OK(op, "[?,2,2]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_1]");
+  INFER_OK(op, "[4,2,2]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_1]");
+  INFER_OK(op, "[?,3,2]", "[d0_0,d0_2];[d0_0,d0_1,d0_2];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[4,3,2]", "[d0_0,d0_2];[d0_0,d0_1,d0_2];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[?,2,3]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_1]");
+  INFER_OK(op, "[4,2,3]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_1]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
+
+  set_attrs(true, true);
+  INFER_OK(op, "?", "?;?;?");
+  INFER_OK(op, "[?,?,?]", "[d0_0,?];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[4,?,?]", "[d0_0,?];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[4,2,?]", "[d0_0,?];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[4,?,2]", "[d0_0,?];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[?,2,2]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[4,2,2]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[?,3,2]", "[d0_0,d0_2];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[4,3,2]", "[d0_0,d0_2];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[?,2,3]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[4,2,3]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/logging_ops.cc b/tensorflow/core/ops/logging_ops.cc
index 5d8d34988df..49fd72a4f0c 100644
--- a/tensorflow/core/ops/logging_ops.cc
+++ b/tensorflow/core/ops/logging_ops.cc
@@ -92,6 +92,7 @@ REGISTER_OP("ScalarSummary")
     .Input("values: T")
     .Output("summary: string")
     .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Outputs a `Summary` protocol buffer with scalar values.
 
@@ -108,6 +109,7 @@ REGISTER_OP("HistogramSummary")
     .Input("values: T")
     .Output("summary: string")
     .Attr("T: realnumbertype = DT_FLOAT")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Outputs a `Summary` protocol buffer with a histogram.
 
@@ -132,6 +134,7 @@ REGISTER_OP("ImageSummary")
         "bad_color: tensor = { dtype: DT_UINT8 "
         "tensor_shape: { dim { size: 4 } } "
         "int_val: 255 int_val: 0 int_val: 0 int_val: 255 }")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Outputs a `Summary` protocol buffer with images.
 
@@ -183,6 +186,7 @@ REGISTER_OP("AudioSummary")
     .Output("summary: string")
     .Attr("sample_rate: float")
     .Attr("max_outputs: int >= 1 = 3")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Outputs a `Summary` protocol buffer with audio.
 
@@ -209,6 +213,7 @@ REGISTER_OP("MergeSummary")
     .Input("inputs: N * string")
     .Output("summary: string")
     .Attr("N : int >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Merges summaries.
 
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 03ada875112..d38a5350831 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -533,6 +533,7 @@ REGISTER_OP("Conv3D")
     .Attr("T: numbertype")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
+    .SetShapeFn(shape_inference::Conv3DShape)
     .Doc(R"doc(
 Computes a 3-D convolution given 5-D `input` and `filter` tensors.
 
@@ -677,6 +678,7 @@ REGISTER_OP("AvgPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr("T: numbertype")
+    .SetShapeFn(shape_inference::Pool3DShape)
     .Doc(R"doc(
 Performs 3D average pooling on the input.
 
@@ -726,6 +728,7 @@ REGISTER_OP("MaxPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr("T: numbertype")
+    .SetShapeFn(shape_inference::Pool3DShape)
     .Doc(R"doc(
 Performs 3D max pooling on the input.
 
@@ -973,6 +976,77 @@ REGISTER_OP("Dilation2D")
     .Attr("strides: list(int) >= 4")
     .Attr("rates: list(int) >= 4")
     .Attr(GetPaddingAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+      const Shape* filter_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &filter_shape));
+
+      std::vector<int32> strides;
+      TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
+      if (strides.size() != 4) {
+        return errors::InvalidArgument(
+            "Dilation2D requires the stride attribute to contain 4 values, but "
+            "got: ",
+            strides.size());
+      }
+
+      std::vector<int32> rates;
+      TF_RETURN_IF_ERROR(c->GetAttr("rates", &rates));
+      if (rates.size() != 4) {
+        return errors::InvalidArgument(
+            "Dilation2D requires the rates attribute to contain 4 values, but "
+            "got: ",
+            rates.size());
+      }
+
+      int32 stride_rows = strides[1];
+      int32 stride_cols = strides[2];
+
+      int32 rate_rows = rates[1];
+      int32 rate_cols = rates[2];
+
+      const Dimension* batch_size_dim = c->Dim(input_shape, 0);
+      const Dimension* in_rows_dim = c->Dim(input_shape, 1);
+      const Dimension* in_cols_dim = c->Dim(input_shape, 2);
+      const Dimension* filter_rows_dim = c->Dim(filter_shape, 0);
+      const Dimension* filter_cols_dim = c->Dim(filter_shape, 1);
+      const Dimension* output_depth_dim = c->Dim(filter_shape, 2);
+
+      const Dimension* unused;
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->Dim(input_shape, 3), output_depth_dim, &unused));
+
+      // At the moment we need to know the values of several fields.
+      TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+      TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
+      TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_rows_dim, "filter_rows"));
+      TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_cols_dim, "filter_cols"));
+
+      auto in_rows = c->Value(in_rows_dim);
+      auto in_cols = c->Value(in_cols_dim);
+      auto filter_rows = c->Value(filter_rows_dim);
+      auto filter_cols = c->Value(filter_cols_dim);
+      auto filter_rows_eff = filter_rows + (filter_rows - 1) * (rate_rows - 1);
+      auto filter_cols_eff = filter_cols + (filter_cols - 1) * (rate_cols - 1);
+
+      Padding padding;
+      TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
+
+      int64 output_rows, output_cols;
+      int64 padding_before, padding_after;
+      TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+          in_rows, filter_rows_eff, stride_rows, padding, &output_rows,
+          &padding_before, &padding_after));
+      TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+          in_cols, filter_cols_eff, stride_cols, padding, &output_cols,
+          &padding_before, &padding_after));
+
+      const Shape* output_shape = c->MakeShape(
+          {batch_size_dim, output_rows, output_cols, output_depth_dim});
+      c->set_output(0, output_shape);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
 
diff --git a/tensorflow/core/ops/nn_ops_test.cc b/tensorflow/core/ops/nn_ops_test.cc
index 9056217db3a..773a8f30080 100644
--- a/tensorflow/core/ops/nn_ops_test.cc
+++ b/tensorflow/core/ops/nn_ops_test.cc
@@ -312,4 +312,29 @@ TEST(NNOpsTest, InTopK_ShapeFn) {
   INFER_ERROR("Shape must be rank 1 but is rank 2", op, "?;[1,2]");
 }
 
+TEST(NNOpsTest, Dilation2DShapeTest) {
+  ShapeInferenceTestOp op("Dilation2D");
+  auto set_op = [&op](const std::vector<int32>& strides,
+                      const std::vector<int32>& rates, const string& padding) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Dilation2D")
+                    .Input("input", 0, DT_FLOAT)
+                    .Input("filter", 0, DT_FLOAT)
+                    .Attr("strides", strides)
+                    .Attr("rates", rates)
+                    .Attr("padding", padding)
+                    .Finalize(&op.node_def));
+  };
+
+  // rate rows and cols is 1, so filter_rows and cols are unchanged.
+  // We have a 1x1 filter so the output is still 2x2.
+  set_op({1, 1, 1, 1}, {1, 1, 1, 1}, "VALID");
+  INFER_OK(op, "[1,2,2,2];[1,1,2]", "[d0_0,2,2,d1_2]");
+
+  // rate rows and cols is 2, so filter_rows and cols are changed to
+  // be 2 + (2 - 1) = 3.  7x7 input with 3x3 filter and 1x1 stride
+  // gives a 5x5 output.
+  set_op({1, 1, 1, 1}, {1, 2, 2, 1}, "VALID");
+  INFER_OK(op, "[1,7,7,2];[2,2,2]", "[d0_0,5,5,d1_2]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 01bb4bc82f8..8bc7e5b86c9 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -98,7 +98,7 @@ op {
     }
   }
   summary: "Returns x + y element-wise."
-  description: "*NOTE*: Add supports broadcasting. AddN does not."
+  description: "*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "AddN"
@@ -1886,7 +1886,7 @@ op {
       }
     }
   }
-  summary: "Calculates the Cholesky decomposition of a batch of square matrices."
+  summary: "Computes the Cholesky decomposition of a batch of square matrices."
   description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices, with the same constraints as the single matrix Cholesky\ndecomposition above. The output is a tensor of the same shape as the input\ncontaining the Cholesky decompositions for all input submatrices `[..., :, :]`."
 }
 op {
@@ -1898,12 +1898,12 @@ op {
   }
   input_arg {
     name: "grad"
-    description: "df/dl where f is some scalar function. Shape is `[..., M, M]\'.\nAlgorithm depends only on lower triangular part of the innermost matrices of\nthis tensor."
+    description: "df/dl where f is some scalar function. Shape is `[..., M, M]`.\nAlgorithm depends only on lower triangular part of the innermost matrices of\nthis tensor."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Symmetrized version of df/dA . Shape is `[..., M, M]\'"
+    description: "Symmetrized version of df/dA . Shape is `[..., M, M]`"
     type_attr: "T"
   }
   attr {
@@ -1916,7 +1916,7 @@ op {
       }
     }
   }
-  summary: "Calculates the reverse mode backpropagated gradient of the Cholesky algorithm."
+  summary: "Computes the reverse mode backpropagated gradient of the Cholesky algorithm."
   description: "For an explanation see \"Differentiation of the Cholesky algorithm\" by\nIain Murray http://arxiv.org/abs/1602.07527."
 }
 op {
@@ -2110,7 +2110,7 @@ op {
       }
     }
   }
-  summary: "Calculates the determinants for a batch of square matrices."
+  summary: "Computes the determinants for a batch of square matrices."
   description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. The output is a tensor containing the determinants\nfor all input submatrices `[..., :, :]`."
 }
 op {
@@ -2180,7 +2180,7 @@ op {
       }
     }
   }
-  summary: "Calculates the inverse of square invertible matrices or their adjoints"
+  summary: "Computes the inverse of square invertible matrices or their adjoints"
   description: "(conjugate transposes).\n\nThe input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. The output is a tensor of the same shape as the input\ncontaining the inverse for all input submatrices `[..., :, :]`.\n\nThe op uses LU decomposition with partial pivoting to compute the inverses.\n\nIf a matrix is not invertible there is no guarantee what the op does. It\nmay detect the condition and raise an exception or it may simply return a\ngarbage result."
 }
 op {
@@ -2284,7 +2284,7 @@ op {
     }
   }
   summary: "Solves multiple linear least-squares problems."
-  description: "`matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions\nform square matrices. Rhs is a tensor of shape `[..., M, K]`. The output\nis a tensor shape `[..., N, K]` where each output matrix solves each of\nthe equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :] in the\nleast squares sense.\n\nBelow we will use the following notation for each pair of\nmatrix and right-hand sides in the batch:\n\n`matrix`=\\\\(A \\in \\Re^{m \\times n}\\\\),\n`rhs`=\\\\(B  \\in \\Re^{m \\times k}\\\\),\n`output`=\\\\(X  \\in \\Re^{n \\times k}\\\\),\n`l2_regularizer`=\\\\(\\lambda\\\\).\n\nIf `fast` is `True`, then the solution is computed by solving the normal\nequations using Cholesky decomposition. Specifically, if \\\\(m \\ge n\\\\) then\n\\\\(X = (A^T A + \\lambda I)^{-1} A^T B\\\\), which solves the least-squares\nproblem \\\\(X = \\mathrm{argmin}_{Z \\in \\Re^{n \\times k}} ||A Z - B||_F^2 +\n\\lambda ||Z||_F^2\\\\). If \\\\(m \\lt n\\\\) then `output` is computed as\n\\\\(X = A^T (A A^T + \\lambda I)^{-1} B\\\\), which (for \\\\(\\lambda = 0\\\\)) is the\nminimum-norm solution to the under-determined linear system, i.e.\n\\\\(X = \\mathrm{argmin}_{Z \\in \\Re^{n \\times k}} ||Z||_F^2 \\\\), subject to\n\\\\(A Z = B\\\\). Notice that the fast path is only numerically stable when\n\\\\(A\\\\) is numerically full rank and has a condition number\n\\\\(\\mathrm{cond}(A) \\lt \\frac{1}{\\sqrt{\\epsilon_{mach}}}\\\\) or\\\\(\\lambda\\\\) is\nsufficiently large.\n\nIf `fast` is `False` an algorithm based on the numerically robust complete\northogonal decomposition is used. This computes the minimum-norm\nleast-squares solution, even when \\\\(A\\\\) is rank deficient. This path is\ntypically 6-7 times slower than the fast path. If `fast` is `False` then\n`l2_regularizer` is ignored."
+  description: "`matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions\nform matrices of size `[M, N]`. Rhs is a tensor of shape `[..., M, K]`.\nThe output is a tensor shape `[..., N, K]` where each output matrix solves\neach of the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]\nin the least squares sense.\n\nBelow we will use the following notation for each pair of\nmatrix and right-hand sides in the batch:\n\n`matrix`=\\\\(A \\in \\Re^{m \\times n}\\\\),\n`rhs`=\\\\(B  \\in \\Re^{m \\times k}\\\\),\n`output`=\\\\(X  \\in \\Re^{n \\times k}\\\\),\n`l2_regularizer`=\\\\(\\lambda\\\\).\n\nIf `fast` is `True`, then the solution is computed by solving the normal\nequations using Cholesky decomposition. Specifically, if \\\\(m \\ge n\\\\) then\n\\\\(X = (A^T A + \\lambda I)^{-1} A^T B\\\\), which solves the least-squares\nproblem \\\\(X = \\mathrm{argmin}_{Z \\in \\Re^{n \\times k}} ||A Z - B||_F^2 +\n\\lambda ||Z||_F^2\\\\). If \\\\(m \\lt n\\\\) then `output` is computed as\n\\\\(X = A^T (A A^T + \\lambda I)^{-1} B\\\\), which (for \\\\(\\lambda = 0\\\\)) is the\nminimum-norm solution to the under-determined linear system, i.e.\n\\\\(X = \\mathrm{argmin}_{Z \\in \\Re^{n \\times k}} ||Z||_F^2 \\\\), subject to\n\\\\(A Z = B\\\\). Notice that the fast path is only numerically stable when\n\\\\(A\\\\) is numerically full rank and has a condition number\n\\\\(\\mathrm{cond}(A) \\lt \\frac{1}{\\sqrt{\\epsilon_{mach}}}\\\\) or\\\\(\\lambda\\\\) is\nsufficiently large.\n\nIf `fast` is `False` an algorithm based on the numerically robust complete\northogonal decomposition is used. This computes the minimum-norm\nleast-squares solution, even when \\\\(A\\\\) is rank deficient. This path is\ntypically 6-7 times slower than the fast path. If `fast` is `False` then\n`l2_regularizer` is ignored."
 }
 op {
   name: "BatchMatrixTriangularSolve"
@@ -2515,8 +2515,101 @@ op {
       }
     }
   }
-  summary: "Calculates the Eigen Decomposition of a batch of square self-adjoint matrices."
-  description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices, with the same constraints as the single matrix\nSelfAdjointEig.\n\nThe result is a \'[..., M+1, M] matrix with [..., 0,:] containing the\neigenvalues, and subsequent [...,1:, :] containing the eigenvectors."
+  summary: "Computes the Eigen Decomposition of a batch of square self-adjoint matrices."
+  description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices, with the same constraints as the single matrix\nSelfAdjointEig.\n\nThe result is a [..., M+1, M] matrix with [..., 0,:] containing the\neigenvalues, and subsequent [...,1:, :] containing the eigenvectors."
+  deprecation {
+    version: 11
+    explanation: "Use BatchSelfAdjointEigV2 instead."
+  }
+}
+op {
+  name: "BatchSelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    description: "`Tensor` input of shape `[N, N]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    description: "Eigenvalues. Shape is `[N]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    description: "Eigenvectors. Shape is `[N, N]`."
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+    description: "If `True` then eigenvectors will be computed and returned in `v`.\nOtherwise, only the eigenvalues will be computed."
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  summary: "Computes the eigen decomposition of a batch of square self-adjoint matrices."
+  description: "Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in\n`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.\n\n```prettyprint\n# a is a tensor.\n# e is a tensor of eigenvalues.\n# v is a tensor of eigenvectors.\ne, v = batch_self_adjoint_eig(a)\ne = batch_self_adjoint_eig(a, compute_v=False)\n```"
+}
+op {
+  name: "BatchSvd"
+  input_arg {
+    name: "input"
+    description: "A tensor of shape `[..., M, N]` whose inner-most 2 dimensions\nform matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    description: "Singular values. Shape is `[..., P]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    description: "Left singular vectors. If `full_matrices` is `False` then shape is\n`[..., M, M]`; if `full_matrices` is `True` then shape is\n`[..., M, P]`. Undefined if `compute_uv` is `False`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    description: "Left singular vectors. If `full_matrices` is `False` then shape is\n`[..., N, N]`. If `full_matrices` is `True` then shape is `[..., N, P]`.\nUndefined if `compute_uv` is false."
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+    description: "If true, left and right singular vectors will be\ncomputed and returned in `u` and `v`, respectively.\nIf false, `u` and `v` are not set and should never referenced."
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If true, compute full-sized `u` and `v`. If false\n(the default), compute only the leading `P` singular vectors.\nIgnored if `compute_uv` is `False`."
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  summary: "Computes the singular value decompositions of a batch of matrices."
+  description: "Computes the SVD of each inner matrix in `input` such that\n`input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`\n\n```prettyprint\n# a is a tensor containing a batch of matrices.\n# s is a tensor of singular values for each matrix.\n# u is the tensor containing of left singular vectors for each matrix.\n# v is the tensor containing of right singular vectors for each matrix.\ns, u, v = batch_svd(a)\ns, _, _ = batch_svd(a, compute_uv=False)\n```"
 }
 op {
   name: "BatchToSpace"
@@ -3023,7 +3116,7 @@ op {
       }
     }
   }
-  summary: "Calculates the Cholesky decomposition of a square matrix."
+  summary: "Computes the Cholesky decomposition of a square matrix."
   description: "The input has to be symmetric and positive definite. Only the lower-triangular\npart of the input will be used for this operation. The upper-triangular part\nwill not be read.\n\nThe result is the lower-triangular matrix of the Cholesky decomposition of the\ninput, `L`, so that `input = L L^*`."
 }
 op {
@@ -3035,12 +3128,12 @@ op {
   }
   input_arg {
     name: "grad"
-    description: "df/dl where f is some scalar function. Shape is `[M, M]\'.\nAlgorithm depends only on lower triangular part of this matrix."
+    description: "df/dl where f is some scalar function. Shape is `[M, M]`.\nAlgorithm depends only on lower triangular part of this matrix."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Symmetrized version of df/dA . Shape is `[M, M]\'."
+    description: "Symmetrized version of df/dA . Shape is `[M, M]`."
     type_attr: "T"
   }
   attr {
@@ -3053,7 +3146,7 @@ op {
       }
     }
   }
-  summary: "Calculates the reverse mode backpropagated gradient of the Cholesky algorithm."
+  summary: "Computes the reverse mode backpropagated gradient of the Cholesky algorithm."
   description: "For an explanation see \"Differentiation of the Cholesky algorithm\" by\nIain Murray http://arxiv.org/abs/1602.07527."
 }
 op {
@@ -5095,6 +5188,7 @@ op {
     }
   }
   summary: "Returns x / y element-wise."
+  description: "*NOTE*: `Div` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "DrawBoundingBoxes"
@@ -5506,6 +5600,7 @@ op {
     }
   }
   summary: "Returns the truth value of (x == y) element-wise."
+  description: "*NOTE*: `Equal` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -6186,6 +6281,7 @@ op {
     }
   }
   summary: "Returns the truth value of (x > y) element-wise."
+  description: "*NOTE*: `Greater` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "GreaterEqual"
@@ -6219,6 +6315,7 @@ op {
     }
   }
   summary: "Returns the truth value of (x >= y) element-wise."
+  description: "*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "HSVToRGB"
@@ -7104,6 +7201,7 @@ op {
     }
   }
   summary: "Returns the truth value of (x < y) element-wise."
+  description: "*NOTE*: `Less` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "LessEqual"
@@ -7137,6 +7235,7 @@ op {
     }
   }
   summary: "Returns the truth value of (x <= y) element-wise."
+  description: "*NOTE*: `LessEqual` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "Lgamma"
@@ -7359,6 +7458,7 @@ op {
     type: DT_BOOL
   }
   summary: "Returns the truth value of x AND y element-wise."
+  description: "*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -7388,6 +7488,7 @@ op {
     type: DT_BOOL
   }
   summary: "Returns the truth value of x OR y element-wise."
+  description: "*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -7594,7 +7695,7 @@ op {
       }
     }
   }
-  summary: "Calculates the determinant of a square matrix."
+  summary: "Computes the determinant of a square matrix."
 }
 op {
   name: "MatrixInverse"
@@ -7625,7 +7726,7 @@ op {
       }
     }
   }
-  summary: "Calculates the inverse of a square invertible matrix or its adjoint (conjugate"
+  summary: "Computes the inverse of a square invertible matrix or its adjoint (conjugate"
   description: "transpose).\n\nThe op uses LU decomposition with partial pivoting to compute the inverse.\n\nIf the matrix is not invertible there is no guarantee what the op does. It\nmay detect the condition and raise an exception or it may simply return a\ngarbage result."
 }
 op {
@@ -8245,7 +8346,8 @@ op {
       }
     }
   }
-  summary: "Returns the max of x and y (i.e. x > y ? x : y) element-wise, broadcasts."
+  summary: "Returns the max of x and y (i.e. x > y ? x : y) element-wise."
+  description: "*NOTE*: `Maximum` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -8428,7 +8530,8 @@ op {
       }
     }
   }
-  summary: "Returns the min of x and y (i.e. x < y ? x : y) element-wise, broadcasts."
+  summary: "Returns the min of x and y (i.e. x < y ? x : y) element-wise."
+  description: "*NOTE*: `Minimum` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -8528,6 +8631,7 @@ op {
     }
   }
   summary: "Returns element-wise remainder of division."
+  description: "*NOTE*: `Mod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "Mul"
@@ -8562,6 +8666,7 @@ op {
     }
   }
   summary: "Returns x * y element-wise."
+  description: "*NOTE*: `Mul` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -8863,6 +8968,7 @@ op {
     }
   }
   summary: "Returns the truth value of (x != y) element-wise."
+  description: "*NOTE*: `NotEqual` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -12113,8 +12219,50 @@ op {
       }
     }
   }
-  summary: "Calculates the Eigen Decomposition of a square Self-Adjoint matrix."
+  summary: "Computes the Eigen Decomposition of a square Self-Adjoint matrix."
   description: "Only the lower-triangular part of the input will be used in this case. The\nupper-triangular part will not be read.\n\nThe result is a M+1 x M matrix whose first row is the eigenvalues, and\nsubsequent rows are eigenvectors."
+  deprecation {
+    version: 11
+    explanation: "Use SelfAdjointEigV2 instead."
+  }
+}
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    description: "`Tensor` input of shape `[N, N]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    description: "Eigenvalues. Shape is `[N]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    description: "Eigenvectors. Shape is `[N, N]`."
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+    description: "If `True` then eigenvectors will be computed and returned in `v`.\nOtherwise, only the eigenvalues will be computed."
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  summary: "Computes the eigen decomposition of a self-adjoint (\\\"symmetric\\\") matrix."
+  description: "Computes the eigenvalues and (optionally) eigenvectors such that\n`input = v * diag(e)`.\n\n```prettyprint\n# a is a self-adjoint matrix.\n# e is a vector of eigenvalues.\n# v is a matrix of eigenvectors.\ne, v = self_adjoint_eig(a)\ne = self_adjoint_eig(a, compute_v=False)\n```"
 }
 op {
   name: "SerializeManySparse"
@@ -14643,6 +14791,7 @@ op {
     }
   }
   summary: "Returns (x - y)(x - y) element-wise."
+  description: "*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -15094,6 +15243,7 @@ op {
     }
   }
   summary: "Returns x - y element-wise."
+  description: "*NOTE*: `Sub` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "Sum"
@@ -15145,6 +15295,57 @@ op {
   summary: "Computes the sum of elements across dimensions of a tensor."
   description: "Reduces `input` along the dimensions given in `reduction_indices`. Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_indices`. If `keep_dims` is true, the reduced dimensions are\nretained with length 1."
 }
+op {
+  name: "Svd"
+  input_arg {
+    name: "input"
+    description: "Shape is `[M, N]`. Let `P` be the minimum of `M` and `N`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    description: "Singular values. Shape is `[P]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    description: "Left singular vectors; if `full_matrices` is `False` then shape is `[M, M]`.\nIf `full_matrices` is `True` then shape is `[M, P]`.\nUndefined if `compute_uv` is `False`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    description: "Left singular vectors. If `full_matrices` is `False` then shape is `[N, N]`.\nIf `full_matrices` is `True` then shape is `[N, P]`.\nUndefined if `compute_uv` is false."
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+    description: "If true, left and right singular vectors will be\ncomputed and returned in `u` and `v`, respectively.\nIf false, `u` and `v` are not set and should never referenced."
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If true, compute full-sized `u` and `v`. If false\n(the default), compute only the leading `P` singular vectors.\nIgnored if `compute_uv` is `False`."
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  summary: "Computes the singular value decomposition of a matrix."
+  description: "Computes the SVD of if `input` such that `input = u * diag(s) * transpose(v)`\n\n```prettyprint\n# a is a matrix.\n# s is a vector of singular values.\n# u is the matrix of left singular vectors.\n# v is a matrix of right singular vectors.\ns, u, v = svd(a)\ns, _, _ = svd(a, compute_uv=False)\n```"
+}
 op {
   name: "Switch"
   input_arg {
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index ac213385054..17d5983d76f 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -363,7 +363,7 @@ REGISTER_OP("SparseConcat")
     .Attr("T: type")
     .SetShapeFn([](InferenceContext* c) {
       // These accumulates the sum.
-      const Dimension* output_row_count = c->MakeDim(0);
+      const Dimension* output_row_count = c->MakeDim(0ll);
 
       // These are only merged.
       const Dimension* output_ind_cols = c->UnknownDim();
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index 1e267657a26..684e86a00dc 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -18,6 +18,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+using shape_inference::InferenceContext;
+using shape_inference::Shape;
+
 REGISTER_OP("Variable")
     .Output("ref: Ref(dtype)")
     .Attr("shape: shape")
@@ -63,6 +66,14 @@ REGISTER_OP("TemporaryVariable")
     .Attr("dtype: type")
     .Attr("var_name: string = ''")
     .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      TensorShapeProto shape_proto;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape_proto));
+      const Shape* output;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(shape_proto, &output));
+      c->set_output(0, output);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Returns a tensor that may be mutated, but only persists within a single step.
 
@@ -93,6 +104,7 @@ REGISTER_OP("DestroyTemporaryVariable")
     .Output("value: T")
     .Attr("T: type")
     .Attr("var_name: string")
+    .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Destroys the temporary variable and returns its final value.
 
@@ -117,6 +129,16 @@ REGISTER_OP("Assign")
     .Attr("validate_shape: bool = true")
     .Attr("use_locking: bool = true")
     .SetAllowsUninitializedInput()
+    .SetShapeFn([](InferenceContext* c) {
+      bool validate_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("validate_shape", &validate_shape));
+      if (validate_shape) {
+        return shape_inference::MergeBothInputsShapeFn(c);
+      }
+
+      c->set_output(0, c->input(1));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Update 'ref' by assigning 'value' to it.
 
@@ -140,6 +162,7 @@ REGISTER_OP("AssignAdd")
     .Output("output_ref: Ref(T)")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
     .Doc(R"doc(
 Update 'ref' by adding 'value' to it.
 
@@ -160,6 +183,7 @@ REGISTER_OP("AssignSub")
     .Output("output_ref: Ref(T)")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
     .Doc(R"doc(
 Update 'ref' by subtracting 'value' from it.
 
@@ -174,6 +198,25 @@ output_ref:= Same as "ref".  Returned as a convenience for operations that want
   to use the new value after the variable has been updated.
 )doc");
 
+namespace {
+
+Status ScatterUpdateShape(InferenceContext* c) {
+  const Shape* var_shape = c->input(0);
+  const Shape* indices_shape = c->input(1);
+
+  const Shape* unused_updates_shape;
+  const Shape* concat;
+  const Shape* var_subshape;
+  TF_RETURN_IF_ERROR(c->Subshape(var_shape, 1, &var_subshape));
+  TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, var_subshape, &concat));
+  TF_RETURN_IF_ERROR(c->Merge(c->input(2), concat, &unused_updates_shape));
+
+  c->set_output(0, var_shape);
+  return Status::OK();
+}
+
+}  // namespace
+
 REGISTER_OP("ScatterUpdate")
     .Input("ref: Ref(T)")
     .Input("indices: Tindices")
@@ -182,6 +225,7 @@ REGISTER_OP("ScatterUpdate")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = true")
+    .SetShapeFn(ScatterUpdateShape)
     .Doc(R"doc(
 Applies sparse updates to a variable reference.
 
@@ -226,6 +270,7 @@ REGISTER_OP("ScatterAdd")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
+    .SetShapeFn(ScatterUpdateShape)
     .Doc(R"doc(
 Adds sparse updates to a variable reference.
 
@@ -269,6 +314,7 @@ REGISTER_OP("ScatterSub")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
+    .SetShapeFn(ScatterUpdateShape)
     .Doc(R"doc(
 Subtracts sparse updates to a variable reference.
 
@@ -307,6 +353,12 @@ REGISTER_OP("CountUpTo")
     .Output("output: T")
     .Attr("limit: int")
     .Attr("T: {int32, int64}")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* output;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &output));
+      c->set_output(0, output);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Increments 'ref' until it reaches 'limit'.
 
diff --git a/tensorflow/core/ops/state_ops_test.cc b/tensorflow/core/ops/state_ops_test.cc
new file mode 100644
index 00000000000..586de77edc8
--- /dev/null
+++ b/tensorflow/core/ops/state_ops_test.cc
@@ -0,0 +1,74 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(StateOpsTest, Assign_ShapeFn) {
+  ShapeInferenceTestOp op("Assign");
+
+  TF_ASSERT_OK(NodeDefBuilder("test", "Assign")
+                   .Input("ref", 0, DT_FLOAT_REF)
+                   .Input("value", 1, DT_FLOAT)
+                   .Attr("validate_shape", true)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[1,2];[1,2]", "in0");
+
+  // Resolves shapes when validate_shape is True.
+  INFER_OK(op, "[1,?];[?,2]", "[d0_0,d1_1]");
+
+  // validate_shape=True, fails when the shapes are not compatible.
+  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 3", op,
+              "[1,?];[3,2]");
+
+  // Test for validate_shape=False
+  TF_ASSERT_OK(NodeDefBuilder("test", "Assign")
+                   .Input("ref", 0, DT_FLOAT_REF)
+                   .Input("value", 1, DT_FLOAT)
+                   .Attr("validate_shape", false)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[1,2];[1,2,3,4]", "in1");
+}
+
+TEST(StateOpsTest, ScatterUpdate_ShapeFn) {
+  ShapeInferenceTestOp op("ScatterUpdate");
+  TF_ASSERT_OK(NodeDefBuilder("test", "ScatterUpdate")
+                   .Input("ref", 0, DT_FLOAT_REF)
+                   .Input("indices", 0, DT_INT32)
+                   .Input("updates", 1, DT_FLOAT)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[1,2];[3];[3,2]", "in0");
+
+  // Resolve shape on first updates dimension.
+  INFER_OK(op, "[1,2];[3];[?,2]", "in0");
+}
+
+TEST(StateOpsTest, TemporaryVariable_ShapeFn) {
+  ShapeInferenceTestOp op("TemporaryVariable");
+  TensorShape shape({1, 2, 3});
+  TensorShapeProto shape_proto;
+  shape.AsProto(&shape_proto);
+  TF_ASSERT_OK(NodeDefBuilder("test", "TemporaryVariable")
+                   .Attr("shape", shape_proto)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "", "[1,2,3]");
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/platform/context.h b/tensorflow/core/platform/context.h
index e6555029fd8..728ef916312 100644
--- a/tensorflow/core/platform/context.h
+++ b/tensorflow/core/platform/context.h
@@ -18,6 +18,13 @@ limitations under the License.
 
 namespace tensorflow {
 
+enum class ContextKind {
+  // Initial state with default (empty) values.
+  kDefault,
+  // Initial state inherited from the creating or scheduling thread.
+  kThread,
+};
+
 // Context is a container for request-specific information that should be passed
 // to threads that perform related work. The default constructor should capture
 // all relevant context.
diff --git a/tensorflow/core/platform/default/context.h b/tensorflow/core/platform/default/context.h
index 5d261ea9fbf..d8afeb47a9c 100644
--- a/tensorflow/core/platform/default/context.h
+++ b/tensorflow/core/platform/default/context.h
@@ -19,6 +19,9 @@ limitations under the License.
 namespace tensorflow {
 
 class Context {
+ public:
+  Context() {}
+  Context(const ContextKind kind) {}
 };
 
 class WithContext {
diff --git a/tensorflow/core/platform/default/dynamic_annotations.h b/tensorflow/core/platform/default/dynamic_annotations.h
index c86603117e7..d087035b5ab 100644
--- a/tensorflow/core/platform/default/dynamic_annotations.h
+++ b/tensorflow/core/platform/default/dynamic_annotations.h
@@ -19,9 +19,14 @@ limitations under the License.
 // IWYU pragma: private, include "third_party/tensorflow/core/platform/mem.h"
 // IWYU pragma: friend third_party/tensorflow/core/platform/mem.h
 
-// Do nothing for this platform
+// Do nothing for this platform.
+
 #define TF_ANNOTATE_MEMORY_IS_INITIALIZED(ptr, bytes) \
   do {                                                \
   } while (0)
 
+#define TF_ANNOTATE_BENIGN_RACE(ptr, description) \
+  do {                                            \
+  } while (0)
+
 #endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_DYNAMIC_ANNOTATIONS_H_
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 3da8d3f1245..7cbcc40ccb8 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -188,9 +188,14 @@ TEST(EnvTest, GetSchemeForURI) {
 TEST(EnvTest, SleepForMicroseconds) {
   Env* env = Env::Default();
   const int64 start = env->NowMicros();
-  env->SleepForMicroseconds(1e6 + 5e5);
+  const int64 sleep_time = 1e6 + 5e5;
+  env->SleepForMicroseconds(sleep_time);
   const int64 delta = env->NowMicros() - start;
-  EXPECT_GE(delta, 1e6 + 5e5);
+
+  // Subtract 10 from the sleep_time for this check because NowMicros can
+  // sometimes give slightly inconsistent values between the start and the
+  // finish (e.g. because the two calls run on different CPUs).
+  EXPECT_GE(delta, sleep_time - 10);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_statistics.h b/tensorflow/core/platform/file_statistics.h
new file mode 100644
index 00000000000..cc781e0a7f2
--- /dev/null
+++ b/tensorflow/core/platform/file_statistics.h
@@ -0,0 +1,38 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+struct FileStatistics {
+  // The length of the file or -1 if finding file length is not supported.
+  int64 length = -1;
+  // The last modified time in nanoseconds.
+  int64 mtime_nsec = 0;
+  // This is the mode_t from stat.h containing file type and permission
+  // information.
+  mode_t mode = 0;
+
+  FileStatistics() {}
+  ~FileStatistics() {}
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index f372b379f53..51074768c5a 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/file_statistics.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -34,16 +35,6 @@ class RandomAccessFile;
 class ReadOnlyMemoryRegion;
 class WritableFile;
 
-struct FileStatistics {
-  // The length of the file or -1 if finding file length is not supported.
-  int64 length;
-  // The last modified time in nanoseconds.
-  int64 mtime_nsec;
-  // This field contains more than just the permissions bits.  More information
-  // can be found on the man page for stat(2).
-  mode_t mode;
-};
-
 /// A generic interface for accessing a file system.
 class FileSystem {
  public:
diff --git a/tensorflow/core/platform/net.h b/tensorflow/core/platform/net.h
new file mode 100644
index 00000000000..9e7851728dd
--- /dev/null
+++ b/tensorflow/core/platform/net.h
@@ -0,0 +1,27 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PLATFORM_NET_H_
+#define TENSORFLOW_PLATFORM_NET_H_
+
+namespace tensorflow {
+namespace internal {
+
+int PickUnusedPortOrDie();
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PLATFORM_NET_H_
diff --git a/tensorflow/core/platform/net_test.cc b/tensorflow/core/platform/net_test.cc
new file mode 100644
index 00000000000..475f4340167
--- /dev/null
+++ b/tensorflow/core/platform/net_test.cc
@@ -0,0 +1,34 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/net.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace internal {
+
+TEST(Net, PickUnusedPortOrDie) {
+  int port0 = PickUnusedPortOrDie();
+  int port1 = PickUnusedPortOrDie();
+  CHECK_GE(port0, 0);
+  CHECK_LT(port0, 65536);
+  CHECK_GE(port1, 0);
+  CHECK_LT(port1, 65536);
+  CHECK_NE(port0, port1);
+}
+
+}  // namespace internal
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/net.cc b/tensorflow/core/platform/posix/net.cc
new file mode 100644
index 00000000000..2f01b779341
--- /dev/null
+++ b/tensorflow/core/platform/posix/net.cc
@@ -0,0 +1,129 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/net.h"
+
+#include <cstdlib>
+#include <unordered_set>
+
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace internal {
+
+namespace {
+bool IsPortAvailable(int* port, bool is_tcp) {
+  const int protocol = is_tcp ? IPPROTO_TCP : 0;
+  const int fd = socket(AF_INET, is_tcp ? SOCK_STREAM : SOCK_DGRAM, protocol);
+
+  struct sockaddr_in addr;
+  socklen_t addr_len = sizeof(addr);
+  int actual_port;
+
+  CHECK_GE(*port, 0);
+  CHECK_LE(*port, 65535);
+  if (fd < 0) {
+    LOG(ERROR) << "socket() failed: " << strerror(errno);
+    return false;
+  }
+
+  // SO_REUSEADDR lets us start up a server immediately after it exists.
+  int one = 1;
+  if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0) {
+    LOG(ERROR) << "setsockopt() failed: " << strerror(errno);
+    close(fd);
+    return false;
+  }
+
+  // Try binding to port.
+  addr.sin_family = AF_INET;
+  addr.sin_addr.s_addr = INADDR_ANY;
+  addr.sin_port = htons((uint16_t)*port);
+  if (bind(fd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
+    LOG(WARNING) << "bind(port=" << *port << ") failed: " << strerror(errno);
+    close(fd);
+    return false;
+  }
+
+  // Get the bound port number.
+  if (getsockname(fd, (struct sockaddr*)&addr, &addr_len) < 0) {
+    LOG(WARNING) << "getsockname() failed: " << strerror(errno);
+    close(fd);
+    return false;
+  }
+  CHECK_LE(addr_len, sizeof(addr));
+  actual_port = ntohs(addr.sin_port);
+  CHECK_GT(actual_port, 0);
+  if (*port == 0) {
+    *port = actual_port;
+  } else {
+    CHECK_EQ(*port, actual_port);
+  }
+  close(fd);
+  return true;
+}
+
+const int kNumRandomPortsToPick = 100;
+const int kMaximumTrials = 1000;
+
+}  // namespace
+
+int PickUnusedPortOrDie() {
+  static std::unordered_set<int> chosen_ports;
+
+  // Type of port to first pick in the next iteration.
+  bool is_tcp = true;
+  int trial = 0;
+  while (true) {
+    int port;
+    trial++;
+    CHECK_LE(trial, kMaximumTrials)
+        << "Failed to pick an unused port for testing.";
+    if (trial == 1) {
+      port = getpid() % (65536 - 30000) + 30000;
+    } else if (trial <= kNumRandomPortsToPick) {
+      port = rand() % (65536 - 30000) + 30000;
+    } else {
+      port = 0;
+    }
+
+    if (chosen_ports.find(port) != chosen_ports.end()) {
+      continue;
+    }
+    if (!IsPortAvailable(&port, is_tcp)) {
+      continue;
+    }
+
+    CHECK_GT(port, 0);
+    if (!IsPortAvailable(&port, !is_tcp)) {
+      is_tcp = !is_tcp;
+      continue;
+    }
+
+    chosen_ports.insert(port);
+    return port;
+  }
+
+  return 0;
+}
+
+}  // namespace internal
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/test.cc b/tensorflow/core/platform/posix/test.cc
index fe16a898788..f83fccaa227 100644
--- a/tensorflow/core/platform/posix/test.cc
+++ b/tensorflow/core/platform/posix/test.cc
@@ -13,16 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/test.h"
-
-#include <cstdlib>
-#include <unordered_set>
-
-#include <netinet/in.h>
 #include <signal.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <unistd.h>
+
+#include "tensorflow/core/platform/net.h"
+#include "tensorflow/core/platform/test.h"
 
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -84,101 +78,7 @@ std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv) {
   return std::unique_ptr<SubProcess>(new PosixSubProcess(argv));
 }
 
-namespace {
-bool IsPortAvailable(int* port, bool is_tcp) {
-  const int protocol = is_tcp ? IPPROTO_TCP : 0;
-  const int fd = socket(AF_INET, is_tcp ? SOCK_STREAM : SOCK_DGRAM, protocol);
-
-  struct sockaddr_in addr;
-  socklen_t addr_len = sizeof(addr);
-  int actual_port;
-
-  CHECK_GE(*port, 0);
-  CHECK_LE(*port, 65535);
-  if (fd < 0) {
-    LOG(ERROR) << "socket() failed: " << strerror(errno);
-    return false;
-  }
-
-  // SO_REUSEADDR lets us start up a server immediately after it exists.
-  int one = 1;
-  if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0) {
-    LOG(ERROR) << "setsockopt() failed: " << strerror(errno);
-    close(fd);
-    return false;
-  }
-
-  // Try binding to port.
-  addr.sin_family = AF_INET;
-  addr.sin_addr.s_addr = INADDR_ANY;
-  addr.sin_port = htons((uint16_t)*port);
-  if (bind(fd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
-    LOG(WARNING) << "bind(port=" << *port << ") failed: " << strerror(errno);
-    close(fd);
-    return false;
-  }
-
-  // Get the bound port number.
-  if (getsockname(fd, (struct sockaddr*)&addr, &addr_len) < 0) {
-    LOG(WARNING) << "getsockname() failed: " << strerror(errno);
-    close(fd);
-    return false;
-  }
-  CHECK_LE(addr_len, sizeof(addr));
-  actual_port = ntohs(addr.sin_port);
-  CHECK_GT(actual_port, 0);
-  if (*port == 0) {
-    *port = actual_port;
-  } else {
-    CHECK_EQ(*port, actual_port);
-  }
-  close(fd);
-  return true;
-}
-
-const int kNumRandomPortsToPick = 100;
-const int kMaximumTrials = 1000;
-
-}  // namespace
-
-int PickUnusedPortOrDie() {
-  static std::unordered_set<int> chosen_ports;
-
-  // Type of port to first pick in the next iteration.
-  bool is_tcp = true;
-  int trial = 0;
-  while (true) {
-    int port;
-    trial++;
-    CHECK_LE(trial, kMaximumTrials)
-        << "Failed to pick an unused port for testing.";
-    if (trial == 1) {
-      port = getpid() % (65536 - 30000) + 30000;
-    } else if (trial <= kNumRandomPortsToPick) {
-      port = rand() % (65536 - 30000) + 30000;
-    } else {
-      port = 0;
-    }
-
-    if (chosen_ports.find(port) != chosen_ports.end()) {
-      continue;
-    }
-    if (!IsPortAvailable(&port, is_tcp)) {
-      continue;
-    }
-
-    CHECK_GT(port, 0);
-    if (!IsPortAvailable(&port, !is_tcp)) {
-      is_tcp = !is_tcp;
-      continue;
-    }
-
-    chosen_ports.insert(port);
-    return port;
-  }
-
-  return 0;
-}
+int PickUnusedPortOrDie() { return internal::PickUnusedPortOrDie(); }
 
 string TensorFlowSrcRoot() {
   // 'bazel test' sets TEST_SRCDIR, and also TEST_WORKSPACE if a new
diff --git a/tensorflow/core/protobuf/saver.proto b/tensorflow/core/protobuf/saver.proto
index b130c7343b4..c6b5e1c938a 100644
--- a/tensorflow/core/protobuf/saver.proto
+++ b/tensorflow/core/protobuf/saver.proto
@@ -6,7 +6,7 @@ option java_outer_classname = "SaverProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
 
-// Protocol buffer representing the configuration of a SaveRestoreHelper.
+// Protocol buffer representing the configuration of a Saver.
 message SaverDef {
   // The name of the tensor in which to specify the filename when saving or
   // restoring a model checkpoint.
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 2c260b1a9a0..3f6fa3826a0 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -64,9 +64,10 @@ limitations under the License.
 // 8. Replace RandomCrop from C++ with pure Python (5feb2016).
 // 9. Deprecate batch_norm_with_global_normalization (16feb2016).
 // 10. Deprecate conv3d_backprop_{filter,input} (10jun2016).
+// 11. Deprecate {batch}_self_adjoint_eig (3aug2016).
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 10
+#define TF_GRAPH_DEF_VERSION 11
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc
index f2072d7b211..397ee8bb7d8 100644
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@@ -230,8 +230,11 @@ Status SingleExampleProtoToTensors(
     const Tensor& default_value = feature_config.default_value;
     bool required = (default_value.NumElements() == 0);
     const auto& feature_found = feature_dict.find(key);
+    const bool feature_has_data =  // Found key & data type is set
+        (feature_found != feature_dict.end() &&
+         (feature_found->second.kind_case() != Feature::KIND_NOT_SET));
 
-    bool required_ok = (feature_found != feature_dict.end()) || !required;
+    const bool required_ok = feature_has_data || !required;
     if (!required_ok) {
       return errors::InvalidArgument("Name: ", example_name, ", Feature: ", key,
                                      " is required but could not be found.");
@@ -239,7 +242,7 @@ Status SingleExampleProtoToTensors(
 
     // Perform the FeatureDenseCopy into the output dense_values tensor (if
     // the value is present).
-    if (feature_found != feature_dict.end()) {
+    if (feature_has_data) {
       const Feature& f = feature_found->second;
       bool types_match;
       TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match));
@@ -266,7 +269,7 @@ Status SingleExampleProtoToTensors(
     const DataType& dtype = feature_config.dtype;
     const auto& feature_found = feature_dict.find(key);
 
-    bool feature_has_data =  // Found key & data type is set
+    const bool feature_has_data =  // Found key & data type is set
         (feature_found != feature_dict.end() &&
          (feature_found->second.kind_case() != Feature::KIND_NOT_SET));
 
@@ -318,9 +321,9 @@ Status BatchExampleProtoToTensors(
     std::vector<Tensor>* output_sparse_indices_tensor,
     std::vector<Tensor>* output_sparse_values_tensor,
     std::vector<Tensor>* output_sparse_shapes_tensor) {
-  int batch_size = examples.size();
+  const int batch_size = examples.size();
 
-  bool has_names = (names.size() > 0);
+  const bool has_names = (names.size() > 0);
   if (has_names) {
     if (names.size() != examples.size()) {
       return errors::InvalidArgument(
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 4992c772848..79064e9988d 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -150,7 +150,7 @@ class SparseTensor {
   // Picks out the dimensions according to `dim_indices`.
   std::vector<int64> PickDims(gtl::ArraySlice<int64> dim_indices) {
     std::vector<int64> res(dim_indices.size());
-    for (int i = 0; i < dim_indices.size(); ++i) {
+    for (size_t i = 0; i < dim_indices.size(); ++i) {
       res[i] = shape_.dim_size(dim_indices[i]);
     }
     return res;
diff --git a/tensorflow/g3doc/api_docs/python/array_ops.md b/tensorflow/g3doc/api_docs/python/array_ops.md
index 19b9516e13e..9072a483c10 100644
--- a/tensorflow/g3doc/api_docs/python/array_ops.md
+++ b/tensorflow/g3doc/api_docs/python/array_ops.md
@@ -2026,6 +2026,33 @@ endian orderings will give different results.
   A `Tensor` of type `type`.
 
 
+- - -
+
+### `tf.contrib.graph_editor.copy(sgv, dst_graph=None, dst_scope='', src_scope='')` {#copy}
+
+Copy a subgraph.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the source subgraph-view. This argument is converted to a subgraph
+    using the same rules than the function subgraph.make_view.
+*  <b>`dst_graph`</b>: the destination graph.
+*  <b>`dst_scope`</b>: the destination scope.
+*  <b>`src_scope`</b>: the source scope.
+
+##### Returns:
+
+  the subgraph view of the copied subgraph.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if dst_graph is not a tf.Graph.
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
+
 - - -
 
 ### `tf.shape_n(input, name=None)` {#shape_n}
diff --git a/tensorflow/g3doc/api_docs/python/constant_op.md b/tensorflow/g3doc/api_docs/python/constant_op.md
index d5803f925b2..50bcac8506a 100644
--- a/tensorflow/g3doc/api_docs/python/constant_op.md
+++ b/tensorflow/g3doc/api_docs/python/constant_op.md
@@ -684,3 +684,38 @@ with tf.Session() as sess2:
 *  <b>`seed`</b>: integer.
 
 
+
+## Other Functions and Classes
+- - -
+
+### `tf.contrib.graph_editor.ops(*args, **kwargs)` {#ops}
+
+Helper to select operations.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Operation. tf.Tensor instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ops_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ops)".
+
+##### Returns:
+
+  list of tf.Operation
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Operation
+    or an (array of) tf.Tensor (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/contrib.distributions.md b/tensorflow/g3doc/api_docs/python/contrib.distributions.md
index d48816ba7d2..78a0fb390e9 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.distributions.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.distributions.md
@@ -352,6 +352,412 @@ Variance of the distribution.
 
 ### Univariate (scalar) distributions
 
+- - -
+
+### `class tf.contrib.distributions.Binomial` {#Binomial}
+
+Binomial distribution.
+
+This distribution is parameterized by a vector `p` of probabilities and `n`,
+the total counts.
+
+#### Mathematical details
+
+The Binomial is a distribution over the number of successes in `n` independent
+trials, with each trial having the same probability of success `p`.
+The probability mass function (pmf):
+
+```pmf(k) = n! / (k! * (n - k)!) * (p)^k * (1 - p)^(n - k)```
+
+#### Examples
+
+Create a single distribution, corresponding to 5 coin flips.
+
+```python
+dist = Binomial(n=5., p=.5)
+```
+
+Create a single distribution (using logits), corresponding to 5 coin flips.
+
+```python
+dist = Binomial(n=5., logits=0.)
+```
+
+Creates 3 distributions with the third distribution most likely to have
+successes.
+
+```python
+p = [.2, .3, .8]
+# n will be broadcast to [4., 4., 4.], to match p.
+dist = Binomial(n=4., p=p)
+```
+
+The distribution functions can be evaluated on counts.
+
+```python
+# counts same shape as p.
+counts = [1., 2, 3]
+dist.prob(counts)  # Shape [3]
+
+# p will be broadcast to [[.2, .3, .8], [.2, .3, .8]] to match counts.
+counts = [[1., 2, 1], [2, 2, 4]]
+dist.prob(counts)  # Shape [2, 3]
+
+# p will be broadcast to shape [5, 7, 3] to match counts.
+counts = [[...]]  # Shape [5, 7, 3]
+dist.prob(counts)  # Shape [5, 7, 3]
+```
+- - -
+
+#### `tf.contrib.distributions.Binomial.__init__(n, logits=None, p=None, validate_args=True, allow_nan_stats=False, name='Binomial')` {#Binomial.__init__}
+
+Initialize a batch of Binomial distributions.
+
+##### Args:
+
+
+*  <b>`n`</b>: Non-negative floating point tensor with shape broadcastable to
+    `[N1,..., Nm]` with `m >= 0` and the same dtype as `p` or `logits`.
+    Defines this as a batch of `N1 x ... x Nm` different Binomial
+    distributions. Its components should be equal to integer values.
+*  <b>`logits`</b>: Floating point tensor representing the log-odds of a
+    positive event with shape broadcastable to `[N1,..., Nm]` `m >= 0`, and
+    the same dtype as `n`. Each entry represents logits for the probability
+    of success for independent Binomial distributions.
+*  <b>`p`</b>: Positive floating point tensor with shape broadcastable to
+    `[N1,..., Nm]` `m >= 0`, `p in [0, 1]`. Each entry represents the
+    probability of success for independent Binomial distributions.
+*  <b>`validate_args`</b>: Whether to assert valid values for parameters `n` and `p`,
+    and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
+    guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
+*  <b>`name`</b>: The name to prefix Ops created by this distribution class.
+
+
+*  <b>`Examples`</b>: 
+
+```python
+# Define 1-batch of a binomial distribution.
+dist = Binomial(n=2., p=.9)
+
+# Define a 2-batch.
+dist = Binomial(n=[4., 5], p=[.1, .3])
+```
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.allow_nan_stats` {#Binomial.allow_nan_stats}
+
+Boolean describing behavior when a stat is undefined for batch member.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.batch_shape(name='batch_shape')` {#Binomial.batch_shape}
+
+Batch dimensions of this instance as a 1-D int32 `Tensor`.
+
+The product of the dimensions of the `batch_shape` is the number of
+independent distributions of this kind the instance represents.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `batch_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.cdf(value, name='cdf')` {#Binomial.cdf}
+
+Cumulative distribution function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.dtype` {#Binomial.dtype}
+
+dtype of samples from this distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.entropy(name='entropy')` {#Binomial.entropy}
+
+Entropy of the distribution in nats.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.event_shape(name='event_shape')` {#Binomial.event_shape}
+
+Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `event_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.get_batch_shape()` {#Binomial.get_batch_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `batch_shape`. May be only partially defined.
+
+##### Returns:
+
+  batch shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.get_event_shape()` {#Binomial.get_event_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `event_shape`. May be only partially defined.
+
+##### Returns:
+
+  event shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.is_continuous` {#Binomial.is_continuous}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.is_reparameterized` {#Binomial.is_reparameterized}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_cdf(value, name='log_cdf')` {#Binomial.log_cdf}
+
+Log CDF.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_pdf(value, name='log_pdf')` {#Binomial.log_pdf}
+
+Log of the probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_pmf(value, name='log_pmf')` {#Binomial.log_pmf}
+
+Log of the probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_prob(counts, name='log_prob')` {#Binomial.log_prob}
+
+`Log(P[counts])`, computed for every batch member.
+
+For each batch member of counts `k`, `P[counts]` is the probability that
+after sampling `n` draws from this Binomial distribution, the number of
+successes is `k`.  Note that different sequences of draws can result in the
+same counts, thus the probability includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.p` and `self.n`. `counts` is only legal if it is
+    less than or equal to `n` and its components are equal to integer
+    values.
+*  <b>`name`</b>: Name to give this Op, defaults to "log_prob".
+
+##### Returns:
+
+  Log probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.logits` {#Binomial.logits}
+
+Log-odds.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.mean(name='mean')` {#Binomial.mean}
+
+Mean of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.mode(name='mode')` {#Binomial.mode}
+
+Mode of the distribution.
+
+Note that when `(n + 1) * p` is an integer, there are actually two modes.
+Namely, `(n + 1) * p` and `(n + 1) * p - 1` are both modes. Here we return
+only the larger of the two modes.
+
+##### Args:
+
+
+*  <b>`name`</b>: The name for this op.
+
+##### Returns:
+
+  The mode of the Binomial distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.n` {#Binomial.n}
+
+Number of trials.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.name` {#Binomial.name}
+
+Name to prepend to all ops.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.p` {#Binomial.p}
+
+Probability of success.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.pdf(value, name='pdf')` {#Binomial.pdf}
+
+The probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.pmf(value, name='pmf')` {#Binomial.pmf}
+
+The probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.prob(counts, name='prob')` {#Binomial.prob}
+
+`P[counts]`, computed for every batch member.
+
+
+For each batch member of counts `k`, `P[counts]` is the probability that
+after sampling `n` draws from this Binomial distribution, the number of
+successes is `k`.  Note that different sequences of draws can result in the
+same counts, thus the probability includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.p` and `self.n`. `counts` is only legal if it is
+    less than or equal to `n` and its components are equal to integer
+    values.
+*  <b>`name`</b>: Name to give this Op, defaults to "prob".
+
+##### Returns:
+
+  Probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.sample(sample_shape=(), seed=None, name='sample')` {#Binomial.sample}
+
+Generate samples of the specified shape for each batched distribution.
+
+Note that a call to `sample()` without arguments will generate a single
+sample per batched distribution.
+
+##### Args:
+
+
+*  <b>`sample_shape`</b>: `int32` `Tensor` or tuple or list. Shape of the generated
+    samples.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of dtype `self.dtype` and shape
+      `sample_shape + self.batch_shape + self.event_shape`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.sample_n(n, seed=None, name='sample_n')` {#Binomial.sample_n}
+
+Generate `n` samples.
+
+##### Args:
+
+
+*  <b>`n`</b>: scalar. Number of samples to draw from each distribution.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of shape `(n,) + self.batch_shape + self.event_shape`
+      with values of type `self.dtype`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.std(name='std')` {#Binomial.std}
+
+Standard deviation of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.validate_args` {#Binomial.validate_args}
+
+Boolean describing behavior on invalid input.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.variance(name='variance')` {#Binomial.variance}
+
+Variance of the distribution.
+
+
+
 - - -
 
 ### `class tf.contrib.distributions.Bernoulli` {#Bernoulli}
@@ -360,10 +766,6 @@ Bernoulli distribution.
 
 The Bernoulli distribution is parameterized by p, the probability of a
 positive event.
-
-Note, the following methods of the base class aren't implemented:
-  * cdf
-  * log_cdf
 - - -
 
 #### `tf.contrib.distributions.Bernoulli.__init__(logits=None, p=None, dtype=tf.int32, validate_args=True, allow_nan_stats=False, name='Bernoulli')` {#Bernoulli.__init__}
@@ -383,10 +785,10 @@ Construct Bernoulli distributions.
 *  <b>`dtype`</b>: dtype for samples.
 *  <b>`validate_args`</b>: Whether to assert that `0 <= p <= 1`. If not validate_args,
    `log_pmf` may return nans.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: A name for this distribution.
 
 ##### Raises:
@@ -767,20 +1169,20 @@ Initialize a batch of Beta distributions.
 ##### Args:
 
 
-*  <b>`a`</b>: Positive `float` or `double` tensor with shape broadcastable to
+*  <b>`a`</b>: Positive floating point tensor with shape broadcastable to
     `[N1,..., Nm]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
      different Beta distributions. This also defines the
      dtype of the distribution.
-*  <b>`b`</b>: Positive `float` or `double` tensor with shape broadcastable to
+*  <b>`b`</b>: Positive floating point tensor with shape broadcastable to
     `[N1,..., Nm]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
      different Beta distributions.
 *  <b>`validate_args`</b>: Whether to assert valid values for parameters `a` and `b`,
-    and `x` in `prob` and `log_prob`.  If False, correct behavior is not
+    and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
     guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 
@@ -942,7 +1344,7 @@ Log of the probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float` or `double`, tensor whose shape can
+*  <b>`x`</b>: Non-negative floating point tensor whose shape can
     be broadcast with `self.a` and `self.b`.  For fixed leading
     dimensions, the last dimension represents counts for the corresponding
     Beta distribution in `self.a` and `self.b`. `x` is only legal if
@@ -1012,7 +1414,7 @@ The probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float`, `double` tensor whose shape can
+*  <b>`x`</b>: Non-negative floating point tensor whose shape can
     be broadcast with `self.a` and `self.b`.  For fixed leading
     dimensions, the last dimension represents x for the corresponding Beta
     distribution in `self.a` and `self.b`. `x` is only legal if is
@@ -1098,11 +1500,6 @@ Categorical distribution.
 
 The categorical distribution is parameterized by the log-probabilities
 of a set of classes.
-
-Note, the following methods of the base class aren't implemented:
-  * mean
-  * cdf
-  * log_cdf
 - - -
 
 #### `tf.contrib.distributions.Categorical.__init__(logits, dtype=tf.int32, validate_args=True, allow_nan_stats=False, name='Categorical')` {#Categorical.__init__}
@@ -1118,10 +1515,10 @@ Initialize Categorical distributions using class log-probabilities.
       indexes into the classes.
 *  <b>`dtype`</b>: The type of the event samples (default: int32).
 *  <b>`validate_args`</b>: Unused in this distribution.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: A name for this distribution (optional).
 
 
@@ -1385,15 +1782,15 @@ Construct Chi2 distributions with parameter `df`.
 ##### Args:
 
 
-*  <b>`df`</b>: `float` or `double` tensor, the degrees of freedom of the
+*  <b>`df`</b>: Floating point tensor, the degrees of freedom of the
     distribution(s).  `df` must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `df > 0`, and that `x > 0` in the
-    methods `prob(x)` and `log_prob(x)`. If `validate_args` is False
+    methods `prob(x)` and `log_prob(x)`. If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 
@@ -1767,15 +2164,15 @@ Construct Exponential distribution with parameter `lam`.
 ##### Args:
 
 
-*  <b>`lam`</b>: `float` or `double` tensor, the rate of the distribution(s).
+*  <b>`lam`</b>: Floating point tensor, the rate of the distribution(s).
     `lam` must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `lam > 0`, and that `x > 0` in the
-    methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+    methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member. If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 
@@ -2161,19 +2558,19 @@ broadcasting (e.g. `alpha + beta` is a valid operation).
 ##### Args:
 
 
-*  <b>`alpha`</b>: `float` or `double` tensor, the shape params of the
+*  <b>`alpha`</b>: Floating point tensor, the shape params of the
     distribution(s).
     alpha must contain only positive values.
-*  <b>`beta`</b>: `float` or `double` tensor, the inverse scale params of the
+*  <b>`beta`</b>: Floating point tensor, the inverse scale params of the
     distribution(s).
     beta must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `a > 0, b > 0`, and that `x > 0` in
-    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 ##### Raises:
@@ -2560,18 +2957,18 @@ broadcasting (e.g. `alpha + beta` is a valid operation).
 ##### Args:
 
 
-*  <b>`alpha`</b>: `float` or `double` tensor, the shape params of the
+*  <b>`alpha`</b>: Floating point tensor, the shape params of the
     distribution(s).
     alpha must contain only positive values.
-*  <b>`beta`</b>: `float` or `double` tensor, the scale params of the distribution(s).
+*  <b>`beta`</b>: Floating point tensor, the scale params of the distribution(s).
     beta must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `a > 0, b > 0`, and that `x > 0` in
-    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 ##### Raises:
@@ -2972,17 +3369,17 @@ broadcasting (e.g., `loc / scale` is a valid operation).
 ##### Args:
 
 
-*  <b>`loc`</b>: `float` or `double` tensor which characterizes the location (center)
+*  <b>`loc`</b>: Floating point tensor which characterizes the location (center)
     of the distribution.
-*  <b>`scale`</b>: `float` or `double`, positive-valued tensor which characterzes the
-    spread of the distribution.
+*  <b>`scale`</b>: Positive floating point tensor which characterizes the spread of
+    the distribution.
 *  <b>`validate_args`</b>: Whether to validate input with asserts.  If `validate_args`
     is `False`, and the inputs are invalid, correct behavior is not
     guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
@@ -3363,15 +3760,15 @@ broadcasting (e.g. `mu + sigma` is a valid operation).
 ##### Args:
 
 
-*  <b>`mu`</b>: `float` or `double` tensor, the means of the distribution(s).
-*  <b>`sigma`</b>: `float` or `double` tensor, the stddevs of the distribution(s).
+*  <b>`mu`</b>: Floating point tensor, the means of the distribution(s).
+*  <b>`sigma`</b>: Floating point tensor, the stddevs of the distribution(s).
     sigma must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `sigma > 0`. If `validate_args` is
-    False, correct output is not guaranteed when input is invalid.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+    `False`, correct output is not guaranteed when input is invalid.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
@@ -3750,19 +4147,19 @@ broadcasting (e.g. `df + mu + sigma` is a valid operation).
 ##### Args:
 
 
-*  <b>`df`</b>: `float` or `double` tensor, the degrees of freedom of the
+*  <b>`df`</b>: Floating point tensor, the degrees of freedom of the
     distribution(s). `df` must contain only positive values.
-*  <b>`mu`</b>: `float` or `double` tensor, the means of the distribution(s).
-*  <b>`sigma`</b>: `float` or `double` tensor, the scaling factor for the
+*  <b>`mu`</b>: Floating point tensor, the means of the distribution(s).
+*  <b>`sigma`</b>: Floating point tensor, the scaling factor for the
     distribution(s). `sigma` must contain only positive values.
     Note that `sigma` is not the standard deviation of this distribution.
 *  <b>`validate_args`</b>: Whether to assert that `df > 0, sigma > 0`. If
-    `validate_args` is False and inputs are invalid, correct behavior is not
-    guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+    `validate_args` is `False` and inputs are invalid, correct behavior is
+    not guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
@@ -4102,14 +4499,14 @@ u1 = Uniform(3.0, [5.0, 6.0, 7.0])  # 3 distributions
 ##### Args:
 
 
-*  <b>`a`</b>: `float` or `double` tensor, the minimum endpoint.
-*  <b>`b`</b>: `float` or `double` tensor, the maximum endpoint. Must be > `a`.
-*  <b>`validate_args`</b>: Whether to assert that `a > b`. If `validate_args` is False
-    and inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`a`</b>: Floating point tensor, the minimum endpoint.
+*  <b>`b`</b>: Floating point tensor, the maximum endpoint. Must be > `a`.
+*  <b>`validate_args`</b>: Whether to assert that `a > b`. If `validate_args` is
+    `False` and inputs are invalid, correct behavior is not guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 ##### Raises:
@@ -4446,7 +4843,7 @@ The mean of `X_i` is `mu[i]`, and the standard deviation is `diag_stdev[i]`.
 ##### Args:
 
 
-*  <b>`mu`</b>: Rank `N + 1` `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+*  <b>`mu`</b>: Rank `N + 1` floating point tensor with shape `[N1,...,Nb, k]`,
     `b >= 0`.
 *  <b>`diag_stdev`</b>: Rank `N + 1` `Tensor` with same `dtype` and shape as `mu`,
     representing the standard deviations.  Must be positive.
@@ -4803,7 +5200,7 @@ User must provide means `mu` and `sigma`, the mean and covariance.
 ##### Args:
 
 
-*  <b>`mu`</b>: `(N+1)-D`  `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+*  <b>`mu`</b>: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
     `b >= 0`.
 *  <b>`sigma`</b>: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
     `[N1,...,Nb, k, k]`.  Each batch member must be positive definite.
@@ -5168,7 +5565,7 @@ factors, such that the covariance of each batch member is `chol chol^T`.
 ##### Args:
 
 
-*  <b>`mu`</b>: `(N+1)-D`  `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+*  <b>`mu`</b>: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
     `b >= 0`.
 *  <b>`chol`</b>: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
     `[N1,...,Nb, k, k]`.  The upper triangular part is ignored (treated as
@@ -5605,16 +6002,16 @@ Initialize a batch of Dirichlet distributions.
 ##### Args:
 
 
-*  <b>`alpha`</b>: Positive `float` or `double` tensor with shape broadcastable to
+*  <b>`alpha`</b>: Positive floating point tensor with shape broadcastable to
     `[N1,..., Nm, k]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
      different `k` class Dirichlet distributions.
 *  <b>`validate_args`</b>: Whether to assert valid values for parameters `alpha` and
-    `x` in `prob` and `log_prob`.  If False, correct behavior is not
+    `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
     guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 
@@ -5770,7 +6167,7 @@ Log of the probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float` or `double`, tensor whose shape can
+*  <b>`x`</b>: Non-negative tensor with dtype `dtype` and whose shape can
     be broadcast with `self.alpha`.  For fixed leading dimensions, the last
     dimension represents counts for the corresponding Dirichlet distribution
     in `self.alpha`. `x` is only legal if it sums up to one.
@@ -5839,7 +6236,7 @@ The probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float`, `double` tensor whose shape can
+*  <b>`x`</b>: Non-negative tensor with dtype `dtype` and whose shape can
     be broadcast with `self.alpha`.  For fixed leading dimensions, the last
     dimension represents x for the corresponding Dirichlet distribution in
     `self.alpha` and `self.beta`. `x` is only legal if it sums up to one.
@@ -5996,22 +6393,22 @@ Initialize a batch of DirichletMultinomial distributions.
 ##### Args:
 
 
-*  <b>`n`</b>: Non-negative `float` or `double` tensor, whose dtype is the same as
+*  <b>`n`</b>: Non-negative floating point tensor, whose dtype is the same as
     `alpha`. The shape is broadcastable to `[N1,..., Nm]` with `m >= 0`.
     Defines this as a batch of `N1 x ... x Nm` different Dirichlet
-    multinomial distributions. Its components should be equal to integral
+    multinomial distributions. Its components should be equal to integer
     values.
-*  <b>`alpha`</b>: Positive `float` or `double` tensor, whose dtype is the same as
+*  <b>`alpha`</b>: Positive floating point tensor, whose dtype is the same as
     `n` with shape broadcastable to `[N1,..., Nm, k]` `m >= 0`.  Defines
     this as a batch of `N1 x ... x Nm` different `k` class Dirichlet
     multinomial distributions.
 *  <b>`validate_args`</b>: Whether to assert valid values for parameters `alpha` and
-    `n`, and `x` in `prob` and `log_prob`.  If False, correct behavior is
+    `n`, and `x` in `prob` and `log_prob`.  If `False`, correct behavior is
     not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 
@@ -6173,12 +6570,11 @@ probability includes a combinatorial coefficient.
 ##### Args:
 
 
-*  <b>`counts`</b>: Non-negative `float` or `double` tensor whose dtype is the same
-    `self` and whose shape can be broadcast with `self.alpha`.  For fixed
-    leading dimensions, the last dimension represents counts for the
-    corresponding Dirichlet Multinomial distribution in `self.alpha`.
-    `counts` is only legal if it sums up to `n` and its components are
-    equal to integral values.
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.alpha`.  For fixed leading dimensions, the last
+    dimension represents counts for the corresponding Dirichlet Multinomial
+    distribution in `self.alpha`. `counts` is only legal if it sums up to
+    `n` and its components are equal to integer values.
 *  <b>`name`</b>: Name to give this Op, defaults to "log_prob".
 
 ##### Returns:
@@ -6243,12 +6639,11 @@ probability includes a combinatorial coefficient.
 ##### Args:
 
 
-*  <b>`counts`</b>: Non-negative `float` or `double` tensor whose dtype is the same
-    `self` and whose shape can be broadcast with `self.alpha`.  For fixed
-    leading dimensions, the last dimension represents counts for the
-    corresponding Dirichlet Multinomial distribution in `self.alpha`.
-    `counts` is only legal if it sums up to `n` and its components are
-    equal to integral values.
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.alpha`.  For fixed leading dimensions, the last
+    dimension represents counts for the corresponding Dirichlet Multinomial
+    distribution in `self.alpha`. `counts` is only legal if it sums up to
+    `n` and its components are equal to integer values.
 *  <b>`name`</b>: Name to give this Op, defaults to "prob".
 
 ##### Returns:
@@ -6347,6 +6742,413 @@ Cov(X_i, X_j) = -n * alpha_i * alpha_j / alpha_0 ** 2 *
 
 
 
+- - -
+
+### `class tf.contrib.distributions.Multinomial` {#Multinomial}
+
+Multinomial distribution.
+
+This distribution is parameterized by a vector `p` of probability
+parameters for `k` classes and `n`, the counts per each class..
+
+#### Mathematical details
+
+The Multinomial is a distribution over k-class count data, meaning
+for each k-tuple of non-negative integer `counts = [n_1,...,n_k]`, we have a
+probability of these draws being made from the distribution.  The distribution
+has hyperparameters `p = (p_1,...,p_k)`, and probability mass
+function (pmf):
+
+```pmf(counts) = n! / (n_1!...n_k!) * (p_1)^n_1*(p_2)^n_2*...(p_k)^n_k```
+
+where above `n = sum_j n_j`, `n!` is `n` factorial.
+
+#### Examples
+
+Create a 3-class distribution, with the 3rd class is most likely to be drawn,
+using logits..
+
+```python
+logits = [-50., -43, 0]
+dist = Multinomial(n=4., logits=logits)
+```
+
+Create a 3-class distribution, with the 3rd class is most likely to be drawn.
+
+```python
+p = [.2, .3, .5]
+dist = Multinomial(n=4., p=p)
+```
+
+The distribution functions can be evaluated on counts.
+
+```python
+# counts same shape as p.
+counts = [1., 0, 3]
+dist.prob(counts)  # Shape []
+
+# p will be broadcast to [[.2, .3, .5], [.2, .3, .5]] to match counts.
+counts = [[1., 2, 1], [2, 2, 0]]
+dist.prob(counts)  # Shape [2]
+
+# p will be broadcast to shape [5, 7, 3] to match counts.
+counts = [[...]]  # Shape [5, 7, 3]
+dist.prob(counts)  # Shape [5, 7]
+```
+
+Create a 2-batch of 3-class distributions.
+
+```python
+p = [[.1, .2, .7], [.3, .3, .4]]  # Shape [2, 3]
+dist = Multinomial(n=[4., 5], p=p)
+
+counts = [[2., 1, 1], [3, 1, 1]]
+dist.prob(counts)  # Shape [2]
+```
+- - -
+
+#### `tf.contrib.distributions.Multinomial.__init__(n, logits=None, p=None, validate_args=True, allow_nan_stats=False, name='Multinomial')` {#Multinomial.__init__}
+
+Initialize a batch of Multinomial distributions.
+
+##### Args:
+
+
+*  <b>`n`</b>: Non-negative floating point tensor with shape broadcastable to
+    `[N1,..., Nm]` with `m >= 0`. Defines this as a batch of
+    `N1 x ... x Nm` different Multinomial distributions.  Its components
+    should be equal to integer values.
+*  <b>`logits`</b>: Floating point tensor representing the log-odds of a
+    positive event with shape broadcastable to `[N1,..., Nm, k], m >= 0`,
+    and the same dtype as `n`. Defines this as a batch of `N1 x ... x Nm`
+    different `k` class Multinomial distributions.
+*  <b>`p`</b>: Positive floating point tensor with shape broadcastable to
+    `[N1,..., Nm, k]` `m >= 0` and same dtype as `n`.  Defines this as
+    a batch of `N1 x ... x Nm` different `k` class Multinomial
+    distributions. `p`'s components in the last portion of its shape should
+    sum up to 1.
+*  <b>`validate_args`</b>: Whether to assert valid values for parameters `n` and `p`,
+    and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
+    guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
+*  <b>`name`</b>: The name to prefix Ops created by this distribution class.
+
+
+*  <b>`Examples`</b>: 
+
+```python
+# Define 1-batch of 2-class multinomial distribution,
+# also known as a Binomial distribution.
+dist = Multinomial(n=2., p=[.1, .9])
+
+# Define a 2-batch of 3-class distributions.
+dist = Multinomial(n=[4., 5], p=[[.1, .3, .6], [.4, .05, .55]])
+```
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.allow_nan_stats` {#Multinomial.allow_nan_stats}
+
+Boolean describing behavior when a stat is undefined for batch member.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.batch_shape(name='batch_shape')` {#Multinomial.batch_shape}
+
+Batch dimensions of this instance as a 1-D int32 `Tensor`.
+
+The product of the dimensions of the `batch_shape` is the number of
+independent distributions of this kind the instance represents.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `batch_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.cdf(value, name='cdf')` {#Multinomial.cdf}
+
+Cumulative distribution function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.dtype` {#Multinomial.dtype}
+
+dtype of samples from this distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.entropy(name='entropy')` {#Multinomial.entropy}
+
+Entropy of the distribution in nats.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.event_shape(name='event_shape')` {#Multinomial.event_shape}
+
+Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `event_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.get_batch_shape()` {#Multinomial.get_batch_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `batch_shape`. May be only partially defined.
+
+##### Returns:
+
+  batch shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.get_event_shape()` {#Multinomial.get_event_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `event_shape`. May be only partially defined.
+
+##### Returns:
+
+  event shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.is_continuous` {#Multinomial.is_continuous}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.is_reparameterized` {#Multinomial.is_reparameterized}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_cdf(value, name='log_cdf')` {#Multinomial.log_cdf}
+
+Log CDF.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_pdf(value, name='log_pdf')` {#Multinomial.log_pdf}
+
+Log of the probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_pmf(value, name='log_pmf')` {#Multinomial.log_pmf}
+
+Log of the probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_prob(counts, name='log_prob')` {#Multinomial.log_prob}
+
+`Log(P[counts])`, computed for every batch member.
+
+For each batch of counts `[n_1,...,n_k]`, `P[counts]` is the probability
+that after sampling `n` draws from this Multinomial distribution, the
+number of draws falling in class `j` is `n_j`.  Note that different
+sequences of draws can result in the same counts, thus the probability
+includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can
+    be broadcast with `self.p` and `self.n`.  For fixed leading dimensions,
+    the last dimension represents counts for the corresponding Multinomial
+    distribution in `self.p`. `counts` is only legal if it sums up to `n`
+    and its components are equal to integer values.
+*  <b>`name`</b>: Name to give this Op, defaults to "log_prob".
+
+##### Returns:
+
+  Log probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.logits` {#Multinomial.logits}
+
+Log-odds.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.mean(name='mean')` {#Multinomial.mean}
+
+Mean of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.mode(name='mode')` {#Multinomial.mode}
+
+Mode of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.n` {#Multinomial.n}
+
+Number of trials.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.name` {#Multinomial.name}
+
+Name to prepend to all ops.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.p` {#Multinomial.p}
+
+Event probabilities.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.pdf(value, name='pdf')` {#Multinomial.pdf}
+
+The probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.pmf(value, name='pmf')` {#Multinomial.pmf}
+
+The probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.prob(counts, name='prob')` {#Multinomial.prob}
+
+`P[counts]`, computed for every batch member.
+
+For each batch of counts `[n_1,...,n_k]`, `P[counts]` is the probability
+that after sampling `n` draws from this Multinomial distribution, the
+number of draws falling in class `j` is `n_j`.  Note that different
+sequences of draws can result in the same counts, thus the probability
+includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can
+    be broadcast with `self.p` and `self.n`.  For fixed leading dimensions,
+    the last dimension represents counts for the corresponding Multinomial
+    distribution in `self.p`. `counts` is only legal if it sums up to `n`
+    and its components are equal to integer values.
+*  <b>`name`</b>: Name to give this Op, defaults to "prob".
+
+##### Returns:
+
+  Probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.sample(sample_shape=(), seed=None, name='sample')` {#Multinomial.sample}
+
+Generate samples of the specified shape for each batched distribution.
+
+Note that a call to `sample()` without arguments will generate a single
+sample per batched distribution.
+
+##### Args:
+
+
+*  <b>`sample_shape`</b>: `int32` `Tensor` or tuple or list. Shape of the generated
+    samples.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of dtype `self.dtype` and shape
+      `sample_shape + self.batch_shape + self.event_shape`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.sample_n(n, seed=None, name='sample_n')` {#Multinomial.sample_n}
+
+Generate `n` samples.
+
+##### Args:
+
+
+*  <b>`n`</b>: scalar. Number of samples to draw from each distribution.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of shape `(n,) + self.batch_shape + self.event_shape`
+      with values of type `self.dtype`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.std(name='std')` {#Multinomial.std}
+
+Standard deviation of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.validate_args` {#Multinomial.validate_args}
+
+Boolean describing behavior on invalid input.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.variance(name='variance')` {#Multinomial.variance}
+
+Variance of the distribution.
+
+
+
 
 ### Transformed distributions
 
@@ -6847,9 +7649,9 @@ Get the KL-divergence KL(dist_a || dist_b).
 
 *  <b>`dist_a`</b>: instance of distributions.Distribution.
 *  <b>`dist_b`</b>: instance of distributions.Distribution.
-*  <b>`allow_nan`</b>: If False (default), a runtime error is raised
+*  <b>`allow_nan`</b>: If `False` (default), a runtime error is raised
     if the KL returns NaN values for any batch entry of the given
-    distributions.  If True, the KL may return a NaN for the given entry.
+    distributions.  If `True`, the KL may return a NaN for the given entry.
 *  <b>`name`</b>: (optional) Name scope to use for created operations.
 
 ##### Returns:
@@ -7059,13 +7861,13 @@ D = is diagonal (r x r), optional (defaults to identity).
 ##### Args:
 
 
-*  <b>`mu`</b>: Rank `n + 1` `float` or `double` tensor with shape `[N1,...,Nn, k]`,
+*  <b>`mu`</b>: Rank `n + 1` floating point tensor with shape `[N1,...,Nn, k]`,
     `n >= 0`.  The means.
-*  <b>`diag_large`</b>: Optional rank `n + 1` `float` or `double` tensor, shape
+*  <b>`diag_large`</b>: Optional rank `n + 1` floating point tensor, shape
     `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `M`.
-*  <b>`v`</b>: Rank `n + 1` `float` or `double` tensor, shape `[N1,...,Nn, k, r]`
+*  <b>`v`</b>: Rank `n + 1` floating point tensor, shape `[N1,...,Nn, k, r]`
     `n >= 0`.  Defines the matrix `V`.
-*  <b>`diag_small`</b>: Rank `n + 1` `float` or `double` tensor, shape
+*  <b>`diag_small`</b>: Rank `n + 1` floating point tensor, shape
     `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `D`.  Default
     is `None`, which means `D` will be the identity matrix.
 *  <b>`validate_args`</b>: Whether to validate input with asserts.  If `validate_args`
diff --git a/tensorflow/g3doc/api_docs/python/contrib.framework.md b/tensorflow/g3doc/api_docs/python/contrib.framework.md
index df4df30d199..0a6c8119248 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.framework.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.framework.md
@@ -324,15 +324,14 @@ Assert tensors are the same shape, from the same graph.
 
 Decorator for marking functions or methods deprecated.
 
-This decorator adds a deprecation warning to a function's docstring. It has
-the following format:
+This decorator logs a deprecation warning whenever the decorated function is
+called. It has the following format:
 
   <function> (from <module>) is deprecated and will be removed after <date>.
   Instructions for updating:
   <instructions>
 
-whenever the decorated function is called. <function> will include the class
-name if it is a method.
+<function> will include the class name if it is a method.
 
 It also edits the docstring of the function: ' (deprecated)' is appended
 to the first line of the docstring and a deprecation notice is prepended
@@ -356,6 +355,44 @@ to the rest of the docstring.
 *  <b>`ValueError`</b>: If date is not in ISO 8601 format, or instructions are empty.
 
 
+- - -
+
+### `tf.contrib.framework.deprecated_arg_values(date, instructions, **deprecated_kwargs)` {#deprecated_arg_values}
+
+Decorator for marking specific function argument values as deprecated.
+
+This decorator logs a deprecation warning whenever the decorated function is
+called with the deprecated argument values. It has the following format:
+
+  Calling <function> (from <module>) with <arg>=<value> is deprecated and
+  will be removed after <date>. Instructions for updating:
+    <instructions>
+
+<function> will include the class name if it is a method.
+
+It also edits the docstring of the function: ' (deprecated arguments)' is
+appended to the first line of the docstring and a deprecation notice is
+prepended to the rest of the docstring.
+
+##### Args:
+
+
+*  <b>`date`</b>: String. The date the function is scheduled to be removed. Must be
+    ISO 8601 (YYYY-MM-DD).
+*  <b>`instructions`</b>: String. Instructions on how to update code using the
+    deprecated function.
+*  <b>`**deprecated_kwargs`</b>: The deprecated argument values.
+
+##### Returns:
+
+  Decorated function or method.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If date is not in ISO 8601 format, or instructions are empty.
+
+
 
 ## Arg_Scope
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/contrib.graph_editor.md b/tensorflow/g3doc/api_docs/python/contrib.graph_editor.md
new file mode 100644
index 00000000000..be6fa7bde55
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/contrib.graph_editor.md
@@ -0,0 +1,859 @@
+<!-- This file is machine generated: DO NOT EDIT! -->
+
+# Graph Editor (contrib)
+[TOC]
+
+Graph editor module allows to modify an existing graph in place.
+
+## Other Functions and Classes
+- - -
+
+### `class tf.contrib.graph_editor.SubGraphView` {#SubGraphView}
+
+A subgraph view on an existing tf.Graph.
+
+An instance of this class is a subgraph view on an existing tf.Graph.
+"subgraph" means that it can represent part of the whole tf.Graph.
+"view" means that it only provides a passive observation and do not to act
+on the tf.Graph. Note that in this documentation, the term "subgraph" is often
+used as substitute to "subgraph view".
+
+A subgraph contains:
+- a list of input tensors, accessible via the "inputs" property.
+- a list of output tensors, accessible via the "outputs" property.
+- and the operations in between, accessible via the "ops" property.
+
+An subgraph can be seen as a function F(i0, i1, ...) -> o0, o1, ... It is a
+function which takes as input some input tensors and returns as output some
+output tensors. The computation that the function performs is encoded in the
+operations of the subgraph.
+
+The tensors (input or output) can be of two kinds:
+- connected: a connected tensor connects to at least one operation contained
+in the subgraph. One example is a subgraph representing a single operation
+and its inputs and outputs: all the input and output tensors of the op
+are "connected".
+- passthrough: a passthrough tensor does not connect to any operation
+contained in the subgraph. One example is a subgraph representing a
+single tensor: this tensor is passthrough. By default a passthrough tensor is
+present both in the input and output tensors of the subgraph. It can however
+be remapped to only appear as an input (or output) only.
+
+The input and output tensors can be remapped. For instance, some input tensor
+can be ommited. For instance, a subgraph representing an operation with two
+inputs can be remapped to only take one input. Note that this does not change
+at all the underlying tf.Graph (remember, it is a view). It means that
+the other input is being ignored, or is being treated as "given".
+The analogy with functions can be extended like this: F(x,y) is the original
+function. Remapping the inputs from [x, y] to just [x] means that the subgraph
+now represent the function F_y(x) (y is "given").
+
+The output tensors can also be remapped. For instance, some output tensor can
+be ommited. Other output tensor can be duplicated as well. As mentioned
+before, this does not change at all the underlying tf.Graph.
+The analogy with functions can be extended like this: F(...)->x,y is the
+original function. Remapping the outputs from [x, y] to just [y,y] means that
+the subgraph now represent the function M(F(...)) where M is the function
+M(a,b)->b,b.
+
+It is useful to describe three other kind of tensors:
+- internal: an internal tensor is a tensor connecting operations contained
+in the subgraph. One example in the subgraph representing the two operations
+A and B connected sequentially: -> A -> B ->. The middle arrow is an internal
+tensor.
+- actual input: an input tensor of the subgraph, regardless of whether it is
+  listed in "inputs" or not (masked-out).
+- actual output: an output tensor of the subgraph, regardless of whether it is
+  listed in "outputs" or not (masked-out).
+- hidden input: an actual input which has been masked-out using an
+  input remapping. In other word, a hidden input is a non-internal tensor
+  not listed as a input tensor and one of whose consumers belongs to
+  the subgraph.
+- hidden output: a actual output which has been masked-out using an output
+  remapping. In other word, a hidden output is a non-internal tensor
+  not listed as an output and one of whose generating operations belongs to
+  the subgraph.
+
+Here are some usefull guarantees about an instance of a SubGraphView:
+- the input (or output) tensors are not internal.
+- the input (or output) tensors are either "connected" or "passthrough".
+- the passthrough tensors are not connected to any of the operation of
+the subgraph.
+
+Note that there is no guarantee that an operation in a subgraph contributes
+at all to its inputs or outputs. For instance, remapping both the inputs and
+outputs to empty lists will produce a subgraph which still contains all the
+original operations. However, the remove_unused_ops function can be used to
+make a new subgraph view whose operations are connected to at least one of
+the input or output tensors.
+
+An instance of this class is meant to be a lightweight object which is not
+modified in-place by the user. Rather, the user can create new modified
+instances of a given subgraph. In that sense, the class SubGraphView is meant
+to be used like an immutable python object.
+
+A common problem when using views is that they can get out-of-sync with the
+data they observe (in this case, a tf.Graph). This is up to the user to insure
+that this doesn't happen. To keep on the safe sife, it is recommended that
+the life time of subgraph views are kept very short. One way to achieve this
+is to use subgraphs within a "with make_sgv(...) as sgv:" Python context.
+
+To alleviate the out-of-sync problem, some functions are granted the right to
+modified subgraph in place. This is typically the case of graph manipulation
+functions which, given some subgraphs as arguments, can modify the underlying
+tf.Graph. Since this modification is likely to render the subgraph view
+invalid, those functions can modify the argument in place to reflect the
+change. For instance, calling the function swap_inputs(svg0, svg1) will modify
+svg0 and svg1 in place to reflect the fact that their inputs have now being
+swapped.
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.__init__(inside_ops=(), passthrough_ts=())` {#SubGraphView.__init__}
+
+Create a subgraph containing the given ops and the "passthrough" tensors.
+
+##### Args:
+
+
+*  <b>`inside_ops`</b>: an object convertible to a list of tf.Operation. This list
+    defines all the operations in the subgraph.
+*  <b>`passthrough_ts`</b>: an object convertible to a list of tf.Tensor. This list
+    define all the "passthrough" tensors. A passthrough tensor is a tensor
+    which goes directly from the input of the subgraph to it output, without
+    any intermediate operations. All the non passthrough tensors are
+    silently ignored.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if inside_ops cannot be converted to a list of tf.Operation or
+    if passthrough_ts cannot be converted to a list of tf.Tensor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.connected_inputs` {#SubGraphView.connected_inputs}
+
+The connected input tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.connected_outputs` {#SubGraphView.connected_outputs}
+
+The connected output tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.consumers()` {#SubGraphView.consumers}
+
+Return a Python set of all the consumers of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.copy()` {#SubGraphView.copy}
+
+Return a copy of itself.
+
+Note that this class is a "view", copying it only create another view and
+does not copy the underlying part of the tf.Graph.
+
+##### Returns:
+
+  a new instance identical to the original one.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.find_op_by_name(op_name)` {#SubGraphView.find_op_by_name}
+
+Return the op named op_name.
+
+##### Args:
+
+
+*  <b>`op_name`</b>: the name to search for
+
+##### Returns:
+
+  The op named op_name.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if the op_name could not be found.
+*  <b>`AssertionError`</b>: if the name was found multiple time.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.graph` {#SubGraphView.graph}
+
+The underlying tf.Graph.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.input_index(t)` {#SubGraphView.input_index}
+
+Find the input index corresponding to the given input tensor t.
+
+##### Args:
+
+
+*  <b>`t`</b>: the input tensor of this subgraph view.
+
+##### Returns:
+
+  the index in the self.inputs list.
+
+##### Raises:
+
+
+*  <b>`Error`</b>: if t in not an input tensor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.inputs` {#SubGraphView.inputs}
+
+The input tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.is_passthrough(t)` {#SubGraphView.is_passthrough}
+
+Check whether a tensor is passthrough.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.op(op_id)` {#SubGraphView.op}
+
+Get an op by its index.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.ops` {#SubGraphView.ops}
+
+The operations in this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.output_index(t)` {#SubGraphView.output_index}
+
+Find the output index corresponding to given output tensor t.
+
+##### Args:
+
+
+*  <b>`t`</b>: the output tensor of this subgraph view.
+
+##### Returns:
+
+  the index in the self.outputs list.
+
+##### Raises:
+
+
+*  <b>`Error`</b>: if t in not an output tensor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.outputs` {#SubGraphView.outputs}
+
+The output tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.passthroughs` {#SubGraphView.passthroughs}
+
+The passthrough tensors, going straight from input to output.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap(new_input_indices=None, new_output_indices=None)` {#SubGraphView.remap}
+
+Remap the inputs and outputs of the subgraph.
+
+Note that this is only modifying the view: the underlying tf.Graph is not
+affected.
+
+##### Args:
+
+
+*  <b>`new_input_indices`</b>: an iterable of integers representing a mapping between
+    the old inputs and the new ones. This mapping can be under-complete and
+    must be without repetitions.
+*  <b>`new_output_indices`</b>: an iterable of integers representing a mapping between
+    the old outputs and the new ones. This mapping can be under-complete and
+    can have repetitions.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with remapped
+    inputs and outputs.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_default(remove_input_map=True, remove_output_map=True)` {#SubGraphView.remap_default}
+
+Remap the inputs and/or outputs to the default mapping.
+
+##### Args:
+
+
+*  <b>`remove_input_map`</b>: if True the input map is reset to the default one.
+*  <b>`remove_output_map`</b>: if True the output map is reset to the default one.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with its
+    input and/or output mapping reset to the default one.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_inputs(new_input_indices)` {#SubGraphView.remap_inputs}
+
+Remap the inputs of the subgraph.
+
+If the inputs of the original subgraph are [t0, t1, t2], remapping to [2,0]
+will create a new instance whose inputs is [t2, t0].
+
+Note that this is only modifying the view: the underlying tf.Graph is not
+affected.
+
+##### Args:
+
+
+*  <b>`new_input_indices`</b>: an iterable of integers representing a mapping between
+    the old inputs and the new ones. This mapping can be under-complete and
+    must be without repetitions.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with remapped
+    inputs.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_outputs(new_output_indices)` {#SubGraphView.remap_outputs}
+
+Remap the output of the subgraph.
+
+If the output of the original subgraph are [t0, t1, t2], remapping to
+[1,1,0] will create a new instance whose outputs is [t1, t1, t0].
+
+Note that this is only modifying the view: the underlying tf.Graph is not
+affected.
+
+##### Args:
+
+
+*  <b>`new_output_indices`</b>: an iterable of integers representing a mapping between
+    the old outputs and the new ones. This mapping can be under-complete and
+    can have repetitions.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with remapped
+    outputs.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_outputs_make_unique()` {#SubGraphView.remap_outputs_make_unique}
+
+Remap the outputs so that all the tensors appears only once.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_outputs_to_consumers()` {#SubGraphView.remap_outputs_to_consumers}
+
+Remap the outputs to match the number of consumers.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remove_unused_ops(control_inputs=True)` {#SubGraphView.remove_unused_ops}
+
+Remove unused ops.
+
+##### Args:
+
+
+*  <b>`control_inputs`</b>: if True, control inputs are used to detect used ops.
+
+##### Returns:
+
+  A new subgraph view which only contains used operations.
+
+
+
+- - -
+
+### `class tf.contrib.graph_editor.Transformer` {#Transformer}
+
+Transform a subgraph into another one.
+
+By default, the constructor create a transform which copy a subgraph and
+replaces inputs with placeholders. This behavior can be modified by changing
+the handlers.
+- - -
+
+#### `tf.contrib.graph_editor.Transformer.__init__()` {#Transformer.__init__}
+
+Transformer constructor.
+
+The following members can be modified:
+transform_op_handler: handle the transformation of a tf.Operation.
+  This handler defaults to a simple copy.
+assign_collections_handler: handle the assignment of collections.
+  This handler defaults to assigning new collections created under the
+  given name-scope.
+transform_input_handler: handle the transform of the inputs to the given
+  subgraph. This handler defaults to creating placeholders instead of the
+  ops just before the input tensors of the subgraph.
+transform_hidden_input_handler: handle the transform of the hidden inputs of
+  the subgraph, that is, the inputs which are not listed in sgv.inputs.
+  This handler defaults to a transform which keep the same input if the
+  source and destination graphs are the same, otherwise use placeholders.
+transform_original_op_hanlder: handle the transform of original_op. This
+  handler defaults to transforming original_op only if they are in the
+  subgraph, otherwise they are ignored.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.Transformer.new_name(name)` {#Transformer.new_name}
+
+Compute a destination name from a source name.
+
+##### Args:
+
+
+*  <b>`name`</b>: the name to be "transformed".
+
+##### Returns:
+
+  the transformed name.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if the source scope is used (that is, not an empty string)
+    and the source name does not belong to the source scope.
+
+
+
+- - -
+
+### `tf.contrib.graph_editor.bypass(sgv)` {#bypass}
+
+Bypass the given subgraph by connecting its inputs to its outputs.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be bypassed. This argument is converted to a
+    subgraph using the same rules than the function subgraph.make_view.
+
+##### Returns:
+
+  A new subgraph view of the bypassed subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
+
+- - -
+
+### `tf.contrib.graph_editor.connect(sgv0, sgv1, disconnect_first=False)` {#connect}
+
+Connect the outputs of sgv0 to the inputs of sgv1.
+
+##### Args:
+
+
+*  <b>`sgv0`</b>: the first subgraph to have its outputs swapped. This argument is
+    converted to a subgraph using the same rules as the function
+    subgraph.make_view.
+*  <b>`sgv1`</b>: the second subgraph to have its outputs swapped. This argument is
+    converted to a subgraph using the same rules as the function
+    subgraph.make_view.
+*  <b>`disconnect_first`</b>: if True the current outputs of sgv0 are disconnected.
+
+##### Returns:
+
+  Two new subgraph views (now connected). sgv0 and svg1 are also modified
+    in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv0 or sgv1 cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
+
+- - -
+
+### `tf.contrib.graph_editor.detach(sgv, control_inputs=False, control_outputs=None, control_ios=None)` {#detach}
+
+Detach both the inputs and the outputs of a subgraph view.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
+    subgraph using the same rules as the function subgraph.make_view.
+*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
+*  <b>`control_outputs`</b>: An instance of util.ControlOutputs or None. If not None,
+    control outputs are enabled.
+*  <b>`control_ios`</b>: An instance of util.ControlOutputs or None. If not None, both
+    control inputs and control outputs are enabled. This is equivalent to set
+    control_inputs to True and control_outputs to the util.ControlOutputs
+    instance.
+
+##### Returns:
+
+  A new subgraph view of the detached subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
+
+- - -
+
+### `tf.contrib.graph_editor.detach_inputs(sgv, control_inputs=False)` {#detach_inputs}
+
+Detach the inputs of a subgraph view.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
+    subgraph using the same rules as the function subgraph.make_view.
+*  <b>`control_inputs`</b>: if True control_inputs are also detached.
+
+##### Returns:
+
+  A new subgraph view of the detached subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
+
+- - -
+
+### `tf.contrib.graph_editor.detach_outputs(sgv, control_outputs=None)` {#detach_outputs}
+
+Detach the outputa of a subgraph view.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
+    subgraph using the same rules as the function subgraph.make_view.
+*  <b>`control_outputs`</b>: a util.ControlOutputs instance or None. If not None the
+    control outputs are also detached.
+
+##### Returns:
+
+  A new subgraph view of the detached subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
+
+- - -
+
+### `class tf.contrib.graph_editor.matcher` {#matcher}
+
+Graph match class.
+- - -
+
+#### `tf.contrib.graph_editor.matcher.__init__(positive_filter)` {#matcher.__init__}
+
+Graph match constructor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.matcher.control_input_ops(*args)` {#matcher.control_input_ops}
+
+Add input matches.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.matcher.input_ops(*args)` {#matcher.input_ops}
+
+Add input matches.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.matcher.output_ops(*args)` {#matcher.output_ops}
+
+Add output matches.
+
+
+
+- - -
+
+### `tf.contrib.graph_editor.ph(dtype, shape=None, scope=None)` {#ph}
+
+Create a tf.placeholder for the Graph Editor.
+
+Note that the correct graph scope must be set by the calling function.
+The placeholder is named using the function placeholder_name (with no
+tensor argument).
+
+##### Args:
+
+
+*  <b>`dtype`</b>: the tensor type.
+*  <b>`shape`</b>: the tensor shape (optional).
+*  <b>`scope`</b>: absolute scope within which to create the placeholder. None
+    means that the scope of t is preserved. "" means the root scope.
+
+##### Returns:
+
+  A newly created tf.placeholder.
+
+
+- - -
+
+### `tf.contrib.graph_editor.reroute_a2b(sgv0, sgv1)` {#reroute_a2b}
+
+Re-route the inputs and outputs of sgv0 to sgv1 (see _reroute).
+
+
+- - -
+
+### `tf.contrib.graph_editor.reroute_a2b_inputs(sgv0, sgv1)` {#reroute_a2b_inputs}
+
+Re-route all the inputs of sgv0 to sgv1 (see reroute_inputs).
+
+
+- - -
+
+### `tf.contrib.graph_editor.reroute_a2b_outputs(sgv0, sgv1)` {#reroute_a2b_outputs}
+
+Re-route all the outputs of sgv0 to sgv1 (see _reroute_outputs).
+
+
+- - -
+
+### `tf.contrib.graph_editor.reroute_b2a(sgv0, sgv1)` {#reroute_b2a}
+
+Re-route the inputs and outputs of sgv1 to sgv0 (see _reroute).
+
+
+- - -
+
+### `tf.contrib.graph_editor.reroute_b2a_inputs(sgv0, sgv1)` {#reroute_b2a_inputs}
+
+Re-route all the inputs of sgv1 to sgv0 (see reroute_inputs).
+
+
+- - -
+
+### `tf.contrib.graph_editor.reroute_b2a_outputs(sgv0, sgv1)` {#reroute_b2a_outputs}
+
+Re-route all the outputs of sgv1 to sgv0 (see _reroute_outputs).
+
+
+- - -
+
+### `tf.contrib.graph_editor.select_ops(*args, **kwargs)` {#select_ops}
+
+Helper to select operations.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Operation. tf.Tensor instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ops_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ops)".
+
+##### Returns:
+
+  list of tf.Operation
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Operation
+    or an (array of) tf.Tensor (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
+
+- - -
+
+### `tf.contrib.graph_editor.select_ts(*args, **kwargs)` {#select_ts}
+
+Helper to select tensors.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Tensor. tf.Operation instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ts_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ts)".
+
+##### Returns:
+
+  list of tf.Tensor
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Tensor
+    or an (array of) tf.Operation (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
+
+- - -
+
+### `tf.contrib.graph_editor.sgv(*args, **kwargs)` {#sgv}
+
+Create a SubGraphView from selected operations and passthrough tensors.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Operation 3) (array of) tf.Tensor. Those objects will be converted
+    into a list of operations and a list of candidate for passthrough tensors.
+*  <b>`**kwargs`</b>: keyword graph is used 1) to check that the ops and ts are from
+    the correct graph 2) for regular expression query
+
+##### Returns:
+
+  A subgraph view.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Tensor
+    or an (array of) tf.Operation or a string or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected.
+
+
+- - -
+
+### `tf.contrib.graph_editor.sgv_scope(scope, graph)` {#sgv_scope}
+
+Make a subgraph from a name scope.
+
+##### Args:
+
+
+*  <b>`scope`</b>: the name of the scope.
+*  <b>`graph`</b>: the tf.Graph.
+
+##### Returns:
+
+  A subgraph view representing the given scope.
+
+
+- - -
+
+### `tf.contrib.graph_editor.swap(sgv0, sgv1)` {#swap}
+
+Swap the inputs and outputs of sgv1 to sgv0 (see _reroute).
+
+
+- - -
+
+### `tf.contrib.graph_editor.swap_inputs(sgv0, sgv1)` {#swap_inputs}
+
+Swap all the inputs of sgv0 and sgv1 (see reroute_inputs).
+
+
+- - -
+
+### `tf.contrib.graph_editor.swap_outputs(sgv0, sgv1)` {#swap_outputs}
+
+Swap all the outputs of sgv0 and sgv1 (see _reroute_outputs).
+
+
+- - -
+
+### `tf.contrib.graph_editor.ts(*args, **kwargs)` {#ts}
+
+Helper to select tensors.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Tensor. tf.Operation instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ts_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ts)".
+
+##### Returns:
+
+  list of tf.Tensor
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Tensor
+    or an (array of) tf.Operation (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.md b/tensorflow/g3doc/api_docs/python/contrib.learn.md
index 65f6768e2d8..a7214b2242a 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.learn.md
@@ -31,9 +31,9 @@ Initializes a BaseEstimator instance.
 ##### Args:
 
 
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
+    also be used to load checkpoints from the directory into a estimator to
+    continue training a previously saved model.
 *  <b>`config`</b>: A RunConfig instance.
 
 
@@ -41,56 +41,7 @@ Initializes a BaseEstimator instance.
 
 #### `tf.contrib.learn.BaseEstimator.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#BaseEstimator.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -104,37 +55,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.BaseEstimator.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#BaseEstimator.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
@@ -340,9 +261,9 @@ Constructs an Estimator instance.
              to configure Estimators from hyper parameter tunning.
 
 
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
+    also be used to load checkpoints from the directory into a estimator to
+    continue training a previously saved model.
 *  <b>`config`</b>: Configuration object.
 *  <b>`params`</b>: `dict` of hyper parameters that will be passed into `model_fn`.
           Keys are names of parameters, values are basic python types.
@@ -357,56 +278,7 @@ Constructs an Estimator instance.
 
 #### `tf.contrib.learn.Estimator.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#Estimator.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -420,37 +292,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.Estimator.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#Estimator.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
@@ -667,56 +509,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -1030,56 +823,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.DNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#DNNClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -1093,37 +837,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.DNNClassifier.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#DNNClassifier.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
@@ -1447,56 +1161,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.DNNRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#DNNRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -1510,37 +1175,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.DNNRegressor.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#DNNRegressor.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
@@ -1766,56 +1401,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowDNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowDNNClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -2040,56 +1626,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowDNNRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowDNNRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -2655,9 +2192,9 @@ Construct a `LinearClassifier` estimator object.
 *  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
     the model. All items in the set should be instances of classes derived
     from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
+    also be used to load checkpoints from the directory into a estimator
+    to continue training a previously saved model.
 *  <b>`n_classes`</b>: number of target classes. Default is binary classification.
 *  <b>`weight_column_name`</b>: A string defining feature column name representing
     weights. It is used to down weight or boost examples during training. It
@@ -2703,56 +2240,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.LinearClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#LinearClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -2766,37 +2254,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.LinearClassifier.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#LinearClassifier.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
@@ -3056,9 +2514,9 @@ Construct a `LinearRegressor` estimator object.
 *  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
     the model. All items in the set should be instances of classes derived
     from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph, etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph, etc. This can
+    also be used to load checkpoints from the directory into a estimator
+    to continue training a previously saved model.
 *  <b>`weight_column_name`</b>: A string defining feature column name representing
     weights. It is used to down weight or boost examples during training. It
     will be multiplied by the loss of the example.
@@ -3103,56 +2561,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.LinearRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#LinearRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -3166,37 +2575,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.LinearRegressor.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#LinearRegressor.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
@@ -3422,56 +2801,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowLinearClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowLinearClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -3696,56 +3026,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowLinearRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowLinearRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -4615,56 +3896,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -4870,7 +4102,7 @@ Perform various training, evaluation, and inference actions on a graph.
 This class specifies the specific configurations for the run.
 - - -
 
-#### `tf.contrib.learn.RunConfig.__init__(master='', task=0, num_ps_replicas=0, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=42, save_summary_steps=100, save_checkpoints_secs=60, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, save_checkpoints_steps=1000)` {#RunConfig.__init__}
+#### `tf.contrib.learn.RunConfig.__init__(master='', task=0, num_ps_replicas=0, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=42, save_summary_steps=100, save_checkpoints_secs=60, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__}
 
 Constructor.
 
@@ -4895,7 +4127,6 @@ Constructor.
 *  <b>`keep_checkpoint_every_n_hours`</b>: Number of hours between each checkpoint
     to be saved. The default value of 10,000 hours effectively disables
     the feature.
-*  <b>`save_checkpoints_steps`</b>: Number of steps between each checkpoint saving.
 
 
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md b/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
index 56b7879acff..04e0ba140e8 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
@@ -469,38 +469,32 @@ Returns the values captured so far.
 Saves checkpoints every N steps.
 - - -
 
-#### `tf.contrib.learn.monitors.CheckpointSaver.__init__(every_n_steps, saver, checkpoint_dir, checkpoint_basename='model.ckpt', first_n_steps=-1)` {#CheckpointSaver.__init__}
+#### `tf.contrib.learn.monitors.CheckpointSaver.__init__(checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename='model.ckpt', scaffold=None)` {#CheckpointSaver.__init__}
 
 Initialize CheckpointSaver monitor.
 
 ##### Args:
 
 
-*  <b>`every_n_steps`</b>: `int`, save every N steps.
-*  <b>`saver`</b>: `Saver` object, used for saving.
 *  <b>`checkpoint_dir`</b>: `str`, base directory for the checkpoint files.
+*  <b>`save_secs`</b>: `int`, save every N secs.
+*  <b>`save_steps`</b>: `int`, save every N steps.
+*  <b>`saver`</b>: `Saver` object, used for saving.
 *  <b>`checkpoint_basename`</b>: `str`, base name for the checkpoint files.
-*  <b>`first_n_steps`</b>: `int`, if positive, save every step during the
-    first `first_n_steps` steps.
+*  <b>`scaffold`</b>: `Scaffold`, use to get saver object.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If both `save_steps` and `save_secs` are not `None`.
+*  <b>`ValueError`</b>: If both `save_steps` and `save_secs` are `None`.
 
 
 - - -
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.begin(max_steps=None)` {#CheckpointSaver.begin}
 
-Called at the beginning of training.
 
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
 
 
 - - -
@@ -544,55 +538,6 @@ End epoch.
 *  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
 
 
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.every_n_post_step(step, session)` {#CheckpointSaver.every_n_post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.every_n_step_begin(step)` {#CheckpointSaver.every_n_step_begin}
-
-Callback before every n'th step begins.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list` of tensors that will be evaluated at this step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.every_n_step_end(step, outputs)` {#CheckpointSaver.every_n_step_end}
-
-Callback after every n'th step finished.
-
-This callback provides access to the tensors/ops evaluated at this step,
-including the additional tensors for which evaluation was requested in
-`step_begin`.
-
-In addition, the callback has the opportunity to stop training by returning
-`True`. This is useful for early stopping, for example.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`outputs`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`. True if training should stop.
-
-
 - - -
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.post_step(step, session)` {#CheckpointSaver.post_step}
@@ -628,33 +573,24 @@ A setter called automatically by the target estimator.
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.step_begin(step)` {#CheckpointSaver.step_begin}
 
-Overrides `BaseMonitor.step_begin`.
 
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
 
 
 - - -
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.step_end(step, output)` {#CheckpointSaver.step_end}
 
-Overrides `BaseMonitor.step_end`.
+Callback after training step finished.
 
-When overriding this method, you must call the super implementation.
+This callback provides access to the tensors/ops evaluated at this step,
+including the additional tensors for which evaluation was requested in
+`step_begin`.
+
+In addition, the callback has the opportunity to stop training by returning
+`True`. This is useful for early stopping, for example.
+
+Note that this method is not called if the call to `Session.run()` that
+followed the last call to `step_begin()` failed.
 
 ##### Args:
 
@@ -666,8 +602,12 @@ When overriding this method, you must call the super implementation.
 
 ##### Returns:
 
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
+  `bool`. True if training should stop.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if we've not begun a step, or `step` number does not match.
 
 
 
@@ -2173,7 +2113,7 @@ A setter called automatically by the target estimator.
 Saves summaries every N steps.
 - - -
 
-#### `tf.contrib.learn.monitors.SummarySaver.__init__(summary_op, save_steps=100, output_dir=None, summary_writer=None)` {#SummarySaver.__init__}
+#### `tf.contrib.learn.monitors.SummarySaver.__init__(summary_op, save_steps=100, output_dir=None, summary_writer=None, scaffold=None)` {#SummarySaver.__init__}
 
 Initializes a `SummarySaver` monitor.
 
@@ -2188,6 +2128,7 @@ Initializes a `SummarySaver` monitor.
       if no `summary_writer` is supplied.
 *  <b>`summary_writer`</b>: `SummaryWriter`. If `None` and an `output_dir` was passed,
       one will be created accordingly.
+*  <b>`scaffold`</b>: `Scaffold` to get summary_op if it's not provided.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/contrib.rnn.md b/tensorflow/g3doc/api_docs/python/contrib.rnn.md
index 201e23c66d3..34277d2b093 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.rnn.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.rnn.md
@@ -33,7 +33,7 @@ Initialize the basic LSTM cell.
 
 *  <b>`num_units`</b>: int, The number of units in the LSTM cell.
 *  <b>`forget_bias`</b>: float, The bias added to forget gates (see above).
-*  <b>`use_peephole`</b>: Whether to use peephole connectios or not.
+*  <b>`use_peephole`</b>: Whether to use peephole connections or not.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/control_flow_ops.md b/tensorflow/g3doc/api_docs/python/control_flow_ops.md
index 579633aa3b8..9a92c60b850 100644
--- a/tensorflow/g3doc/api_docs/python/control_flow_ops.md
+++ b/tensorflow/g3doc/api_docs/python/control_flow_ops.md
@@ -361,6 +361,9 @@ to your graph.
 
 Returns the truth value of x AND y element-wise.
 
+*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -396,6 +399,9 @@ Returns the truth value of NOT x element-wise.
 
 Returns the truth value of x OR y element-wise.
 
+*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -427,6 +433,9 @@ operators to your graph.
 
 Returns the truth value of (x == y) element-wise.
 
+*NOTE*: `Equal` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -445,6 +454,9 @@ Returns the truth value of (x == y) element-wise.
 
 Returns the truth value of (x != y) element-wise.
 
+*NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -463,6 +475,9 @@ Returns the truth value of (x != y) element-wise.
 
 Returns the truth value of (x < y) element-wise.
 
+*NOTE*: `Less` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -481,6 +496,9 @@ Returns the truth value of (x < y) element-wise.
 
 Returns the truth value of (x <= y) element-wise.
 
+*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -499,6 +517,9 @@ Returns the truth value of (x <= y) element-wise.
 
 Returns the truth value of (x > y) element-wise.
 
+*NOTE*: `Greater` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -517,6 +538,9 @@ Returns the truth value of (x > y) element-wise.
 
 Returns the truth value of (x >= y) element-wise.
 
+*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functional_ops.md b/tensorflow/g3doc/api_docs/python/functional_ops.md
index 0de41334647..68366b1a83e 100644
--- a/tensorflow/g3doc/api_docs/python/functional_ops.md
+++ b/tensorflow/g3doc/api_docs/python/functional_ops.md
@@ -16,7 +16,7 @@ map-reduce programming patterns.
 
 - - -
 
-### `tf.map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#map_fn}
+### `tf.map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name=None)` {#map_fn}
 
 map on the list of tensors unpacked from `elems` on dimension 0.
 
@@ -58,6 +58,7 @@ nested) tuple of types matching the output of `fn`.
     in parallel.
 *  <b>`back_prop`</b>: (optional) True enables support for back propagation.
 *  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
+*  <b>`infer_shape`</b>: (optional) False disables tests for consistent output shapes.
 *  <b>`name`</b>: (optional) Name prefix for the returned tensors.
 
 ##### Returns:
@@ -191,7 +192,7 @@ of the result tensor is `fn(initializer, values[0]).shape`.
 
 - - -
 
-### `tf.scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#scan}
+### `tf.scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name=None)` {#scan}
 
 scan on the list of tensors unpacked from `elems` on dimension 0.
 
@@ -243,6 +244,7 @@ For example, if `elems` is `(t1, [t2, t3])` and `initializer` is
     in parallel.
 *  <b>`back_prop`</b>: (optional) True enables support for back propagation.
 *  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
+*  <b>`infer_shape`</b>: (optional) False disables tests for consistent output shapes.
 *  <b>`name`</b>: (optional) Name prefix for the returned tensors.
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.VariableScope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.VariableScope.md
index 1b8931d726b..60254402eaa 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.VariableScope.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.VariableScope.md
@@ -13,9 +13,10 @@ Attributes:
   partitioner: callable or `None`: the partitioner passed to `get_variable`.
   custom_getter: default custom getter passed to get_variable.
   name_scope: The name passed to `tf.name_scope`.
+  dtype: default type passed to get_variable (defaults to DT_FLOAT).
 - - -
 
-#### `tf.VariableScope.__init__(reuse, name='', initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, name_scope='')` {#VariableScope.__init__}
+#### `tf.VariableScope.__init__(reuse, name='', initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, name_scope='', dtype=tf.float32)` {#VariableScope.__init__}
 
 Creates a new VariableScope with the given properties.
 
@@ -36,7 +37,14 @@ Creates a new VariableScope with the given properties.
 
 - - -
 
-#### `tf.VariableScope.get_variable(var_store, name, shape=None, dtype=tf.float32, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#VariableScope.get_variable}
+#### `tf.VariableScope.dtype` {#VariableScope.dtype}
+
+
+
+
+- - -
+
+#### `tf.VariableScope.get_variable(var_store, name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#VariableScope.get_variable}
 
 Gets an existing variable with this name or create a new one.
 
@@ -104,6 +112,13 @@ Set caching_device for this scope.
 Set custom getter for this scope.
 
 
+- - -
+
+#### `tf.VariableScope.set_dtype(dtype)` {#VariableScope.set_dtype}
+
+Set data type for this scope.
+
+
 - - -
 
 #### `tf.VariableScope.set_initializer(initializer)` {#VariableScope.set_initializer}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cholesky.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cholesky.md
index 4032b80d8e0..61e781319d8 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cholesky.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cholesky.md
@@ -1,6 +1,6 @@
 ### `tf.cholesky(input, name=None)` {#cholesky}
 
-Calculates the Cholesky decomposition of a square matrix.
+Computes the Cholesky decomposition of a square matrix.
 
 The input has to be symmetric and positive definite. Only the lower-triangular
 part of the input will be used for this operation. The upper-triangular part
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md
index 1c16241d89a..79adadc72c2 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md
@@ -2,10 +2,6 @@ Bernoulli distribution.
 
 The Bernoulli distribution is parameterized by p, the probability of a
 positive event.
-
-Note, the following methods of the base class aren't implemented:
-  * cdf
-  * log_cdf
 - - -
 
 #### `tf.contrib.distributions.Bernoulli.__init__(logits=None, p=None, dtype=tf.int32, validate_args=True, allow_nan_stats=False, name='Bernoulli')` {#Bernoulli.__init__}
@@ -25,10 +21,10 @@ Construct Bernoulli distributions.
 *  <b>`dtype`</b>: dtype for samples.
 *  <b>`validate_args`</b>: Whether to assert that `0 <= p <= 1`. If not validate_args,
    `log_pmf` may return nans.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: A name for this distribution.
 
 ##### Raises:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md
index 05da054e766..508fa43b59c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md
@@ -68,16 +68,16 @@ Initialize a batch of Dirichlet distributions.
 ##### Args:
 
 
-*  <b>`alpha`</b>: Positive `float` or `double` tensor with shape broadcastable to
+*  <b>`alpha`</b>: Positive floating point tensor with shape broadcastable to
     `[N1,..., Nm, k]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
      different `k` class Dirichlet distributions.
 *  <b>`validate_args`</b>: Whether to assert valid values for parameters `alpha` and
-    `x` in `prob` and `log_prob`.  If False, correct behavior is not
+    `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
     guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 
@@ -233,7 +233,7 @@ Log of the probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float` or `double`, tensor whose shape can
+*  <b>`x`</b>: Non-negative tensor with dtype `dtype` and whose shape can
     be broadcast with `self.alpha`.  For fixed leading dimensions, the last
     dimension represents counts for the corresponding Dirichlet distribution
     in `self.alpha`. `x` is only legal if it sums up to one.
@@ -302,7 +302,7 @@ The probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float`, `double` tensor whose shape can
+*  <b>`x`</b>: Non-negative tensor with dtype `dtype` and whose shape can
     be broadcast with `self.alpha`.  For fixed leading dimensions, the last
     dimension represents x for the corresponding Dirichlet distribution in
     `self.alpha` and `self.beta`. `x` is only legal if it sums up to one.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md
index 3fca9098d2a..82e42910610 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md
@@ -56,7 +56,7 @@ factors, such that the covariance of each batch member is `chol chol^T`.
 ##### Args:
 
 
-*  <b>`mu`</b>: `(N+1)-D`  `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+*  <b>`mu`</b>: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
     `b >= 0`.
 *  <b>`chol`</b>: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
     `[N1,...,Nb, k, k]`.  The upper triangular part is ignored (treated as
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.detach_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.detach_inputs.md
new file mode 100644
index 00000000000..fdf95a1b8f1
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.detach_inputs.md
@@ -0,0 +1,22 @@
+### `tf.contrib.graph_editor.detach_inputs(sgv, control_inputs=False)` {#detach_inputs}
+
+Detach the inputs of a subgraph view.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
+    subgraph using the same rules as the function subgraph.make_view.
+*  <b>`control_inputs`</b>: if True control_inputs are also detached.
+
+##### Returns:
+
+  A new subgraph view of the detached subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.reroute_a2b_outputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.reroute_a2b_outputs.md
new file mode 100644
index 00000000000..0bf41935968
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.reroute_a2b_outputs.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.reroute_a2b_outputs(sgv0, sgv1)` {#reroute_a2b_outputs}
+
+Re-route all the outputs of sgv0 to sgv1 (see _reroute_outputs).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.select_ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.select_ops.md
new file mode 100644
index 00000000000..44660ef243a
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.select_ops.md
@@ -0,0 +1,30 @@
+### `tf.contrib.graph_editor.select_ops(*args, **kwargs)` {#select_ops}
+
+Helper to select operations.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Operation. tf.Tensor instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ops_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ops)".
+
+##### Returns:
+
+  list of tf.Operation
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Operation
+    or an (array of) tf.Tensor (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.swap_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.swap_inputs.md
new file mode 100644
index 00000000000..bd18c89d6b2
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.swap_inputs.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.swap_inputs(sgv0, sgv1)` {#swap_inputs}
+
+Swap all the inputs of sgv0 and sgv1 (see reroute_inputs).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
index 6492f54565b..14aad5b0ccb 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
@@ -54,9 +54,9 @@ Construct a `LinearRegressor` estimator object.
 *  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
     the model. All items in the set should be instances of classes derived
     from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph, etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph, etc. This can
+    also be used to load checkpoints from the directory into a estimator
+    to continue training a previously saved model.
 *  <b>`weight_column_name`</b>: A string defining feature column name representing
     weights. It is used to down weight or boost examples during training. It
     will be multiplied by the loss of the example.
@@ -101,56 +101,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.LinearRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#LinearRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -164,37 +115,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.LinearRegressor.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#LinearRegressor.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cumprod.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cumprod.md
index a226ce07373..7381350be38 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cumprod.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cumprod.md
@@ -2,13 +2,15 @@
 
 Compute the cumulative product of the tensor `x` along `axis`.
 
-By default, this op performs an inclusive cumprod, which means that the first
+By default, this op performs an inclusive cumprod, which means that the
+first
 element of the input is identical to the first element of the output:
 ```prettyprint
 tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
 ```
 
-By setting the `exclusive` kwarg to `True`, an exclusive cumprod is performed
+By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+performed
 instead:
 ```prettyprint
 tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
@@ -30,8 +32,8 @@ tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
 
 
 *  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-   `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-   `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+     `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+     `complex128`, `qint8`, `quint8`, `qint32`, `half`.
 *  <b>`axis`</b>: A `Tensor` of type `int32` (default: 0).
 *  <b>`reverse`</b>: A `bool` (default: False).
 *  <b>`name`</b>: A name for the operation (optional).
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mod.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mod.md
index 5bfe1058a77..86978890b5a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mod.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mod.md
@@ -2,6 +2,9 @@
 
 Returns element-wise remainder of division.
 
+*NOTE*: `Mod` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mul.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mul.md
index 3d6fa568645..2efd16e8915 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mul.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mul.md
@@ -2,6 +2,9 @@
 
 Returns x * y element-wise.
 
+*NOTE*: `Mul` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.not_equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.not_equal.md
index 9c187922232..5ed8df49d5c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.not_equal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.not_equal.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of (x != y) element-wise.
 
+*NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_matrix_inverse.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_matrix_inverse.md
index 231056a05c2..6b51df6aec7 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_matrix_inverse.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_matrix_inverse.md
@@ -1,6 +1,6 @@
 ### `tf.batch_matrix_inverse(input, adjoint=None, name=None)` {#batch_matrix_inverse}
 
-Calculates the inverse of square invertible matrices or their adjoints
+Computes the inverse of square invertible matrices or their adjoints
 
 (conjugate transposes).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_self_adjoint_eigvals.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_self_adjoint_eigvals.md
new file mode 100644
index 00000000000..77cdaf3ec3c
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_self_adjoint_eigvals.md
@@ -0,0 +1,16 @@
+### `tf.batch_self_adjoint_eigvals(tensor, name=None)` {#batch_self_adjoint_eigvals}
+
+Computes the eigenvalues of a batch of self-adjoint matrices.
+
+##### Args:
+
+
+*  <b>`tensor`</b>: `Tensor` of shape `[..., N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`e`</b>: Eigenvalues. Shape is `[..., N]`. The vector `e[..., :]` contains the `N`
+    eigenvalues of `tensor[..., :, :]`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md
index ea3e42eb2f3..8d26e98d154 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md
@@ -56,7 +56,7 @@ The mean of `X_i` is `mu[i]`, and the standard deviation is `diag_stdev[i]`.
 ##### Args:
 
 
-*  <b>`mu`</b>: Rank `N + 1` `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+*  <b>`mu`</b>: Rank `N + 1` floating point tensor with shape `[N1,...,Nb, k]`,
     `b >= 0`.
 *  <b>`diag_stdev`</b>: Rank `N + 1` `Tensor` with same `dtype` and shape as `mu`,
     representing the standard deviations.  Must be positive.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md
index 0b00a17938d..c43058d8870 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md
@@ -57,19 +57,19 @@ broadcasting (e.g. `df + mu + sigma` is a valid operation).
 ##### Args:
 
 
-*  <b>`df`</b>: `float` or `double` tensor, the degrees of freedom of the
+*  <b>`df`</b>: Floating point tensor, the degrees of freedom of the
     distribution(s). `df` must contain only positive values.
-*  <b>`mu`</b>: `float` or `double` tensor, the means of the distribution(s).
-*  <b>`sigma`</b>: `float` or `double` tensor, the scaling factor for the
+*  <b>`mu`</b>: Floating point tensor, the means of the distribution(s).
+*  <b>`sigma`</b>: Floating point tensor, the scaling factor for the
     distribution(s). `sigma` must contain only positive values.
     Note that `sigma` is not the standard deviation of this distribution.
 *  <b>`validate_args`</b>: Whether to assert that `df > 0, sigma > 0`. If
-    `validate_args` is False and inputs are invalid, correct behavior is not
-    guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+    `validate_args` is `False` and inputs are invalid, correct behavior is
+    not guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.ops.md
new file mode 100644
index 00000000000..d579ac9a46e
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.ops.md
@@ -0,0 +1,30 @@
+### `tf.contrib.graph_editor.ops(*args, **kwargs)` {#ops}
+
+Helper to select operations.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Operation. tf.Tensor instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ops_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ops)".
+
+##### Returns:
+
+  list of tf.Operation
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Operation
+    or an (array of) tf.Tensor (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.reroute_a2b_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.reroute_a2b_inputs.md
new file mode 100644
index 00000000000..0f82675ef90
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.reroute_a2b_inputs.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.reroute_a2b_inputs(sgv0, sgv1)` {#reroute_a2b_inputs}
+
+Re-route all the inputs of sgv0 to sgv1 (see reroute_inputs).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
index eb2c56ad076..c3cbf1d862c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
@@ -71,9 +71,9 @@ Construct a `LinearClassifier` estimator object.
 *  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
     the model. All items in the set should be instances of classes derived
     from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
+    also be used to load checkpoints from the directory into a estimator
+    to continue training a previously saved model.
 *  <b>`n_classes`</b>: number of target classes. Default is binary classification.
 *  <b>`weight_column_name`</b>: A string defining feature column name representing
     weights. It is used to down weight or boost examples during training. It
@@ -119,56 +119,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.LinearClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#LinearClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -182,37 +133,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.LinearClassifier.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#LinearClassifier.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.greater_equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.greater_equal.md
index 9d68429c36c..d6ce057c133 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.greater_equal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.greater_equal.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of (x >= y) element-wise.
 
+*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md
index 40a4332531b..dd98fd9dd8a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md
@@ -1,4 +1,4 @@
-### `tf.map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#map_fn}
+### `tf.map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name=None)` {#map_fn}
 
 map on the list of tensors unpacked from `elems` on dimension 0.
 
@@ -40,6 +40,7 @@ nested) tuple of types matching the output of `fn`.
     in parallel.
 *  <b>`back_prop`</b>: (optional) True enables support for back propagation.
 *  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
+*  <b>`infer_shape`</b>: (optional) False disables tests for consistent output shapes.
 *  <b>`name`</b>: (optional) Name prefix for the returned tensors.
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.batch_matrix_determinant.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.batch_matrix_determinant.md
index d55bf96f187..a30b74e35cc 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.batch_matrix_determinant.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.batch_matrix_determinant.md
@@ -1,6 +1,6 @@
 ### `tf.batch_matrix_determinant(input, name=None)` {#batch_matrix_determinant}
 
-Calculates the determinants for a batch of square matrices.
+Computes the determinants for a batch of square matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices. The output is a tensor containing the determinants
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md
index 052af1eb55e..a207a1112ec 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md
@@ -2,11 +2,6 @@ Categorical distribution.
 
 The categorical distribution is parameterized by the log-probabilities
 of a set of classes.
-
-Note, the following methods of the base class aren't implemented:
-  * mean
-  * cdf
-  * log_cdf
 - - -
 
 #### `tf.contrib.distributions.Categorical.__init__(logits, dtype=tf.int32, validate_args=True, allow_nan_stats=False, name='Categorical')` {#Categorical.__init__}
@@ -22,10 +17,10 @@ Initialize Categorical distributions using class log-probabilities.
       indexes into the classes.
 *  <b>`dtype`</b>: The type of the event samples (default: int32).
 *  <b>`validate_args`</b>: Unused in this distribution.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: A name for this distribution (optional).
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md
index 2f692a15f9c..f01b075d05a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md
@@ -15,15 +15,15 @@ Construct Chi2 distributions with parameter `df`.
 ##### Args:
 
 
-*  <b>`df`</b>: `float` or `double` tensor, the degrees of freedom of the
+*  <b>`df`</b>: Floating point tensor, the degrees of freedom of the
     distribution(s).  `df` must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `df > 0`, and that `x > 0` in the
-    methods `prob(x)` and `log_prob(x)`. If `validate_args` is False
+    methods `prob(x)` and `log_prob(x)`. If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md
index 9862309eed4..9eea17257d9 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md
@@ -31,14 +31,14 @@ u1 = Uniform(3.0, [5.0, 6.0, 7.0])  # 3 distributions
 ##### Args:
 
 
-*  <b>`a`</b>: `float` or `double` tensor, the minimum endpoint.
-*  <b>`b`</b>: `float` or `double` tensor, the maximum endpoint. Must be > `a`.
-*  <b>`validate_args`</b>: Whether to assert that `a > b`. If `validate_args` is False
-    and inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`a`</b>: Floating point tensor, the minimum endpoint.
+*  <b>`b`</b>: Floating point tensor, the maximum endpoint. Must be > `a`.
+*  <b>`validate_args`</b>: Whether to assert that `a > b`. If `validate_args` is
+    `False` and inputs are invalid, correct behavior is not guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 ##### Raises:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.deprecated_arg_values.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.deprecated_arg_values.md
new file mode 100644
index 00000000000..285ea14f96e
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.deprecated_arg_values.md
@@ -0,0 +1,35 @@
+### `tf.contrib.framework.deprecated_arg_values(date, instructions, **deprecated_kwargs)` {#deprecated_arg_values}
+
+Decorator for marking specific function argument values as deprecated.
+
+This decorator logs a deprecation warning whenever the decorated function is
+called with the deprecated argument values. It has the following format:
+
+  Calling <function> (from <module>) with <arg>=<value> is deprecated and
+  will be removed after <date>. Instructions for updating:
+    <instructions>
+
+<function> will include the class name if it is a method.
+
+It also edits the docstring of the function: ' (deprecated arguments)' is
+appended to the first line of the docstring and a deprecation notice is
+prepended to the rest of the docstring.
+
+##### Args:
+
+
+*  <b>`date`</b>: String. The date the function is scheduled to be removed. Must be
+    ISO 8601 (YYYY-MM-DD).
+*  <b>`instructions`</b>: String. Instructions on how to update code using the
+    deprecated function.
+*  <b>`**deprecated_kwargs`</b>: The deprecated argument values.
+
+##### Returns:
+
+  Decorated function or method.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If date is not in ISO 8601 format, or instructions are empty.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.SubGraphView.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.SubGraphView.md
new file mode 100644
index 00000000000..bf2ecc56456
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.SubGraphView.md
@@ -0,0 +1,391 @@
+A subgraph view on an existing tf.Graph.
+
+An instance of this class is a subgraph view on an existing tf.Graph.
+"subgraph" means that it can represent part of the whole tf.Graph.
+"view" means that it only provides a passive observation and do not to act
+on the tf.Graph. Note that in this documentation, the term "subgraph" is often
+used as substitute to "subgraph view".
+
+A subgraph contains:
+- a list of input tensors, accessible via the "inputs" property.
+- a list of output tensors, accessible via the "outputs" property.
+- and the operations in between, accessible via the "ops" property.
+
+An subgraph can be seen as a function F(i0, i1, ...) -> o0, o1, ... It is a
+function which takes as input some input tensors and returns as output some
+output tensors. The computation that the function performs is encoded in the
+operations of the subgraph.
+
+The tensors (input or output) can be of two kinds:
+- connected: a connected tensor connects to at least one operation contained
+in the subgraph. One example is a subgraph representing a single operation
+and its inputs and outputs: all the input and output tensors of the op
+are "connected".
+- passthrough: a passthrough tensor does not connect to any operation
+contained in the subgraph. One example is a subgraph representing a
+single tensor: this tensor is passthrough. By default a passthrough tensor is
+present both in the input and output tensors of the subgraph. It can however
+be remapped to only appear as an input (or output) only.
+
+The input and output tensors can be remapped. For instance, some input tensor
+can be ommited. For instance, a subgraph representing an operation with two
+inputs can be remapped to only take one input. Note that this does not change
+at all the underlying tf.Graph (remember, it is a view). It means that
+the other input is being ignored, or is being treated as "given".
+The analogy with functions can be extended like this: F(x,y) is the original
+function. Remapping the inputs from [x, y] to just [x] means that the subgraph
+now represent the function F_y(x) (y is "given").
+
+The output tensors can also be remapped. For instance, some output tensor can
+be ommited. Other output tensor can be duplicated as well. As mentioned
+before, this does not change at all the underlying tf.Graph.
+The analogy with functions can be extended like this: F(...)->x,y is the
+original function. Remapping the outputs from [x, y] to just [y,y] means that
+the subgraph now represent the function M(F(...)) where M is the function
+M(a,b)->b,b.
+
+It is useful to describe three other kind of tensors:
+- internal: an internal tensor is a tensor connecting operations contained
+in the subgraph. One example in the subgraph representing the two operations
+A and B connected sequentially: -> A -> B ->. The middle arrow is an internal
+tensor.
+- actual input: an input tensor of the subgraph, regardless of whether it is
+  listed in "inputs" or not (masked-out).
+- actual output: an output tensor of the subgraph, regardless of whether it is
+  listed in "outputs" or not (masked-out).
+- hidden input: an actual input which has been masked-out using an
+  input remapping. In other word, a hidden input is a non-internal tensor
+  not listed as a input tensor and one of whose consumers belongs to
+  the subgraph.
+- hidden output: a actual output which has been masked-out using an output
+  remapping. In other word, a hidden output is a non-internal tensor
+  not listed as an output and one of whose generating operations belongs to
+  the subgraph.
+
+Here are some usefull guarantees about an instance of a SubGraphView:
+- the input (or output) tensors are not internal.
+- the input (or output) tensors are either "connected" or "passthrough".
+- the passthrough tensors are not connected to any of the operation of
+the subgraph.
+
+Note that there is no guarantee that an operation in a subgraph contributes
+at all to its inputs or outputs. For instance, remapping both the inputs and
+outputs to empty lists will produce a subgraph which still contains all the
+original operations. However, the remove_unused_ops function can be used to
+make a new subgraph view whose operations are connected to at least one of
+the input or output tensors.
+
+An instance of this class is meant to be a lightweight object which is not
+modified in-place by the user. Rather, the user can create new modified
+instances of a given subgraph. In that sense, the class SubGraphView is meant
+to be used like an immutable python object.
+
+A common problem when using views is that they can get out-of-sync with the
+data they observe (in this case, a tf.Graph). This is up to the user to insure
+that this doesn't happen. To keep on the safe sife, it is recommended that
+the life time of subgraph views are kept very short. One way to achieve this
+is to use subgraphs within a "with make_sgv(...) as sgv:" Python context.
+
+To alleviate the out-of-sync problem, some functions are granted the right to
+modified subgraph in place. This is typically the case of graph manipulation
+functions which, given some subgraphs as arguments, can modify the underlying
+tf.Graph. Since this modification is likely to render the subgraph view
+invalid, those functions can modify the argument in place to reflect the
+change. For instance, calling the function swap_inputs(svg0, svg1) will modify
+svg0 and svg1 in place to reflect the fact that their inputs have now being
+swapped.
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.__init__(inside_ops=(), passthrough_ts=())` {#SubGraphView.__init__}
+
+Create a subgraph containing the given ops and the "passthrough" tensors.
+
+##### Args:
+
+
+*  <b>`inside_ops`</b>: an object convertible to a list of tf.Operation. This list
+    defines all the operations in the subgraph.
+*  <b>`passthrough_ts`</b>: an object convertible to a list of tf.Tensor. This list
+    define all the "passthrough" tensors. A passthrough tensor is a tensor
+    which goes directly from the input of the subgraph to it output, without
+    any intermediate operations. All the non passthrough tensors are
+    silently ignored.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if inside_ops cannot be converted to a list of tf.Operation or
+    if passthrough_ts cannot be converted to a list of tf.Tensor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.connected_inputs` {#SubGraphView.connected_inputs}
+
+The connected input tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.connected_outputs` {#SubGraphView.connected_outputs}
+
+The connected output tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.consumers()` {#SubGraphView.consumers}
+
+Return a Python set of all the consumers of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.copy()` {#SubGraphView.copy}
+
+Return a copy of itself.
+
+Note that this class is a "view", copying it only create another view and
+does not copy the underlying part of the tf.Graph.
+
+##### Returns:
+
+  a new instance identical to the original one.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.find_op_by_name(op_name)` {#SubGraphView.find_op_by_name}
+
+Return the op named op_name.
+
+##### Args:
+
+
+*  <b>`op_name`</b>: the name to search for
+
+##### Returns:
+
+  The op named op_name.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if the op_name could not be found.
+*  <b>`AssertionError`</b>: if the name was found multiple time.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.graph` {#SubGraphView.graph}
+
+The underlying tf.Graph.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.input_index(t)` {#SubGraphView.input_index}
+
+Find the input index corresponding to the given input tensor t.
+
+##### Args:
+
+
+*  <b>`t`</b>: the input tensor of this subgraph view.
+
+##### Returns:
+
+  the index in the self.inputs list.
+
+##### Raises:
+
+
+*  <b>`Error`</b>: if t in not an input tensor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.inputs` {#SubGraphView.inputs}
+
+The input tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.is_passthrough(t)` {#SubGraphView.is_passthrough}
+
+Check whether a tensor is passthrough.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.op(op_id)` {#SubGraphView.op}
+
+Get an op by its index.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.ops` {#SubGraphView.ops}
+
+The operations in this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.output_index(t)` {#SubGraphView.output_index}
+
+Find the output index corresponding to given output tensor t.
+
+##### Args:
+
+
+*  <b>`t`</b>: the output tensor of this subgraph view.
+
+##### Returns:
+
+  the index in the self.outputs list.
+
+##### Raises:
+
+
+*  <b>`Error`</b>: if t in not an output tensor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.outputs` {#SubGraphView.outputs}
+
+The output tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.passthroughs` {#SubGraphView.passthroughs}
+
+The passthrough tensors, going straight from input to output.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap(new_input_indices=None, new_output_indices=None)` {#SubGraphView.remap}
+
+Remap the inputs and outputs of the subgraph.
+
+Note that this is only modifying the view: the underlying tf.Graph is not
+affected.
+
+##### Args:
+
+
+*  <b>`new_input_indices`</b>: an iterable of integers representing a mapping between
+    the old inputs and the new ones. This mapping can be under-complete and
+    must be without repetitions.
+*  <b>`new_output_indices`</b>: an iterable of integers representing a mapping between
+    the old outputs and the new ones. This mapping can be under-complete and
+    can have repetitions.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with remapped
+    inputs and outputs.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_default(remove_input_map=True, remove_output_map=True)` {#SubGraphView.remap_default}
+
+Remap the inputs and/or outputs to the default mapping.
+
+##### Args:
+
+
+*  <b>`remove_input_map`</b>: if True the input map is reset to the default one.
+*  <b>`remove_output_map`</b>: if True the output map is reset to the default one.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with its
+    input and/or output mapping reset to the default one.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_inputs(new_input_indices)` {#SubGraphView.remap_inputs}
+
+Remap the inputs of the subgraph.
+
+If the inputs of the original subgraph are [t0, t1, t2], remapping to [2,0]
+will create a new instance whose inputs is [t2, t0].
+
+Note that this is only modifying the view: the underlying tf.Graph is not
+affected.
+
+##### Args:
+
+
+*  <b>`new_input_indices`</b>: an iterable of integers representing a mapping between
+    the old inputs and the new ones. This mapping can be under-complete and
+    must be without repetitions.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with remapped
+    inputs.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_outputs(new_output_indices)` {#SubGraphView.remap_outputs}
+
+Remap the output of the subgraph.
+
+If the output of the original subgraph are [t0, t1, t2], remapping to
+[1,1,0] will create a new instance whose outputs is [t1, t1, t0].
+
+Note that this is only modifying the view: the underlying tf.Graph is not
+affected.
+
+##### Args:
+
+
+*  <b>`new_output_indices`</b>: an iterable of integers representing a mapping between
+    the old outputs and the new ones. This mapping can be under-complete and
+    can have repetitions.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with remapped
+    outputs.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_outputs_make_unique()` {#SubGraphView.remap_outputs_make_unique}
+
+Remap the outputs so that all the tensors appears only once.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_outputs_to_consumers()` {#SubGraphView.remap_outputs_to_consumers}
+
+Remap the outputs to match the number of consumers.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remove_unused_ops(control_inputs=True)` {#SubGraphView.remove_unused_ops}
+
+Remove unused ops.
+
+##### Args:
+
+
+*  <b>`control_inputs`</b>: if True, control inputs are used to detect used ops.
+
+##### Returns:
+
+  A new subgraph view which only contains used operations.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.copy.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.copy.md
new file mode 100644
index 00000000000..0d3ac62e34e
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.copy.md
@@ -0,0 +1,24 @@
+### `tf.contrib.graph_editor.copy(sgv, dst_graph=None, dst_scope='', src_scope='')` {#copy}
+
+Copy a subgraph.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the source subgraph-view. This argument is converted to a subgraph
+    using the same rules than the function subgraph.make_view.
+*  <b>`dst_graph`</b>: the destination graph.
+*  <b>`dst_scope`</b>: the destination scope.
+*  <b>`src_scope`</b>: the source scope.
+
+##### Returns:
+
+  the subgraph view of the copied subgraph.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if dst_graph is not a tf.Graph.
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md
index ba7fd7805d5..ca3154b76f5 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md
@@ -16,9 +16,9 @@ Initializes a BaseEstimator instance.
 ##### Args:
 
 
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
+    also be used to load checkpoints from the directory into a estimator to
+    continue training a previously saved model.
 *  <b>`config`</b>: A RunConfig instance.
 
 
@@ -26,56 +26,7 @@ Initializes a BaseEstimator instance.
 
 #### `tf.contrib.learn.BaseEstimator.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#BaseEstimator.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -89,37 +40,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.BaseEstimator.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#BaseEstimator.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.TensorFlowDNNRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.TensorFlowDNNRegressor.md
index ad2b7626ebe..f13720e198d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.TensorFlowDNNRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.TensorFlowDNNRegressor.md
@@ -31,56 +31,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowDNNRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowDNNRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.minimum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.minimum.md
index bff13483f4d..9bcd03f6e78 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.minimum.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.minimum.md
@@ -1,6 +1,9 @@
 ### `tf.minimum(x, y, name=None)` {#minimum}
 
-Returns the min of x and y (i.e. x < y ? x : y) element-wise, broadcasts.
+Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+
+*NOTE*: `Minimum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.batch_self_adjoint_eig.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.batch_self_adjoint_eig.md
index 19d6c5319f0..fe05ec127a2 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.batch_self_adjoint_eig.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.batch_self_adjoint_eig.md
@@ -1,22 +1,22 @@
-### `tf.batch_self_adjoint_eig(input, name=None)` {#batch_self_adjoint_eig}
+### `tf.batch_self_adjoint_eig(tensor, name=None)` {#batch_self_adjoint_eig}
 
-Calculates the Eigen Decomposition of a batch of square self-adjoint matrices.
+Computes the eigen decomposition of a batch of self-adjoint matrices.
 
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices, with the same constraints as the single matrix
-SelfAdjointEig.
-
-The result is a '[..., M+1, M] matrix with [..., 0,:] containing the
-eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
+Computes the eigenvalues and eigenvectors of the innermost N-by-N matrices
+in `tensor` such that
+`tensor[...,:,:] * v[..., :,i] = e(..., i) * v[...,:,i]`, for i=0...N-1.
 
 ##### Args:
 
 
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`.
-    Shape is `[..., M, M]`.
-*  <b>`name`</b>: A name for the operation (optional).
+*  <b>`tensor`</b>: `Tensor` of shape `[..., N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
 
 ##### Returns:
 
-  A `Tensor`. Has the same type as `input`. Shape is `[..., M+1, M]`.
+
+*  <b>`e`</b>: Eigenvalues. Shape is `[..., N]`.
+*  <b>`v`</b>: Eigenvectors. Shape is `[..., N, N]`. The columns of the inner most
+  matrices
+    contain eigenvectors of the corresponding matrices in `tensor`
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Binomial.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Binomial.md
new file mode 100644
index 00000000000..96d194944e1
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Binomial.md
@@ -0,0 +1,401 @@
+Binomial distribution.
+
+This distribution is parameterized by a vector `p` of probabilities and `n`,
+the total counts.
+
+#### Mathematical details
+
+The Binomial is a distribution over the number of successes in `n` independent
+trials, with each trial having the same probability of success `p`.
+The probability mass function (pmf):
+
+```pmf(k) = n! / (k! * (n - k)!) * (p)^k * (1 - p)^(n - k)```
+
+#### Examples
+
+Create a single distribution, corresponding to 5 coin flips.
+
+```python
+dist = Binomial(n=5., p=.5)
+```
+
+Create a single distribution (using logits), corresponding to 5 coin flips.
+
+```python
+dist = Binomial(n=5., logits=0.)
+```
+
+Creates 3 distributions with the third distribution most likely to have
+successes.
+
+```python
+p = [.2, .3, .8]
+# n will be broadcast to [4., 4., 4.], to match p.
+dist = Binomial(n=4., p=p)
+```
+
+The distribution functions can be evaluated on counts.
+
+```python
+# counts same shape as p.
+counts = [1., 2, 3]
+dist.prob(counts)  # Shape [3]
+
+# p will be broadcast to [[.2, .3, .8], [.2, .3, .8]] to match counts.
+counts = [[1., 2, 1], [2, 2, 4]]
+dist.prob(counts)  # Shape [2, 3]
+
+# p will be broadcast to shape [5, 7, 3] to match counts.
+counts = [[...]]  # Shape [5, 7, 3]
+dist.prob(counts)  # Shape [5, 7, 3]
+```
+- - -
+
+#### `tf.contrib.distributions.Binomial.__init__(n, logits=None, p=None, validate_args=True, allow_nan_stats=False, name='Binomial')` {#Binomial.__init__}
+
+Initialize a batch of Binomial distributions.
+
+##### Args:
+
+
+*  <b>`n`</b>: Non-negative floating point tensor with shape broadcastable to
+    `[N1,..., Nm]` with `m >= 0` and the same dtype as `p` or `logits`.
+    Defines this as a batch of `N1 x ... x Nm` different Binomial
+    distributions. Its components should be equal to integer values.
+*  <b>`logits`</b>: Floating point tensor representing the log-odds of a
+    positive event with shape broadcastable to `[N1,..., Nm]` `m >= 0`, and
+    the same dtype as `n`. Each entry represents logits for the probability
+    of success for independent Binomial distributions.
+*  <b>`p`</b>: Positive floating point tensor with shape broadcastable to
+    `[N1,..., Nm]` `m >= 0`, `p in [0, 1]`. Each entry represents the
+    probability of success for independent Binomial distributions.
+*  <b>`validate_args`</b>: Whether to assert valid values for parameters `n` and `p`,
+    and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
+    guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
+*  <b>`name`</b>: The name to prefix Ops created by this distribution class.
+
+
+*  <b>`Examples`</b>: 
+
+```python
+# Define 1-batch of a binomial distribution.
+dist = Binomial(n=2., p=.9)
+
+# Define a 2-batch.
+dist = Binomial(n=[4., 5], p=[.1, .3])
+```
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.allow_nan_stats` {#Binomial.allow_nan_stats}
+
+Boolean describing behavior when a stat is undefined for batch member.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.batch_shape(name='batch_shape')` {#Binomial.batch_shape}
+
+Batch dimensions of this instance as a 1-D int32 `Tensor`.
+
+The product of the dimensions of the `batch_shape` is the number of
+independent distributions of this kind the instance represents.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `batch_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.cdf(value, name='cdf')` {#Binomial.cdf}
+
+Cumulative distribution function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.dtype` {#Binomial.dtype}
+
+dtype of samples from this distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.entropy(name='entropy')` {#Binomial.entropy}
+
+Entropy of the distribution in nats.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.event_shape(name='event_shape')` {#Binomial.event_shape}
+
+Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `event_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.get_batch_shape()` {#Binomial.get_batch_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `batch_shape`. May be only partially defined.
+
+##### Returns:
+
+  batch shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.get_event_shape()` {#Binomial.get_event_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `event_shape`. May be only partially defined.
+
+##### Returns:
+
+  event shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.is_continuous` {#Binomial.is_continuous}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.is_reparameterized` {#Binomial.is_reparameterized}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_cdf(value, name='log_cdf')` {#Binomial.log_cdf}
+
+Log CDF.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_pdf(value, name='log_pdf')` {#Binomial.log_pdf}
+
+Log of the probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_pmf(value, name='log_pmf')` {#Binomial.log_pmf}
+
+Log of the probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_prob(counts, name='log_prob')` {#Binomial.log_prob}
+
+`Log(P[counts])`, computed for every batch member.
+
+For each batch member of counts `k`, `P[counts]` is the probability that
+after sampling `n` draws from this Binomial distribution, the number of
+successes is `k`.  Note that different sequences of draws can result in the
+same counts, thus the probability includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.p` and `self.n`. `counts` is only legal if it is
+    less than or equal to `n` and its components are equal to integer
+    values.
+*  <b>`name`</b>: Name to give this Op, defaults to "log_prob".
+
+##### Returns:
+
+  Log probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.logits` {#Binomial.logits}
+
+Log-odds.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.mean(name='mean')` {#Binomial.mean}
+
+Mean of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.mode(name='mode')` {#Binomial.mode}
+
+Mode of the distribution.
+
+Note that when `(n + 1) * p` is an integer, there are actually two modes.
+Namely, `(n + 1) * p` and `(n + 1) * p - 1` are both modes. Here we return
+only the larger of the two modes.
+
+##### Args:
+
+
+*  <b>`name`</b>: The name for this op.
+
+##### Returns:
+
+  The mode of the Binomial distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.n` {#Binomial.n}
+
+Number of trials.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.name` {#Binomial.name}
+
+Name to prepend to all ops.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.p` {#Binomial.p}
+
+Probability of success.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.pdf(value, name='pdf')` {#Binomial.pdf}
+
+The probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.pmf(value, name='pmf')` {#Binomial.pmf}
+
+The probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.prob(counts, name='prob')` {#Binomial.prob}
+
+`P[counts]`, computed for every batch member.
+
+
+For each batch member of counts `k`, `P[counts]` is the probability that
+after sampling `n` draws from this Binomial distribution, the number of
+successes is `k`.  Note that different sequences of draws can result in the
+same counts, thus the probability includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.p` and `self.n`. `counts` is only legal if it is
+    less than or equal to `n` and its components are equal to integer
+    values.
+*  <b>`name`</b>: Name to give this Op, defaults to "prob".
+
+##### Returns:
+
+  Probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.sample(sample_shape=(), seed=None, name='sample')` {#Binomial.sample}
+
+Generate samples of the specified shape for each batched distribution.
+
+Note that a call to `sample()` without arguments will generate a single
+sample per batched distribution.
+
+##### Args:
+
+
+*  <b>`sample_shape`</b>: `int32` `Tensor` or tuple or list. Shape of the generated
+    samples.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of dtype `self.dtype` and shape
+      `sample_shape + self.batch_shape + self.event_shape`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.sample_n(n, seed=None, name='sample_n')` {#Binomial.sample_n}
+
+Generate `n` samples.
+
+##### Args:
+
+
+*  <b>`n`</b>: scalar. Number of samples to draw from each distribution.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of shape `(n,) + self.batch_shape + self.event_shape`
+      with values of type `self.dtype`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.std(name='std')` {#Binomial.std}
+
+Standard deviation of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.validate_args` {#Binomial.validate_args}
+
+Boolean describing behavior on invalid input.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.variance(name='variance')` {#Binomial.variance}
+
+Variance of the distribution.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md
index f3434ce2990..004dc294dca 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md
@@ -74,22 +74,22 @@ Initialize a batch of DirichletMultinomial distributions.
 ##### Args:
 
 
-*  <b>`n`</b>: Non-negative `float` or `double` tensor, whose dtype is the same as
+*  <b>`n`</b>: Non-negative floating point tensor, whose dtype is the same as
     `alpha`. The shape is broadcastable to `[N1,..., Nm]` with `m >= 0`.
     Defines this as a batch of `N1 x ... x Nm` different Dirichlet
-    multinomial distributions. Its components should be equal to integral
+    multinomial distributions. Its components should be equal to integer
     values.
-*  <b>`alpha`</b>: Positive `float` or `double` tensor, whose dtype is the same as
+*  <b>`alpha`</b>: Positive floating point tensor, whose dtype is the same as
     `n` with shape broadcastable to `[N1,..., Nm, k]` `m >= 0`.  Defines
     this as a batch of `N1 x ... x Nm` different `k` class Dirichlet
     multinomial distributions.
 *  <b>`validate_args`</b>: Whether to assert valid values for parameters `alpha` and
-    `n`, and `x` in `prob` and `log_prob`.  If False, correct behavior is
+    `n`, and `x` in `prob` and `log_prob`.  If `False`, correct behavior is
     not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 
@@ -251,12 +251,11 @@ probability includes a combinatorial coefficient.
 ##### Args:
 
 
-*  <b>`counts`</b>: Non-negative `float` or `double` tensor whose dtype is the same
-    `self` and whose shape can be broadcast with `self.alpha`.  For fixed
-    leading dimensions, the last dimension represents counts for the
-    corresponding Dirichlet Multinomial distribution in `self.alpha`.
-    `counts` is only legal if it sums up to `n` and its components are
-    equal to integral values.
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.alpha`.  For fixed leading dimensions, the last
+    dimension represents counts for the corresponding Dirichlet Multinomial
+    distribution in `self.alpha`. `counts` is only legal if it sums up to
+    `n` and its components are equal to integer values.
 *  <b>`name`</b>: Name to give this Op, defaults to "log_prob".
 
 ##### Returns:
@@ -321,12 +320,11 @@ probability includes a combinatorial coefficient.
 ##### Args:
 
 
-*  <b>`counts`</b>: Non-negative `float` or `double` tensor whose dtype is the same
-    `self` and whose shape can be broadcast with `self.alpha`.  For fixed
-    leading dimensions, the last dimension represents counts for the
-    corresponding Dirichlet Multinomial distribution in `self.alpha`.
-    `counts` is only legal if it sums up to `n` and its components are
-    equal to integral values.
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.alpha`.  For fixed leading dimensions, the last
+    dimension represents counts for the corresponding Dirichlet Multinomial
+    distribution in `self.alpha`. `counts` is only legal if it sums up to
+    `n` and its components are equal to integer values.
 *  <b>`name`</b>: Name to give this Op, defaults to "prob".
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md
index e785e49b2d8..745800ba7db 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md
@@ -15,15 +15,15 @@ Construct Exponential distribution with parameter `lam`.
 ##### Args:
 
 
-*  <b>`lam`</b>: `float` or `double` tensor, the rate of the distribution(s).
+*  <b>`lam`</b>: Floating point tensor, the rate of the distribution(s).
     `lam` must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `lam > 0`, and that `x > 0` in the
-    methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+    methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member. If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md
index 741d4d8c08d..cc830c5c70d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md
@@ -30,19 +30,19 @@ broadcasting (e.g. `alpha + beta` is a valid operation).
 ##### Args:
 
 
-*  <b>`alpha`</b>: `float` or `double` tensor, the shape params of the
+*  <b>`alpha`</b>: Floating point tensor, the shape params of the
     distribution(s).
     alpha must contain only positive values.
-*  <b>`beta`</b>: `float` or `double` tensor, the inverse scale params of the
+*  <b>`beta`</b>: Floating point tensor, the inverse scale params of the
     distribution(s).
     beta must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `a > 0, b > 0`, and that `x > 0` in
-    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 ##### Raises:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md
index 11b7ce9156c..cf788712cd7 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md
@@ -30,18 +30,18 @@ broadcasting (e.g. `alpha + beta` is a valid operation).
 ##### Args:
 
 
-*  <b>`alpha`</b>: `float` or `double` tensor, the shape params of the
+*  <b>`alpha`</b>: Floating point tensor, the shape params of the
     distribution(s).
     alpha must contain only positive values.
-*  <b>`beta`</b>: `float` or `double` tensor, the scale params of the distribution(s).
+*  <b>`beta`</b>: Floating point tensor, the scale params of the distribution(s).
     beta must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `a > 0, b > 0`, and that `x > 0` in
-    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 ##### Raises:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Multinomial.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Multinomial.md
new file mode 100644
index 00000000000..7ce70d130b5
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Multinomial.md
@@ -0,0 +1,402 @@
+Multinomial distribution.
+
+This distribution is parameterized by a vector `p` of probability
+parameters for `k` classes and `n`, the counts per each class..
+
+#### Mathematical details
+
+The Multinomial is a distribution over k-class count data, meaning
+for each k-tuple of non-negative integer `counts = [n_1,...,n_k]`, we have a
+probability of these draws being made from the distribution.  The distribution
+has hyperparameters `p = (p_1,...,p_k)`, and probability mass
+function (pmf):
+
+```pmf(counts) = n! / (n_1!...n_k!) * (p_1)^n_1*(p_2)^n_2*...(p_k)^n_k```
+
+where above `n = sum_j n_j`, `n!` is `n` factorial.
+
+#### Examples
+
+Create a 3-class distribution, with the 3rd class is most likely to be drawn,
+using logits..
+
+```python
+logits = [-50., -43, 0]
+dist = Multinomial(n=4., logits=logits)
+```
+
+Create a 3-class distribution, with the 3rd class is most likely to be drawn.
+
+```python
+p = [.2, .3, .5]
+dist = Multinomial(n=4., p=p)
+```
+
+The distribution functions can be evaluated on counts.
+
+```python
+# counts same shape as p.
+counts = [1., 0, 3]
+dist.prob(counts)  # Shape []
+
+# p will be broadcast to [[.2, .3, .5], [.2, .3, .5]] to match counts.
+counts = [[1., 2, 1], [2, 2, 0]]
+dist.prob(counts)  # Shape [2]
+
+# p will be broadcast to shape [5, 7, 3] to match counts.
+counts = [[...]]  # Shape [5, 7, 3]
+dist.prob(counts)  # Shape [5, 7]
+```
+
+Create a 2-batch of 3-class distributions.
+
+```python
+p = [[.1, .2, .7], [.3, .3, .4]]  # Shape [2, 3]
+dist = Multinomial(n=[4., 5], p=p)
+
+counts = [[2., 1, 1], [3, 1, 1]]
+dist.prob(counts)  # Shape [2]
+```
+- - -
+
+#### `tf.contrib.distributions.Multinomial.__init__(n, logits=None, p=None, validate_args=True, allow_nan_stats=False, name='Multinomial')` {#Multinomial.__init__}
+
+Initialize a batch of Multinomial distributions.
+
+##### Args:
+
+
+*  <b>`n`</b>: Non-negative floating point tensor with shape broadcastable to
+    `[N1,..., Nm]` with `m >= 0`. Defines this as a batch of
+    `N1 x ... x Nm` different Multinomial distributions.  Its components
+    should be equal to integer values.
+*  <b>`logits`</b>: Floating point tensor representing the log-odds of a
+    positive event with shape broadcastable to `[N1,..., Nm, k], m >= 0`,
+    and the same dtype as `n`. Defines this as a batch of `N1 x ... x Nm`
+    different `k` class Multinomial distributions.
+*  <b>`p`</b>: Positive floating point tensor with shape broadcastable to
+    `[N1,..., Nm, k]` `m >= 0` and same dtype as `n`.  Defines this as
+    a batch of `N1 x ... x Nm` different `k` class Multinomial
+    distributions. `p`'s components in the last portion of its shape should
+    sum up to 1.
+*  <b>`validate_args`</b>: Whether to assert valid values for parameters `n` and `p`,
+    and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
+    guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
+*  <b>`name`</b>: The name to prefix Ops created by this distribution class.
+
+
+*  <b>`Examples`</b>: 
+
+```python
+# Define 1-batch of 2-class multinomial distribution,
+# also known as a Binomial distribution.
+dist = Multinomial(n=2., p=[.1, .9])
+
+# Define a 2-batch of 3-class distributions.
+dist = Multinomial(n=[4., 5], p=[[.1, .3, .6], [.4, .05, .55]])
+```
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.allow_nan_stats` {#Multinomial.allow_nan_stats}
+
+Boolean describing behavior when a stat is undefined for batch member.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.batch_shape(name='batch_shape')` {#Multinomial.batch_shape}
+
+Batch dimensions of this instance as a 1-D int32 `Tensor`.
+
+The product of the dimensions of the `batch_shape` is the number of
+independent distributions of this kind the instance represents.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `batch_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.cdf(value, name='cdf')` {#Multinomial.cdf}
+
+Cumulative distribution function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.dtype` {#Multinomial.dtype}
+
+dtype of samples from this distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.entropy(name='entropy')` {#Multinomial.entropy}
+
+Entropy of the distribution in nats.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.event_shape(name='event_shape')` {#Multinomial.event_shape}
+
+Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `event_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.get_batch_shape()` {#Multinomial.get_batch_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `batch_shape`. May be only partially defined.
+
+##### Returns:
+
+  batch shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.get_event_shape()` {#Multinomial.get_event_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `event_shape`. May be only partially defined.
+
+##### Returns:
+
+  event shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.is_continuous` {#Multinomial.is_continuous}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.is_reparameterized` {#Multinomial.is_reparameterized}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_cdf(value, name='log_cdf')` {#Multinomial.log_cdf}
+
+Log CDF.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_pdf(value, name='log_pdf')` {#Multinomial.log_pdf}
+
+Log of the probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_pmf(value, name='log_pmf')` {#Multinomial.log_pmf}
+
+Log of the probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_prob(counts, name='log_prob')` {#Multinomial.log_prob}
+
+`Log(P[counts])`, computed for every batch member.
+
+For each batch of counts `[n_1,...,n_k]`, `P[counts]` is the probability
+that after sampling `n` draws from this Multinomial distribution, the
+number of draws falling in class `j` is `n_j`.  Note that different
+sequences of draws can result in the same counts, thus the probability
+includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can
+    be broadcast with `self.p` and `self.n`.  For fixed leading dimensions,
+    the last dimension represents counts for the corresponding Multinomial
+    distribution in `self.p`. `counts` is only legal if it sums up to `n`
+    and its components are equal to integer values.
+*  <b>`name`</b>: Name to give this Op, defaults to "log_prob".
+
+##### Returns:
+
+  Log probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.logits` {#Multinomial.logits}
+
+Log-odds.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.mean(name='mean')` {#Multinomial.mean}
+
+Mean of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.mode(name='mode')` {#Multinomial.mode}
+
+Mode of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.n` {#Multinomial.n}
+
+Number of trials.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.name` {#Multinomial.name}
+
+Name to prepend to all ops.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.p` {#Multinomial.p}
+
+Event probabilities.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.pdf(value, name='pdf')` {#Multinomial.pdf}
+
+The probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.pmf(value, name='pmf')` {#Multinomial.pmf}
+
+The probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.prob(counts, name='prob')` {#Multinomial.prob}
+
+`P[counts]`, computed for every batch member.
+
+For each batch of counts `[n_1,...,n_k]`, `P[counts]` is the probability
+that after sampling `n` draws from this Multinomial distribution, the
+number of draws falling in class `j` is `n_j`.  Note that different
+sequences of draws can result in the same counts, thus the probability
+includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can
+    be broadcast with `self.p` and `self.n`.  For fixed leading dimensions,
+    the last dimension represents counts for the corresponding Multinomial
+    distribution in `self.p`. `counts` is only legal if it sums up to `n`
+    and its components are equal to integer values.
+*  <b>`name`</b>: Name to give this Op, defaults to "prob".
+
+##### Returns:
+
+  Probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.sample(sample_shape=(), seed=None, name='sample')` {#Multinomial.sample}
+
+Generate samples of the specified shape for each batched distribution.
+
+Note that a call to `sample()` without arguments will generate a single
+sample per batched distribution.
+
+##### Args:
+
+
+*  <b>`sample_shape`</b>: `int32` `Tensor` or tuple or list. Shape of the generated
+    samples.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of dtype `self.dtype` and shape
+      `sample_shape + self.batch_shape + self.event_shape`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.sample_n(n, seed=None, name='sample_n')` {#Multinomial.sample_n}
+
+Generate `n` samples.
+
+##### Args:
+
+
+*  <b>`n`</b>: scalar. Number of samples to draw from each distribution.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of shape `(n,) + self.batch_shape + self.event_shape`
+      with values of type `self.dtype`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.std(name='std')` {#Multinomial.std}
+
+Standard deviation of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.validate_args` {#Multinomial.validate_args}
+
+Boolean describing behavior on invalid input.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.variance(name='variance')` {#Multinomial.variance}
+
+Variance of the distribution.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.md
index 7d96496c430..4c6b99b4c3d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.md
@@ -83,13 +83,13 @@ D = is diagonal (r x r), optional (defaults to identity).
 ##### Args:
 
 
-*  <b>`mu`</b>: Rank `n + 1` `float` or `double` tensor with shape `[N1,...,Nn, k]`,
+*  <b>`mu`</b>: Rank `n + 1` floating point tensor with shape `[N1,...,Nn, k]`,
     `n >= 0`.  The means.
-*  <b>`diag_large`</b>: Optional rank `n + 1` `float` or `double` tensor, shape
+*  <b>`diag_large`</b>: Optional rank `n + 1` floating point tensor, shape
     `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `M`.
-*  <b>`v`</b>: Rank `n + 1` `float` or `double` tensor, shape `[N1,...,Nn, k, r]`
+*  <b>`v`</b>: Rank `n + 1` floating point tensor, shape `[N1,...,Nn, k, r]`
     `n >= 0`.  Defines the matrix `V`.
-*  <b>`diag_small`</b>: Rank `n + 1` `float` or `double` tensor, shape
+*  <b>`diag_small`</b>: Rank `n + 1` floating point tensor, shape
     `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `D`.  Default
     is `None`, which means `D` will be the identity matrix.
 *  <b>`validate_args`</b>: Whether to validate input with asserts.  If `validate_args`
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.detach_outputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.detach_outputs.md
new file mode 100644
index 00000000000..7ef04022163
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.detach_outputs.md
@@ -0,0 +1,23 @@
+### `tf.contrib.graph_editor.detach_outputs(sgv, control_outputs=None)` {#detach_outputs}
+
+Detach the outputa of a subgraph view.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
+    subgraph using the same rules as the function subgraph.make_view.
+*  <b>`control_outputs`</b>: a util.ControlOutputs instance or None. If not None the
+    control outputs are also detached.
+
+##### Returns:
+
+  A new subgraph view of the detached subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.matcher.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.matcher.md
new file mode 100644
index 00000000000..242efb37e3f
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.matcher.md
@@ -0,0 +1,29 @@
+Graph match class.
+- - -
+
+#### `tf.contrib.graph_editor.matcher.__init__(positive_filter)` {#matcher.__init__}
+
+Graph match constructor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.matcher.control_input_ops(*args)` {#matcher.control_input_ops}
+
+Add input matches.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.matcher.input_ops(*args)` {#matcher.input_ops}
+
+Add input matches.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.matcher.output_ops(*args)` {#matcher.output_ops}
+
+Add output matches.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.reroute_b2a.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.reroute_b2a.md
new file mode 100644
index 00000000000..f15af87d5eb
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.reroute_b2a.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.reroute_b2a(sgv0, sgv1)` {#reroute_b2a}
+
+Re-route the inputs and outputs of sgv1 to sgv0 (see _reroute).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.select_ts.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.select_ts.md
new file mode 100644
index 00000000000..22905da75da
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.select_ts.md
@@ -0,0 +1,30 @@
+### `tf.contrib.graph_editor.select_ts(*args, **kwargs)` {#select_ts}
+
+Helper to select tensors.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Tensor. tf.Operation instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ts_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ts)".
+
+##### Returns:
+
+  list of tf.Tensor
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Tensor
+    or an (array of) tf.Operation (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.swap_outputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.swap_outputs.md
new file mode 100644
index 00000000000..31ed5df8d41
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.swap_outputs.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.swap_outputs(sgv0, sgv1)` {#swap_outputs}
+
+Swap all the outputs of sgv0 and sgv1 (see _reroute_outputs).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md
index 9822437283f..d292d350493 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md
@@ -32,9 +32,9 @@ Constructs an Estimator instance.
              to configure Estimators from hyper parameter tunning.
 
 
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
+    also be used to load checkpoints from the directory into a estimator to
+    continue training a previously saved model.
 *  <b>`config`</b>: Configuration object.
 *  <b>`params`</b>: `dict` of hyper parameters that will be passed into `model_fn`.
           Keys are names of parameters, values are basic python types.
@@ -49,56 +49,7 @@ Constructs an Estimator instance.
 
 #### `tf.contrib.learn.Estimator.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#Estimator.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -112,37 +63,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.Estimator.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#Estimator.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummarySaver.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummarySaver.md
index 9d42226216e..cc016557ca4 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummarySaver.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummarySaver.md
@@ -1,7 +1,7 @@
 Saves summaries every N steps.
 - - -
 
-#### `tf.contrib.learn.monitors.SummarySaver.__init__(summary_op, save_steps=100, output_dir=None, summary_writer=None)` {#SummarySaver.__init__}
+#### `tf.contrib.learn.monitors.SummarySaver.__init__(summary_op, save_steps=100, output_dir=None, summary_writer=None, scaffold=None)` {#SummarySaver.__init__}
 
 Initializes a `SummarySaver` monitor.
 
@@ -16,6 +16,7 @@ Initializes a `SummarySaver` monitor.
       if no `summary_writer` is supplied.
 *  <b>`summary_writer`</b>: `SummaryWriter`. If `None` and an `output_dir` was passed,
       one will be created accordingly.
+*  <b>`scaffold`</b>: `Scaffold` to get summary_op if it's not provided.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.svd.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.svd.md
new file mode 100644
index 00000000000..0c9f0aacf0d
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.svd.md
@@ -0,0 +1,39 @@
+### `tf.svd(matrix, compute_uv=True, full_matrices=False, name=None)` {#svd}
+
+Computes the singular value decomposition of a matrix.
+
+Computes the SVD of `matrix` such that `matrix = u * diag(s) *
+transpose(v)`
+
+```prettyprint
+# a is a matrix.
+# s is a vector of singular values.
+# u is the matrix of left singular vectors.
+# v is a matrix of right singular vectors.
+s, u, v = svd(a)
+s = svd(a, compute_uv=False)
+```
+
+##### Args:
+
+
+*  <b>`matrix`</b>: `Tensor` of shape `[M, N]`. Let `P` be the minimum of `M` and `N`.
+*  <b>`compute_uv`</b>: If `True` then left and right singular vectors will be
+    computed and returned in `u` and `v`, respectively. Otherwise, only the
+    singular values will be computed, which can be significantly faster.
+*  <b>`full_matrices`</b>: If true, compute full-sized `u` and `v`. If false
+    (the default), compute only the leading `P` singular vectors.
+    Ignored if `compute_uv` is `False`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`s`</b>: Singular values. Shape is `[P]`.
+*  <b>`u`</b>: Right singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[M, P]`; if `full_matrices` is `True` then shape is
+    `[M, M]`. Not returned if `compute_uv` is `False`.
+*  <b>`v`</b>: Left singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[N, P]`. If `full_matrices` is `True` then shape is
+    `[N, N]`. Not returned if `compute_uv` is `False`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.batch_svd.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.batch_svd.md
new file mode 100644
index 00000000000..2555bb57e30
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.batch_svd.md
@@ -0,0 +1,41 @@
+### `tf.batch_svd(tensor, compute_uv=True, full_matrices=False, name=None)` {#batch_svd}
+
+Computes the singular value decompositions of a batch of matrices.
+
+Computes the SVD of each inner matrix in `tensor` such that
+`tensor[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :,
+:])`
+
+```prettyprint
+# a is a tensor.
+# s is a tensor of singular values.
+# u is a tensor of left singular vectors.
+# v is a tensor of right singular vectors.
+s, u, v = batch_svd(a)
+s = batch_svd(a, compute_uv=False)
+```
+
+##### Args:
+
+
+*  <b>`matrix`</b>: `Tensor` of shape `[..., M, N]`. Let `P` be the minimum of `M` and
+    `N`.
+*  <b>`compute_uv`</b>: If `True` then left and right singular vectors will be
+    computed and returned in `u` and `v`, respectively. Otherwise, only the
+    singular values will be computed, which can be significantly faster.
+*  <b>`full_matrices`</b>: If true, compute full-sized `u` and `v`. If false
+    (the default), compute only the leading `P` singular vectors.
+    Ignored if `compute_uv` is `False`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`s`</b>: Singular values. Shape is `[..., P]`.
+*  <b>`u`</b>: Right singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+    `[..., M, M]`. Not returned if `compute_uv` is `False`.
+*  <b>`v`</b>: Left singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[..., N, P]`. If `full_matrices` is `True` then shape is
+    `[..., N, N]`. Not returned if `compute_uv` is `False`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.sgv_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.sgv_scope.md
new file mode 100644
index 00000000000..6362e0d99f0
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.sgv_scope.md
@@ -0,0 +1,14 @@
+### `tf.contrib.graph_editor.sgv_scope(scope, graph)` {#sgv_scope}
+
+Make a subgraph from a name scope.
+
+##### Args:
+
+
+*  <b>`scope`</b>: the name of the scope.
+*  <b>`graph`</b>: the tf.Graph.
+
+##### Returns:
+
+  A subgraph view representing the given scope.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
index 56b5edb17c0..767756e311f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
@@ -120,56 +120,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.DNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#DNNClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -183,37 +134,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.DNNClassifier.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#DNNClassifier.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md
index da8cef18b04..35a71be5f8c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md
@@ -1,7 +1,7 @@
 This class specifies the specific configurations for the run.
 - - -
 
-#### `tf.contrib.learn.RunConfig.__init__(master='', task=0, num_ps_replicas=0, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=42, save_summary_steps=100, save_checkpoints_secs=60, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, save_checkpoints_steps=1000)` {#RunConfig.__init__}
+#### `tf.contrib.learn.RunConfig.__init__(master='', task=0, num_ps_replicas=0, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=42, save_summary_steps=100, save_checkpoints_secs=60, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__}
 
 Constructor.
 
@@ -26,6 +26,5 @@ Constructor.
 *  <b>`keep_checkpoint_every_n_hours`</b>: Number of hours between each checkpoint
     to be saved. The default value of 10,000 hours effectively disables
     the feature.
-*  <b>`save_checkpoints_steps`</b>: Number of steps between each checkpoint saving.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.TensorFlowLinearClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.TensorFlowLinearClassifier.md
index c5f0eab6b7d..2c7b221fb22 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.TensorFlowLinearClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.TensorFlowLinearClassifier.md
@@ -31,56 +31,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowLinearClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowLinearClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.greater.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.greater.md
index c629a0286f3..99b34aaca47 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.greater.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.greater.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of (x > y) element-wise.
 
+*NOTE*: `Greater` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.self_adjoint_eigvals.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.self_adjoint_eigvals.md
new file mode 100644
index 00000000000..3dc968afa13
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.self_adjoint_eigvals.md
@@ -0,0 +1,15 @@
+### `tf.self_adjoint_eigvals(matrix, name=None)` {#self_adjoint_eigvals}
+
+Computes the eigenvalues a self-adjoint  matrix.
+
+##### Args:
+
+
+*  <b>`matrix`</b>: `Tensor` of shape `[N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`e`</b>: Eigenvalues of `matrix`. Shape is `[N]`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sub.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sub.md
index 2d1da0f0b98..83dbd7a93c8 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sub.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sub.md
@@ -2,6 +2,9 @@
 
 Returns x - y element-wise.
 
+*NOTE*: `Sub` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.add.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.add.md
index 738f0337d30..da82da60762 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.add.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.add.md
@@ -2,7 +2,8 @@
 
 Returns x + y element-wise.
 
-*NOTE*: Add supports broadcasting. AddN does not.
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.bypass.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.bypass.md
new file mode 100644
index 00000000000..976d579cd64
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.bypass.md
@@ -0,0 +1,21 @@
+### `tf.contrib.graph_editor.bypass(sgv)` {#bypass}
+
+Bypass the given subgraph by connecting its inputs to its outputs.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be bypassed. This argument is converted to a
+    subgraph using the same rules than the function subgraph.make_view.
+
+##### Returns:
+
+  A new subgraph view of the bypassed subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.reroute_a2b.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.reroute_a2b.md
new file mode 100644
index 00000000000..4a4cecc26c2
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.reroute_a2b.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.reroute_a2b(sgv0, sgv1)` {#reroute_a2b}
+
+Re-route the inputs and outputs of sgv0 to sgv1 (see _reroute).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.reroute_b2a_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.reroute_b2a_inputs.md
new file mode 100644
index 00000000000..46a82bdad96
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.reroute_b2a_inputs.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.reroute_b2a_inputs(sgv0, sgv1)` {#reroute_b2a_inputs}
+
+Re-route all the inputs of sgv1 to sgv0 (see reroute_inputs).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.TensorFlowDNNClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.TensorFlowDNNClassifier.md
index 8fd75ed89cf..ab811506714 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.TensorFlowDNNClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.TensorFlowDNNClassifier.md
@@ -31,56 +31,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowDNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowDNNClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.CheckpointSaver.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.CheckpointSaver.md
index 07e073ced5a..58db246a0ca 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.CheckpointSaver.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.CheckpointSaver.md
@@ -1,38 +1,32 @@
 Saves checkpoints every N steps.
 - - -
 
-#### `tf.contrib.learn.monitors.CheckpointSaver.__init__(every_n_steps, saver, checkpoint_dir, checkpoint_basename='model.ckpt', first_n_steps=-1)` {#CheckpointSaver.__init__}
+#### `tf.contrib.learn.monitors.CheckpointSaver.__init__(checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename='model.ckpt', scaffold=None)` {#CheckpointSaver.__init__}
 
 Initialize CheckpointSaver monitor.
 
 ##### Args:
 
 
-*  <b>`every_n_steps`</b>: `int`, save every N steps.
-*  <b>`saver`</b>: `Saver` object, used for saving.
 *  <b>`checkpoint_dir`</b>: `str`, base directory for the checkpoint files.
+*  <b>`save_secs`</b>: `int`, save every N secs.
+*  <b>`save_steps`</b>: `int`, save every N steps.
+*  <b>`saver`</b>: `Saver` object, used for saving.
 *  <b>`checkpoint_basename`</b>: `str`, base name for the checkpoint files.
-*  <b>`first_n_steps`</b>: `int`, if positive, save every step during the
-    first `first_n_steps` steps.
+*  <b>`scaffold`</b>: `Scaffold`, use to get saver object.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If both `save_steps` and `save_secs` are not `None`.
+*  <b>`ValueError`</b>: If both `save_steps` and `save_secs` are `None`.
 
 
 - - -
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.begin(max_steps=None)` {#CheckpointSaver.begin}
 
-Called at the beginning of training.
 
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
 
 
 - - -
@@ -76,55 +70,6 @@ End epoch.
 *  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
 
 
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.every_n_post_step(step, session)` {#CheckpointSaver.every_n_post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.every_n_step_begin(step)` {#CheckpointSaver.every_n_step_begin}
-
-Callback before every n'th step begins.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list` of tensors that will be evaluated at this step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.every_n_step_end(step, outputs)` {#CheckpointSaver.every_n_step_end}
-
-Callback after every n'th step finished.
-
-This callback provides access to the tensors/ops evaluated at this step,
-including the additional tensors for which evaluation was requested in
-`step_begin`.
-
-In addition, the callback has the opportunity to stop training by returning
-`True`. This is useful for early stopping, for example.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`outputs`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`. True if training should stop.
-
-
 - - -
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.post_step(step, session)` {#CheckpointSaver.post_step}
@@ -160,33 +105,24 @@ A setter called automatically by the target estimator.
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.step_begin(step)` {#CheckpointSaver.step_begin}
 
-Overrides `BaseMonitor.step_begin`.
 
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
 
 
 - - -
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.step_end(step, output)` {#CheckpointSaver.step_end}
 
-Overrides `BaseMonitor.step_end`.
+Callback after training step finished.
 
-When overriding this method, you must call the super implementation.
+This callback provides access to the tensors/ops evaluated at this step,
+including the additional tensors for which evaluation was requested in
+`step_begin`.
+
+In addition, the callback has the opportunity to stop training by returning
+`True`. This is useful for early stopping, for example.
+
+Note that this method is not called if the call to `Session.run()` that
+followed the last call to `step_begin()` failed.
 
 ##### Args:
 
@@ -198,7 +134,11 @@ When overriding this method, you must call the super implementation.
 
 ##### Returns:
 
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
+  `bool`. True if training should stop.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if we've not begun a step, or `step` number does not match.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.div.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.div.md
index 92eba7927a0..61616c0e6b2 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.div.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.div.md
@@ -2,6 +2,9 @@
 
 Returns x / y element-wise.
 
+*NOTE*: `Div` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.logical_and.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.logical_and.md
index dd5b563c8ba..2b5f011ccdc 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.logical_and.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.logical_and.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of x AND y element-wise.
 
+*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.variable_op_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.variable_op_scope.md
index 709d2375b50..4ed0f567ffa 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.variable_op_scope.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.variable_op_scope.md
@@ -1,4 +1,4 @@
-### `tf.variable_op_scope(values, name_or_scope, default_name=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, reuse=None)` {#variable_op_scope}
+### `tf.variable_op_scope(values, name_or_scope, default_name=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, reuse=None, dtype=None)` {#variable_op_scope}
 
 Returns a context manager for defining an op that creates variables.
 
@@ -42,6 +42,8 @@ def my_op_with_vars(a, b, scope=None):
 *  <b>`custom_getter`</b>: The default custom getter for variables within this scope.
 *  <b>`reuse`</b>: `True` or `None`; if `True`, we go into reuse mode for this scope as
     well as all sub-scopes; if `None`, we just inherit the parent scope reuse.
+*  <b>`dtype`</b>: The default type of variables created in this scope, defaults to the
+    type of the parent scope.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md
index aa40420ff83..df1b3d32e6e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md
@@ -67,20 +67,20 @@ Initialize a batch of Beta distributions.
 ##### Args:
 
 
-*  <b>`a`</b>: Positive `float` or `double` tensor with shape broadcastable to
+*  <b>`a`</b>: Positive floating point tensor with shape broadcastable to
     `[N1,..., Nm]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
      different Beta distributions. This also defines the
      dtype of the distribution.
-*  <b>`b`</b>: Positive `float` or `double` tensor with shape broadcastable to
+*  <b>`b`</b>: Positive floating point tensor with shape broadcastable to
     `[N1,..., Nm]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
      different Beta distributions.
 *  <b>`validate_args`</b>: Whether to assert valid values for parameters `a` and `b`,
-    and `x` in `prob` and `log_prob`.  If False, correct behavior is not
+    and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
     guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 
@@ -242,7 +242,7 @@ Log of the probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float` or `double`, tensor whose shape can
+*  <b>`x`</b>: Non-negative floating point tensor whose shape can
     be broadcast with `self.a` and `self.b`.  For fixed leading
     dimensions, the last dimension represents counts for the corresponding
     Beta distribution in `self.a` and `self.b`. `x` is only legal if
@@ -312,7 +312,7 @@ The probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float`, `double` tensor whose shape can
+*  <b>`x`</b>: Non-negative floating point tensor whose shape can
     be broadcast with `self.a` and `self.b`.  For fixed leading
     dimensions, the last dimension represents x for the corresponding Beta
     distribution in `self.a` and `self.b`. `x` is only legal if is
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md
index 273e23714fe..815e544a063 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md
@@ -20,17 +20,17 @@ broadcasting (e.g., `loc / scale` is a valid operation).
 ##### Args:
 
 
-*  <b>`loc`</b>: `float` or `double` tensor which characterizes the location (center)
+*  <b>`loc`</b>: Floating point tensor which characterizes the location (center)
     of the distribution.
-*  <b>`scale`</b>: `float` or `double`, positive-valued tensor which characterzes the
-    spread of the distribution.
+*  <b>`scale`</b>: Positive floating point tensor which characterizes the spread of
+    the distribution.
 *  <b>`validate_args`</b>: Whether to validate input with asserts.  If `validate_args`
     is `False`, and the inputs are invalid, correct behavior is not
     guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.deprecated.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.deprecated.md
index 15924febeed..2daecf41e27 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.deprecated.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.deprecated.md
@@ -2,15 +2,14 @@
 
 Decorator for marking functions or methods deprecated.
 
-This decorator adds a deprecation warning to a function's docstring. It has
-the following format:
+This decorator logs a deprecation warning whenever the decorated function is
+called. It has the following format:
 
   <function> (from <module>) is deprecated and will be removed after <date>.
   Instructions for updating:
   <instructions>
 
-whenever the decorated function is called. <function> will include the class
-name if it is a method.
+<function> will include the class name if it is a method.
 
 It also edits the docstring of the function: ' (deprecated)' is appended
 to the first line of the docstring and a deprecation notice is prepended
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.ts.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.ts.md
new file mode 100644
index 00000000000..9239a5a3dca
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.ts.md
@@ -0,0 +1,30 @@
+### `tf.contrib.graph_editor.ts(*args, **kwargs)` {#ts}
+
+Helper to select tensors.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Tensor. tf.Operation instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ts_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ts)".
+
+##### Returns:
+
+  list of tf.Tensor
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Tensor
+    or an (array of) tf.Operation (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.cumsum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.cumsum.md
index 64a8312fde0..baa00e57d53 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.cumsum.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.cumsum.md
@@ -30,8 +30,8 @@ tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
 
 
 *  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-   `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-   `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+     `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+     `complex128`, `qint8`, `quint8`, `qint32`, `half`.
 *  <b>`axis`</b>: A `Tensor` of type `int32` (default: 0).
 *  <b>`reverse`</b>: A `bool` (default: False).
 *  <b>`name`</b>: A name for the operation (optional).
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.maximum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.maximum.md
index 309946f4352..aec816dcbad 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.maximum.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.maximum.md
@@ -1,6 +1,9 @@
 ### `tf.maximum(x, y, name=None)` {#maximum}
 
-Returns the max of x and y (i.e. x > y ? x : y) element-wise, broadcasts.
+Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+
+*NOTE*: `Maximum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.self_adjoint_eig.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.self_adjoint_eig.md
index efbc0cd3be9..48cd3b0575a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.self_adjoint_eig.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.self_adjoint_eig.md
@@ -1,21 +1,20 @@
-### `tf.self_adjoint_eig(input, name=None)` {#self_adjoint_eig}
+### `tf.self_adjoint_eig(matrix, name=None)` {#self_adjoint_eig}
 
-Calculates the Eigen Decomposition of a square Self-Adjoint matrix.
+Computes the eigen decomposition of a self-adjoint matrix.
 
-Only the lower-triangular part of the input will be used in this case. The
-upper-triangular part will not be read.
-
-The result is a M+1 x M matrix whose first row is the eigenvalues, and
-subsequent rows are eigenvectors.
+Computes the eigenvalues and eigenvectors of an N-by-N matrix `matrix` such
+that `matrix * v[:,i] = e(i) * v[:,i]`, for i=0...N-1.
 
 ##### Args:
 
 
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`.
-    Shape is `[M, M]`.
-*  <b>`name`</b>: A name for the operation (optional).
+*  <b>`matrix`</b>: `Tensor` of shape `[N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
 
 ##### Returns:
 
-  A `Tensor`. Has the same type as `input`. Shape is `[M+1, M]`.
+
+*  <b>`e`</b>: Eigenvalues. Shape is `[N]`.
+*  <b>`v`</b>: Eigenvectors. Shape is `[N, N]`. The columns contain the eigenvectors of
+    `matrix`.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Coordinator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Coordinator.md
index 744e4e233a6..27ae6f13e3a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Coordinator.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Coordinator.md
@@ -120,11 +120,14 @@ After this is called, calls to `should_stop()` will return `False`.
 
 - - -
 
-#### `tf.train.Coordinator.join(threads, stop_grace_period_secs=120)` {#Coordinator.join}
+#### `tf.train.Coordinator.join(threads=None, stop_grace_period_secs=120)` {#Coordinator.join}
 
 Wait for threads to terminate.
 
-Blocks until all `threads` have terminated or `request_stop()` is called.
+This call blocks until a set of threads have terminated.  The set of thread
+is the union of the threads passed in the `threads` argument and the list
+of threads that registered with the coordinator by calling
+`Coordinator.register_thread()`.
 
 After the threads stop, if an `exc_info` was passed to `request_stop`, that
 exception is re-raised.
@@ -138,7 +141,8 @@ that `RuntimeError`.
 ##### Args:
 
 
-*  <b>`threads`</b>: List of `threading.Threads`. The started threads to join.
+*  <b>`threads`</b>: List of `threading.Threads`. The started threads to join in
+    addition to the registered threads.
 *  <b>`stop_grace_period_secs`</b>: Number of seconds given to threads to stop after
     `request_stop()` has been called.
 
@@ -156,6 +160,18 @@ that `RuntimeError`.
 
 
 
+- - -
+
+#### `tf.train.Coordinator.register_thread(thread)` {#Coordinator.register_thread}
+
+Register a thread to join.
+
+##### Args:
+
+
+*  <b>`thread`</b>: A Python thread to join.
+
+
 - - -
 
 #### `tf.train.Coordinator.request_stop(ex=None)` {#Coordinator.request_stop}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.batch_cholesky.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.batch_cholesky.md
index 487680f50b8..1ce7fca603d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.batch_cholesky.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.batch_cholesky.md
@@ -1,6 +1,6 @@
 ### `tf.batch_cholesky(input, name=None)` {#batch_cholesky}
 
-Calculates the Cholesky decomposition of a batch of square matrices.
+Computes the Cholesky decomposition of a batch of square matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices, with the same constraints as the single matrix Cholesky
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md
index 8377e7ab9a9..3b1715d88c2 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md
@@ -48,7 +48,7 @@ User must provide means `mu` and `sigma`, the mean and covariance.
 ##### Args:
 
 
-*  <b>`mu`</b>: `(N+1)-D`  `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+*  <b>`mu`</b>: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
     `b >= 0`.
 *  <b>`sigma`</b>: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
     `[N1,...,Nb, k, k]`.  Each batch member must be positive definite.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md
index 3826c2812f2..159e477f03f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md
@@ -52,15 +52,15 @@ broadcasting (e.g. `mu + sigma` is a valid operation).
 ##### Args:
 
 
-*  <b>`mu`</b>: `float` or `double` tensor, the means of the distribution(s).
-*  <b>`sigma`</b>: `float` or `double` tensor, the stddevs of the distribution(s).
+*  <b>`mu`</b>: Floating point tensor, the means of the distribution(s).
+*  <b>`sigma`</b>: Floating point tensor, the stddevs of the distribution(s).
     sigma must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `sigma > 0`. If `validate_args` is
-    False, correct output is not guaranteed when input is invalid.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+    `False`, correct output is not guaranteed when input is invalid.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md
index 62f0a904016..014d2792b6b 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md
@@ -7,9 +7,9 @@ Get the KL-divergence KL(dist_a || dist_b).
 
 *  <b>`dist_a`</b>: instance of distributions.Distribution.
 *  <b>`dist_b`</b>: instance of distributions.Distribution.
-*  <b>`allow_nan`</b>: If False (default), a runtime error is raised
+*  <b>`allow_nan`</b>: If `False` (default), a runtime error is raised
     if the KL returns NaN values for any batch entry of the given
-    distributions.  If True, the KL may return a NaN for the given entry.
+    distributions.  If `True`, the KL may return a NaN for the given entry.
 *  <b>`name`</b>: (optional) Name scope to use for created operations.
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.ph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.ph.md
new file mode 100644
index 00000000000..c765240585a
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.ph.md
@@ -0,0 +1,20 @@
+### `tf.contrib.graph_editor.ph(dtype, shape=None, scope=None)` {#ph}
+
+Create a tf.placeholder for the Graph Editor.
+
+Note that the correct graph scope must be set by the calling function.
+The placeholder is named using the function placeholder_name (with no
+tensor argument).
+
+##### Args:
+
+
+*  <b>`dtype`</b>: the tensor type.
+*  <b>`shape`</b>: the tensor shape (optional).
+*  <b>`scope`</b>: absolute scope within which to create the placeholder. None
+    means that the scope of t is preserved. "" means the root scope.
+
+##### Returns:
+
+  A newly created tf.placeholder.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.sgv.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.sgv.md
new file mode 100644
index 00000000000..36b4de6315d
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.sgv.md
@@ -0,0 +1,25 @@
+### `tf.contrib.graph_editor.sgv(*args, **kwargs)` {#sgv}
+
+Create a SubGraphView from selected operations and passthrough tensors.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Operation 3) (array of) tf.Tensor. Those objects will be converted
+    into a list of operations and a list of candidate for passthrough tensors.
+*  <b>`**kwargs`</b>: keyword graph is used 1) to check that the ops and ts are from
+    the correct graph 2) for regular expression query
+
+##### Returns:
+
+  A subgraph view.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Tensor
+    or an (array of) tf.Operation or a string or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.learn.TensorFlowClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.learn.TensorFlowClassifier.md
index e9c910ac619..2318f59670f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.learn.TensorFlowClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.learn.TensorFlowClassifier.md
@@ -31,56 +31,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.less.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.less.md
index 8791d0366aa..3a00afa8db5 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.less.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.less.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of (x < y) element-wise.
 
+*NOTE*: `Less` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.logical_or.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.logical_or.md
index be18e65e92e..e04b6a15d2c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.logical_or.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.logical_or.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of x OR y element-wise.
 
+*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.variable_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.variable_scope.md
index 72c790f627e..9c89cd4fb34 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.variable_scope.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.variable_scope.md
@@ -1,4 +1,4 @@
-### `tf.variable_scope(name_or_scope, reuse=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None)` {#variable_scope}
+### `tf.variable_scope(name_or_scope, reuse=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, dtype=None)` {#variable_scope}
 
 Returns a context for variable scope.
 
@@ -69,6 +69,8 @@ then all its sub-scopes become reusing as well.
 *  <b>`caching_device`</b>: default caching device for variables within this scope.
 *  <b>`partitioner`</b>: default partitioner for variables within this scope.
 *  <b>`custom_getter`</b>: default custom getter for variables within this scope.
+*  <b>`dtype`</b>: type of variables created in this scope (defaults to the type
+    in the passed scope, or inherited from parent scope).
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.Transformer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.Transformer.md
new file mode 100644
index 00000000000..d070c982f1d
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.Transformer.md
@@ -0,0 +1,51 @@
+Transform a subgraph into another one.
+
+By default, the constructor create a transform which copy a subgraph and
+replaces inputs with placeholders. This behavior can be modified by changing
+the handlers.
+- - -
+
+#### `tf.contrib.graph_editor.Transformer.__init__()` {#Transformer.__init__}
+
+Transformer constructor.
+
+The following members can be modified:
+transform_op_handler: handle the transformation of a tf.Operation.
+  This handler defaults to a simple copy.
+assign_collections_handler: handle the assignment of collections.
+  This handler defaults to assigning new collections created under the
+  given name-scope.
+transform_input_handler: handle the transform of the inputs to the given
+  subgraph. This handler defaults to creating placeholders instead of the
+  ops just before the input tensors of the subgraph.
+transform_hidden_input_handler: handle the transform of the hidden inputs of
+  the subgraph, that is, the inputs which are not listed in sgv.inputs.
+  This handler defaults to a transform which keep the same input if the
+  source and destination graphs are the same, otherwise use placeholders.
+transform_original_op_hanlder: handle the transform of original_op. This
+  handler defaults to transforming original_op only if they are in the
+  subgraph, otherwise they are ignored.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.Transformer.new_name(name)` {#Transformer.new_name}
+
+Compute a destination name from a source name.
+
+##### Args:
+
+
+*  <b>`name`</b>: the name to be "transformed".
+
+##### Returns:
+
+  the transformed name.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if the source scope is used (that is, not an empty string)
+    and the source name does not belong to the source scope.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.detach.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.detach.md
new file mode 100644
index 00000000000..e04134d548e
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.detach.md
@@ -0,0 +1,28 @@
+### `tf.contrib.graph_editor.detach(sgv, control_inputs=False, control_outputs=None, control_ios=None)` {#detach}
+
+Detach both the inputs and the outputs of a subgraph view.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
+    subgraph using the same rules as the function subgraph.make_view.
+*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
+*  <b>`control_outputs`</b>: An instance of util.ControlOutputs or None. If not None,
+    control outputs are enabled.
+*  <b>`control_ios`</b>: An instance of util.ControlOutputs or None. If not None, both
+    control inputs and control outputs are enabled. This is equivalent to set
+    control_inputs to True and control_outputs to the util.ControlOutputs
+    instance.
+
+##### Returns:
+
+  A new subgraph view of the detached subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.reroute_b2a_outputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.reroute_b2a_outputs.md
new file mode 100644
index 00000000000..b14ea3485b0
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.reroute_b2a_outputs.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.reroute_b2a_outputs(sgv0, sgv1)` {#reroute_b2a_outputs}
+
+Re-route all the outputs of sgv1 to sgv0 (see _reroute_outputs).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.swap.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.swap.md
new file mode 100644
index 00000000000..d6fab641cc7
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.swap.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.swap(sgv0, sgv1)` {#swap}
+
+Swap the inputs and outputs of sgv1 to sgv0 (see _reroute).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.TensorFlowRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.TensorFlowRegressor.md
index 8dcf209b03a..9424e537c40 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.TensorFlowRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.TensorFlowRegressor.md
@@ -31,56 +31,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.LSTMFusedCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.LSTMFusedCell.md
index fec80caecf1..b1e9fde7160 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.LSTMFusedCell.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.LSTMFusedCell.md
@@ -19,7 +19,7 @@ Initialize the basic LSTM cell.
 
 *  <b>`num_units`</b>: int, The number of units in the LSTM cell.
 *  <b>`forget_bias`</b>: float, The bias added to forget gates (see above).
-*  <b>`use_peephole`</b>: Whether to use peephole connectios or not.
+*  <b>`use_peephole`</b>: Whether to use peephole connections or not.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.equal.md
index 998db9189ff..332a12f7255 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.equal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.equal.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of (x == y) element-wise.
 
+*NOTE*: `Equal` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.less_equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.less_equal.md
index 65d7eb50842..c8ce84b6691 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.less_equal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.less_equal.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of (x <= y) element-wise.
 
+*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.matrix_inverse.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.matrix_inverse.md
index 4172badef50..1edc4a9ec9e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.matrix_inverse.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.matrix_inverse.md
@@ -1,6 +1,6 @@
 ### `tf.matrix_inverse(input, adjoint=None, name=None)` {#matrix_inverse}
 
-Calculates the inverse of a square invertible matrix or its adjoint (conjugate
+Computes the inverse of a square invertible matrix or its adjoint (conjugate
 
 transpose).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.dynamic_rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.dynamic_rnn.md
index 34a275c6a1d..623e04e33f0 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.dynamic_rnn.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.dynamic_rnn.md
@@ -14,8 +14,9 @@ The corresponding output is either a single `Tensor` having the same number
 of time steps and batch size, or a (possibly nested) tuple of such tensors,
 matching the nested structure of `cell.output_size`.
 
-The parameter `sequence_length` is required and dynamic calculation is
-automatically performed.
+The parameter `sequence_length` is optional and is used to copy-through state
+and zero-out outputs when past a batch element's sequence length. So it's more
+for correctness than performance, unlike in rnn().
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.connect.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.connect.md
new file mode 100644
index 00000000000..134765ea06b
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.connect.md
@@ -0,0 +1,26 @@
+### `tf.contrib.graph_editor.connect(sgv0, sgv1, disconnect_first=False)` {#connect}
+
+Connect the outputs of sgv0 to the inputs of sgv1.
+
+##### Args:
+
+
+*  <b>`sgv0`</b>: the first subgraph to have its outputs swapped. This argument is
+    converted to a subgraph using the same rules as the function
+    subgraph.make_view.
+*  <b>`sgv1`</b>: the second subgraph to have its outputs swapped. This argument is
+    converted to a subgraph using the same rules as the function
+    subgraph.make_view.
+*  <b>`disconnect_first`</b>: if True the current outputs of sgv0 are disconnected.
+
+##### Returns:
+
+  Two new subgraph views (now connected). sgv0 and svg1 are also modified
+    in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv0 or sgv1 cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
index 4b75dcc7b0d..95d0d145099 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
@@ -118,56 +118,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.DNNRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#DNNRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -181,37 +132,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.DNNRegressor.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#DNNRegressor.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.TensorFlowLinearRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.TensorFlowLinearRegressor.md
index 161b7d5fd3f..a8362b03686 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.TensorFlowLinearRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.TensorFlowLinearRegressor.md
@@ -31,56 +31,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowLinearRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowLinearRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md
index bbbc297a94a..feb96eb180e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md
@@ -1,4 +1,4 @@
-### `tf.get_variable(name, shape=None, dtype=tf.float32, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#get_variable}
+### `tf.get_variable(name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#get_variable}
 
 Gets an existing variable with these parameters or create a new one.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.matrix_determinant.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.matrix_determinant.md
index a5cd5a7fe68..fcaa1b1c774 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.matrix_determinant.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.matrix_determinant.md
@@ -1,6 +1,6 @@
 ### `tf.matrix_determinant(input, name=None)` {#matrix_determinant}
 
-Calculates the determinant of a square matrix.
+Computes the determinant of a square matrix.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.rnn_cell.RNNCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.rnn_cell.RNNCell.md
index 5ecf4e515fe..ab13073fd5d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.rnn_cell.RNNCell.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.rnn_cell.RNNCell.md
@@ -1,5 +1,10 @@
 Abstract object representing an RNN cell.
 
+The definition of cell in this package differs from the definition used in the
+literature. In the literature, cell refers to an object with a single scalar
+output. The definition in this package refers to a horizontal array of such
+units.
+
 An RNN cell, in the most abstract setting, is anything that has
 a state and performs some operation that takes a matrix of inputs.
 This operation results in an output matrix with `self.output_size` columns.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md
index 561e5e196f0..2e272685948 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md
@@ -1,4 +1,4 @@
-### `tf.scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#scan}
+### `tf.scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name=None)` {#scan}
 
 scan on the list of tensors unpacked from `elems` on dimension 0.
 
@@ -50,6 +50,7 @@ For example, if `elems` is `(t1, [t2, t3])` and `initializer` is
     in parallel.
 *  <b>`back_prop`</b>: (optional) True enables support for back propagation.
 *  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
+*  <b>`infer_shape`</b>: (optional) False disables tests for consistent output shapes.
 *  <b>`name`</b>: (optional) Name prefix for the returned tensors.
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md
index 38742123d64..b2f9570b2c6 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md
@@ -1,4 +1,4 @@
-### `tf.sparse_merge(sp_ids, sp_values, vocab_size, name=None)` {#sparse_merge}
+### `tf.sparse_merge(sp_ids, sp_values, vocab_size, name=None, already_sorted=False)` {#sparse_merge}
 
 Combines a batch of feature ids and values into a single `SparseTensor`.
 
@@ -18,14 +18,17 @@ The `SparseTensor` returned by this function has the following properties:
 
 For example, consider the following feature vectors:
 
+```python
   vector1 = [-3, 0, 0, 0, 0, 0]
   vector2 = [ 0, 1, 0, 4, 1, 0]
   vector3 = [ 5, 0, 0, 9, 0, 0]
+```
 
 These might be stored sparsely in the following Example protos by storing
 only the feature ids (column number if the vectors are treated as a matrix)
 of the non-zero elements and the corresponding values:
 
+```python
   examples = [Example(features={
                   "ids": Feature(int64_list=Int64List(value=[0])),
                   "values": Feature(float_list=FloatList(value=[-3]))}),
@@ -35,6 +38,7 @@ of the non-zero elements and the corresponding values:
               Example(features={
                   "ids": Feature(int64_list=Int64List(value=[0, 3])),
                   "values": Feature(float_list=FloatList(value=[5, 9]))})]
+```
 
 The result of calling parse_example on these examples will produce a
 dictionary with entries for "ids" and "values". Passing those two objects
@@ -47,9 +51,11 @@ batch, and the second dimension is the column number, i.e., the feature id);
 original matrix, i.e., (3, 6). For our example above, the output will be
 equal to:
 
+```python
   SparseTensor(indices=[[0, 0], [1, 1], [1, 3], [1, 4], [2, 0], [2, 3]],
                values=[-3, 1, 4, 1, 5, 9],
                shape=[3, 6])
+```
 
 ##### Args:
 
@@ -60,6 +66,9 @@ equal to:
 *  <b>`vocab_size`</b>: A scalar `int64` Tensor (or Python int) containing the new size
     of the last dimension, `all(0 <= sp_ids.values < vocab_size)`.
 *  <b>`name`</b>: A name prefix for the returned tensors (optional)
+*  <b>`already_sorted`</b>: A boolean to specify whether the per-batch values in
+   `sp_values` are already sorted. If so skip sorting, False by default
+   (optional).
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.squared_difference.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.squared_difference.md
index d6bb175669c..19f25f473da 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.squared_difference.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.squared_difference.md
@@ -2,6 +2,9 @@
 
 Returns (x - y)(x - y) element-wise.
 
+*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/index.md b/tensorflow/g3doc/api_docs/python/index.md
index 448a32d72a5..c3ea11b4d3f 100644
--- a/tensorflow/g3doc/api_docs/python/index.md
+++ b/tensorflow/g3doc/api_docs/python/index.md
@@ -58,6 +58,7 @@
   * [`multinomial`](../../api_docs/python/constant_op.md#multinomial)
   * [`ones`](../../api_docs/python/constant_op.md#ones)
   * [`ones_like`](../../api_docs/python/constant_op.md#ones_like)
+  * [`ops`](../../api_docs/python/constant_op.md#ops)
   * [`random_crop`](../../api_docs/python/constant_op.md#random_crop)
   * [`random_gamma`](../../api_docs/python/constant_op.md#random_gamma)
   * [`random_normal`](../../api_docs/python/constant_op.md#random_normal)
@@ -120,6 +121,7 @@
   * [`boolean_mask`](../../api_docs/python/array_ops.md#boolean_mask)
   * [`cast`](../../api_docs/python/array_ops.md#cast)
   * [`concat`](../../api_docs/python/array_ops.md#concat)
+  * [`copy`](../../api_docs/python/array_ops.md#copy)
   * [`depth_to_space`](../../api_docs/python/array_ops.md#depth_to_space)
   * [`dynamic_partition`](../../api_docs/python/array_ops.md#dynamic_partition)
   * [`dynamic_stitch`](../../api_docs/python/array_ops.md#dynamic_stitch)
@@ -186,6 +188,8 @@
   * [`batch_matrix_transpose`](../../api_docs/python/math_ops.md#batch_matrix_transpose)
   * [`batch_matrix_triangular_solve`](../../api_docs/python/math_ops.md#batch_matrix_triangular_solve)
   * [`batch_self_adjoint_eig`](../../api_docs/python/math_ops.md#batch_self_adjoint_eig)
+  * [`batch_self_adjoint_eigvals`](../../api_docs/python/math_ops.md#batch_self_adjoint_eigvals)
+  * [`batch_svd`](../../api_docs/python/math_ops.md#batch_svd)
   * [`ceil`](../../api_docs/python/math_ops.md#ceil)
   * [`cholesky`](../../api_docs/python/math_ops.md#cholesky)
   * [`cholesky_solve`](../../api_docs/python/math_ops.md#cholesky_solve)
@@ -251,6 +255,7 @@
   * [`segment_prod`](../../api_docs/python/math_ops.md#segment_prod)
   * [`segment_sum`](../../api_docs/python/math_ops.md#segment_sum)
   * [`self_adjoint_eig`](../../api_docs/python/math_ops.md#self_adjoint_eig)
+  * [`self_adjoint_eigvals`](../../api_docs/python/math_ops.md#self_adjoint_eigvals)
   * [`sign`](../../api_docs/python/math_ops.md#sign)
   * [`sin`](../../api_docs/python/math_ops.md#sin)
   * [`sparse_segment_mean`](../../api_docs/python/math_ops.md#sparse_segment_mean)
@@ -261,6 +266,7 @@
   * [`square`](../../api_docs/python/math_ops.md#square)
   * [`squared_difference`](../../api_docs/python/math_ops.md#squared_difference)
   * [`sub`](../../api_docs/python/math_ops.md#sub)
+  * [`svd`](../../api_docs/python/math_ops.md#svd)
   * [`tan`](../../api_docs/python/math_ops.md#tan)
   * [`trace`](../../api_docs/python/math_ops.md#trace)
   * [`transpose`](../../api_docs/python/math_ops.md#transpose)
@@ -598,6 +604,7 @@
   * [`batch_matrix_diag_transform`](../../api_docs/python/contrib.distributions.md#batch_matrix_diag_transform)
   * [`Bernoulli`](../../api_docs/python/contrib.distributions.md#Bernoulli)
   * [`Beta`](../../api_docs/python/contrib.distributions.md#Beta)
+  * [`Binomial`](../../api_docs/python/contrib.distributions.md#Binomial)
   * [`Categorical`](../../api_docs/python/contrib.distributions.md#Categorical)
   * [`Chi2`](../../api_docs/python/contrib.distributions.md#Chi2)
   * [`Dirichlet`](../../api_docs/python/contrib.distributions.md#Dirichlet)
@@ -608,6 +615,7 @@
   * [`InverseGamma`](../../api_docs/python/contrib.distributions.md#InverseGamma)
   * [`kl`](../../api_docs/python/contrib.distributions.md#kl)
   * [`Laplace`](../../api_docs/python/contrib.distributions.md#Laplace)
+  * [`Multinomial`](../../api_docs/python/contrib.distributions.md#Multinomial)
   * [`MultivariateNormalCholesky`](../../api_docs/python/contrib.distributions.md#MultivariateNormalCholesky)
   * [`MultivariateNormalDiag`](../../api_docs/python/contrib.distributions.md#MultivariateNormalDiag)
   * [`MultivariateNormalDiagPlusVDVT`](../../api_docs/python/contrib.distributions.md#MultivariateNormalDiagPlusVDVT)
@@ -636,6 +644,7 @@
   * [`convert_to_tensor_or_sparse_tensor`](../../api_docs/python/contrib.framework.md#convert_to_tensor_or_sparse_tensor)
   * [`create_global_step`](../../api_docs/python/contrib.framework.md#create_global_step)
   * [`deprecated`](../../api_docs/python/contrib.framework.md#deprecated)
+  * [`deprecated_arg_values`](../../api_docs/python/contrib.framework.md#deprecated_arg_values)
   * [`get_global_step`](../../api_docs/python/contrib.framework.md#get_global_step)
   * [`get_graph_from_inputs`](../../api_docs/python/contrib.framework.md#get_graph_from_inputs)
   * [`get_local_variables`](../../api_docs/python/contrib.framework.md#get_local_variables)
@@ -660,6 +669,31 @@
   * [`with_same_shape`](../../api_docs/python/contrib.framework.md#with_same_shape)
   * [`with_shape`](../../api_docs/python/contrib.framework.md#with_shape)
 
+* **[Graph Editor (contrib)](../../api_docs/python/contrib.graph_editor.md)**:
+  * [`bypass`](../../api_docs/python/contrib.graph_editor.md#bypass)
+  * [`connect`](../../api_docs/python/contrib.graph_editor.md#connect)
+  * [`detach`](../../api_docs/python/contrib.graph_editor.md#detach)
+  * [`detach_inputs`](../../api_docs/python/contrib.graph_editor.md#detach_inputs)
+  * [`detach_outputs`](../../api_docs/python/contrib.graph_editor.md#detach_outputs)
+  * [`matcher`](../../api_docs/python/contrib.graph_editor.md#matcher)
+  * [`ph`](../../api_docs/python/contrib.graph_editor.md#ph)
+  * [`reroute_a2b`](../../api_docs/python/contrib.graph_editor.md#reroute_a2b)
+  * [`reroute_a2b_inputs`](../../api_docs/python/contrib.graph_editor.md#reroute_a2b_inputs)
+  * [`reroute_a2b_outputs`](../../api_docs/python/contrib.graph_editor.md#reroute_a2b_outputs)
+  * [`reroute_b2a`](../../api_docs/python/contrib.graph_editor.md#reroute_b2a)
+  * [`reroute_b2a_inputs`](../../api_docs/python/contrib.graph_editor.md#reroute_b2a_inputs)
+  * [`reroute_b2a_outputs`](../../api_docs/python/contrib.graph_editor.md#reroute_b2a_outputs)
+  * [`select_ops`](../../api_docs/python/contrib.graph_editor.md#select_ops)
+  * [`select_ts`](../../api_docs/python/contrib.graph_editor.md#select_ts)
+  * [`sgv`](../../api_docs/python/contrib.graph_editor.md#sgv)
+  * [`sgv_scope`](../../api_docs/python/contrib.graph_editor.md#sgv_scope)
+  * [`SubGraphView`](../../api_docs/python/contrib.graph_editor.md#SubGraphView)
+  * [`swap`](../../api_docs/python/contrib.graph_editor.md#swap)
+  * [`swap_inputs`](../../api_docs/python/contrib.graph_editor.md#swap_inputs)
+  * [`swap_outputs`](../../api_docs/python/contrib.graph_editor.md#swap_outputs)
+  * [`Transformer`](../../api_docs/python/contrib.graph_editor.md#Transformer)
+  * [`ts`](../../api_docs/python/contrib.graph_editor.md#ts)
+
 * **[Layers (contrib)](../../api_docs/python/contrib.layers.md)**:
   * [`apply_regularization`](../../api_docs/python/contrib.layers.md#apply_regularization)
   * [`avg_pool2d`](../../api_docs/python/contrib.layers.md#avg_pool2d)
diff --git a/tensorflow/g3doc/api_docs/python/math_ops.md b/tensorflow/g3doc/api_docs/python/math_ops.md
index 4a9ead85023..51cfff68af4 100644
--- a/tensorflow/g3doc/api_docs/python/math_ops.md
+++ b/tensorflow/g3doc/api_docs/python/math_ops.md
@@ -21,7 +21,8 @@ operators to your graph.
 
 Returns x + y element-wise.
 
-*NOTE*: Add supports broadcasting. AddN does not.
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 
 ##### Args:
 
@@ -41,6 +42,9 @@ Returns x + y element-wise.
 
 Returns x - y element-wise.
 
+*NOTE*: `Sub` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -59,6 +63,9 @@ Returns x - y element-wise.
 
 Returns x * y element-wise.
 
+*NOTE*: `Mul` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -77,6 +84,9 @@ Returns x * y element-wise.
 
 Returns x / y element-wise.
 
+*NOTE*: `Div` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -164,6 +174,9 @@ as well.
 
 Returns element-wise remainder of division.
 
+*NOTE*: `Mod` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -504,7 +517,10 @@ Returns element-wise largest integer not greater than x.
 
 ### `tf.maximum(x, y, name=None)` {#maximum}
 
-Returns the max of x and y (i.e. x > y ? x : y) element-wise, broadcasts.
+Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+
+*NOTE*: `Maximum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 
 ##### Args:
 
@@ -522,7 +538,10 @@ Returns the max of x and y (i.e. x > y ? x : y) element-wise, broadcasts.
 
 ### `tf.minimum(x, y, name=None)` {#minimum}
 
-Returns the min of x and y (i.e. x < y ? x : y) element-wise, broadcasts.
+Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+
+*NOTE*: `Minimum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 
 ##### Args:
 
@@ -749,6 +768,9 @@ Computes the complementary error function of `x` element-wise.
 
 Returns (x - y)(x - y) element-wise.
 
+*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -1365,7 +1387,7 @@ It is computed as:
 
 ### `tf.matrix_determinant(input, name=None)` {#matrix_determinant}
 
-Calculates the determinant of a square matrix.
+Computes the determinant of a square matrix.
 
 ##### Args:
 
@@ -1384,7 +1406,7 @@ Calculates the determinant of a square matrix.
 
 ### `tf.batch_matrix_determinant(input, name=None)` {#batch_matrix_determinant}
 
-Calculates the determinants for a batch of square matrices.
+Computes the determinants for a batch of square matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices. The output is a tensor containing the determinants
@@ -1407,7 +1429,7 @@ for all input submatrices `[..., :, :]`.
 
 ### `tf.matrix_inverse(input, adjoint=None, name=None)` {#matrix_inverse}
 
-Calculates the inverse of a square invertible matrix or its adjoint (conjugate
+Computes the inverse of a square invertible matrix or its adjoint (conjugate
 
 transpose).
 
@@ -1437,7 +1459,7 @@ garbage result.
 
 ### `tf.batch_matrix_inverse(input, adjoint=None, name=None)` {#batch_matrix_inverse}
 
-Calculates the inverse of square invertible matrices or their adjoints
+Computes the inverse of square invertible matrices or their adjoints
 
 (conjugate transposes).
 
@@ -1469,7 +1491,7 @@ garbage result.
 
 ### `tf.cholesky(input, name=None)` {#cholesky}
 
-Calculates the Cholesky decomposition of a square matrix.
+Computes the Cholesky decomposition of a square matrix.
 
 The input has to be symmetric and positive definite. Only the lower-triangular
 part of the input will be used for this operation. The upper-triangular part
@@ -1494,7 +1516,7 @@ input, `L`, so that `input = L L^*`.
 
 ### `tf.batch_cholesky(input, name=None)` {#batch_cholesky}
 
-Calculates the Cholesky decomposition of a batch of square matrices.
+Computes the Cholesky decomposition of a batch of square matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices, with the same constraints as the single matrix Cholesky
@@ -1590,56 +1612,6 @@ X[3, :, 2]  # Solution to the linear system A[3, :, :] x = RHS[3, :, 2]
 
 
 
-- - -
-
-### `tf.self_adjoint_eig(input, name=None)` {#self_adjoint_eig}
-
-Calculates the Eigen Decomposition of a square Self-Adjoint matrix.
-
-Only the lower-triangular part of the input will be used in this case. The
-upper-triangular part will not be read.
-
-The result is a M+1 x M matrix whose first row is the eigenvalues, and
-subsequent rows are eigenvectors.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`.
-    Shape is `[M, M]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. Shape is `[M+1, M]`.
-
-
-- - -
-
-### `tf.batch_self_adjoint_eig(input, name=None)` {#batch_self_adjoint_eig}
-
-Calculates the Eigen Decomposition of a batch of square self-adjoint matrices.
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices, with the same constraints as the single matrix
-SelfAdjointEig.
-
-The result is a '[..., M+1, M] matrix with [..., 0,:] containing the
-eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`.
-    Shape is `[..., M, M]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. Shape is `[..., M+1, M]`.
-
-
-
 - - -
 
 ### `tf.matrix_solve(matrix, rhs, adjoint=None, name=None)` {#matrix_solve}
@@ -1886,6 +1858,179 @@ typically 6-7 times slower than the fast path. If `fast` is `False` then
 
 
 
+- - -
+
+### `tf.self_adjoint_eig(matrix, name=None)` {#self_adjoint_eig}
+
+Computes the eigen decomposition of a self-adjoint matrix.
+
+Computes the eigenvalues and eigenvectors of an N-by-N matrix `matrix` such
+that `matrix * v[:,i] = e(i) * v[:,i]`, for i=0...N-1.
+
+##### Args:
+
+
+*  <b>`matrix`</b>: `Tensor` of shape `[N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`e`</b>: Eigenvalues. Shape is `[N]`.
+*  <b>`v`</b>: Eigenvectors. Shape is `[N, N]`. The columns contain the eigenvectors of
+    `matrix`.
+
+
+- - -
+
+### `tf.batch_self_adjoint_eig(tensor, name=None)` {#batch_self_adjoint_eig}
+
+Computes the eigen decomposition of a batch of self-adjoint matrices.
+
+Computes the eigenvalues and eigenvectors of the innermost N-by-N matrices
+in `tensor` such that
+`tensor[...,:,:] * v[..., :,i] = e(..., i) * v[...,:,i]`, for i=0...N-1.
+
+##### Args:
+
+
+*  <b>`tensor`</b>: `Tensor` of shape `[..., N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`e`</b>: Eigenvalues. Shape is `[..., N]`.
+*  <b>`v`</b>: Eigenvectors. Shape is `[..., N, N]`. The columns of the inner most
+  matrices
+    contain eigenvectors of the corresponding matrices in `tensor`
+
+
+- - -
+
+### `tf.self_adjoint_eigvals(matrix, name=None)` {#self_adjoint_eigvals}
+
+Computes the eigenvalues a self-adjoint  matrix.
+
+##### Args:
+
+
+*  <b>`matrix`</b>: `Tensor` of shape `[N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`e`</b>: Eigenvalues of `matrix`. Shape is `[N]`.
+
+
+- - -
+
+### `tf.batch_self_adjoint_eigvals(tensor, name=None)` {#batch_self_adjoint_eigvals}
+
+Computes the eigenvalues of a batch of self-adjoint matrices.
+
+##### Args:
+
+
+*  <b>`tensor`</b>: `Tensor` of shape `[..., N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`e`</b>: Eigenvalues. Shape is `[..., N]`. The vector `e[..., :]` contains the `N`
+    eigenvalues of `tensor[..., :, :]`.
+
+
+
+- - -
+
+### `tf.svd(matrix, compute_uv=True, full_matrices=False, name=None)` {#svd}
+
+Computes the singular value decomposition of a matrix.
+
+Computes the SVD of `matrix` such that `matrix = u * diag(s) *
+transpose(v)`
+
+```prettyprint
+# a is a matrix.
+# s is a vector of singular values.
+# u is the matrix of left singular vectors.
+# v is a matrix of right singular vectors.
+s, u, v = svd(a)
+s = svd(a, compute_uv=False)
+```
+
+##### Args:
+
+
+*  <b>`matrix`</b>: `Tensor` of shape `[M, N]`. Let `P` be the minimum of `M` and `N`.
+*  <b>`compute_uv`</b>: If `True` then left and right singular vectors will be
+    computed and returned in `u` and `v`, respectively. Otherwise, only the
+    singular values will be computed, which can be significantly faster.
+*  <b>`full_matrices`</b>: If true, compute full-sized `u` and `v`. If false
+    (the default), compute only the leading `P` singular vectors.
+    Ignored if `compute_uv` is `False`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`s`</b>: Singular values. Shape is `[P]`.
+*  <b>`u`</b>: Right singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[M, P]`; if `full_matrices` is `True` then shape is
+    `[M, M]`. Not returned if `compute_uv` is `False`.
+*  <b>`v`</b>: Left singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[N, P]`. If `full_matrices` is `True` then shape is
+    `[N, N]`. Not returned if `compute_uv` is `False`.
+
+
+- - -
+
+### `tf.batch_svd(tensor, compute_uv=True, full_matrices=False, name=None)` {#batch_svd}
+
+Computes the singular value decompositions of a batch of matrices.
+
+Computes the SVD of each inner matrix in `tensor` such that
+`tensor[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :,
+:])`
+
+```prettyprint
+# a is a tensor.
+# s is a tensor of singular values.
+# u is a tensor of left singular vectors.
+# v is a tensor of right singular vectors.
+s, u, v = batch_svd(a)
+s = batch_svd(a, compute_uv=False)
+```
+
+##### Args:
+
+
+*  <b>`matrix`</b>: `Tensor` of shape `[..., M, N]`. Let `P` be the minimum of `M` and
+    `N`.
+*  <b>`compute_uv`</b>: If `True` then left and right singular vectors will be
+    computed and returned in `u` and `v`, respectively. Otherwise, only the
+    singular values will be computed, which can be significantly faster.
+*  <b>`full_matrices`</b>: If true, compute full-sized `u` and `v`. If false
+    (the default), compute only the leading `P` singular vectors.
+    Ignored if `compute_uv` is `False`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`s`</b>: Singular values. Shape is `[..., P]`.
+*  <b>`u`</b>: Right singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+    `[..., M, M]`. Not returned if `compute_uv` is `False`.
+*  <b>`v`</b>: Left singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[..., N, P]`. If `full_matrices` is `True` then shape is
+    `[..., N, N]`. Not returned if `compute_uv` is `False`.
+
+
+
 ## Complex Number Functions
 
 TensorFlow provides several operations that you can use to add complex number
@@ -2603,8 +2748,8 @@ tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
 
 
 *  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-   `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-   `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+     `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+     `complex128`, `qint8`, `quint8`, `qint32`, `half`.
 *  <b>`axis`</b>: A `Tensor` of type `int32` (default: 0).
 *  <b>`reverse`</b>: A `bool` (default: False).
 *  <b>`name`</b>: A name for the operation (optional).
@@ -2620,13 +2765,15 @@ tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
 
 Compute the cumulative product of the tensor `x` along `axis`.
 
-By default, this op performs an inclusive cumprod, which means that the first
+By default, this op performs an inclusive cumprod, which means that the
+first
 element of the input is identical to the first element of the output:
 ```prettyprint
 tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
 ```
 
-By setting the `exclusive` kwarg to `True`, an exclusive cumprod is performed
+By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+performed
 instead:
 ```prettyprint
 tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
@@ -2648,8 +2795,8 @@ tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
 
 
 *  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-   `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-   `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+     `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+     `complex128`, `qint8`, `quint8`, `qint32`, `half`.
 *  <b>`axis`</b>: A `Tensor` of type `int32` (default: 0).
 *  <b>`reverse`</b>: A `bool` (default: False).
 *  <b>`name`</b>: A name for the operation (optional).
diff --git a/tensorflow/g3doc/api_docs/python/nn.md b/tensorflow/g3doc/api_docs/python/nn.md
index 075f85b2495..4268fdf5d21 100644
--- a/tensorflow/g3doc/api_docs/python/nn.md
+++ b/tensorflow/g3doc/api_docs/python/nn.md
@@ -1539,8 +1539,9 @@ The corresponding output is either a single `Tensor` having the same number
 of time steps and batch size, or a (possibly nested) tuple of such tensors,
 matching the nested structure of `cell.output_size`.
 
-The parameter `sequence_length` is required and dynamic calculation is
-automatically performed.
+The parameter `sequence_length` is optional and is used to copy-through state
+and zero-out outputs when past a batch element's sequence length. So it's more
+for correctness than performance, unlike in rnn().
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/rnn_cell.md b/tensorflow/g3doc/api_docs/python/rnn_cell.md
index 94b48f5d416..5fcbd27966a 100644
--- a/tensorflow/g3doc/api_docs/python/rnn_cell.md
+++ b/tensorflow/g3doc/api_docs/python/rnn_cell.md
@@ -13,6 +13,11 @@ Module for constructing RNN Cells.
 
 Abstract object representing an RNN cell.
 
+The definition of cell in this package differs from the definition used in the
+literature. In the literature, cell refers to an object with a single scalar
+output. The definition in this package refers to a horizontal array of such
+units.
+
 An RNN cell, in the most abstract setting, is anything that has
 a state and performs some operation that takes a matrix of inputs.
 This operation results in an output matrix with `self.output_size` columns.
diff --git a/tensorflow/g3doc/api_docs/python/sparse_ops.md b/tensorflow/g3doc/api_docs/python/sparse_ops.md
index a1d9d23bea7..e934dc765cd 100644
--- a/tensorflow/g3doc/api_docs/python/sparse_ops.md
+++ b/tensorflow/g3doc/api_docs/python/sparse_ops.md
@@ -350,7 +350,7 @@ The input `SparseTensor` must be in row-major order.
 
 - - -
 
-### `tf.sparse_merge(sp_ids, sp_values, vocab_size, name=None)` {#sparse_merge}
+### `tf.sparse_merge(sp_ids, sp_values, vocab_size, name=None, already_sorted=False)` {#sparse_merge}
 
 Combines a batch of feature ids and values into a single `SparseTensor`.
 
@@ -370,14 +370,17 @@ The `SparseTensor` returned by this function has the following properties:
 
 For example, consider the following feature vectors:
 
+```python
   vector1 = [-3, 0, 0, 0, 0, 0]
   vector2 = [ 0, 1, 0, 4, 1, 0]
   vector3 = [ 5, 0, 0, 9, 0, 0]
+```
 
 These might be stored sparsely in the following Example protos by storing
 only the feature ids (column number if the vectors are treated as a matrix)
 of the non-zero elements and the corresponding values:
 
+```python
   examples = [Example(features={
                   "ids": Feature(int64_list=Int64List(value=[0])),
                   "values": Feature(float_list=FloatList(value=[-3]))}),
@@ -387,6 +390,7 @@ of the non-zero elements and the corresponding values:
               Example(features={
                   "ids": Feature(int64_list=Int64List(value=[0, 3])),
                   "values": Feature(float_list=FloatList(value=[5, 9]))})]
+```
 
 The result of calling parse_example on these examples will produce a
 dictionary with entries for "ids" and "values". Passing those two objects
@@ -399,9 +403,11 @@ batch, and the second dimension is the column number, i.e., the feature id);
 original matrix, i.e., (3, 6). For our example above, the output will be
 equal to:
 
+```python
   SparseTensor(indices=[[0, 0], [1, 1], [1, 3], [1, 4], [2, 0], [2, 3]],
                values=[-3, 1, 4, 1, 5, 9],
                shape=[3, 6])
+```
 
 ##### Args:
 
@@ -412,6 +418,9 @@ equal to:
 *  <b>`vocab_size`</b>: A scalar `int64` Tensor (or Python int) containing the new size
     of the last dimension, `all(0 <= sp_ids.values < vocab_size)`.
 *  <b>`name`</b>: A name prefix for the returned tensors (optional)
+*  <b>`already_sorted`</b>: A boolean to specify whether the per-batch values in
+   `sp_values` are already sorted. If so skip sorting, False by default
+   (optional).
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/state_ops.md b/tensorflow/g3doc/api_docs/python/state_ops.md
index 250886db3a7..4b5f4cf353c 100644
--- a/tensorflow/g3doc/api_docs/python/state_ops.md
+++ b/tensorflow/g3doc/api_docs/python/state_ops.md
@@ -1065,7 +1065,7 @@ create variables contingent on certain conditions.
 
 - - -
 
-### `tf.get_variable(name, shape=None, dtype=tf.float32, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#get_variable}
+### `tf.get_variable(name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#get_variable}
 
 Gets an existing variable with these parameters or create a new one.
 
@@ -1167,9 +1167,10 @@ Attributes:
   partitioner: callable or `None`: the partitioner passed to `get_variable`.
   custom_getter: default custom getter passed to get_variable.
   name_scope: The name passed to `tf.name_scope`.
+  dtype: default type passed to get_variable (defaults to DT_FLOAT).
 - - -
 
-#### `tf.VariableScope.__init__(reuse, name='', initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, name_scope='')` {#VariableScope.__init__}
+#### `tf.VariableScope.__init__(reuse, name='', initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, name_scope='', dtype=tf.float32)` {#VariableScope.__init__}
 
 Creates a new VariableScope with the given properties.
 
@@ -1190,7 +1191,14 @@ Creates a new VariableScope with the given properties.
 
 - - -
 
-#### `tf.VariableScope.get_variable(var_store, name, shape=None, dtype=tf.float32, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#VariableScope.get_variable}
+#### `tf.VariableScope.dtype` {#VariableScope.dtype}
+
+
+
+
+- - -
+
+#### `tf.VariableScope.get_variable(var_store, name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#VariableScope.get_variable}
 
 Gets an existing variable with this name or create a new one.
 
@@ -1258,6 +1266,13 @@ Set caching_device for this scope.
 Set custom getter for this scope.
 
 
+- - -
+
+#### `tf.VariableScope.set_dtype(dtype)` {#VariableScope.set_dtype}
+
+Set data type for this scope.
+
+
 - - -
 
 #### `tf.VariableScope.set_initializer(initializer)` {#VariableScope.set_initializer}
@@ -1282,7 +1297,7 @@ Set regularizer for this scope.
 
 - - -
 
-### `tf.variable_scope(name_or_scope, reuse=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None)` {#variable_scope}
+### `tf.variable_scope(name_or_scope, reuse=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, dtype=None)` {#variable_scope}
 
 Returns a context for variable scope.
 
@@ -1353,6 +1368,8 @@ then all its sub-scopes become reusing as well.
 *  <b>`caching_device`</b>: default caching device for variables within this scope.
 *  <b>`partitioner`</b>: default partitioner for variables within this scope.
 *  <b>`custom_getter`</b>: default custom getter for variables within this scope.
+*  <b>`dtype`</b>: type of variables created in this scope (defaults to the type
+    in the passed scope, or inherited from parent scope).
 
 ##### Returns:
 
@@ -1368,7 +1385,7 @@ then all its sub-scopes become reusing as well.
 
 - - -
 
-### `tf.variable_op_scope(values, name_or_scope, default_name=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, reuse=None)` {#variable_op_scope}
+### `tf.variable_op_scope(values, name_or_scope, default_name=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, reuse=None, dtype=None)` {#variable_op_scope}
 
 Returns a context manager for defining an op that creates variables.
 
@@ -1412,6 +1429,8 @@ def my_op_with_vars(a, b, scope=None):
 *  <b>`custom_getter`</b>: The default custom getter for variables within this scope.
 *  <b>`reuse`</b>: `True` or `None`; if `True`, we go into reuse mode for this scope as
     well as all sub-scopes; if `None`, we just inherit the parent scope reuse.
+*  <b>`dtype`</b>: The default type of variables created in this scope, defaults to the
+    type of the parent scope.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/train.md b/tensorflow/g3doc/api_docs/python/train.md
index 792fb2bcb78..e080769f41b 100644
--- a/tensorflow/g3doc/api_docs/python/train.md
+++ b/tensorflow/g3doc/api_docs/python/train.md
@@ -851,7 +851,7 @@ learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                            100000, 0.96, staircase=True)
 # Passing global_step to minimize() will increment it at each step.
 learning_step = (
-    tf.GradientDescentOptimizer(learning_rate)
+    tf.train.GradientDescentOptimizer(learning_rate)
     .minimize(...my loss..., global_step=global_step)
 )
 ```
@@ -1251,11 +1251,14 @@ After this is called, calls to `should_stop()` will return `False`.
 
 - - -
 
-#### `tf.train.Coordinator.join(threads, stop_grace_period_secs=120)` {#Coordinator.join}
+#### `tf.train.Coordinator.join(threads=None, stop_grace_period_secs=120)` {#Coordinator.join}
 
 Wait for threads to terminate.
 
-Blocks until all `threads` have terminated or `request_stop()` is called.
+This call blocks until a set of threads have terminated.  The set of thread
+is the union of the threads passed in the `threads` argument and the list
+of threads that registered with the coordinator by calling
+`Coordinator.register_thread()`.
 
 After the threads stop, if an `exc_info` was passed to `request_stop`, that
 exception is re-raised.
@@ -1269,7 +1272,8 @@ that `RuntimeError`.
 ##### Args:
 
 
-*  <b>`threads`</b>: List of `threading.Threads`. The started threads to join.
+*  <b>`threads`</b>: List of `threading.Threads`. The started threads to join in
+    addition to the registered threads.
 *  <b>`stop_grace_period_secs`</b>: Number of seconds given to threads to stop after
     `request_stop()` has been called.
 
@@ -1287,6 +1291,18 @@ that `RuntimeError`.
 
 
 
+- - -
+
+#### `tf.train.Coordinator.register_thread(thread)` {#Coordinator.register_thread}
+
+Register a thread to join.
+
+##### Args:
+
+
+*  <b>`thread`</b>: A Python thread to join.
+
+
 - - -
 
 #### `tf.train.Coordinator.request_stop(ex=None)` {#Coordinator.request_stop}
diff --git a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
index 0d733ce9941..8183cdf0247 100644
--- a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
+++ b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
@@ -201,4 +201,4 @@ For in depth information on how to use the *graph* tab to visualize your graph,
 see [TensorBoard: Graph Visualization](../../how_tos/graph_viz/index.md).
 
 For more usage information on TensorBoard in general, see the [TensorBoard
-Readme](../../../tensorboard/README.md).
+README](https://www.tensorflow.org/code/tensorflow/tensorboard/README.md).
diff --git a/tensorflow/g3doc/tutorials/index.md b/tensorflow/g3doc/tutorials/index.md
index a489d977c8f..c634a6f6add 100644
--- a/tensorflow/g3doc/tutorials/index.md
+++ b/tensorflow/g3doc/tutorials/index.md
@@ -63,6 +63,12 @@ model and a deep neural net to harness the advantages of each type of model.
 
 [View Tutorial](../tutorials/wide_and_deep/index.md)
 
+### Logging and Monitoring Basics with tf.contrib.learn
+
+This tutorial shows you how to use TensorFlow’s logging capabilities and the
+Monitor API to audit the in-progress training of a neural network.
+
+[View Tutorial](../tutorials/monitors/index.md)
 
 ## TensorFlow Serving
 
diff --git a/tensorflow/g3doc/tutorials/leftnav_files b/tensorflow/g3doc/tutorials/leftnav_files
index 9c80a6c6e1b..75ef57f59fa 100644
--- a/tensorflow/g3doc/tutorials/leftnav_files
+++ b/tensorflow/g3doc/tutorials/leftnav_files
@@ -7,6 +7,7 @@ tflearn/index.md
 linear/overview.md
 wide/index.md
 wide_and_deep/index.md
+monitors/index.md
 ### TensorFlow Serving
 tfserve/index.md
 ### Image Processing
diff --git a/tensorflow/g3doc/tutorials/linear/overview.md b/tensorflow/g3doc/tutorials/linear/overview.md
index f8fd1ab0de8..aafa1585760 100644
--- a/tensorflow/g3doc/tutorials/linear/overview.md
+++ b/tensorflow/g3doc/tutorials/linear/overview.md
@@ -174,11 +174,11 @@ that value.
 indicating how to represent and transform the data. But they do not provide
 the data itself. You provide the data through an input function.
 
-The input function must return a dictionary of tensors. Each key corresponds
-to the name of a `FeatureColumn`. Each key's value is a tensor containing the
+The input function must return a dictionary of tensors. Each key corresponds to
+the name of a `FeatureColumn`. Each key's value is a tensor containing the
 values of that feature for all data instances. See `input_fn` in the [linear
-models tutorial code](
-https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py?l=160)
+models tutorial code]
+(https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py)
 for an example of an input function.
 
 The input function is passed to the `fit()` and `evaluate()` calls that
diff --git a/tensorflow/g3doc/tutorials/recurrent/index.md b/tensorflow/g3doc/tutorials/recurrent/index.md
index 52155633329..82b159c20ab 100644
--- a/tensorflow/g3doc/tutorials/recurrent/index.md
+++ b/tensorflow/g3doc/tutorials/recurrent/index.md
@@ -155,8 +155,9 @@ the second and so on.
 We have a class called `MultiRNNCell` that makes the implementation seamless:
 
 ```python
-lstm = rnn_cell.BasicLSTMCell(lstm_size)
-stacked_lstm = rnn_cell.MultiRNNCell([lstm] * number_of_layers)
+lstm = rnn_cell.BasicLSTMCell(lstm_size, state_is_tuple=False)
+stacked_lstm = rnn_cell.MultiRNNCell([lstm] * number_of_layers,
+    state_is_tuple=False)
 
 initial_state = state = stacked_lstm.zero_state(batch_size, tf.float32)
 for i in range(num_steps):
diff --git a/tensorflow/models/rnn/ptb/ptb_word_lm.py b/tensorflow/models/rnn/ptb/ptb_word_lm.py
index 5fea073820a..a8b54a3e9f3 100644
--- a/tensorflow/models/rnn/ptb/ptb_word_lm.py
+++ b/tensorflow/models/rnn/ptb/ptb_word_lm.py
@@ -148,11 +148,15 @@ class PTBModel(object):
     tvars = tf.trainable_variables()
     grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                       config.max_grad_norm)
-    optimizer = tf.train.GradientDescentOptimizer(self.lr)
+    optimizer = tf.train.GradientDescentOptimizer(self._lr)
     self._train_op = optimizer.apply_gradients(zip(grads, tvars))
 
+    self._new_lr = tf.placeholder(
+        tf.float32, shape=[], name="new_learning_rate")
+    self._lr_update = tf.assign(self._lr, self._new_lr)
+
   def assign_lr(self, session, lr_value):
-    session.run(tf.assign(self.lr, lr_value))
+    session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
 
   @property
   def input_data(self):
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index fce04e50f7c..5e2621cca8f 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1057,6 +1057,27 @@ cuda_py_tests(
     ],
 )
 
+py_library(
+    name = "net_lib",
+    testonly = 1,
+    srcs = ["util/net_lib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pywrap_tensorflow",
+    ],
+)
+
+py_tests(
+    name = "net_lib_test",
+    size = "small",
+    srcs = [
+        "util/net_lib_test.py",
+    ],
+    additional_deps = [
+        ":net_lib",
+    ],
+)
+
 tf_cuda_library(
     name = "tf_session_helper",
     srcs = ["client/tf_session_helper.cc"],
@@ -1083,6 +1104,7 @@ tf_py_wrap_cc(
     swig_includes = [
         "client/device_lib.i",
         "client/events_writer.i",
+        "client/net_lib.i",
         "client/quantize_training.i",
         "client/tf_session.i",
         "framework/python_op_gen.i",
@@ -1148,6 +1170,14 @@ py_test(
     ],
 )
 
+cuda_py_test(
+    name = "localhost_cluster_performance_test",
+    size = "medium",
+    srcs = [
+        "training/localhost_cluster_performance_test.py",
+    ],
+)
+
 py_library(
     name = "timeline",
     srcs = ["client/timeline.py"],
@@ -1246,7 +1276,6 @@ cuda_py_test(
     name = "special_math_ops_test",
     size = "small",
     srcs = ["ops/special_math_ops_test.py"],
-    tags = ["notsan"],
 )
 
 cuda_py_tests(
@@ -1269,7 +1298,6 @@ cuda_py_tests(
         "//tensorflow/core:image_testdata",
     ],
     shard_count = 5,
-    tags = ["notsan"],
 )
 
 cuda_py_tests(
@@ -1283,6 +1311,7 @@ cuda_py_tests(
             "training/session_manager_test.py",
             "training/supervisor_test.py",
             "training/saver_large_variable_test.py",
+            "training/localhost_cluster_performance_test.py",
         ],
     ),
     additional_deps = [
diff --git a/tensorflow/python/client/net_lib.i b/tensorflow/python/client/net_lib.i
new file mode 100644
index 00000000000..333e2abbc59
--- /dev/null
+++ b/tensorflow/python/client/net_lib.i
@@ -0,0 +1,30 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+#include "tensorflow/core/platform/net.h"
+%}
+
+%ignoreall
+
+%unignore tensorflow;
+%unignore tensorflow::internal;
+%unignore tensorflow::internal::PickUnusedPortOrDie;
+
+%include "tensorflow/core/platform/net.h"
+
+%unignoreall
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index d8c5737fbb2..0c2edcb2279 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -333,16 +333,18 @@ class _DictFetchMapper(_FetchMapper):
 class _FetchHandler(object):
   """Handler for structured fetches.
 
-  Given a graph and a user-provided structure for fetches, this class takes
-  care of generating a list of tensor names to fetch and op names to run for a
-  low level `run()` call.
+  Given a graph, a user-provided structure for fetches, and a feed dict, this
+  class takes care of generating a list of tensor names to fetch and op names
+  to run for a low level `run()` call.
 
   Given the results of the low level run call, this class can also rebuild a
   result structure matching the user-provided structure for fetches, but
   containing the corresponding results.
   """
+  # TODO(touts): Make this class also take care of destructuring the feed
+  # dict instead of doing it in the callers.
 
-  def __init__(self, graph, fetches):
+  def __init__(self, graph, fetches, feeds):
     """Creates a fetch handler.
 
     Args:
@@ -350,11 +352,13 @@ class _FetchHandler(object):
         and to convert all fetches to tensors or ops as needed.
       fetches: An arbitrary fetch structure: singleton, list, tuple,
         namedtuple, or dict.
+      feeds: A feed dict where keys are fully resolved tensor names.
     """
     with graph.as_default():
       self._fetch_mapper = _FetchMapper.for_fetch(fetches)
     self._fetches = []
     self._targets = []
+    self._feeds = feeds
     self._ops = []
     self._fetch_handles = {}
     for fetch in self._fetch_mapper.unique_fetches():
@@ -370,6 +374,7 @@ class _FetchHandler(object):
       # Remember the fetch if it is for a tensor handle.
       if isinstance(fetch, ops.Tensor) and fetch.op.type == 'GetSessionHandle':
         self._fetch_handles[fetch_name] = fetch.op.inputs[0].dtype
+    self._final_fetches = [x for x in self._fetches if x not in feeds]
 
   def _assert_fetchable(self, graph, op):
     if not graph.is_fetchable(op):
@@ -382,7 +387,7 @@ class _FetchHandler(object):
     Returns:
       A list of strings.
     """
-    return self._fetches
+    return self._final_fetches
 
   def targets(self):
     """Return the unique names of ops to run.
@@ -413,19 +418,26 @@ class _FetchHandler(object):
         containing tensors or None (for fetched ops).
     """
     full_values = []
-    assert len(self._fetches) == len(tensor_values)
+    assert len(self._final_fetches) == len(tensor_values)
     i = 0
+    j = 0
     for is_op in self._ops:
       if is_op:
         full_values.append(None)
       else:
+        # If the fetch was in the feeds, use the fed value, otherwise
+        # use the returned value.
+        value = self._feeds.get(self._fetches[i])
+        if value is None:
+          value = tensor_values[j]
+          j += 1
         dtype = self._fetch_handles.get(self._fetches[i])
         if dtype:
-          full_values.append(session_ops.TensorHandle(
-              tensor_values[i], dtype, session))
+          full_values.append(session_ops.TensorHandle(value, dtype, session))
         else:
-          full_values.append(tensor_values[i])
+          full_values.append(value)
         i += 1
+    assert j == len(tensor_values)
     return self._fetch_mapper.build_results(full_values)
 
 
@@ -749,6 +761,7 @@ class BaseSession(SessionInterface):
     Raises:
       tf.errors.OpError: Or one of its subclasses on error.
     """
+    # TODO(touts): Support feeding and fetching the same tensor.
     return self._run(handle, fetches, feed_dict, None, None)
 
   def partial_run_setup(self, fetches, feeds=None):
@@ -786,9 +799,6 @@ class BaseSession(SessionInterface):
       raise RuntimeError('The Session graph is empty.  Add operations to the '
                          'graph before calling run().')
 
-    # Validate and process fetches.
-    fetch_handler = _FetchHandler(self._graph, fetches)
-
     # Create request.
     feed_list = []
 
@@ -808,6 +818,10 @@ class BaseSession(SessionInterface):
           e.args = (e.message,)
           raise e
 
+    # Validate and process fetches.
+    # TODO(touts): Support feeding and fetching the same tensor.
+    fetch_handler = _FetchHandler(self._graph, fetches, {})
+
     # Set up a graph with feeds and fetches for partial run.
     def _setup_fn(session, feed_list, fetch_list, target_list):
       self._extend_graph()
@@ -834,9 +848,6 @@ class BaseSession(SessionInterface):
       raise RuntimeError('The Session graph is empty.  Add operations to the '
                          'graph before calling run().')
 
-    # Create a fetch handler to take care of the structure of fetches.
-    fetch_handler = _FetchHandler(self._graph, fetches)
-
     # Create request.
     feed_dict_string = {}
     feed_map = {}
@@ -880,6 +891,9 @@ class BaseSession(SessionInterface):
           feed_dict_string[subfeed_name] = np_val
           feed_map[subfeed_name] = (subfeed_t, subfeed_val)
 
+    # Create a fetch handler to take care of the structure of fetches.
+    fetch_handler = _FetchHandler(self._graph, fetches, feed_dict_string)
+
     # Run request and get response.
     # We need to keep the movers alive for the following _do_run().
     # These movers are no longer needed when _do_run() completes, and
@@ -887,9 +901,13 @@ class BaseSession(SessionInterface):
     # TODO(yuanbyu, keveman): Revisit whether we should just treat feeding
     # of a handle from a different device as an error.
     movers = self._update_with_movers(feed_dict_string, feed_map)
-    results = self._do_run(handle, fetch_handler.targets(),
-                           fetch_handler.fetches(), feed_dict_string, options,
-                           run_metadata)
+    final_fetches = fetch_handler.fetches()
+    final_targets = fetch_handler.targets()
+    if final_fetches or final_targets:
+      results = self._do_run(handle, final_targets, final_fetches,
+                             feed_dict_string, options, run_metadata)
+    else:
+      results = []
     return fetch_handler.build_results(self, results)
 
   # Captures the name of a node in an error status.
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 424ba665fa2..462afc39e0e 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -505,6 +505,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
+      # Feed with tuple, fetch sp directly
+      sp_out = s.run(sp, {sp: (indices, values, shape)})
+      self.assertAllEqual(sp_out.indices, indices)
+      self.assertAllEqual(sp_out.values, values)
+      self.assertAllEqual(sp_out.shape, shape)
       # Feed with SparseTensorValue
       indices_out, values_out, shape_out = s.run(
           [sp_indices, sp_values, sp_shape],
@@ -517,6 +522,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(sp2_out.indices, indices)
       self.assertAllEqual(sp2_out.values, values)
       self.assertAllEqual(sp2_out.shape, shape)
+      # Feed SparseTensorValue and fetch sp directly.
+      sp_out = s.run(sp, {sp: ops.SparseTensorValue(indices, values, shape)})
+      self.assertAllEqual(sp_out.indices, indices)
+      self.assertAllEqual(sp_out.values, values)
+      self.assertAllEqual(sp_out.shape, shape)
 
   def testFeedSparsePlaceholder(self):
     with session.Session() as s:
@@ -1036,7 +1046,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(a2_val, [[1.0, 1.0]])
 
   def testFeedAndFetch(self):
-    with session.Session():
+    with session.Session() as sess:
       for dtype in [dtypes.float16,
                     dtypes.float32,
                     dtypes.float64,
@@ -1066,7 +1076,15 @@ class SessionTest(test_util.TensorFlowTestCase):
             np_array = np_array.astype(np_dtype)
 
           self.assertAllEqual(np_array,
-                              out_t.eval(feed_dict={feed_t: np_array}))
+                              sess.run(out_t, feed_dict={feed_t: np_array}))
+          # Check that we can also get the feed back.
+          self.assertAllEqual(np_array,
+                              sess.run(feed_t, feed_dict={feed_t: np_array}))
+          # Also check that we can get both back.
+          out_v, feed_v = sess.run([out_t, feed_t],
+                                   feed_dict={feed_t: np_array})
+          self.assertAllEqual(np_array, out_v)
+          self.assertAllEqual(np_array, feed_v)
 
   def testFeedError(self):
     with session.Session() as sess:
@@ -1108,7 +1126,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(c.eval(), c_list)
 
   def testStringFeed(self):
-    with session.Session():
+    with session.Session() as sess:
       for shape in [(32, 4, 128), (37,), (2, 0, 6), (0, 0, 0)]:
         size = 1
         for s in shape:
@@ -1117,7 +1135,12 @@ class SessionTest(test_util.TensorFlowTestCase):
                           dtype=np.object).reshape(shape)
         feed_t = array_ops.placeholder(dtype=dtypes.string, shape=shape)
         c = array_ops.identity(feed_t)
-        self.assertAllEqual(c.eval(feed_dict={feed_t: c_list}), c_list)
+        self.assertAllEqual(sess.run(c, feed_dict={feed_t: c_list}), c_list)
+        self.assertAllEqual(sess.run(feed_t, feed_dict={feed_t: c_list}),
+                            c_list)
+        c_v, feed_v = sess.run([c, feed_t], feed_dict={feed_t: c_list})
+        self.assertAllEqual(c_v, c_list)
+        self.assertAllEqual(feed_v, c_list)
 
   def testStringFeedWithNullCharacters(self):
     with session.Session():
@@ -1351,14 +1374,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, 'may not be fed'):
         sess.run(reshaped_tensor, feed_dict={new_shape: [3, 7]})
 
-  def testRunWithNoTargetsIsAnError(self):
-    with session.Session() as sess:
-      _ = constant_op.constant(5.0)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          'Must specify at least one target to fetch or execute.'):
-        sess.run([])
-
   def testInferShapesFalse(self):
     with ops.Graph().as_default(), ops.device('/cpu:0'):
       a = constant_op.constant([[1, 2]])
diff --git a/tensorflow/python/framework/gen_docs_combined.py b/tensorflow/python/framework/gen_docs_combined.py
index 49d9cec7c19..63557302103 100644
--- a/tensorflow/python/framework/gen_docs_combined.py
+++ b/tensorflow/python/framework/gen_docs_combined.py
@@ -60,6 +60,7 @@ def get_module_to_name():
       tf.contrib.distributions: "tf.contrib.distributions",
       tf.contrib.ffmpeg: "tf.contrib.ffmpeg",
       tf.contrib.framework: "tf.contrib.framework",
+      tf.contrib.graph_editor: "tf.contrib.graph_editor",
       tf.contrib.layers: "tf.contrib.layers",
       tf.contrib.learn: "tf.contrib.learn",
       tf.contrib.learn.monitors: (
@@ -119,7 +120,7 @@ def all_libraries(module_to_name, members, documented):
       library("tensor_array_ops", "TensorArray Operations", prefix=PREFIX_TEXT),
       library("session_ops", "Tensor Handle Operations", prefix=PREFIX_TEXT),
       library("image", "Images", tf.image, exclude_symbols=["ResizeMethod"],
-               prefix=PREFIX_TEXT),
+              prefix=PREFIX_TEXT),
       library("sparse_ops",
               "Sparse Tensors",
               exclude_symbols=["serialize_sparse", "serialize_many_sparse",
@@ -167,6 +168,8 @@ def all_libraries(module_to_name, members, documented):
               tf.contrib.distributions),
       library("contrib.ffmpeg", "FFmpeg (contrib)", ffmpeg),
       library("contrib.framework", "Framework (contrib)", tf.contrib.framework),
+      library("contrib.graph_editor", "Graph Editor (contrib)",
+              tf.contrib.graph_editor),
       library("contrib.layers", "Layers (contrib)", tf.contrib.layers),
       library("contrib.learn", "Learn (contrib)", tf.contrib.learn),
       library("contrib.learn.monitors", "Monitors (contrib)",
@@ -177,7 +180,7 @@ def all_libraries(module_to_name, members, documented):
       library("contrib.util", "Utilities (contrib)", tf.contrib.util),
       library("contrib.copy_graph", "Copying Graph Elements (contrib)",
               tf.contrib.copy_graph),
-    ]
+  ]
 
 _hidden_symbols = ["Event", "LogMessage", "Summary", "SessionLog", "xrange",
                    "HistogramProto", "ConfigProto", "NodeDef", "GraphDef",
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 630c72fa5c6..3342617cad5 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -502,23 +502,26 @@ class TensorFlowTestCase(googletest.TestCase):
                                      expected_err_re_or_predicate):
     """Returns a context manager to enclose code expected to raise an exception.
 
+    If the exception is an OpError, the op stack is also included in the message
+    predicate search.
+
     Args:
       exception_type: The expected type of exception that should be raised.
       expected_err_re_or_predicate: If this is callable, it should be a function
-        of one argument that inspects the passed-in OpError exception and
+        of one argument that inspects the passed-in exception and
         returns True (success) or False (please fail the test). Otherwise, the
         error message is expected to match this regular expression partially.
 
     Returns:
       A context manager to surround code that is expected to raise an
-      errors.OpError exception.
+      exception.
     """
     if callable(expected_err_re_or_predicate):
       predicate = expected_err_re_or_predicate
     else:
       def predicate(e):
-        err_str = e.message
-        op = e.op
+        err_str = e.message if isinstance(e, errors.OpError) else str(e)
+        op = e.op if isinstance(e, errors.OpError) else None
         while op is not None:
           err_str += "\nCaused by: " + op.name
           op = op._original_op
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 7e11f17211b..8532fe3ecf0 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -11,6 +11,7 @@ package(
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_tests")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
@@ -32,7 +33,6 @@ py_tests(
         "decode_png_op_test.py",
         "decode_raw_op_test.py",
         "determinant_op_test.py",
-        "diag_op_test.py",
         "edit_distance_op_test.py",
         "fifo_queue_test.py",
         "identity_op_py_test.py",
@@ -71,6 +71,13 @@ py_tests(
     ],
 )
 
+cuda_py_tests(
+    name = "cast_op_test",
+    size = "small",
+    srcs = ["cast_op_test.py"],
+    tags = ["noasan"],
+)
+
 cuda_py_test(
     name = "dense_update_ops_no_tsan_test",
     size = "small",
@@ -78,6 +85,13 @@ cuda_py_test(
     tags = ["notsan"],
 )
 
+tf_py_test(
+    name = "diag_op_test",
+    size = "medium",
+    srcs = ["diag_op_test.py"],
+    shard_count = 2,
+)
+
 py_tests(
     name = "reader_ops_test",
     size = "small",
@@ -87,13 +101,6 @@ py_tests(
     ],
 )
 
-cuda_py_tests(
-    name = "cast_op_test",
-    size = "small",
-    srcs = ["cast_op_test.py"],
-    tags = ["noasan"],
-)
-
 cuda_py_tests(
     name = "kernel_tests",
     size = "small",
@@ -108,14 +115,12 @@ cuda_py_tests(
         "constant_op_test.py",
         "control_flow_ops_py_test.py",
         "conv1d_test.py",
-        "conv2d_backprop_filter_grad_test.py",
         "conv2d_transpose_test.py",
         "conv3d_backprop_filter_v2_grad_test.py",
         "cross_grad_test.py",
         "denormal_test.py",
         "dense_update_ops_test.py",
         "depthtospace_op_test.py",
-        "depthwise_conv_op_test.py",
         "division_past_test.py",
         "dynamic_partition_op_test.py",
         "dynamic_stitch_op_test.py",
@@ -135,7 +140,6 @@ cuda_py_tests(
         "pack_op_test.py",
         "pad_op_test.py",
         "padding_fifo_queue_test.py",
-        "pooling_ops_3d_test.py",
         "py_func_test.py",
         "random_crop_test.py",
         "random_ops_test.py",
@@ -177,10 +181,14 @@ cuda_py_tests(
     name = "medium_kernel_tests",
     size = "medium",
     srcs = [
+        "atrous_conv2d_test.py",
+        "conv2d_backprop_filter_grad_test.py",
         "conv3d_transpose_test.py",
         "conv_ops_test.py",
+        "depthwise_conv_op_test.py",  # http://b/30603882
         "division_future_test.py",
         "fft_ops_test.py",
+        "pooling_ops_3d_test.py",  # http://b/30600785
         "pooling_ops_test.py",
         "random_gamma_test.py",
         "rnn_test.py",
@@ -209,6 +217,7 @@ cuda_py_tests(
         "cwise_ops_test.py",
         "embedding_ops_test.py",
         "linalg_grad_test.py",
+        "svd_op_test.py",
     ],
     shard_count = 50,
     tags = ["notap"],  # b/30226163
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 9d0025eafad..8a6ba3615a1 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -278,12 +278,10 @@ class StridedSliceChecker(object):
     self.x_np = np.array(x)
 
   def __getitem__(self, spec):
-    # TODO(aselle): When NewSliceHelper is installed, we can switch this back
-    # op = self.x[spec]
-    op = array_ops._NewSliceHelper(self.x, spec)
+    op = self.x.__getitem__(spec)
 
     tensor = op.eval()
-    self.test.assertAllEqual(self.x_np[spec], tensor)
+    self.test.assertAllEqual(self.x_np.__getitem__(spec), tensor)
     self.test.assertAllEqual(tensor.shape, op.get_shape())
     return tensor
 
@@ -399,9 +397,7 @@ class StridedSliceShapeChecker(object):
     self.x = x
 
   def __getitem__(self, spec):
-    # TODO(aselle): When NewSliceHelper is installed, we can switch this back
-    # op = self.x[spec]
-    op = array_ops._NewSliceHelper(self.x, spec)
+    op = self.x.__getitem__(spec)
     return op.get_shape()
 
 
@@ -455,8 +451,8 @@ class GradSliceChecker(object):
     self.varnp = varnp
 
   def __getitem__(self, spec):
-    slice_var = array_ops._NewSliceHelper(self.var, spec)
-    slice_val = array_ops._NewSliceHelper(self.val, spec)
+    slice_var = self.var[spec]
+    slice_val = self.val[spec]
 
     # compute analytic 2nd derivative
     analytic_grad2 = 2 * slice_val
@@ -549,7 +545,7 @@ class BenchmarkSlice(object):
     self.tensor = tensor
 
   def __getitem__(self, x):
-    return array_ops._NewSliceHelper(self.tensor, x)
+    return self.tensor[x]
 
 
 class StridedSliceBenchmark(tf.test.Benchmark):
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 00372831df6..159305f78bb 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -716,10 +716,11 @@ class ControlFlowTest(tf.test.TestCase):
   def testWhileWithControl_3(self):
     with self.test_session() as sess:
       b = tf.placeholder(tf.bool)
-      c = tf.constant(0)
+      c = tf.constant(1)
+      x0 = tf.constant(0)
       with tf.control_dependencies([b]):
-        c = tf.while_loop(lambda x: x < 10, lambda x: x + 1, [c])
-      self.assertEqual(10, sess.run(c, {b: True}))
+        r = tf.while_loop(lambda x: x < 10, lambda x: x + c, [x0])
+      self.assertEqual(10, sess.run(r, {b: True}))
 
   def testWhileWithControl_4(self):
     with self.test_session() as sess:
@@ -1245,6 +1246,27 @@ class ControlFlowTest(tf.test.TestCase):
       r = tf.gradients([rx], x)
       self.assertAllClose(64.0, r[0].eval())
 
+  def testWhileGrad_OneOutputWithControlDependencyOnSecond(self):
+    with self.test_session():
+      i = tf.constant(0, name="i")
+      x = tf.constant(1.0, name="x")
+      y = tf.constant(1.0, name="y")
+      c = lambda i, *_: tf.less(i, 1, name="cond_less")
+      def b(i, xi, yi):
+        # return (i + 1, xi, xi + yi)
+        return (tf.add(i, 1, name="inc"),
+                tf.identity(xi, name="xi"),
+                tf.add(xi, yi, name="xi_plus_yi"))
+
+      _, x_f, y_f = tf.while_loop(c, b, [i, x, y])
+      with tf.control_dependencies([x_f]):
+        y_f_d = tf.identity(y_f, name="y_f_d")
+
+      self.assertAllClose(2.0, y_f_d.eval())  # y_f_d = 1.0 + 1.0
+      g = tf.gradients([y_f_d], [x])[0]
+      self.assertTrue(g is not None)
+      self.assertAllClose(1.0, g.eval())  # y_f_d = x + 1.0, dy_f_d/dx = 1.0
+
   def _testNestedWhileGrad_Simple(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       v = tf.constant(1.0)
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index cd603932838..3d6ae377fe1 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -26,10 +26,9 @@ import tensorflow as tf
 class DepthToSpaceTest(tf.test.TestCase):
 
   def _testOne(self, inputs, block_size, outputs):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = tf.depth_to_space(tf.to_float(inputs), block_size)
-        self.assertAllEqual(x_tf.eval(), outputs)
+    with self.test_session():
+      x_tf = tf.depth_to_space(tf.to_float(inputs), block_size)
+      self.assertAllEqual(x_tf.eval(), outputs)
 
   def testBasic(self):
     x_np = [[[[1, 2, 3, 4]]]]
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index ae521c05f0a..bdc83ea6328 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -319,21 +319,21 @@ class DiagTest(tf.test.TestCase):
                   [[5.5 + 5.5j, 6.6 + 6.6j], [7.7 + 7.7j, 8.8 + 8.8j]]],
                   dtype = np.complex64)
     expected_ans = np.array(
-        [[[[[[1.1 + 1.1j, 0 + 0j], [0 + 0j, 0 + 0j]], 
+        [[[[[[1.1 + 1.1j, 0 + 0j], [0 + 0j, 0 + 0j]],
             [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-           [[[0 + 0j, 2.2 + 2.2j], [0 + 0j, 0 + 0j]], 
+           [[[0 + 0j, 2.2 + 2.2j], [0 + 0j, 0 + 0j]],
                [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]]],
-          [[[[0 + 0j, 0 + 0j], [3.3 + 3.3j, 0 + 0j]], 
+          [[[[0 + 0j, 0 + 0j], [3.3 + 3.3j, 0 + 0j]],
               [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-           [[[0 + 0j, 0 + 0j], [0 + 0j, 4.4 + 4.4j]], 
+           [[[0 + 0j, 0 + 0j], [0 + 0j, 4.4 + 4.4j]],
                [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]]]],
-         [[[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], 
+         [[[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
              [[5.5 + 5.5j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-           [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], 
+           [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
                [[0 + 0j, 6.6 + 6.6j], [0 + 0j, 0 + 0j]]]],
-          [[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], 
+          [[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
               [[0 + 0j, 0 + 0j], [7.7 + 7.7j, 0 + 0j]]],
-           [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], 
+           [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
                [[0 + 0j, 0 + 0j], [0 + 0j, 8.8 + 8.8j]]]]]],
            dtype = np.complex64)
     self.diagOp(x, np.complex64, expected_ans)
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
index 1edea3f1f25..54433420be6 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
@@ -40,16 +40,15 @@ class ExtractImagePatches(tf.test.TestCase):
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        out_tensor = tf.extract_image_patches(
-            tf.constant(image),
-            ksizes=ksizes,
-            strides=strides,
-            rates=rates,
-            padding=padding,
-            name="im2col")
-        self.assertAllClose(patches, out_tensor.eval())
+    with self.test_session():
+      out_tensor = tf.extract_image_patches(
+          tf.constant(image),
+          ksizes=ksizes,
+          strides=strides,
+          rates=rates,
+          padding=padding,
+          name="im2col")
+      self.assertAllClose(patches, out_tensor.eval())
 
   def testKsize1x1Stride1x1Rate1x1(self):
     """Verifies that for 1x1 kernel the output equals the input."""
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 403d86b8f4c..e73d61d2617 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -367,5 +367,22 @@ class FunctionalOpsTest(tf.test.TestCase):
     y = tf.scan(fn, x, initializer=initializer)
     self.assertIs(None, y.get_shape().dims)
 
+  def testScanVaryingShape(self):
+    with self.test_session() as sess:
+      x = tf.placeholder(dtype=tf.float32, shape=[None, 2])
+      x_t = tf.transpose(x)
+      # scan over dimension 0 (with shape None)
+      result = tf.scan(lambda a, x: a + x, x)
+      # scanned over transposed dimension 0 (with shape 2)
+      result_t = tf.scan(lambda a, x: a + x, x_t, infer_shape=False)
+      # ensure gradients can be calculated
+      result_grad = tf.gradients(result, [x])[0]
+      result_t_grad = tf.gradients(result_t, [x_t])[0]
+
+      # smoke test to ensure they all evaluate
+      sess.run([result, result_t, result_grad, result_t_grad],
+               feed_dict={x: [[1.0, 2.0]]})
+
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index b88ba668f82..5b94583a8aa 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -61,9 +61,9 @@ class PadOpTest(tf.test.TestCase):
             [[1, 1], [1, 2]],
             mode="symmetric"))
 
-  def _testPad(self, np_inputs, paddings, mode, use_gpu=False):
+  def _testPad(self, np_inputs, paddings, mode):
     np_val = self._npPad(np_inputs, paddings, mode=mode)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session():
       tf_val = tf.pad(np_inputs, paddings, mode=mode)
       out = tf_val.eval()
     self.assertAllEqual(np_val, out)
@@ -86,8 +86,8 @@ class PadOpTest(tf.test.TestCase):
 
   def _testAll(self, np_inputs, paddings):
     for mode in ("CONSTANT", "REFLECT", "SYMMETRIC"):
-      self._testPad(np_inputs, paddings, mode=mode, use_gpu=False)
-      self._testPad(np_inputs, paddings, mode=mode, use_gpu=True)
+      self._testPad(np_inputs, paddings, mode=mode)
+      self._testPad(np_inputs, paddings, mode=mode)
       if np_inputs.dtype == np.float32:
         self._testGradient(np_inputs, paddings, mode=mode)
 
@@ -189,12 +189,11 @@ class PadOpTest(tf.test.TestCase):
   def testScalars(self):
     paddings = np.zeros((0, 2), dtype=np.int32)
     inp = np.asarray(7)
-    for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
-        tf_val = tf.pad(inp, paddings)
-        out = tf_val.eval()
-      self.assertAllEqual(inp, out)
-      self.assertShapeEqual(inp, tf_val)
+    with self.test_session():
+      tf_val = tf.pad(inp, paddings)
+      out = tf_val.eval()
+    self.assertAllEqual(inp, out)
+    self.assertShapeEqual(inp, tf_val)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/kernel_tests/parsing_ops_test.py
index 58f6da9f976..52d3c0dde1a 100644
--- a/tensorflow/python/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/python/kernel_tests/parsing_ops_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tests for tensorflow.ops.parsing_ops."""
 
 from __future__ import absolute_import
@@ -46,13 +45,13 @@ def flatten(list_of_lists):
 
 def flatten_values_tensors_or_sparse(tensors_list):
   """Flatten each SparseTensor object into 3 Tensors for session.run()."""
-  return list(flatten([[v.indices, v.values, v.shape]
-                       if isinstance(v, tf.SparseTensor) else [v]
-                       for v in tensors_list]))
+  return list(
+      flatten([[v.indices, v.values, v.shape] if isinstance(v, tf.SparseTensor)
+               else [v] for v in tensors_list]))
 
 
-def _compare_output_to_expected(
-    tester, dict_tensors, expected_tensors, flat_output):
+def _compare_output_to_expected(tester, dict_tensors, expected_tensors,
+                                flat_output):
   tester.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys()))
 
   i = 0  # Index into the flattened output of session.run()
@@ -74,11 +73,11 @@ def _compare_output_to_expected(
 
 class ParseExampleTest(tf.test.TestCase):
 
-  def _test(
-      self, kwargs, expected_values=None, expected_err=None):
+  def _test(self, kwargs, expected_values=None, expected_err=None):
     with self.test_session() as sess:
       if expected_err:
-        with self.assertRaisesRegexp(expected_err[0], expected_err[1]):
+        with self.assertRaisesWithPredicateMatch(
+            expected_err[0], expected_err[1]):
           out = tf.parse_example(**kwargs)
           sess.run(flatten_values_tensors_or_sparse(out.values()))
       else:
@@ -92,9 +91,8 @@ class ParseExampleTest(tf.test.TestCase):
       # Check shapes; if serialized is a Tensor we need its size to
       # properly check.
       serialized = kwargs["serialized"]
-      batch_size = (
-          serialized.eval().size if isinstance(serialized, tf.Tensor)
-          else np.asarray(serialized).size)
+      batch_size = (serialized.eval().size if isinstance(serialized, tf.Tensor)
+                    else np.asarray(serialized).size)
       for k, f in kwargs["features"].items():
         if isinstance(f, tf.FixedLenFeature) and f.shape is not None:
           self.assertEqual(
@@ -115,9 +113,12 @@ class ParseExampleTest(tf.test.TestCase):
     c_default = np.random.rand(2).astype(np.float32)
 
     expected_st_a = (  # indices, values, shape
-        np.empty((0, 2), dtype=np.int64),  # indices
-        np.empty((0,), dtype=np.int64),  # sp_a is DT_INT64
-        np.array([2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
+        np.empty(
+            (0, 2), dtype=np.int64),  # indices
+        np.empty(
+            (0,), dtype=np.int64),  # sp_a is DT_INT64
+        np.array(
+            [2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
 
     expected_output = {
         sparse_name: expected_st_a,
@@ -126,38 +127,63 @@ class ParseExampleTest(tf.test.TestCase):
         c_name: np.array(2 * [c_default]),
     }
 
-    self._test({
-        "example_names": np.empty((0,), dtype=bytes),
-        "serialized": tf.convert_to_tensor(["", ""]),
-        "features": {
-            sparse_name: tf.VarLenFeature(tf.int64),
-            a_name: tf.FixedLenFeature((1, 3), tf.int64, default_value=a_default),
-            b_name: tf.FixedLenFeature((3, 3), tf.string, default_value=b_default),
-            c_name: tf.FixedLenFeature((2,), tf.float32, default_value=c_default),
-        }
-    }, expected_output)
+    self._test(
+        {
+            "example_names": np.empty(
+                (0,), dtype=bytes),
+            "serialized": tf.convert_to_tensor(["", ""]),
+            "features": {
+                sparse_name: tf.VarLenFeature(tf.int64),
+                a_name: tf.FixedLenFeature(
+                    (1, 3), tf.int64, default_value=a_default),
+                b_name: tf.FixedLenFeature(
+                    (3, 3), tf.string, default_value=b_default),
+                c_name: tf.FixedLenFeature(
+                    (2,), tf.float32, default_value=c_default),
+            }
+        },
+        expected_output)
 
   def testEmptySerializedWithoutDefaultsShouldFail(self):
-    self._test({
-        "example_names": ["in1", "in2"],
-        "serialized": ["", ""],
-        "features": {
-            "st_a": tf.VarLenFeature(tf.int64),
-            "a": tf.FixedLenFeature((1, 3), tf.int64, default_value=[0, 42, 0]),
-            "b": tf.FixedLenFeature(
-                (3, 3), tf.string,
-                default_value=np.random.rand(3, 3).astype(bytes)),
-            # Feature "c" is missing a default, this gap will cause failure.
-            "c": tf.FixedLenFeature((2,), dtype=tf.float32),
-        }
-    }, expected_err=(tf.OpError, "Name: in1, Feature: c is required"))
+    input_features = {
+        "st_a": tf.VarLenFeature(tf.int64),
+        "a": tf.FixedLenFeature(
+            (1, 3), tf.int64, default_value=[0, 42, 0]),
+        "b": tf.FixedLenFeature(
+            (3, 3),
+            tf.string,
+            default_value=np.random.rand(3, 3).astype(bytes)),
+        # Feature "c" is missing a default, this gap will cause failure.
+        "c": tf.FixedLenFeature(
+            (2,), dtype=tf.float32),
+    }
+
+    # Edge case where the key is there but the feature value is empty
+    original = example(features=features({
+        "c": feature()
+    }))
+    self._test(
+        {
+            "example_names": ["in1"],
+            "serialized": [original.SerializeToString()],
+            "features": input_features,
+        },
+        expected_err=(tf.OpError, "Name: in1, Feature: c is required"))
+
+    # Standard case of missing key and value.
+    self._test(
+        {
+            "example_names": ["in1", "in2"],
+            "serialized": ["", ""],
+            "features": input_features,
+        },
+        expected_err=(tf.OpError, "Name: in1, Feature: c is required"))
 
   def testDenseNotMatchingShapeShouldFail(self):
     original = [
         example(features=features({
             "a": float_feature([1, 1, 3]),
-        })),
-        example(features=features({
+        })), example(features=features({
             "a": float_feature([-1, -1]),
         }))
     ]
@@ -165,27 +191,27 @@ class ParseExampleTest(tf.test.TestCase):
     names = ["passing", "failing"]
     serialized = [m.SerializeToString() for m in original]
 
-    self._test({
-        "example_names": names,
-        "serialized": tf.convert_to_tensor(serialized),
-        "features": {"a": tf.FixedLenFeature((1, 3), tf.float32)}
-    }, expected_err=(
-        tf.OpError, "Name: failing, Key: a, Index: 1.  Number of float val"))
+    self._test(
+        {
+            "example_names": names,
+            "serialized": tf.convert_to_tensor(serialized),
+            "features": {"a": tf.FixedLenFeature((1, 3), tf.float32)}
+        },
+        expected_err=(tf.OpError,
+                      "Name: failing, Key: a, Index: 1.  Number of float val"))
 
   def testDenseDefaultNoShapeShouldFail(self):
-    original = [
-        example(features=features({
-            "a": float_feature([1, 1, 3]),
-        })),
-    ]
+    original = [example(features=features({"a": float_feature([1, 1, 3]),})),]
 
     serialized = [m.SerializeToString() for m in original]
 
-    self._test({
-        "example_names": ["failing"],
-        "serialized": tf.convert_to_tensor(serialized),
-        "features": {"a": tf.FixedLenFeature(None, tf.float32)}
-    }, expected_err=(ValueError, "Missing shape for feature a"))
+    self._test(
+        {
+            "example_names": ["failing"],
+            "serialized": tf.convert_to_tensor(serialized),
+            "features": {"a": tf.FixedLenFeature(None, tf.float32)}
+        },
+        expected_err=(ValueError, "Missing shape for feature a"))
 
   def testSerializedContainingSparse(self):
     original = [
@@ -207,14 +233,16 @@ class ParseExampleTest(tf.test.TestCase):
     serialized = [m.SerializeToString() for m in original]
 
     expected_st_c = (  # indices, values, shape
-        np.array([[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64),
-        np.array([3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32),
-        np.array([4, 3], dtype=np.int64))  # batch == 2, max_elems = 3
+        np.array(
+            [[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64), np.array(
+                [3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32), np.array(
+                    [4, 3], dtype=np.int64))  # batch == 2, max_elems = 3
 
     expected_st_d = (  # indices, values, shape
-        np.array([[3, 0]], dtype=np.int64),
-        np.array(["hi"], dtype=bytes),
-        np.array([4, 1], dtype=np.int64))  # batch == 2, max_elems = 1
+        np.array(
+            [[3, 0]], dtype=np.int64), np.array(
+                ["hi"], dtype=bytes), np.array(
+                    [4, 1], dtype=np.int64))  # batch == 2, max_elems = 1
 
     expected_output = {
         "st_c": expected_st_c,
@@ -236,8 +264,7 @@ class ParseExampleTest(tf.test.TestCase):
         example(features=features({
             aname: float_feature([1, 1]),
             bname: bytes_feature([b"b0_str"]),
-        })),
-        example(features=features({
+        })), example(features=features({
             aname: float_feature([-1, -1]),
             bname: bytes_feature([b"b1"]),
         }))
@@ -248,24 +275,28 @@ class ParseExampleTest(tf.test.TestCase):
     expected_output = {
         aname: np.array(
             [[1, 1], [-1, -1]], dtype=np.float32).reshape(2, 1, 2, 1),
-        bname: np.array(["b0_str", "b1"], dtype=bytes).reshape(2, 1, 1, 1, 1),
+        bname: np.array(
+            ["b0_str", "b1"], dtype=bytes).reshape(2, 1, 1, 1, 1),
     }
 
     # No defaults, values required
-    self._test({
-        "serialized": tf.convert_to_tensor(serialized),
-        "features": {
-            aname: tf.FixedLenFeature((1, 2, 1), dtype=tf.float32),
-            bname: tf.FixedLenFeature((1, 1, 1, 1), dtype=tf.string),
-        }
-    }, expected_output)
+    self._test(
+        {
+            "serialized": tf.convert_to_tensor(serialized),
+            "features": {
+                aname: tf.FixedLenFeature(
+                    (1, 2, 1), dtype=tf.float32),
+                bname: tf.FixedLenFeature(
+                    (1, 1, 1, 1), dtype=tf.string),
+            }
+        },
+        expected_output)
 
   def testSerializedContainingDenseScalar(self):
     original = [
         example(features=features({
             "a": float_feature([1]),
-        })),
-        example(features=features({}))
+        })), example(features=features({}))
     ]
 
     serialized = [m.SerializeToString() for m in original]
@@ -274,12 +305,15 @@ class ParseExampleTest(tf.test.TestCase):
         "a": np.array([[1], [-1]], dtype=np.float32)  # 2x1 (column vector)
     }
 
-    self._test({
-        "serialized": tf.convert_to_tensor(serialized),
-        "features": {
-            "a": tf.FixedLenFeature((1,), dtype=tf.float32, default_value=-1),
-        }
-    }, expected_output)
+    self._test(
+        {
+            "serialized": tf.convert_to_tensor(serialized),
+            "features": {
+                "a": tf.FixedLenFeature(
+                    (1,), dtype=tf.float32, default_value=-1),
+            }
+        },
+        expected_output)
 
   def testSerializedContainingDenseWithDefaults(self):
     original = [
@@ -288,37 +322,46 @@ class ParseExampleTest(tf.test.TestCase):
         })),
         example(features=features({
             "b": bytes_feature([b"b1"]),
-        }))
+        })),
+        example(features=features({
+            "b": feature()
+        })),
     ]
 
     serialized = [m.SerializeToString() for m in original]
 
     expected_output = {
-        "a": np.array([[1, 1], [3, -3]], dtype=np.float32).reshape(2, 1, 2, 1),
-        "b": np.array(["tmp_str", "b1"], dtype=bytes).reshape(2, 1, 1, 1, 1),
+        "a": np.array(
+            [[1, 1], [3, -3], [3, -3]], dtype=np.float32).reshape(3, 1, 2, 1),
+        "b": np.array(
+            ["tmp_str", "b1", "tmp_str"], dtype=bytes).reshape(3, 1, 1, 1, 1),
     }
 
-    self._test({
-        "serialized": tf.convert_to_tensor(serialized),
-        "features": {
-            "a": tf.FixedLenFeature(
-                (1, 2, 1), dtype=tf.float32, default_value=[3.0, -3.0]),
-            "b": tf.FixedLenFeature(
-                (1, 1, 1, 1), dtype=tf.string, default_value="tmp_str"),
-        }
-    }, expected_output)
+    self._test(
+        {
+            "serialized": tf.convert_to_tensor(serialized),
+            "features": {
+                "a": tf.FixedLenFeature(
+                    (1, 2, 1), dtype=tf.float32, default_value=[3.0, -3.0]),
+                "b": tf.FixedLenFeature(
+                    (1, 1, 1, 1), dtype=tf.string, default_value="tmp_str"),
+            }
+        },
+        expected_output)
 
   def testSerializedContainingSparseAndDenseWithNoDefault(self):
     expected_st_a = (  # indices, values, shape
-        np.empty((0, 2), dtype=np.int64),  # indices
-        np.empty((0,), dtype=np.int64),  # sp_a is DT_INT64
-        np.array([2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
+        np.empty(
+            (0, 2), dtype=np.int64),  # indices
+        np.empty(
+            (0,), dtype=np.int64),  # sp_a is DT_INT64
+        np.array(
+            [2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
 
     original = [
         example(features=features({
             "c": float_feature([3, 4])
-        })),
-        example(features=features({
+        })), example(features=features({
             "c": float_feature([1, 2])
         }))
     ]
@@ -332,20 +375,25 @@ class ParseExampleTest(tf.test.TestCase):
         "st_a": expected_st_a,
         "a": np.array(2 * [[a_default]]),
         "b": np.array(2 * [b_default]),
-        "c": np.array([[3, 4], [1, 2]], dtype=np.float32),
+        "c": np.array(
+            [[3, 4], [1, 2]], dtype=np.float32),
     }
 
-    self._test({
-        "example_names": names,
-        "serialized": tf.convert_to_tensor(serialized),
-        "features": {
-            "st_a": tf.VarLenFeature(tf.int64),
-            "a": tf.FixedLenFeature((1, 3), tf.int64, default_value=a_default),
-            "b": tf.FixedLenFeature((3, 3), tf.string, default_value=b_default),
-            # Feature "c" must be provided, since it has no default_value.
-            "c": tf.FixedLenFeature((2,), tf.float32),
-        }
-    }, expected_output)
+    self._test(
+        {
+            "example_names": names,
+            "serialized": tf.convert_to_tensor(serialized),
+            "features": {
+                "st_a": tf.VarLenFeature(tf.int64),
+                "a": tf.FixedLenFeature(
+                    (1, 3), tf.int64, default_value=a_default),
+                "b": tf.FixedLenFeature(
+                    (3, 3), tf.string, default_value=b_default),
+                # Feature "c" must be provided, since it has no default_value.
+                "c": tf.FixedLenFeature((2,), tf.float32),
+            }
+        },
+        expected_output)
 
 
 class ParseSingleExampleTest(tf.test.TestCase):
@@ -353,7 +401,8 @@ class ParseSingleExampleTest(tf.test.TestCase):
   def _test(self, kwargs, expected_values=None, expected_err=None):
     with self.test_session() as sess:
       if expected_err:
-        with self.assertRaisesRegexp(expected_err[0], expected_err[1]):
+        with self.assertRaisesWithPredicateMatch(
+            expected_err[0], expected_err[1]):
           out = tf.parse_single_example(**kwargs)
           sess.run(flatten_values_tensors_or_sparse(out.values()))
       else:
@@ -374,16 +423,17 @@ class ParseSingleExampleTest(tf.test.TestCase):
           self.assertEqual(tuple(out[k].shape.get_shape().as_list()), (1,))
 
   def testSingleExampleWithSparseAndDense(self):
-    original = example(features=features(
-        {"c": float_feature([3, 4]),
-         "st_a": float_feature([3.0, 4.0])}))
+    original = example(features=features({"c": float_feature([3, 4]),
+                                          "st_a": float_feature([3.0, 4.0])}))
 
     serialized = original.SerializeToString()
 
-    expected_st_a = (
-        np.array([[0], [1]], dtype=np.int64),  # indices
-        np.array([3.0, 4.0], dtype=np.float32),  # values
-        np.array([2], dtype=np.int64))  # shape: max_values = 2
+    expected_st_a = (np.array(
+        [[0], [1]], dtype=np.int64),  # indices
+                     np.array(
+                         [3.0, 4.0], dtype=np.float32),  # values
+                     np.array(
+                         [2], dtype=np.int64))  # shape: max_values = 2
 
     a_default = [1, 2, 3]
     b_default = np.random.rand(3, 3).astype(bytes)
@@ -391,20 +441,25 @@ class ParseSingleExampleTest(tf.test.TestCase):
         "st_a": expected_st_a,
         "a": [a_default],
         "b": b_default,
-        "c": np.array([3, 4], dtype=np.float32),
+        "c": np.array(
+            [3, 4], dtype=np.float32),
     }
 
-    self._test({
-        "example_names": tf.convert_to_tensor("in1"),
-        "serialized": tf.convert_to_tensor(serialized),
-        "features": {
-            "st_a": tf.VarLenFeature(tf.float32),
-            "a": tf.FixedLenFeature((1, 3), tf.int64, default_value=a_default),
-            "b": tf.FixedLenFeature((3, 3), tf.string, default_value=b_default),
-            # Feature "c" must be provided, since it has no default_value.
-            "c": tf.FixedLenFeature((2,), tf.float32),
-        }
-    }, expected_output)
+    self._test(
+        {
+            "example_names": tf.convert_to_tensor("in1"),
+            "serialized": tf.convert_to_tensor(serialized),
+            "features": {
+                "st_a": tf.VarLenFeature(tf.float32),
+                "a": tf.FixedLenFeature(
+                    (1, 3), tf.int64, default_value=a_default),
+                "b": tf.FixedLenFeature(
+                    (3, 3), tf.string, default_value=b_default),
+                # Feature "c" must be provided, since it has no default_value.
+                "c": tf.FixedLenFeature((2,), tf.float32),
+            }
+        },
+        expected_output)
 
 
 class ParseSequenceExampleTest(tf.test.TestCase):
@@ -413,26 +468,31 @@ class ParseSequenceExampleTest(tf.test.TestCase):
     value = sequence_example(
         context=features({
             "global_feature": float_feature([1, 2, 3]),
-            }),
+        }),
         feature_lists=feature_lists({
             "repeated_feature_2_frames": feature_list([
                 bytes_feature([b"a", b"b", b"c"]),
-                bytes_feature([b"a", b"d", b"e"])]),
+                bytes_feature([b"a", b"d", b"e"])
+            ]),
             "repeated_feature_3_frames": feature_list([
-                int64_feature([3, 4, 5, 6, 7]),
-                int64_feature([-1, 0, 0, 0, 0]),
-                int64_feature([1, 2, 3, 4, 5])])
-            }))
+                int64_feature([3, 4, 5, 6, 7]), int64_feature([-1, 0, 0, 0, 0]),
+                int64_feature([1, 2, 3, 4, 5])
+            ])
+        }))
     value.SerializeToString()  # Smoke test
 
-  def _test(self, kwargs, expected_context_values=None,
-            expected_feat_list_values=None, expected_err=None):
+  def _test(self,
+            kwargs,
+            expected_context_values=None,
+            expected_feat_list_values=None,
+            expected_err=None):
     expected_context_values = expected_context_values or {}
     expected_feat_list_values = expected_feat_list_values or {}
 
     with self.test_session() as sess:
       if expected_err:
-        with self.assertRaisesRegexp(expected_err[0], expected_err[1]):
+        with self.assertRaisesWithPredicateMatch(
+            expected_err[0], expected_err[1]):
           c_out, fl_out = tf.parse_single_sequence_example(**kwargs)
           if c_out:
             sess.run(flatten_values_tensors_or_sparse(c_out.values()))
@@ -442,16 +502,16 @@ class ParseSequenceExampleTest(tf.test.TestCase):
         # Returns dicts w/ Tensors and SparseTensors.
         context_out, feat_list_out = tf.parse_single_sequence_example(**kwargs)
         context_result = sess.run(
-            flatten_values_tensors_or_sparse(
-                context_out.values())) if context_out else []
+            flatten_values_tensors_or_sparse(context_out.values(
+            ))) if context_out else []
         feat_list_result = sess.run(
-            flatten_values_tensors_or_sparse(
-                feat_list_out.values())) if feat_list_out else []
+            flatten_values_tensors_or_sparse(feat_list_out.values(
+            ))) if feat_list_out else []
         # Check values.
-        _compare_output_to_expected(
-            self, context_out, expected_context_values, context_result)
-        _compare_output_to_expected(
-            self, feat_list_out, expected_feat_list_values, feat_list_result)
+        _compare_output_to_expected(self, context_out, expected_context_values,
+                                    context_result)
+        _compare_output_to_expected(self, feat_list_out,
+                                    expected_feat_list_values, feat_list_result)
 
       # Check shapes; if serialized is a Tensor we need its size to
       # properly check.
@@ -469,16 +529,18 @@ class ParseSequenceExampleTest(tf.test.TestCase):
                 tuple(context_out[k].shape.get_shape().as_list()), (1,))
 
   def testSequenceExampleWithSparseAndDenseContext(self):
-    original = sequence_example(context=features(
-        {"c": float_feature([3, 4]),
-         "st_a": float_feature([3.0, 4.0])}))
+    original = sequence_example(context=features({"c": float_feature([3, 4]),
+                                                  "st_a": float_feature(
+                                                      [3.0, 4.0])}))
 
     serialized = original.SerializeToString()
 
-    expected_st_a = (
-        np.array([[0], [1]], dtype=np.int64),  # indices
-        np.array([3.0, 4.0], dtype=np.float32),  # values
-        np.array([2], dtype=np.int64))  # shape: num_features = 2
+    expected_st_a = (np.array(
+        [[0], [1]], dtype=np.int64),  # indices
+                     np.array(
+                         [3.0, 4.0], dtype=np.float32),  # values
+                     np.array(
+                         [2], dtype=np.int64))  # shape: num_features = 2
 
     a_default = [1, 2, 3]
     b_default = np.random.rand(3, 3).astype(bytes)
@@ -486,20 +548,25 @@ class ParseSequenceExampleTest(tf.test.TestCase):
         "st_a": expected_st_a,
         "a": [a_default],
         "b": b_default,
-        "c": np.array([3, 4], dtype=np.float32),
+        "c": np.array(
+            [3, 4], dtype=np.float32),
     }
 
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(serialized),
-        "context_features": {
-            "st_a": tf.VarLenFeature(tf.float32),
-            "a": tf.FixedLenFeature((1, 3), tf.int64, default_value=a_default),
-            "b": tf.FixedLenFeature((3, 3), tf.string, default_value=b_default),
-            # Feature "c" must be provided, since it has no default_value.
-            "c": tf.FixedLenFeature((2,), tf.float32),
-        }
-    }, expected_context_values=expected_context_output)
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(serialized),
+            "context_features": {
+                "st_a": tf.VarLenFeature(tf.float32),
+                "a": tf.FixedLenFeature(
+                    (1, 3), tf.int64, default_value=a_default),
+                "b": tf.FixedLenFeature(
+                    (3, 3), tf.string, default_value=b_default),
+                # Feature "c" must be provided, since it has no default_value.
+                "c": tf.FixedLenFeature((2,), tf.float32),
+            }
+        },
+        expected_context_values=expected_context_output)
 
   def testSequenceExampleWithMultipleSizeFeatureLists(self):
     original = sequence_example(feature_lists=feature_lists({
@@ -507,229 +574,274 @@ class ParseSequenceExampleTest(tf.test.TestCase):
             int64_feature([-1, 0, 1]),
             int64_feature([2, 3, 4]),
             int64_feature([5, 6, 7]),
-            int64_feature([8, 9, 10]),]),
+            int64_feature([8, 9, 10]),
+        ]),
         "b": feature_list([
-            bytes_feature([b"r00", b"r01", b"r10", b"r11"])]),
+            bytes_feature([b"r00", b"r01", b"r10", b"r11"])
+        ]),
         "c": feature_list([
-            float_feature([3, 4]),
-            float_feature([-1, 2])]),
-        }))
+            float_feature([3, 4]), float_feature([-1, 2])
+        ]),
+    }))
 
     serialized = original.SerializeToString()
 
     expected_feature_list_output = {
-        "a": np.array([  # outer dimension is time.
-            [[-1, 0, 1]],  # inside are 1x3 matrices
-            [[2, 3, 4]],
-            [[5, 6, 7]],
-            [[8, 9, 10]]], dtype=np.int64),
-        "b": np.array([  # outer dimension is time, inside are 2x2 matrices
-            [[b"r00", b"r01"], [b"r10", b"r11"]]], dtype=bytes),
-        "c": np.array([  # outer dimension is time, inside are 2-vectors
-            [3, 4],
-            [-1, 2]], dtype=np.float32),
-        "d": np.empty(shape=(0, 5), dtype=np.float32),  # empty_allowed_missing
-        }
+        "a": np.array(
+            [  # outer dimension is time.
+                [[-1, 0, 1]],  # inside are 1x3 matrices
+                [[2, 3, 4]],
+                [[5, 6, 7]],
+                [[8, 9, 10]]
+            ],
+            dtype=np.int64),
+        "b": np.array(
+            [  # outer dimension is time, inside are 2x2 matrices
+                [[b"r00", b"r01"], [b"r10", b"r11"]]
+            ],
+            dtype=bytes),
+        "c": np.array(
+            [  # outer dimension is time, inside are 2-vectors
+                [3, 4], [-1, 2]
+            ],
+            dtype=np.float32),
+        "d": np.empty(
+            shape=(0, 5), dtype=np.float32),  # empty_allowed_missing
+    }
 
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(serialized),
-        "sequence_features": {
-            "a": tf.FixedLenSequenceFeature((1, 3), tf.int64),
-            "b": tf.FixedLenSequenceFeature((2, 2), tf.string),
-            "c": tf.FixedLenSequenceFeature((2,), tf.float32),
-            "d": tf.FixedLenSequenceFeature((5,), tf.float32, allow_missing=True),
-        }
-    }, expected_feat_list_values=expected_feature_list_output)
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(serialized),
+            "sequence_features": {
+                "a": tf.FixedLenSequenceFeature((1, 3), tf.int64),
+                "b": tf.FixedLenSequenceFeature((2, 2), tf.string),
+                "c": tf.FixedLenSequenceFeature((2,), tf.float32),
+                "d": tf.FixedLenSequenceFeature(
+                    (5,), tf.float32, allow_missing=True),
+            }
+        },
+        expected_feat_list_values=expected_feature_list_output)
 
   def testSequenceExampleWithoutDebugName(self):
     original = sequence_example(feature_lists=feature_lists({
         "a": feature_list([
-            int64_feature([3, 4]),
-            int64_feature([1, 0])]),
+            int64_feature([3, 4]), int64_feature([1, 0])
+        ]),
         "st_a": feature_list([
-            float_feature([3.0, 4.0]),
-            float_feature([5.0]),
-            float_feature([])]),
+            float_feature([3.0, 4.0]), float_feature([5.0]), float_feature([])
+        ]),
         "st_b": feature_list([
-            bytes_feature([b"a"]),
-            bytes_feature([]),
-            bytes_feature([]),
-            bytes_feature([b"b", b"c"])])}))
+            bytes_feature([b"a"]), bytes_feature([]), bytes_feature([]),
+            bytes_feature([b"b", b"c"])
+        ])
+    }))
 
     serialized = original.SerializeToString()
 
     expected_st_a = (
-        np.array([[0, 0], [0, 1], [1, 0]], dtype=np.int64),  # indices
-        np.array([3.0, 4.0, 5.0], dtype=np.float32),  # values
-        np.array([3, 2], dtype=np.int64))  # shape: num_time = 3, max_feat = 2
+        np.array(
+            [[0, 0], [0, 1], [1, 0]], dtype=np.int64),  # indices
+        np.array(
+            [3.0, 4.0, 5.0], dtype=np.float32),  # values
+        np.array(
+            [3, 2], dtype=np.int64))  # shape: num_time = 3, max_feat = 2
 
     expected_st_b = (
-        np.array([[0, 0], [3, 0], [3, 1]], dtype=np.int64),  # indices
-        np.array(["a", "b", "c"], dtype="|S"),  # values
-        np.array([4, 2], dtype=np.int64))  # shape: num_time = 4, max_feat = 2
+        np.array(
+            [[0, 0], [3, 0], [3, 1]], dtype=np.int64),  # indices
+        np.array(
+            ["a", "b", "c"], dtype="|S"),  # values
+        np.array(
+            [4, 2], dtype=np.int64))  # shape: num_time = 4, max_feat = 2
 
     expected_st_c = (
-        np.empty((0, 2), dtype=np.int64),  # indices
-        np.empty((0,), dtype=np.int64),  # values
-        np.array([0, 0], dtype=np.int64))  # shape: num_time = 0, max_feat = 0
+        np.empty(
+            (0, 2), dtype=np.int64),  # indices
+        np.empty(
+            (0,), dtype=np.int64),  # values
+        np.array(
+            [0, 0], dtype=np.int64))  # shape: num_time = 0, max_feat = 0
 
     expected_feature_list_output = {
-        "a": np.array([[3, 4], [1, 0]], dtype=np.int64),
+        "a": np.array(
+            [[3, 4], [1, 0]], dtype=np.int64),
         "st_a": expected_st_a,
         "st_b": expected_st_b,
         "st_c": expected_st_c,
     }
 
-    self._test({
-        "serialized": tf.convert_to_tensor(serialized),
-        "sequence_features": {
-            "st_a": tf.VarLenFeature(tf.float32),
-            "st_b": tf.VarLenFeature(tf.string),
-            "st_c": tf.VarLenFeature(tf.int64),
-            "a": tf.FixedLenSequenceFeature((2,), tf.int64),
-        }
-    }, expected_feat_list_values=expected_feature_list_output)
+    self._test(
+        {
+            "serialized": tf.convert_to_tensor(serialized),
+            "sequence_features": {
+                "st_a": tf.VarLenFeature(tf.float32),
+                "st_b": tf.VarLenFeature(tf.string),
+                "st_c": tf.VarLenFeature(tf.int64),
+                "a": tf.FixedLenSequenceFeature((2,), tf.int64),
+            }
+        },
+        expected_feat_list_values=expected_feature_list_output)
 
   def testSequenceExampleWithSparseAndDenseFeatureLists(self):
     original = sequence_example(feature_lists=feature_lists({
         "a": feature_list([
-            int64_feature([3, 4]),
-            int64_feature([1, 0])]),
+            int64_feature([3, 4]), int64_feature([1, 0])
+        ]),
         "st_a": feature_list([
-            float_feature([3.0, 4.0]),
-            float_feature([5.0]),
-            float_feature([])]),
+            float_feature([3.0, 4.0]), float_feature([5.0]), float_feature([])
+        ]),
         "st_b": feature_list([
-            bytes_feature([b"a"]),
-            bytes_feature([]),
-            bytes_feature([]),
-            bytes_feature([b"b", b"c"])])}))
+            bytes_feature([b"a"]), bytes_feature([]), bytes_feature([]),
+            bytes_feature([b"b", b"c"])
+        ])
+    }))
 
     serialized = original.SerializeToString()
 
     expected_st_a = (
-        np.array([[0, 0], [0, 1], [1, 0]], dtype=np.int64),  # indices
-        np.array([3.0, 4.0, 5.0], dtype=np.float32),  # values
-        np.array([3, 2], dtype=np.int64))  # shape: num_time = 3, max_feat = 2
+        np.array(
+            [[0, 0], [0, 1], [1, 0]], dtype=np.int64),  # indices
+        np.array(
+            [3.0, 4.0, 5.0], dtype=np.float32),  # values
+        np.array(
+            [3, 2], dtype=np.int64))  # shape: num_time = 3, max_feat = 2
 
     expected_st_b = (
-        np.array([[0, 0], [3, 0], [3, 1]], dtype=np.int64),  # indices
-        np.array(["a", "b", "c"], dtype="|S"),  # values
-        np.array([4, 2], dtype=np.int64))  # shape: num_time = 4, max_feat = 2
+        np.array(
+            [[0, 0], [3, 0], [3, 1]], dtype=np.int64),  # indices
+        np.array(
+            ["a", "b", "c"], dtype="|S"),  # values
+        np.array(
+            [4, 2], dtype=np.int64))  # shape: num_time = 4, max_feat = 2
 
     expected_st_c = (
-        np.empty((0, 2), dtype=np.int64),  # indices
-        np.empty((0,), dtype=np.int64),  # values
-        np.array([0, 0], dtype=np.int64))  # shape: num_time = 0, max_feat = 0
+        np.empty(
+            (0, 2), dtype=np.int64),  # indices
+        np.empty(
+            (0,), dtype=np.int64),  # values
+        np.array(
+            [0, 0], dtype=np.int64))  # shape: num_time = 0, max_feat = 0
 
     expected_feature_list_output = {
-        "a": np.array([[3, 4], [1, 0]], dtype=np.int64),
+        "a": np.array(
+            [[3, 4], [1, 0]], dtype=np.int64),
         "st_a": expected_st_a,
         "st_b": expected_st_b,
         "st_c": expected_st_c,
     }
 
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(serialized),
-        "sequence_features": {
-            "st_a": tf.VarLenFeature(tf.float32),
-            "st_b": tf.VarLenFeature(tf.string),
-            "st_c": tf.VarLenFeature(tf.int64),
-            "a": tf.FixedLenSequenceFeature((2,), tf.int64),
-        }
-    }, expected_feat_list_values=expected_feature_list_output)
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(serialized),
+            "sequence_features": {
+                "st_a": tf.VarLenFeature(tf.float32),
+                "st_b": tf.VarLenFeature(tf.string),
+                "st_c": tf.VarLenFeature(tf.int64),
+                "a": tf.FixedLenSequenceFeature((2,), tf.int64),
+            }
+        },
+        expected_feat_list_values=expected_feature_list_output)
 
   def testSequenceExampleListWithInconsistentDataFails(self):
     original = sequence_example(feature_lists=feature_lists({
         "a": feature_list([
-            int64_feature([-1, 0]),
-            float_feature([2, 3])])
-        }))
+            int64_feature([-1, 0]), float_feature([2, 3])
+        ])
+    }))
 
     serialized = original.SerializeToString()
 
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(serialized),
-        "sequence_features": {"a": tf.FixedLenSequenceFeature((2,), tf.int64)}
-    }, expected_err=(
-        tf.OpError,
-        "Feature list: a, Index: 1."
-        "  Data types don't match. Expected type: int64"))
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(serialized),
+            "sequence_features": {"a": tf.FixedLenSequenceFeature(
+                (2,), tf.int64)}
+        },
+        expected_err=(tf.OpError, "Feature list: a, Index: 1."
+                      "  Data types don't match. Expected type: int64"))
 
   def testSequenceExampleListWithWrongDataTypeFails(self):
     original = sequence_example(feature_lists=feature_lists({
         "a": feature_list([
-            float_feature([2, 3])])
-        }))
+            float_feature([2, 3])
+        ])
+    }))
 
     serialized = original.SerializeToString()
 
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(serialized),
-        "sequence_features": {"a": tf.FixedLenSequenceFeature((2,), tf.int64)}
-    }, expected_err=(
-        tf.OpError,
-        "Feature list: a, Index: 0.  Data types don't match."
-        " Expected type: int64"))
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(serialized),
+            "sequence_features": {"a": tf.FixedLenSequenceFeature(
+                (2,), tf.int64)}
+        },
+        expected_err=(tf.OpError,
+                      "Feature list: a, Index: 0.  Data types don't match."
+                      " Expected type: int64"))
 
   def testSequenceExampleListWithWrongSparseDataTypeFails(self):
     original = sequence_example(feature_lists=feature_lists({
         "a": feature_list([
-            int64_feature([3, 4]),
-            int64_feature([1, 2]),
-            float_feature([2.0, 3.0])])
-        }))
+            int64_feature([3, 4]), int64_feature([1, 2]),
+            float_feature([2.0, 3.0])
+        ])
+    }))
 
     serialized = original.SerializeToString()
 
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(serialized),
-        "sequence_features": {"a": tf.FixedLenSequenceFeature((2,), tf.int64)}
-    }, expected_err=(
-        tf.OpError,
-        "Name: in1, Feature list: a, Index: 2."
-        "  Data types don't match. Expected type: int64"
-        "  Feature is: float_list"))
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(serialized),
+            "sequence_features": {"a": tf.FixedLenSequenceFeature(
+                (2,), tf.int64)}
+        },
+        expected_err=(tf.OpError, "Name: in1, Feature list: a, Index: 2."
+                      "  Data types don't match. Expected type: int64"
+                      "  Feature is: float_list"))
 
   def testSequenceExampleListWithWrongShapeFails(self):
     original = sequence_example(feature_lists=feature_lists({
         "a": feature_list([
-            int64_feature([2, 3]),
-            int64_feature([2, 3, 4])]),
-        }))
+            int64_feature([2, 3]), int64_feature([2, 3, 4])
+        ]),
+    }))
 
     serialized = original.SerializeToString()
 
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(serialized),
-        "sequence_features": {"a": tf.FixedLenSequenceFeature((2,), tf.int64)}
-    }, expected_err=(
-        tf.OpError,
-        r"Name: in1, Key: a, Index: 1."
-        r"  Number of int64 values != expected."
-        r"  values size: 3 but output shape: \[2\]"))
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(serialized),
+            "sequence_features": {"a": tf.FixedLenSequenceFeature(
+                (2,), tf.int64)}
+        },
+        expected_err=(tf.OpError, r"Name: in1, Key: a, Index: 1."
+                      r"  Number of int64 values != expected."
+                      r"  values size: 3 but output shape: \[2\]"))
 
   def testSequenceExampleWithMissingFeatureListFails(self):
     original = sequence_example(feature_lists=feature_lists({}))
 
     # Test fails because we didn't add:
     #  feature_list_dense_defaults = {"a": None}
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(original.SerializeToString()),
-        "sequence_features": {"a": tf.FixedLenSequenceFeature((2,), tf.int64)}
-    }, expected_err=(
-        tf.OpError,
-        "Name: in1, Feature list 'a' is required but could not be found."
-        "  Did you mean to include it in"
-        " feature_list_dense_missing_assumed_empty or"
-        " feature_list_dense_defaults?"))
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(original.SerializeToString()),
+            "sequence_features": {"a": tf.FixedLenSequenceFeature(
+                (2,), tf.int64)}
+        },
+        expected_err=(
+            tf.OpError,
+            "Name: in1, Feature list 'a' is required but could not be found."
+            "  Did you mean to include it in"
+            " feature_list_dense_missing_assumed_empty or"
+            " feature_list_dense_defaults?"))
 
 
 class DecodeJSONExampleTest(tf.test.TestCase):
@@ -740,14 +852,15 @@ class DecodeJSONExampleTest(tf.test.TestCase):
 
       json_tensor = tf.constant(
           [json_format.MessageToJson(m) for m in examples.flatten()],
-          shape=examples.shape, dtype=tf.string)
+          shape=examples.shape,
+          dtype=tf.string)
       binary_tensor = tf.decode_json_example(json_tensor)
       binary_val = sess.run(binary_tensor)
 
       if examples.shape:
         self.assertShapeEqual(binary_val, json_tensor)
-        for input_example, output_binary in zip(np.array(examples).flatten(),
-                                                binary_val.flatten()):
+        for input_example, output_binary in zip(
+            np.array(examples).flatten(), binary_val.flatten()):
           output_example = tf.train.Example()
           output_example.ParseFromString(output_binary)
           self.assertProtoEquals(input_example, output_example)
diff --git a/tensorflow/python/kernel_tests/scan_ops_test.py b/tensorflow/python/kernel_tests/scan_ops_test.py
index 1197b49a5fd..4db5cf51c4e 100644
--- a/tensorflow/python/kernel_tests/scan_ops_test.py
+++ b/tensorflow/python/kernel_tests/scan_ops_test.py
@@ -59,9 +59,9 @@ class CumsumTest(tf.test.TestCase):
   valid_dtypes = [np.int32, np.int64, np.float16, np.float32,
                   np.float64, np.complex64, np.complex128]
 
-  def _compare(self, x, axis, exclusive, reverse, use_gpu=False):
+  def _compare(self, x, axis, exclusive, reverse):
     np_out = handle_options(np.cumsum, x, axis, exclusive, reverse)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session():
       tf_out = tf.cumsum(x, axis, exclusive, reverse).eval()
 
     self.assertAllClose(np_out, tf_out)
@@ -69,8 +69,7 @@ class CumsumTest(tf.test.TestCase):
   def _compareAll(self, x, axis):
     for exclusive in [True, False]:
       for reverse in [True, False]:
-        for use_gpu in [True, False]:
-          self._compare(x, axis, exclusive, reverse, use_gpu)
+        self._compare(x, axis, exclusive, reverse)
 
   def test1D(self):
     for dtype in self.valid_dtypes:
@@ -144,9 +143,9 @@ class CumprodTest(tf.test.TestCase):
   valid_dtypes = [np.int32, np.int64, np.float16, np.float32,
                   np.float64, np.complex64, np.complex128]
 
-  def _compare(self, x, axis, exclusive, reverse, use_gpu=False):
+  def _compare(self, x, axis, exclusive, reverse):
     np_out = handle_options(np.cumprod, x, axis, exclusive, reverse)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session():
       tf_out = tf.cumprod(x, axis, exclusive, reverse).eval()
 
     self.assertAllClose(np_out, tf_out)
@@ -154,8 +153,7 @@ class CumprodTest(tf.test.TestCase):
   def _compareAll(self, x, axis):
     for exclusive in [True, False]:
       for reverse in [True, False]:
-        for use_gpu in [True, False]:
-          self._compare(x, axis, exclusive, reverse, use_gpu)
+        self._compare(x, axis, exclusive, reverse)
 
 
   def test1D(self):
diff --git a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
index 714b86fbfc7..3d08c2afbbe 100644
--- a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
+++ b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,106 +12,118 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""Tests for tensorflow.ops.tf.self_adjoint_eig."""
+"""Tests for tensorflow.ops.math_ops.matrix_inverse."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
 
-class SelfAdjointEigOpTest(tf.test.TestCase):
-
-  def _testEigs(self, x, d, tf_ans, use_gpu=False):
-    np_eig_val, np_eig_vec = np.linalg.eig(x)
-
-    # First check the eigenvalues
-    self.assertAllClose(sorted(np_eig_val), sorted(tf_ans[0, :]))
-
-    # need to make things canonical. This test may still fail in case there are
-    # two equal eigenvalues, so that there is indeterminacy in the eigenvectors.
-    # For now, assume that we will only test matrices with distinct eigenvalues.
-    np_arg = np.argsort(np_eig_val)
-    tf_arg = np.argsort(tf_ans[0, :])
-
-    np_eig_vecs_sorted = np.array([np_eig_vec[:, i] for i in np_arg]).T
-    tf_eig_vecs_sorted = np.array([tf_ans[1:, i] for i in tf_arg]).T
-    np_eig_vecs_signed_sorted = np.array([np_eig_vecs_sorted[:, i] *
-                                          np.sign(np_eig_vecs_sorted[0, i])
-                                          for i in xrange(d)]).T
-    tf_eig_vecs_signed_sorted = np.array([tf_eig_vecs_sorted[:, i] *
-                                          np.sign(tf_eig_vecs_sorted[0, i])
-                                          for i in xrange(d)]).T
-    self.assertAllClose(np_eig_vecs_signed_sorted, tf_eig_vecs_signed_sorted)
-
-  def _compareSelfAdjointEig(self, x, use_gpu=False):
-    with self.test_session() as sess:
-      tf_eig = tf.self_adjoint_eig(tf.constant(x))
-      tf_eig_out = sess.run([tf_eig])[0]
-
-    d, _ = x.shape
-    self.assertEqual([d+1, d], tf_eig.get_shape().dims)
-    self._testEigs(x, d, tf_eig_out, use_gpu)
-
-  def _compareBatchSelfAdjointEigRank3(self, x, use_gpu=False):
-    with self.test_session() as sess:
-      tf_eig = tf.batch_self_adjoint_eig(tf.constant(x))
-      tf_out = sess.run([tf_eig])[0]
-    dlist = x.shape
-    d = dlist[-2]
-
-    self.assertEqual([d+1, d], tf_eig.get_shape().dims[-2:])
-    # not testing the values.
-    self.assertEqual(dlist[0], tf_eig.get_shape().dims[0])
-
-    for i in xrange(dlist[0]):
-      self._testEigs(x[i], d, tf_out[i])
-
-  def _compareBatchSelfAdjointEigRank2(self, x, use_gpu=False):
-    with self.test_session() as sess:
-      tf_eig = tf.batch_self_adjoint_eig(tf.constant(x))
-      tf_out = sess.run([tf_eig])[0]
-    dlist = x.shape
-    d = dlist[-2]
-
-    self.assertEqual(len(tf_eig.get_shape()), 2)
-    self.assertEqual([d+1, d], tf_eig.get_shape().dims[-2:])
-    self._testEigs(x, d, tf_out)
-
-  def testBasic(self):
-    self._compareSelfAdjointEig(
-        np.array([[3., 0., 1.], [0., 2., -2.], [1., -2., 3.]]))
-
-  def testBatch(self):
-    simple_array = np.array([[[1., 0.], [0., 5.]]])  # shape (1, 2, 2)
-    simple_array_2d = simple_array[0]  # shape (2, 2)
-    self._compareBatchSelfAdjointEigRank3(simple_array)
-    self._compareBatchSelfAdjointEigRank3(
-        np.vstack((simple_array, simple_array)))
-    self._compareBatchSelfAdjointEigRank2(simple_array_2d)
-    odd_sized_array = np.array([[[3., 0., 1.], [0., 2., -2.], [1., -2., 3.]]])
-    self._compareBatchSelfAdjointEigRank3(
-        np.vstack((odd_sized_array, odd_sized_array)))
-
-    # Generate random positive-definite matrices.
-    matrices = np.random.rand(10, 5, 5)
-    for i in xrange(10):
-      matrices[i] = np.dot(matrices[i].T, matrices[i])
-    self._compareBatchSelfAdjointEigRank3(matrices)
-
-  def testNonSquareMatrix(self):
-    with self.assertRaises(ValueError):
-      tf.self_adjoint_eig(tf.constant(np.array([[1., 2., 3.], [3., 4., 5.]])))
+class SelfAdjointEigTest(tf.test.TestCase):
 
   def testWrongDimensions(self):
-    tensor3 = tf.constant([1., 2.])
+    # The input to self_adjoint_eig should be 2-dimensional tensor.
+    scalar = tf.constant(1.)
     with self.assertRaises(ValueError):
-      tf.self_adjoint_eig(tensor3)
+      tf.self_adjoint_eig(scalar)
+    vector = tf.constant([1., 2.])
+    with self.assertRaises(ValueError):
+      tf.self_adjoint_eig(vector)
+    tensor = tf.constant([[[1., 2.], [3., 4.]], [[1., 2.], [3., 4.]]])
+    with self.assertRaises(ValueError):
+      tf.self_adjoint_eig(tensor)
+
+    # The input to batch_batch_self_adjoint_eig should be a tensor of
+    # at least rank 2.
+    scalar = tf.constant(1.)
+    with self.assertRaises(ValueError):
+      tf.batch_self_adjoint_eig(scalar)
+    vector = tf.constant([1., 2.])
+    with self.assertRaises(ValueError):
+      tf.batch_self_adjoint_eig(vector)
 
 
-if __name__ == "__main__":
+def SortEigenDecomposition(e, v):
+  if v.ndim < 2:
+    return e, v
+  else:
+    perm = np.argsort(e, -1)
+    return np.take(e, perm, -1), np.take(v, perm, -1)
+
+
+def _GetSelfAdjointEigTest(dtype_, shape_):
+
+  def CompareEigenVectors(self, x, y, atol):
+    # Eigenvectors are only unique up to sign so we normalize the signs first.
+    signs = np.sign(np.sum(np.divide(x, y), -2, keepdims=True))
+    x *= signs
+    self.assertAllClose(x, y, atol)
+
+  def CompareEigenDecompositions(self, x_e, x_v, y_e, y_v, atol):
+    num_batches = int(np.prod(x_e.shape[:-1]))
+    n = x_e.shape[-1]
+    x_e = np.reshape(x_e, [num_batches] + [n])
+    x_v = np.reshape(x_v, [num_batches] + [n, n])
+    y_e = np.reshape(y_e, [num_batches] + [n])
+    y_v = np.reshape(y_v, [num_batches] + [n, n])
+    for i in range(num_batches):
+      x_ei, x_vi = SortEigenDecomposition(x_e[i, :], x_v[i, :, :])
+      y_ei, y_vi = SortEigenDecomposition(y_e[i, :], y_v[i, :, :])
+      self.assertAllClose(x_ei, y_ei, atol=atol)
+      CompareEigenVectors(self, x_vi, y_vi, atol)
+
+  def Test(self):
+    np.random.seed(1)
+    n = shape_[-1]
+    batch_shape = shape_[:-2]
+    a = np.random.uniform(
+        low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(dtype_)
+    a += a.T
+    a = np.tile(a, batch_shape + (1, 1))
+    if dtype_ == np.float32:
+      atol = 1e-4
+    else:
+      atol = 1e-14
+    for compute_v in False, True:
+      np_e, np_v = np.linalg.eig(a)
+      with self.test_session():
+        if compute_v:
+          if a.ndim == 2:
+            op = tf.self_adjoint_eig
+          else:
+            op = tf.batch_self_adjoint_eig
+          tf_e, tf_v = op(tf.constant(a))
+
+          # Check that V*diag(E)*V^T is close to A.
+          a_ev = tf.batch_matmul(
+              tf.batch_matmul(tf_v, tf.batch_matrix_diag(tf_e)),
+              tf_v,
+              adj_y=True)
+          self.assertAllClose(a_ev.eval(), a, atol=atol)
+
+          # Compare to numpy.linalg.eig.
+          CompareEigenDecompositions(self, np_e, np_v, tf_e.eval(), tf_v.eval(),
+                                     atol)
+        else:
+          if a.ndim == 2:
+            op = tf.self_adjoint_eigvals
+          else:
+            op = tf.batch_self_adjoint_eigvals
+          tf_e = op(tf.constant(a))
+          self.assertAllClose(
+              np.sort(np_e, -1), np.sort(tf_e.eval(), -1), atol=atol)
+
+  return Test
+
+
+if __name__ == '__main__':
+  for dtype in np.float32, np.float64:
+    for size in 1, 2, 5, 10:
+      for batch_dims in [(), (3,)] + [(3, 2)] * (max(size, size) < 10):
+        shape = batch_dims + (size, size)
+        name = '%s_%s' % (dtype.__name__, '_'.join(map(str, shape)))
+        setattr(SelfAdjointEigTest, 'testSelfAdjointEig_' + name,
+                _GetSelfAdjointEigTest(dtype, shape))
   tf.test.main()
diff --git a/tensorflow/python/kernel_tests/seq2seq_test.py b/tensorflow/python/kernel_tests/seq2seq_test.py
index 58af5c42bd8..c9a8203b5d9 100644
--- a/tensorflow/python/kernel_tests/seq2seq_test.py
+++ b/tensorflow/python/kernel_tests/seq2seq_test.py
@@ -263,6 +263,32 @@ class Seq2SeqTest(tf.test.TestCase):
         res = sess.run([mem])
         self.assertEqual((2, 2), res[0].shape)
 
+  def testAttentionDecoderStateIsTuple(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
+        cell = tf.nn.rnn_cell.MultiRNNCell(cells=[cell] * 2,
+                                           state_is_tuple=True)
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        enc_outputs, enc_state = tf.nn.rnn(cell, inp, dtype=tf.float32)
+        attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
+                                    for e in enc_outputs])
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        dec, mem = tf.nn.seq2seq.attention_decoder(
+            dec_inp, enc_state,
+            attn_states, cell, output_size=4)
+        sess.run([tf.initialize_all_variables()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual(2, len(res[0]))
+        self.assertEqual((2, 2), res[0][0].c.shape)
+        self.assertEqual((2, 2), res[0][0].h.shape)
+        self.assertEqual((2, 2), res[0][1].c.shape)
+        self.assertEqual((2, 2), res[0][1].h.shape)
+
   def testEmbeddingAttentionDecoder(self):
     with self.test_session() as sess:
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
diff --git a/tensorflow/python/kernel_tests/spacetobatch_op_test.py b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
index f90abb95e8a..f3ff2d517af 100644
--- a/tensorflow/python/kernel_tests/spacetobatch_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
@@ -27,16 +27,15 @@ class SpaceToBatchTest(tf.test.TestCase):
   """Tests input-output pairs for the SpaceToBatch and BatchToSpace ops."""
 
   def _testPad(self, inputs, paddings, block_size, outputs):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        # outputs = space_to_batch(inputs)
-        x_tf = tf.space_to_batch(
-            tf.to_float(inputs), paddings, block_size=block_size)
-        self.assertAllEqual(x_tf.eval(), outputs)
-        # inputs = batch_to_space(outputs)
-        x_tf = tf.batch_to_space(
-            tf.to_float(outputs), paddings, block_size=block_size)
-        self.assertAllEqual(x_tf.eval(), inputs)
+    with self.test_session():
+      # outputs = space_to_batch(inputs)
+      x_tf = tf.space_to_batch(
+          tf.to_float(inputs), paddings, block_size=block_size)
+      self.assertAllEqual(x_tf.eval(), outputs)
+      # inputs = batch_to_space(outputs)
+      x_tf = tf.batch_to_space(
+          tf.to_float(outputs), paddings, block_size=block_size)
+      self.assertAllEqual(x_tf.eval(), inputs)
 
   def _testOne(self, inputs, block_size, outputs):
     paddings = np.zeros((2, 2), dtype=np.int32)
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index d0f31d14137..29b57e80944 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -119,13 +119,16 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
 class SparseMergeTest(test_util.TensorFlowTestCase):
 
   def _SparseTensor_3x50(self, indices_dtype, values_dtype):
+    # NOTE: This input is intentionally not sorted to validate the
+    # already_sorted flag below.
     ind = np.array([
         [0, 0],
-        [1, 0], [1, 1], [1, 2],
-        [2, 0], [2, 1]])
+        [1, 0], [1, 2],
+        [2, 0], [2, 1],
+        [1, 1]])
     # NB: these are not sorted
-    indices = np.array([0, 13, 10, 14, 32, 33])
-    values = np.array([-3, 4, 1, 1, 5, 9])
+    indices = np.array([0, 13, 10, 33, 32, 14])
+    values = np.array([-3, 4, 1, 9, 5, 1])
     shape = np.array([3, 3])
     indices = ops.SparseTensor(
         constant_op.constant(ind, dtypes.int64),
@@ -137,6 +140,28 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
         constant_op.constant(shape, dtypes.int64))
     return indices, values
 
+  def _AssertResultsSorted(self, output, vocab_size):
+    self.assertAllEqual(
+        output.indices,
+        [[0, 0], [1, 10], [1, 13], [1, 14], [2, 32], [2, 33]])
+    self.assertAllEqual(
+        output.values,
+        [-3, 1, 4, 1, 5, 9])
+    self.assertAllEqual(
+        output.shape,
+        [3, vocab_size])
+
+  def _AssertResultsNotSorted(self, output, vocab_size):
+    self.assertAllEqual(
+        output.indices,
+        [[0, 0], [1, 13], [1, 10], [2, 33], [2, 32], [1, 14]])
+    self.assertAllEqual(
+        output.values,
+        [-3, 4, 1, 9, 5, 1])
+    self.assertAllEqual(
+        output.shape,
+        [3, vocab_size])
+
   def testInt32AndFloat32(self):
     vocab_size = 50
     with self.test_session(use_gpu=False) as sess:
@@ -144,15 +169,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
       output = sess.run(sp_output)
-      self.assertAllEqual(
-          output.indices,
-          [[0, 0], [1, 10], [1, 13], [1, 14], [2, 32], [2, 33]])
-      self.assertAllEqual(
-          output.values,
-          [-3, 1, 4, 1, 5, 9])
-      self.assertAllEqual(
-          output.shape,
-          [3, vocab_size])
+      self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat32(self):
     vocab_size = 50
@@ -161,15 +178,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
       output = sess.run(sp_output)
-      self.assertAllEqual(
-          output.indices,
-          [[0, 0], [1, 10], [1, 13], [1, 14], [2, 32], [2, 33]])
-      self.assertAllEqual(
-          output.values,
-          [-3, 1, 4, 1, 5, 9])
-      self.assertAllEqual(
-          output.shape,
-          [3, vocab_size])
+      self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat64(self):
     vocab_size = 50
@@ -178,15 +187,37 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
       output = sess.run(sp_output)
-      self.assertAllEqual(
-          output.indices,
-          [[0, 0], [1, 10], [1, 13], [1, 14], [2, 32], [2, 33]])
-      self.assertAllEqual(
-          output.values,
-          [-3, 1, 4, 1, 5, 9])
-      self.assertAllEqual(
-          output.shape,
-          [3, vocab_size])
+      self._AssertResultsSorted(output, vocab_size)
+
+  def testInt32AndFloat32NonCanonicalOrder(self):
+    vocab_size = 50
+    with self.test_session(use_gpu=False) as sess:
+      indices, values = self._SparseTensor_3x50(dtypes.int32, dtypes.float32)
+      sp_output = sparse_ops.sparse_merge(
+          indices, values, vocab_size, already_sorted=True)
+
+      output = sess.run(sp_output)
+      self._AssertResultsNotSorted(output, vocab_size)
+
+  def testInt64AndFloat32NonCanonicalOrder(self):
+    vocab_size = 50
+    with self.test_session(use_gpu=False) as sess:
+      indices, values = self._SparseTensor_3x50(dtypes.int64, dtypes.float32)
+      sp_output = sparse_ops.sparse_merge(
+          indices, values, vocab_size, already_sorted=True)
+
+      output = sess.run(sp_output)
+      self._AssertResultsNotSorted(output, vocab_size)
+
+  def testInt64AndFloat64NonCanonicalOrder(self):
+    vocab_size = 50
+    with self.test_session(use_gpu=False) as sess:
+      indices, values = self._SparseTensor_3x50(dtypes.int64, dtypes.float64)
+      sp_output = sparse_ops.sparse_merge(
+          indices, values, vocab_size, already_sorted=True)
+
+      output = sess.run(sp_output)
+      self._AssertResultsNotSorted(output, vocab_size)
 
 
 class SparseRetainTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
new file mode 100644
index 00000000000..6c2d8369799
--- /dev/null
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -0,0 +1,112 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.math_ops.matrix_inverse."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+
+class SvdOpTest(tf.test.TestCase):
+
+  def testWrongDimensions(self):
+    # The input to svd should be 2-dimensional tensor.
+    scalar = tf.constant(1.)
+    with self.assertRaises(ValueError):
+      tf.svd(scalar)
+    vector = tf.constant([1., 2.])
+    with self.assertRaises(ValueError):
+      tf.svd(vector)
+    tensor = tf.constant([[[1., 2.], [3., 4.]], [[1., 2.], [3., 4.]]])
+    with self.assertRaises(ValueError):
+      tf.svd(tensor)
+
+    # The input to batch_svd should be a tensor of at least rank 2.
+    scalar = tf.constant(1.)
+    with self.assertRaises(ValueError):
+      tf.batch_svd(scalar)
+    vector = tf.constant([1., 2.])
+    with self.assertRaises(ValueError):
+      tf.batch_svd(vector)
+
+
+def _GetSvdOpTest(dtype_, shape_):
+
+  def _CompareSingularVectors(self, x, y, atol):
+    # Singular vectors are only unique up to sign (complex phase factor for
+    # complex matrices), so we normalize the signs first.
+    signs = np.sign(np.sum(np.divide(x, y), -2, keepdims=True))
+    x *= signs
+    self.assertAllClose(x, y, atol=atol)
+
+  def Test(self):
+    np.random.seed(1)
+    x = np.random.uniform(
+        low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_)
+    if dtype_ == np.float32:
+      atol = 1e-4
+    else:
+      atol = 1e-14
+    for compute_uv in False, True:
+      for full_matrices in False, True:
+        with self.test_session():
+          if x.ndim == 2:
+            if compute_uv:
+              tf_s, tf_u, tf_v = tf.svd(tf.constant(x),
+                                        compute_uv=compute_uv,
+                                        full_matrices=full_matrices)
+            else:
+              tf_s = tf.svd(tf.constant(x),
+                            compute_uv=compute_uv,
+                            full_matrices=full_matrices)
+          else:
+            if compute_uv:
+              tf_s, tf_u, tf_v = tf.batch_svd(
+                  tf.constant(x),
+                  compute_uv=compute_uv,
+                  full_matrices=full_matrices)
+            else:
+              tf_s = tf.batch_svd(
+                  tf.constant(x),
+                  compute_uv=compute_uv,
+                  full_matrices=full_matrices)
+          if compute_uv:
+            np_u, np_s, np_v = np.linalg.svd(x,
+                                             compute_uv=compute_uv,
+                                             full_matrices=full_matrices)
+          else:
+            np_s = np.linalg.svd(x,
+                                 compute_uv=compute_uv,
+                                 full_matrices=full_matrices)
+          self.assertAllClose(np_s, tf_s.eval(), atol=atol)
+          if compute_uv:
+            _CompareSingularVectors(self, np_u, tf_u.eval(), atol)
+            _CompareSingularVectors(self, np.swapaxes(np_v, -2, -1),
+                                    tf_v.eval(), atol)
+
+  return Test
+
+
+if __name__ == '__main__':
+  for dtype in np.float32, np.float64:
+    for m in 1, 2, 5, 10:
+      for n in 1, 2, 5, 10:
+        for batch_dims in [(), (3,)] + [(3, 2)] * (max(m, n) < 10):
+          shape = batch_dims + (m, n)
+          name = '%s_%s' % (dtype.__name__, '_'.join(map(str, shape)))
+          setattr(SvdOpTest, 'testSvd_' + name, _GetSvdOpTest(dtype, shape))
+  tf.test.main()
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index b5f4288871e..f6282439ae4 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -69,6 +69,16 @@ class VariableScopeTest(tf.test.TestCase):
           sess.run(tf.initialize_variables([w]))
           self.assertAllClose(w.eval(), 0.3)
 
+  def testVarScopeDType(self):
+    with self.test_session():
+      with tf.variable_scope("tower") as tower:
+        with tf.variable_scope("foo", dtype=tf.float16):
+          v = tf.get_variable("v", [])
+          self.assertEqual(v.dtype, tf.float16_ref)
+        with tf.variable_scope(tower, dtype=tf.float16):
+          w = tf.get_variable("w", [])
+          self.assertEqual(w.dtype, tf.float16_ref)
+
   def testInitFromNonTensorValue(self):
     with self.test_session() as sess:
       v = tf.get_variable("v", initializer=4, dtype=tf.int32)
diff --git a/tensorflow/python/kernel_tests/zero_division_test.py b/tensorflow/python/kernel_tests/zero_division_test.py
index 9a0d28bf2e3..e635aff84d5 100644
--- a/tensorflow/python/kernel_tests/zero_division_test.py
+++ b/tensorflow/python/kernel_tests/zero_division_test.py
@@ -25,31 +25,30 @@ import tensorflow as tf
 class ZeroDivisionTest(tf.test.TestCase):
 
   def testZeros(self):
-    for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
-        for dtype in tf.uint8, tf.int16, tf.int32, tf.int64:
-          zero = tf.constant(0, dtype=dtype)
-          one = tf.constant(1, dtype=dtype)
-          bads = [one // zero]
-          if dtype in (tf.int32, tf.int64):
-            bads.append(one % zero)
-          for bad in bads:
-            try:
-              result = bad.eval()
-            except tf.OpError as e:
-              # Ideally, we'd get a nice exception.  In theory, this should only
-              # happen on CPU, but 32 bit integer GPU division is actually on
-              # CPU due to a placer bug.
-              # TODO(irving): Make stricter once the placer bug is fixed.
-              self.assertIn('Integer division by zero', str(e))
-            else:
-              # On the GPU, integer division by zero produces all bits set.
-              # But apparently on some GPUs "all bits set" for 64 bit division
-              # means 32 bits set, so we allow 0xffffffff as well.  This isn't
-              # very portable, so we may need to expand this list if other GPUs
-              # do different things.
-              self.assertTrue(use_gpu)
-              self.assertIn(result, (-1, 0xff, 0xffffffff))
+    with self.test_session():
+      for dtype in tf.uint8, tf.int16, tf.int32, tf.int64:
+        zero = tf.constant(0, dtype=dtype)
+        one = tf.constant(1, dtype=dtype)
+        bads = [one // zero]
+        if dtype in (tf.int32, tf.int64):
+          bads.append(one % zero)
+        for bad in bads:
+          try:
+            result = bad.eval()
+          except tf.OpError as e:
+            # Ideally, we'd get a nice exception.  In theory, this should only
+            # happen on CPU, but 32 bit integer GPU division is actually on
+            # CPU due to a placer bug.
+            # TODO(irving): Make stricter once the placer bug is fixed.
+            self.assertIn('Integer division by zero', str(e))
+          else:
+            # On the GPU, integer division by zero produces all bits set.
+            # But apparently on some GPUs "all bits set" for 64 bit division
+            # means 32 bits set, so we allow 0xffffffff as well.  This isn't
+            # very portable, so we may need to expand this list if other GPUs
+            # do different things.
+            self.assertTrue(tf.test.is_gpu_available())
+            self.assertIn(result, (-1, 0xff, 0xffffffff))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/lib/io/file_io.i b/tensorflow/python/lib/io/file_io.i
index 12ab8566e96..4e1c2aba69d 100644
--- a/tensorflow/python/lib/io/file_io.i
+++ b/tensorflow/python/lib/io/file_io.i
@@ -21,8 +21,10 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/match.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_statistics.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 %}
 
@@ -82,7 +84,7 @@ void CreateDir(const string& dirname, TF_Status* out_status) {
 
 void CopyFile(const string& oldpath, const string& newpath, bool overwrite,
               TF_Status* out_status) {
-  // If overwrite is false and the newpath file exists then its an error.
+  // If overwrite is false and the newpath file exists then it's an error.
   if (!overwrite && FileExists(newpath)) {
     TF_SetStatus(out_status, TF_ALREADY_EXISTS, "file already exists");
     return;
@@ -142,6 +144,17 @@ bool IsDirectory(const string& dirname, TF_Status* out_status) {
   }
   return false;
 }
+
+using tensorflow::FileStatistics;
+
+void Stat(const string& filename, FileStatistics* stats,
+          TF_Status* out_status) {
+  tensorflow::Status status = tensorflow::Env::Default()->Stat(filename,
+                                                               stats);
+  if (!status.ok()) {
+    Set_TF_Status_from_Status(out_status, status);
+  }
+}
 %}
 
 // Wrap the above functions.
@@ -159,3 +172,8 @@ void RenameFile(const string& oldname, const string& newname, bool overwrite,
                 TF_Status* out_status);
 void DeleteRecursively(const string& dirname, TF_Status* out_status);
 bool IsDirectory(const string& dirname, TF_Status* out_status);
+void Stat(const string& filename, tensorflow::FileStatistics* stats,
+          TF_Status* out_status);
+
+%include "tensorflow/core/lib/io/path.h"
+%include "tensorflow/core/platform/file_statistics.h"
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index a0ec199d3a0..9467e4b3456 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """File IO methods that wrap the C++ FileSystem API.
 
 The C++ FileSystem API is SWIG wrapped in file_io.i. These functions call those
@@ -22,6 +21,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
@@ -43,8 +44,8 @@ def read_file_to_string(filename):
 
 def write_string_to_file(filename, file_content):
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.WriteStringToFile(compat.as_bytes(filename),
-                                        compat.as_bytes(file_content), status)
+    pywrap_tensorflow.WriteStringToFile(
+        compat.as_bytes(filename), compat.as_bytes(file_content), status)
 
 
 def get_matching_files(filename):
@@ -61,22 +62,21 @@ def recursive_create_dir(dirname):
   with errors.raise_exception_on_not_ok_status() as status:
     dirs = dirname.split('/')
     for i in range(len(dirs)):
-      partial_dir = '/'.join(dirs[0:i+1])
+      partial_dir = '/'.join(dirs[0:i + 1])
       if partial_dir and not file_exists(partial_dir):
         pywrap_tensorflow.CreateDir(compat.as_bytes(partial_dir), status)
 
 
 def copy(oldpath, newpath, overwrite=False):
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.CopyFile(compat.as_bytes(oldpath),
-                               compat.as_bytes(newpath), overwrite, status)
+    pywrap_tensorflow.CopyFile(
+        compat.as_bytes(oldpath), compat.as_bytes(newpath), overwrite, status)
 
 
 def rename(oldname, newname, overwrite=False):
   with errors.raise_exception_on_not_ok_status() as status:
-    return pywrap_tensorflow.RenameFile(compat.as_bytes(oldname),
-                                        compat.as_bytes(newname), overwrite,
-                                        status)
+    return pywrap_tensorflow.RenameFile(
+        compat.as_bytes(oldname), compat.as_bytes(newname), overwrite, status)
 
 
 def delete_recursively(dirname):
@@ -87,3 +87,74 @@ def delete_recursively(dirname):
 def is_directory(dirname):
   with errors.raise_exception_on_not_ok_status() as status:
     return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
+
+
+def list_directory(dirname):
+  """Returns a list of entries contained within a directory.
+
+  The list is in arbitrary order. It does not contain the special entries "."
+  and "..".
+
+  Args:
+    dirname: string, path to a directory
+
+  Raises:
+    NotFoundError if directory doesn't exist
+
+  Returns:
+    [filename1, filename2, ... filenameN]
+  """
+  if not is_directory(dirname):
+    raise errors.NotFoundError(None, None, 'Could not find directory')
+  file_list = get_matching_files(os.path.join(compat.as_str_any(dirname), '*'))
+  return [compat.as_bytes(pywrap_tensorflow.Basename(compat.as_bytes(filename)))
+          for filename in file_list]
+
+
+def walk(top, in_order=True):
+  """Recursive directory tree generator for directories.
+
+  Args:
+    top: string, a Directory name
+    in_order: bool, Traverse in order if True, post order if False.
+
+  Errors that happen while listing directories are ignored.
+
+  Yields:
+    # Each yield is a 3-tuple:  the pathname of a directory, followed
+    # by lists of all its subdirectories and leaf files.
+    (dirname, [subdirname, subdirname, ...], [filename, filename, ...])
+  """
+  top = compat.as_bytes(top)
+  try:
+    listing = list_directory(top)
+  except errors.NotFoundError:
+    return
+
+  files = []
+  subdirs = []
+  for item in listing:
+    full_path = os.path.join(top, item)
+    if is_directory(full_path):
+      subdirs.append(item)
+    else:
+      files.append(item)
+
+  here = (top, subdirs, files)
+
+  if in_order:
+    yield here
+
+  for subdir in subdirs:
+    for subitem in walk(os.path.join(top, subdir), in_order):
+      yield subitem
+
+  if not in_order:
+    yield here
+
+
+def stat(filename):
+  file_statistics = pywrap_tensorflow.FileStatistics()
+  with errors.raise_exception_on_not_ok_status() as status:
+    pywrap_tensorflow.Stat(compat.as_bytes(filename), file_statistics, status)
+    return file_statistics
diff --git a/tensorflow/python/lib/io/file_io_test.py b/tensorflow/python/lib/io/file_io_test.py
index b47b687a2ac..1b95d1b403a 100644
--- a/tensorflow/python/lib/io/file_io_test.py
+++ b/tensorflow/python/lib/io/file_io_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-
 """Testing File IO operations in file_io.py."""
 from __future__ import absolute_import
 from __future__ import division
@@ -24,6 +23,7 @@ import tensorflow as tf
 
 from tensorflow.python.framework import errors
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.util import compat
 
 
 class FileIoTest(tf.test.TestCase):
@@ -60,9 +60,9 @@ class FileIoTest(tf.test.TestCase):
       file_path = os.path.join(dir_path, name)
       file_io.write_string_to_file(file_path, "testing")
     expected_match = [os.path.join(dir_path, name) for name in files]
-    self.assertItemsEqual(file_io.get_matching_files(os.path.join(dir_path,
-                                                                  "file*.txt")),
-                          expected_match)
+    self.assertItemsEqual(
+        file_io.get_matching_files(os.path.join(dir_path, "file*.txt")),
+        expected_match)
     file_io.delete_recursively(dir_path)
     self.assertFalse(file_io.file_exists(os.path.join(dir_path, "file3.txt")))
 
@@ -144,5 +144,117 @@ class FileIoTest(tf.test.TestCase):
     # False for a file.
     self.assertFalse(file_io.is_directory(file_path))
 
+  def testListDirectory(self):
+    dir_path = os.path.join(self._base_dir, "test_dir")
+    file_io.create_dir(dir_path)
+    files = [b"file1.txt", b"file2.txt", b"file3.txt"]
+    for name in files:
+      file_path = os.path.join(dir_path, compat.as_str_any(name))
+      file_io.write_string_to_file(file_path, "testing")
+    subdir_path = os.path.join(dir_path, "sub_dir")
+    file_io.create_dir(subdir_path)
+    subdir_file_path = os.path.join(subdir_path, "file4.txt")
+    file_io.write_string_to_file(subdir_file_path, "testing")
+    dir_list = file_io.list_directory(dir_path)
+    self.assertItemsEqual(files + [b"sub_dir"], dir_list)
+
+  def testListDirectoryFailure(self):
+    dir_path = os.path.join(self._base_dir, "test_dir")
+    with self.assertRaises(errors.NotFoundError):
+      file_io.list_directory(dir_path)
+
+  def _setupWalkDirectories(self, dir_path):
+    # Creating a file structure as follows
+    # test_dir -> file: file1.txt; dirs: subdir1_1, subdir1_2, subdir1_3
+    # subdir1_1 -> file: file3.txt
+    # subdir1_2 -> dir: subdir2
+    file_io.create_dir(dir_path)
+    file_io.write_string_to_file(os.path.join(dir_path, "file1.txt"), "testing")
+    sub_dirs1 = ["subdir1_1", "subdir1_2", "subdir1_3"]
+    for name in sub_dirs1:
+      file_io.create_dir(os.path.join(dir_path, name))
+    file_io.write_string_to_file(
+        os.path.join(dir_path, "subdir1_1/file2.txt"), "testing")
+    file_io.create_dir(os.path.join(dir_path, "subdir1_2/subdir2"))
+
+  def testWalkInOrder(self):
+    dir_path = os.path.join(self._base_dir, "test_dir")
+    self._setupWalkDirectories(dir_path)
+    # Now test the walk (in_order = True)
+    all_dirs = []
+    all_subdirs = []
+    all_files = []
+    for (w_dir, w_subdirs, w_files) in file_io.walk(dir_path, in_order=True):
+      all_dirs.append(w_dir)
+      all_subdirs.append(w_subdirs)
+      all_files.append(w_files)
+    self.assertItemsEqual(all_dirs, [compat.as_bytes(dir_path)] + [
+        compat.as_bytes(os.path.join(dir_path, item))
+        for item in ["subdir1_1", "subdir1_2", "subdir1_2/subdir2", "subdir1_3"]
+    ])
+    self.assertEqual(compat.as_bytes(dir_path), all_dirs[0])
+    self.assertLess(
+        all_dirs.index(compat.as_bytes(os.path.join(dir_path, "subdir1_2"))),
+        all_dirs.index(
+            compat.as_bytes(os.path.join(dir_path, "subdir1_2/subdir2"))))
+    self.assertItemsEqual(all_subdirs[1:5], [[], [b"subdir2"], [], []])
+    self.assertItemsEqual(all_subdirs[0],
+                          [b"subdir1_1", b"subdir1_2", b"subdir1_3"])
+    self.assertItemsEqual(all_files, [[b"file1.txt"], [b"file2.txt"], [], [],
+                                      []])
+    self.assertLess(
+        all_files.index([b"file1.txt"]), all_files.index([b"file2.txt"]))
+
+  def testWalkPostOrder(self):
+    dir_path = os.path.join(self._base_dir, "test_dir")
+    self._setupWalkDirectories(dir_path)
+    # Now test the walk (in_order = False)
+    all_dirs = []
+    all_subdirs = []
+    all_files = []
+    for (w_dir, w_subdirs, w_files) in file_io.walk(dir_path, in_order=False):
+      all_dirs.append(w_dir)
+      all_subdirs.append(w_subdirs)
+      all_files.append(w_files)
+    self.assertItemsEqual(all_dirs, [
+        compat.as_bytes(os.path.join(dir_path, item))
+        for item in ["subdir1_1", "subdir1_2/subdir2", "subdir1_2", "subdir1_3"]
+    ] + [compat.as_bytes(dir_path)])
+    self.assertEqual(compat.as_bytes(dir_path), all_dirs[4])
+    self.assertLess(
+        all_dirs.index(
+            compat.as_bytes(os.path.join(dir_path, "subdir1_2/subdir2"))),
+        all_dirs.index(compat.as_bytes(os.path.join(dir_path, "subdir1_2"))))
+    self.assertItemsEqual(all_subdirs[0:4], [[], [], [b"subdir2"], []])
+    self.assertItemsEqual(all_subdirs[4],
+                          [b"subdir1_1", b"subdir1_2", b"subdir1_3"])
+    self.assertItemsEqual(all_files, [[b"file2.txt"], [], [], [],
+                                      [b"file1.txt"]])
+    self.assertLess(
+        all_files.index([b"file2.txt"]), all_files.index([b"file1.txt"]))
+
+  def testWalkFailure(self):
+    dir_path = os.path.join(self._base_dir, "test_dir")
+    # Try walking a directory that wasn't created.
+    all_dirs = []
+    all_subdirs = []
+    all_files = []
+    for (w_dir, w_subdirs, w_files) in file_io.walk(dir_path, in_order=False):
+      all_dirs.append(w_dir)
+      all_subdirs.append(w_subdirs)
+      all_files.append(w_files)
+    self.assertItemsEqual(all_dirs, [])
+    self.assertItemsEqual(all_subdirs, [])
+    self.assertItemsEqual(all_files, [])
+
+  def testStat(self):
+    file_path = os.path.join(self._base_dir, "temp_file")
+    file_io.write_string_to_file(file_path, "testing")
+    file_statistics = file_io.stat(file_path)
+    os_statistics = os.stat(file_path)
+    self.assertEquals(7, file_statistics.length)
+    self.assertEqual(
+        int(os_statistics.st_mtime), int(file_statistics.mtime_nsec / 1e9))
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 2a3fad66cb7..9931de7bd6b 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -197,7 +197,7 @@ def zeros_initializer(shape, dtype=dtypes.float32):
   return zeros(shape, dtype)
 
 
-def _NewSliceHelper(tensor, slice_spec):
+def _SliceHelper(tensor, slice_spec):
   """Overload for Tensor.__getitem__.
 
   This operation extracts the specified region from the tensor.
@@ -264,85 +264,24 @@ def _NewSliceHelper(tensor, slice_spec):
       shrink_axis_mask |= (1 << index)
     index += 1
 
-  return strided_slice(tensor,
-                       pack(begin),
-                       pack(end),
-                       pack(strides),
-                       begin_mask=begin_mask,
-                       end_mask=end_mask,
-                       shrink_axis_mask=shrink_axis_mask,
-                       new_axis_mask=new_axis_mask,
-                       ellipsis_mask=ellipsis_mask)
+  # pack possibly involves often involves no tensors, so we must use op_scope
+  # correct graph
+  with ops.op_scope([tensor] + begin + end + strides, None,
+                    "strided_slice") as name:
+    begin_pack, end_pack, strides_pack = pack(begin), pack(end), pack(strides)
+    return strided_slice(tensor,
+                         begin_pack,
+                         end_pack,
+                         strides_pack,
+                         begin_mask=begin_mask,
+                         end_mask=end_mask,
+                         shrink_axis_mask=shrink_axis_mask,
+                         new_axis_mask=new_axis_mask,
+                         ellipsis_mask=ellipsis_mask,
+                         name=name)
 
 
 # pylint: disable=undefined-variable,protected-access
-def _SliceHelper(tensor, slice_spec):
-  """Overload for Tensor.__getitem__.
-
-  Currently the size of the slice must be statically known in each dimension,
-  i.e. the "stop" of the slice must not be omitted.
-
-  TODO(mrry): Support slices where the sizes are not specified.
-  TODO(mrry): Support negative indices in slices with numpy/Python semantics.
-
-  Args:
-    tensor: An ops.Tensor object.
-    slice_spec: The arguments to Tensor.__getitem__.
-
-  Returns:
-    The appropriate slice of "tensor", based on "slice_spec".
-
-  Raises:
-    ValueError: If a slice range is negative size.
-    TypeError: If the slice indices aren't int, slice, or Ellipsis.
-  """
-  if not isinstance(slice_spec, (list, tuple)):
-    slice_spec = [slice_spec]
-  indices = []
-  sizes = []
-  squeeze_dims = []
-  for dim, s in enumerate(slice_spec):
-    if isinstance(s, _baseslice):
-      if s.step not in (None, 1):
-        raise NotImplementedError(
-            "Steps other than 1 are not currently supported")
-      start = s.start if s.start is not None else 0
-      if start < 0:
-        raise NotImplementedError(
-            "Negative start indices are not currently supported")
-      indices.append(start)
-      if s.stop is not None and s.stop < 0:
-        raise NotImplementedError(
-            "Negative stop indices are not currently supported")
-      # NOTE(mrry): If the stop is not specified, Python substitutes
-      #   sys.maxsize, which is typically (2 ** 63) - 1. Since Slice currently
-      #   supports signed DT_INT32 arguments, we use -1 to specify that all
-      #   elements should be captured.
-      if s.stop is None or s.stop == sys.maxsize:
-        sizes.append(-1)
-      else:
-        if start > s.stop:
-          raise ValueError("Stop must be at least start")
-        sizes.append(s.stop - start)
-    elif s is Ellipsis:
-      raise NotImplementedError("Ellipsis is not currently supported")
-    else:
-      try:
-        s = int(s)
-      except TypeError:
-        raise TypeError("Bad slice index %s of type %s" % (s, type(s)))
-      if s < 0:
-        raise NotImplementedError("Negative indices are currently unsupported")
-      indices.append(s)
-      sizes.append(1)
-      squeeze_dims.append(dim)
-  sliced = slice(tensor, indices, sizes)
-  if squeeze_dims:
-    return squeeze(sliced, squeeze_dims=squeeze_dims)
-  else:
-    return sliced
-
-
 def slice(input_, begin, size, name=None):
   """Extracts a slice from a tensor.
 
@@ -491,8 +430,6 @@ def strided_slice(input_,
                                      new_axis_mask=new_axis_mask,
                                      shrink_axis_mask=shrink_axis_mask)
 
-# TODO(aselle): When gradient is added and performance verified switch
-# ops.Tensor._override_operator("__getitem__", _NewSliceHelper)
 ops.Tensor._override_operator("__getitem__", _SliceHelper)
 
 
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index eee3b3e2d4e..ae3770416f3 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -348,9 +348,11 @@ def merge(inputs, name=None):
     A tuple containing the chosen input tensor and its index in `inputs`.
 
   Raises:
-    ValueError: If inputs are IndexedSlices and some but not all have a
-      dense_shape property.
+    ValueError: If any of the inputs is None, or inputs are IndexedSlices and
+      some but not all have a dense_shape property.
   """
+  if any([inp is None for inp in inputs]):
+    raise ValueError("At least one of the merge inputs is None: %s" % inputs)
   with ops.op_scope(inputs, name, "Merge") as name:
     inputs = [ops.convert_to_tensor_or_indexed_slices(inp, as_ref=True)
               for inp in inputs]
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 43630c2a726..c50fbcd25d0 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -209,7 +209,7 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
 
 
 def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
-           swap_memory=False, name=None):
+           swap_memory=False, infer_shape=True, name=None):
   """map on the list of tensors unpacked from `elems` on dimension 0.
 
   The simplest version of `map` repeatedly applies the callable `fn` to a
@@ -248,6 +248,7 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
       in parallel.
     back_prop: (optional) True enables support for back propagation.
     swap_memory: (optional) True enables GPU-CPU memory swapping.
+    infer_shape: (optional) False disables tests for consistent output shapes.
     name: (optional) Name prefix for the returned tensors.
 
   Returns:
@@ -335,7 +336,7 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
     accs_ta = [
         tensor_array_ops.TensorArray(dtype=dt, size=n,
                                      dynamic_size=False,
-                                     infer_shape=True)
+                                     infer_shape=infer_shape)
         for dt in dtype_flat]
 
     def compute(i, tas):
@@ -380,7 +381,7 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
 
 
 def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
-         swap_memory=False, name=None):
+         swap_memory=False, infer_shape=True, name=None):
   """scan on the list of tensors unpacked from `elems` on dimension 0.
 
   The simplest version of `scan` repeatedly applies the callable `fn` to a
@@ -429,6 +430,7 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
       in parallel.
     back_prop: (optional) True enables support for back propagation.
     swap_memory: (optional) True enables GPU-CPU memory swapping.
+    infer_shape: (optional) False disables tests for consistent output shapes.
     name: (optional) Name prefix for the returned tensors.
 
   Returns:
@@ -523,7 +525,7 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     accs_ta = [
         tensor_array_ops.TensorArray(dtype=init.dtype, size=n,
                                      dynamic_size=False,
-                                     infer_shape=True)
+                                     infer_shape=infer_shape)
         for init in a_flat]
 
     if initializer is None:
diff --git a/tensorflow/python/ops/gradients.py b/tensorflow/python/ops/gradients.py
index efd0826e566..27b7f044039 100644
--- a/tensorflow/python/ops/gradients.py
+++ b/tensorflow/python/ops/gradients.py
@@ -192,9 +192,6 @@ def _PendingCount(graph, to_ops, from_ops, colocate_gradients_with_ops):
     for x in op.inputs:
       if between_ops[x.op._id]:
         pending_count[x.op._id] += 1
-    for x in op.control_inputs:
-      if between_ops[x._id]:
-        pending_count[x._id] += 1
 
   return pending_count, loop_state
 
@@ -361,6 +358,7 @@ def gradients(ys,
     grad_ys = [None] * len(ys)
   else:
     grad_ys = _AsList(grad_ys)
+
   with ops.op_scope(ys + xs + grad_ys, name, "gradients"):
     ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
     xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x")
@@ -512,10 +510,6 @@ def gradients(ys,
                    control_flow_ops.IsLoopSwitch(x.op))
         if ready:
           queue.append(x.op)
-      for x in op.control_inputs:
-        pending_count[x._id] -= 1
-        if pending_count[x._id] is 0:
-          queue.append(x)
       # pylint: enable=protected-access
 
   if loop_state:
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 0c8824d10a9..30e2b494b35 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import itertools
 import math
 import os
 
@@ -34,6 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
 
 
 class RGBToHSVTest(test_util.TensorFlowTestCase):
@@ -68,11 +68,10 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
     data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     for nptype in [np.float32, np.float64]:
       rgb_np = np.array(data, dtype=nptype).reshape([2, 2, 3]) / 255.
-      for use_gpu in [True, False]:
-        with self.test_session(use_gpu=use_gpu):
-          hsv = image_ops.rgb_to_hsv(rgb_np)
-          rgb = image_ops.hsv_to_rgb(hsv)
-          rgb_tf = rgb.eval()
+      with self.test_session():
+        hsv = image_ops.rgb_to_hsv(rgb_np)
+        rgb = image_ops.hsv_to_rgb(hsv)
+        rgb_tf = rgb.eval()
       self.assertAllClose(rgb_tf, rgb_np)
 
 
@@ -234,65 +233,59 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
   def testIdempotentLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = constant_op.constant(x_np, shape=x_np.shape)
-        y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
-        y_tf = y.eval()
-        self.assertAllEqual(y_tf, x_np)
+    with self.test_session():
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, x_np)
 
   def testLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
 
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = constant_op.constant(x_np, shape=x_np.shape)
-        y = image_ops.flip_left_right(x_tf)
-        y_tf = y.eval()
-        self.assertAllEqual(y_tf, y_np)
+    with self.test_session():
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_left_right(x_tf)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
 
   def testIdempotentUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = constant_op.constant(x_np, shape=x_np.shape)
-        y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
-        y_tf = y.eval()
-        self.assertAllEqual(y_tf, x_np)
+    with self.test_session():
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, x_np)
 
   def testUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = constant_op.constant(x_np, shape=x_np.shape)
-        y = image_ops.flip_up_down(x_tf)
-        y_tf = y.eval()
-        self.assertAllEqual(y_tf, y_np)
+    with self.test_session():
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_up_down(x_tf)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
 
   def testIdempotentTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = constant_op.constant(x_np, shape=x_np.shape)
-        y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
-        y_tf = y.eval()
-        self.assertAllEqual(y_tf, x_np)
+    with self.test_session():
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, x_np)
 
   def testTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[1, 4], [2, 5], [3, 6]], dtype=np.uint8).reshape([3, 2, 1])
 
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = constant_op.constant(x_np, shape=x_np.shape)
-        y = image_ops.transpose_image(x_tf)
-        y_tf = y.eval()
-        self.assertAllEqual(y_tf, y_np)
+    with self.test_session():
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.transpose_image(x_tf)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
 
   def testPartialShapes(self):
     p_unknown_rank = array_ops.placeholder(dtypes.uint8)
@@ -323,17 +316,16 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        rotated = image
-        for _ in xrange(4):
-          rotated = image_ops.rot90(rotated)
-        self.assertAllEqual(image, rotated.eval())
+    with self.test_session():
+      rotated = image
+      for _ in xrange(4):
+        rotated = image_ops.rot90(rotated)
+      self.assertAllEqual(image, rotated.eval())
 
   def testRot90NumpyEquivalence(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
-    for use_gpu, k in itertools.product([False, True], range(4)):
-      with self.test_session(use_gpu=use_gpu):
+    for k in range(4):
+      with self.test_session():
         y_np = np.rot90(image, k=k)
         y_tf = image_ops.rot90(image, k=k)
         self.assertAllEqual(y_np, y_tf.eval())
@@ -377,12 +369,11 @@ class RandomFlipTest(test_util.TensorFlowTestCase):
 class AdjustContrastTest(test_util.TensorFlowTestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
-    for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
-        x = constant_op.constant(x_np, shape=x_np.shape)
-        y = image_ops.adjust_contrast(x, contrast_factor)
-        y_tf = y.eval()
-        self.assertAllClose(y_tf, y_np, 1e-6)
+    with self.test_session():
+      x = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.adjust_contrast(x, contrast_factor)
+      y_tf = y.eval()
+      self.assertAllClose(y_tf, y_np, 1e-6)
 
   def testDoubleContrastUint8(self):
     x_shape = [1, 2, 2, 3]
@@ -975,12 +966,12 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
   TYPES = [np.uint8, np.int8, np.int16, np.int32, np.int64,
            np.float32, np.float64]
 
-  def availableGPUModes(self, opt, nptype):
+  def shouldRunOnGPU(self, opt, nptype):
     if opt == image_ops.ResizeMethod.NEAREST_NEIGHBOR \
             and nptype in [np.float32, np.float64]:
-      return [True, False]
+      return True
     else:
-      return [False]
+      return False
 
   def testNoOp(self):
     img_shape = [1, 6, 4, 1]
@@ -1000,8 +991,8 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       img_np = np.array(data, dtype=nptype).reshape(img_shape)
 
       for opt in self.OPTIONS:
-        for use_gpu in self.availableGPUModes(opt, nptype):
-          with self.test_session(use_gpu=use_gpu) as sess:
+        if test.is_gpu_available() and self.shouldRunOnGPU(opt, nptype):
+          with self.test_session() as sess:
             image = constant_op.constant(img_np, shape=img_shape)
             y = image_ops.resize_images(image, target_height, target_width, opt)
             yshape = array_ops.shape(y)
@@ -1097,8 +1088,8 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         img_np = np.array(data, dtype=nptype).reshape(img_shape)
 
         for opt in self.OPTIONS:
-          for use_gpu in self.availableGPUModes(opt, nptype):
-            with self.test_session(use_gpu=use_gpu):
+          if test.is_gpu_available() and self.shouldRunOnGPU(opt, nptype):
+            with self.test_session():
               image = constant_op.constant(img_np, shape=img_shape)
               y = image_ops.resize_images(image, target_height, target_width, opt)
               expected = np.array(expected_data).reshape(target_shape)
@@ -1140,8 +1131,8 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image_ops.ResizeMethod.BILINEAR,
           image_ops.ResizeMethod.NEAREST_NEIGHBOR,
           image_ops.ResizeMethod.AREA]:
-        for use_gpu in self.availableGPUModes(opt, nptype):
-          with self.test_session(use_gpu=use_gpu):
+        if test.is_gpu_available() and self.shouldRunOnGPU(opt, nptype):
+          with self.test_session():
             img_np = np.array(data, dtype=nptype).reshape(img_shape)
             image = constant_op.constant(img_np, shape=img_shape)
             y = image_ops.resize_images(image, target_height, target_width, opt)
@@ -1207,25 +1198,29 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
 
   def testCompareNearestNeighbor(self):
-    input_shape = [1, 5, 6, 3]
-    target_height = 8
-    target_width = 12
-    for nptype in [np.float32, np.float64]:
-      for align_corners in [True, False]:
-        img_np = np.arange(0, np.prod(input_shape), dtype=nptype).reshape(input_shape)
-        with self.test_session(use_gpu=True):
-          image = constant_op.constant(img_np, shape=input_shape)
-          out_op = image_ops.resize_images(image, target_height, target_width,
-                                           image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-                                           align_corners=align_corners)
-          gpu_val = out_op.eval()
-        with self.test_session(use_gpu=False):
-          image = constant_op.constant(img_np, shape=input_shape)
-          out_op = image_ops.resize_images(image, target_height, target_width,
-                                           image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-                                           align_corners=align_corners)
-          cpu_val = out_op.eval()
-        self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
+    if test.is_gpu_available():
+      input_shape = [1, 5, 6, 3]
+      target_height = 8
+      target_width = 12
+      for nptype in [np.float32, np.float64]:
+        for align_corners in [True, False]:
+          img_np = np.arange(
+              0, np.prod(input_shape), dtype=nptype).reshape(input_shape)
+          with self.test_session(use_gpu=True):
+            image = constant_op.constant(img_np, shape=input_shape)
+            out_op = image_ops.resize_images(
+                image, target_height, target_width,
+                image_ops.ResizeMethod.NEAREST_NEIGHBOR,
+                align_corners=align_corners)
+            gpu_val = out_op.eval()
+          with self.test_session(use_gpu=False):
+            image = constant_op.constant(img_np, shape=input_shape)
+            out_op = image_ops.resize_images(
+                image, target_height, target_width,
+                image_ops.ResizeMethod.NEAREST_NEIGHBOR,
+                align_corners=align_corners)
+            cpu_val = out_op.eval()
+          self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
 
 
 class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 67fadc12cdc..7c102390432 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -32,6 +32,12 @@ from tensorflow.python.ops import math_ops
 
 ops.NoGradient("CholeskyGrad")
 ops.NoGradient("BatchCholeskyGrad")
+ops.NoGradient("SelfAdjointEig")
+ops.NoGradient("BatchSelfAdjointEig")
+ops.NoGradient("SelfAdjointEigV2")
+ops.NoGradient("BatchSelfAdjointEigV2")
+ops.NoGradient("Svd")
+ops.NoGradient("BatchSvd")
 
 
 @ops.RegisterGradient("MatrixInverse")
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 0e76f772caf..9d11cbfc873 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -27,27 +27,32 @@ from tensorflow.python.ops.gen_linalg_ops import *
 # pylint: enable=wildcard-import
 
 
+def _UnchangedSquareHelper(input_shape):
+  """Helper for {Batch}UnchangedSquare."""
+  # The matrices in the batch must be square.
+  input_shape[-1].assert_is_compatible_with(input_shape[-2])
+  return [input_shape]
+
+
 @ops.RegisterShape("Cholesky")
 @ops.RegisterShape("CholeskyGrad")
 @ops.RegisterShape("MatrixInverse")
 def _UnchangedSquare(op):
-  input_shape = op.inputs[0].get_shape().with_rank(2)
-  # The matrix must be square.
-  input_shape[0].assert_is_compatible_with(input_shape[1])
-  return [input_shape]
+  """Shape function for matrix ops with output equal to input shape."""
+  return _UnchangedSquareHelper(op.inputs[0].get_shape().with_rank(2))
 
 
 @ops.RegisterShape("BatchCholesky")
 @ops.RegisterShape("BatchCholeskyGrad")
 @ops.RegisterShape("BatchMatrixInverse")
 def _BatchUnchangedSquare(op):
-  input_shape = op.inputs[0].get_shape().with_rank_at_least(2)
-  # The matrices in the batch must be square.
-  input_shape[-1].assert_is_compatible_with(input_shape[-2])
-  return [input_shape]
+  """Shape function for batch matrix ops with output equal to input shape."""
+  return _UnchangedSquareHelper(op.inputs[0].get_shape().with_rank_at_least(2))
+
 
 @ops.RegisterShape("MatrixDeterminant")
 def _MatrixDeterminantShape(op):
+  """Shape function for determinant op."""
   input_shape = op.inputs[0].get_shape().with_rank(2)
   # The matrix must be square.
   input_shape[0].assert_is_compatible_with(input_shape[1])
@@ -59,6 +64,7 @@ def _MatrixDeterminantShape(op):
 
 @ops.RegisterShape("BatchMatrixDeterminant")
 def _BatchMatrixDeterminantShape(op):
+  """Shape function for batch determinant op."""
   input_shape = op.inputs[0].get_shape().with_rank_at_least(2)
   # The matrices in the batch must be square.
   input_shape[-1].assert_is_compatible_with(input_shape[-2])
@@ -70,6 +76,7 @@ def _BatchMatrixDeterminantShape(op):
 
 @ops.RegisterShape("SelfAdjointEig")
 def _SelfAdjointEigShape(op):
+  """Shape function for self-adjoint eigensolver op."""
   input_shape = op.inputs[0].get_shape().with_rank(2)
   # The matrix must be square.
   input_shape[0].assert_is_compatible_with(input_shape[1])
@@ -80,6 +87,7 @@ def _SelfAdjointEigShape(op):
 
 @ops.RegisterShape("BatchSelfAdjointEig")
 def _BatchSelfAdjointEigShape(op):
+  """Shape function for batch self-adjoint eigensolver op."""
   input_shape = op.inputs[0].get_shape().with_rank_at_least(2)
   # The matrices in the batch must be square.
   input_shape[-1].assert_is_compatible_with(input_shape[-2])
@@ -89,48 +97,113 @@ def _BatchSelfAdjointEigShape(op):
   return [out_shape]
 
 
+def _SelfAdjointEigV2ShapeHelper(op, input_shape):
+  """Shape inference helper for {Batch}SelfAdjointEigV2."""
+  batch_shape = input_shape[:-2]
+  n = input_shape[-1].merge_with(input_shape[-2])
+  compute_v = op.get_attr("compute_v")
+  if compute_v:
+    return [batch_shape.concatenate([n]), batch_shape.concatenate([n, n])]
+  else:
+    return [batch_shape.concatenate([n]), [0]]
+
+
+@ops.RegisterShape("SelfAdjointEigV2")
+def _SelfAdjointEigShapeV2(op):
+  """Shape function for SelfAdjointEigV2."""
+  return _SelfAdjointEigV2ShapeHelper(op, op.inputs[0].get_shape().with_rank(2))
+
+
+@ops.RegisterShape("BatchSelfAdjointEigV2")
+def _BatchSelfAdjointEigV2Shape(op):
+  """Shape function for BatchSelfAdjointEigV2."""
+  return _SelfAdjointEigV2ShapeHelper(
+      op, op.inputs[0].get_shape().with_rank_at_least(2))
+
+
+def _SvdShapeHelper(input_shape, op):
+  """Shape inference helper for {Batch}SVD op."""
+  unknown = tensor_shape.unknown_shape()
+  if input_shape.ndims is not None:
+    return [unknown, unknown, unknown]
+  compute_uv = op.get_attr("compute_uv")
+  full_matrices = op.get_attr("full_matrices")
+  m = input_shape[-2]
+  n = input_shape[-1]
+  p = min(m, n)
+  batch_shape = input_shape[:-2]
+  s_shape = batch_shape.concatenate([p])
+  if compute_uv:
+    if full_matrices:
+      u_shape = batch_shape.concatenate([m, m])
+      v_shape = batch_shape.concatenate([n, n])
+    else:
+      u_shape = batch_shape.concatenate([m, p])
+      v_shape = batch_shape.concatenate([n, p])
+  else:
+    u_shape = [0]
+    v_shape = [0]
+  return [s_shape, u_shape, v_shape]
+
+
+@ops.RegisterShape("Svd")
+def _SvdShape(op):
+  """Shape function for SVD op."""
+  return _SvdShapeHelper(op.inputs[0].get_shape().with_rank(2), op)
+
+
+@ops.RegisterShape("BatchSvd")
+def _BatchSvdShape(op):
+  """Shape function for batch SVD op."""
+  return _SvdShapeHelper(op.inputs[0].get_shape().with_rank_at_least(2), op)
+
+
+def _SquareMatrixSolveShapeHelper(lhs_shape, rhs_shape):
+  """Shape inference helper function for square matrix solver ops."""
+  # The matrix must be square.
+  lhs_shape[-1].assert_is_compatible_with(lhs_shape[-2])
+  # The matrix and right-hand side must have the same number of rows.
+  lhs_shape[-2].assert_is_compatible_with(rhs_shape[-2])
+  return [rhs_shape]
+
+
 @ops.RegisterShape("MatrixSolve")
 @ops.RegisterShape("MatrixTriangularSolve")
 def _SquareMatrixSolveShape(op):
-  lhs_shape = op.inputs[0].get_shape().with_rank(2)
-  rhs_shape = op.inputs[1].get_shape().with_rank(2)
-  # The matrix must be square.
-  lhs_shape[0].assert_is_compatible_with(lhs_shape[1])
-  # The matrix and right-hand side must have the same number of rows.
-  lhs_shape[0].assert_is_compatible_with(rhs_shape[0])
-  return [rhs_shape]
+  """Shape function for square matrix solver ops."""
+  return _SquareMatrixSolveShapeHelper(op.inputs[0].get_shape().with_rank(2),
+                                       op.inputs[1].get_shape().with_rank(2))
 
 
 @ops.RegisterShape("BatchMatrixSolve")
 @ops.RegisterShape("BatchMatrixTriangularSolve")
 def _BatchSquareMatrixSolveShape(op):
-  lhs_shape = op.inputs[0].get_shape().with_rank_at_least(2)
-  rhs_shape = op.inputs[1].get_shape().with_rank_at_least(2)
-  # The matrices must be square.
-  lhs_shape[-1].assert_is_compatible_with(lhs_shape[-2])
-  # The matrices and right-hand sides in the batch must have the same number of
-  # rows.
+  """Shape function for batch square matrix solver ops."""
+  return _SquareMatrixSolveShapeHelper(
+      op.inputs[0].get_shape().with_rank_at_least(2),
+      op.inputs[1].get_shape().with_rank_at_least(2))
+
+
+def _MatrixSolveLsShapeHelper(lhs_shape, rhs_shape):
+  """Shape inference helper function for least squares matrix solver ops."""
+  # The matrices and right-hand sides must have the same number of rows.
   lhs_shape[-2].assert_is_compatible_with(rhs_shape[-2])
-  return [rhs_shape]
+  return [lhs_shape[:-2].concatenate([lhs_shape[-1], rhs_shape[-1]])]
 
 
 @ops.RegisterShape("MatrixSolveLs")
 def _MatrixSolveLsShape(op):
-  lhs_shape = op.inputs[0].get_shape().with_rank(2)
-  rhs_shape = op.inputs[1].get_shape().with_rank(2)
-  # The matrix and right-hand side must have the same number of rows.
-  lhs_shape[0].assert_is_compatible_with(rhs_shape[0])
-  return [[lhs_shape[1], rhs_shape[1]]]
+  """Shape function for least-squares matrix solver op."""
+  return _MatrixSolveLsShapeHelper(op.inputs[0].get_shape().with_rank(2),
+                                   op.inputs[1].get_shape().with_rank(2))
 
 
 @ops.RegisterShape("BatchMatrixSolveLs")
 def _BatchMatrixSolveLsShape(op):
-  lhs_shape = op.inputs[0].get_shape().with_rank_at_least(2)
-  rhs_shape = op.inputs[1].get_shape().with_rank_at_least(2)
-  # The matrices and right-hand sides in the batch must have the same number of
-  # rows.
-  lhs_shape[-2].assert_is_compatible_with(rhs_shape[-2])
-  return [lhs_shape[:-2].concatenate([lhs_shape[-1], rhs_shape[-1]])]
+  """Shape function for batch least-squares matrix solver op."""
+  return _MatrixSolveLsShapeHelper(
+      op.inputs[0].get_shape().with_rank_at_least(2),
+      op.inputs[1].get_shape().with_rank_at_least(2))
 
 
 # Names below are lower_case.
@@ -331,4 +404,163 @@ def batch_matrix_solve_ls(matrix,
                                               fast=fast,
                                               name=name)
 
+
+def self_adjoint_eig(matrix, name=None):
+  """Computes the eigen decomposition of a self-adjoint matrix.
+
+  Computes the eigenvalues and eigenvectors of an N-by-N matrix `matrix` such
+  that `matrix * v[:,i] = e(i) * v[:,i]`, for i=0...N-1.
+
+  Args:
+    matrix: `Tensor` of shape `[N, N]`.
+    name: string, optional name of the operation.
+
+  Returns:
+    e: Eigenvalues. Shape is `[N]`.
+    v: Eigenvectors. Shape is `[N, N]`. The columns contain the eigenvectors of
+      `matrix`.
+  """
+  e, v = gen_linalg_ops.self_adjoint_eig_v2(matrix, compute_v=True, name=name)
+  return e, v
+
+
+def batch_self_adjoint_eig(tensor, name=None):
+  """Computes the eigen decomposition of a batch of self-adjoint matrices.
+
+  Computes the eigenvalues and eigenvectors of the innermost N-by-N matrices
+  in `tensor` such that
+  `tensor[...,:,:] * v[..., :,i] = e(..., i) * v[...,:,i]`, for i=0...N-1.
+
+  Args:
+    tensor: `Tensor` of shape `[..., N, N]`.
+    name: string, optional name of the operation.
+
+  Returns:
+    e: Eigenvalues. Shape is `[..., N]`.
+    v: Eigenvectors. Shape is `[..., N, N]`. The columns of the inner most
+    matrices
+      contain eigenvectors of the corresponding matrices in `tensor`
+  """
+  e, v = gen_linalg_ops.batch_self_adjoint_eig_v2(
+      tensor, compute_v=True, name=name)
+  return e, v
+
+
+def self_adjoint_eigvals(matrix, name=None):
+  """Computes the eigenvalues a self-adjoint  matrix.
+
+  Args:
+    matrix: `Tensor` of shape `[N, N]`.
+    name: string, optional name of the operation.
+
+  Returns:
+    e: Eigenvalues of `matrix`. Shape is `[N]`.
+  """
+  e, _ = gen_linalg_ops.self_adjoint_eig_v2(matrix, compute_v=False, name=name)
+  return e
+
+
+def batch_self_adjoint_eigvals(tensor, name=None):
+  """Computes the eigenvalues of a batch of self-adjoint matrices.
+
+  Args:
+    tensor: `Tensor` of shape `[..., N, N]`.
+    name: string, optional name of the operation.
+
+  Returns:
+    e: Eigenvalues. Shape is `[..., N]`. The vector `e[..., :]` contains the `N`
+      eigenvalues of `tensor[..., :, :]`.
+  """
+  e, _ = gen_linalg_ops.batch_self_adjoint_eig_v2(
+      tensor, compute_v=False, name=name)
+  return e
+
+
+def svd(matrix, compute_uv=True, full_matrices=False, name=None):
+  """Computes the singular value decomposition of a matrix.
+
+  Computes the SVD of `matrix` such that `matrix = u * diag(s) *
+  transpose(v)`
+
+  ```prettyprint
+  # a is a matrix.
+  # s is a vector of singular values.
+  # u is the matrix of left singular vectors.
+  # v is a matrix of right singular vectors.
+  s, u, v = svd(a)
+  s = svd(a, compute_uv=False)
+  ```
+
+  Args:
+    matrix: `Tensor` of shape `[M, N]`. Let `P` be the minimum of `M` and `N`.
+    compute_uv: If `True` then left and right singular vectors will be
+      computed and returned in `u` and `v`, respectively. Otherwise, only the
+      singular values will be computed, which can be significantly faster.
+    full_matrices: If true, compute full-sized `u` and `v`. If false
+      (the default), compute only the leading `P` singular vectors.
+      Ignored if `compute_uv` is `False`.
+    name: string, optional name of the operation.
+
+  Returns:
+    s: Singular values. Shape is `[P]`.
+    u: Right singular vectors. If `full_matrices` is `False` (default) then
+      shape is `[M, P]`; if `full_matrices` is `True` then shape is
+      `[M, M]`. Not returned if `compute_uv` is `False`.
+    v: Left singular vectors. If `full_matrices` is `False` (default) then
+      shape is `[N, P]`. If `full_matrices` is `True` then shape is
+      `[N, N]`. Not returned if `compute_uv` is `False`.
+  """
+  s, u, v = gen_linalg_ops.svd(matrix,
+                               compute_uv=compute_uv,
+                               full_matrices=full_matrices)
+  if compute_uv:
+    return s, u, v
+  else:
+    return s
+
+
+def batch_svd(tensor, compute_uv=True, full_matrices=False, name=None):
+  """Computes the singular value decompositions of a batch of matrices.
+
+  Computes the SVD of each inner matrix in `tensor` such that
+  `tensor[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :,
+  :])`
+
+  ```prettyprint
+  # a is a tensor.
+  # s is a tensor of singular values.
+  # u is a tensor of left singular vectors.
+  # v is a tensor of right singular vectors.
+  s, u, v = batch_svd(a)
+  s = batch_svd(a, compute_uv=False)
+  ```
+
+  Args:
+    matrix: `Tensor` of shape `[..., M, N]`. Let `P` be the minimum of `M` and
+      `N`.
+    compute_uv: If `True` then left and right singular vectors will be
+      computed and returned in `u` and `v`, respectively. Otherwise, only the
+      singular values will be computed, which can be significantly faster.
+    full_matrices: If true, compute full-sized `u` and `v`. If false
+      (the default), compute only the leading `P` singular vectors.
+      Ignored if `compute_uv` is `False`.
+    name: string, optional name of the operation.
+
+  Returns:
+    s: Singular values. Shape is `[..., P]`.
+    u: Right singular vectors. If `full_matrices` is `False` (default) then
+      shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+      `[..., M, M]`. Not returned if `compute_uv` is `False`.
+    v: Left singular vectors. If `full_matrices` is `False` (default) then
+      shape is `[..., N, P]`. If `full_matrices` is `True` then shape is
+      `[..., N, N]`. Not returned if `compute_uv` is `False`.
+  """
+  s, u, v = gen_linalg_ops.batch_svd(
+      tensor, compute_uv=compute_uv, full_matrices=full_matrices)
+  if compute_uv:
+    return s, u, v
+  else:
+    return s
+
+
 # pylint: enable=invalid-name
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index cd7e92401d2..981218bd8ba 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -98,9 +98,6 @@ functions on matrices to your graph.
 @@cholesky_solve
 @@batch_cholesky_solve
 
-@@self_adjoint_eig
-@@batch_self_adjoint_eig
-
 @@matrix_solve
 @@batch_matrix_solve
 
@@ -110,6 +107,14 @@ functions on matrices to your graph.
 @@matrix_solve_ls
 @@batch_matrix_solve_ls
 
+@@self_adjoint_eig
+@@batch_self_adjoint_eig
+@@self_adjoint_eigvals
+@@batch_self_adjoint_eigvals
+
+@@svd
+@@batch_svd
+
 ## Complex Number Functions
 
 TensorFlow provides several operations that you can use to add complex number
@@ -1598,91 +1603,93 @@ def tanh(x, name=None):
 
 
 def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
-    """Compute the cumulative sum of the tensor `x` along `axis`.
+  """Compute the cumulative sum of the tensor `x` along `axis`.
 
-    By default, this op performs an inclusive cumsum, which means that the first
-    element of the input is identical to the first element of the output:
-    ```prettyprint
-    tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
-    ```
+  By default, this op performs an inclusive cumsum, which means that the first
+  element of the input is identical to the first element of the output:
+  ```prettyprint
+  tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
+  ```
 
-    By setting the `exclusive` kwarg to `True`, an exclusive cumsum is performed
-    instead:
-    ```prettyprint
-    tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
-    ```
+  By setting the `exclusive` kwarg to `True`, an exclusive cumsum is performed
+  instead:
+  ```prettyprint
+  tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
+  ```
 
-    By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-    opposite direction:
-    ```prettyprint
-    tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
-    ```
-    This is more efficient than using separate `tf.reverse` ops.
+  By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+  opposite direction:
+  ```prettyprint
+  tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
+  ```
+  This is more efficient than using separate `tf.reverse` ops.
 
-    The `reverse` and `exclusive` kwargs can also be combined:
-    ```prettyprint
-    tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
-    ```
+  The `reverse` and `exclusive` kwargs can also be combined:
+  ```prettyprint
+  tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
+  ```
 
-    Args:
-      x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+  Args:
+    x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
        `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
        `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-      axis: A `Tensor` of type `int32` (default: 0).
-      reverse: A `bool` (default: False).
-      name: A name for the operation (optional).
+       axis: A `Tensor` of type `int32` (default: 0).
+       reverse: A `bool` (default: False).
+       name: A name for the operation (optional).
 
-    Returns:
-      A `Tensor`. Has the same type as `x`.
-    """
-    with ops.op_scope([x], name, "Cumsum") as name:
-      x = ops.convert_to_tensor(x, name="x")
-      return gen_math_ops.cumsum(x, axis, exclusive=exclusive,
-                                 reverse=reverse, name=name)
+  Returns:
+    A `Tensor`. Has the same type as `x`.
+  """
+  with ops.op_scope([x], name, "Cumsum") as name:
+    x = ops.convert_to_tensor(x, name="x")
+    return gen_math_ops.cumsum(
+        x, axis, exclusive=exclusive, reverse=reverse, name=name)
 
 
 def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
-    """Compute the cumulative product of the tensor `x` along `axis`.
+  """Compute the cumulative product of the tensor `x` along `axis`.
 
-    By default, this op performs an inclusive cumprod, which means that the first
-    element of the input is identical to the first element of the output:
-    ```prettyprint
-    tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
-    ```
+  By default, this op performs an inclusive cumprod, which means that the
+  first
+  element of the input is identical to the first element of the output:
+  ```prettyprint
+  tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
+  ```
 
-    By setting the `exclusive` kwarg to `True`, an exclusive cumprod is performed
-    instead:
-    ```prettyprint
-    tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
-    ```
+  By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+  performed
+  instead:
+  ```prettyprint
+  tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
+  ```
 
-    By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-    opposite direction:
-    ```prettyprint
-    tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
-    ```
-    This is more efficient than using separate `tf.reverse` ops.
+  By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+  opposite direction:
+  ```prettyprint
+  tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
+  ```
+  This is more efficient than using separate `tf.reverse` ops.
 
-    The `reverse` and `exclusive` kwargs can also be combined:
-    ```prettyprint
-    tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
-    ```
+  The `reverse` and `exclusive` kwargs can also be combined:
+  ```prettyprint
+  tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
+  ```
 
-    Args:
-      x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+  Args:
+    x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
        `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
        `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-      axis: A `Tensor` of type `int32` (default: 0).
-      reverse: A `bool` (default: False).
-      name: A name for the operation (optional).
+    axis: A `Tensor` of type `int32` (default: 0).
+    reverse: A `bool` (default: False).
+    name: A name for the operation (optional).
 
-    Returns:
-      A `Tensor`. Has the same type as `x`.
-    """
-    with ops.op_scope([x], name, "Cumprod") as name:
-      x = ops.convert_to_tensor(x, name="x")
-      return gen_math_ops.cumprod(x, axis, exclusive=exclusive,
-                                  reverse=reverse, name=name)
+  Returns:
+    A `Tensor`. Has the same type as `x`.
+  """
+  with ops.op_scope([x], name, "Cumprod") as name:
+    x = ops.convert_to_tensor(x, name="x")
+    return gen_math_ops.cumprod(
+        x, axis, exclusive=exclusive, reverse=reverse, name=name)
 
 
 ops.RegisterShape("Abs")(common_shapes.unchanged_shape)
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index a69a72564b2..9ed801dcc5a 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -749,27 +749,19 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
     * the (possibly shifted) sum of squares of the elements in the array.
     * the shift by which the mean must be corrected or None if `shift` is None.
   """
-  with ops.op_scope([x, axes, shift], name, "sufficient_statistics"):
+  axes = list(set(axes))
+  with ops.op_scope([x, shift], name, "sufficient_statistics"):
     x = ops.convert_to_tensor(x, name="x")
     x_shape = x.get_shape()
     if x_shape.is_fully_defined():
       counts = 1
-      m_shape = []
-      for d in xrange(x_shape.ndims):
-        dim = x_shape[d].value
-        if d in set(axes):
-          counts *= dim
-          dim = 1
-        m_shape.append(dim)
+      for d in axes:
+        counts *= x_shape[d].value
       counts = constant_op.constant(counts, dtype=x.dtype)
     else:  # shape needs to be inferred at runtime.
-      x_shape = array_ops.shape(x)
-      select_axes = sparse_ops.sparse_to_dense(axes, array_ops.shape(x_shape),
-                                               True, False)
-      m_shape = math_ops.select(select_axes, array_ops.ones_like(x_shape),
-                                x_shape)
+      x_dims = array_ops.gather(array_ops.shape(x), axes)
       counts = math_ops.cast(
-          math_ops.reduce_prod(x_shape / m_shape), x.dtype, name="count")
+          math_ops.reduce_prod(x_dims), x.dtype, name="count")
     if shift is not None:
       shift = ops.convert_to_tensor(shift, name="shift")
       m_ss = math_ops.sub(x, shift)
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 48d9cab3e40..4f6f8daf621 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -704,8 +704,9 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
   of time steps and batch size, or a (possibly nested) tuple of such tensors,
   matching the nested structure of `cell.output_size`.
 
-  The parameter `sequence_length` is required and dynamic calculation is
-  automatically performed.
+  The parameter `sequence_length` is optional and is used to copy-through state
+  and zero-out outputs when past a batch element's sequence length. So it's more
+  for correctness than performance, unlike in rnn().
 
   Args:
     cell: An instance of RNNCell.
diff --git a/tensorflow/python/ops/seq2seq.py b/tensorflow/python/ops/seq2seq.py
index 99d8daf04c9..8605811b474 100644
--- a/tensorflow/python/ops/seq2seq.py
+++ b/tensorflow/python/ops/seq2seq.py
@@ -560,6 +560,13 @@ def attention_decoder(decoder_inputs, initial_state, attention_states, cell,
     def attention(query):
       """Put attention masks on hidden using hidden_features and query."""
       ds = []  # Results of attention reads will be stored here.
+      if nest.is_sequence(query):  # If the query is a tuple, flatten it.
+        query_list = nest.flatten(query)
+        for q in query_list:  # Check that ndims == 2 if specified.
+          ndims = q.get_shape().ndims
+          if ndims:
+            assert ndims == 2
+        query = array_ops.concat(1, query_list)
       for a in xrange(num_heads):
         with variable_scope.variable_scope("Attention_%d" % a):
           y = linear(query, attention_vec_size, True)
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 139a70fbb2b..7da35219e2a 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -775,7 +775,8 @@ def sparse_to_indicator(sp_input, vocab_size, name=None):
                                   name=name)
 
 
-def sparse_merge(sp_ids, sp_values, vocab_size, name=None):
+def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
+                 already_sorted=False):
   """Combines a batch of feature ids and values into a single `SparseTensor`.
 
   The most common use case for this function occurs when feature ids and
@@ -794,14 +795,17 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None):
 
   For example, consider the following feature vectors:
 
+  ```python
     vector1 = [-3, 0, 0, 0, 0, 0]
     vector2 = [ 0, 1, 0, 4, 1, 0]
     vector3 = [ 5, 0, 0, 9, 0, 0]
+  ```
 
   These might be stored sparsely in the following Example protos by storing
   only the feature ids (column number if the vectors are treated as a matrix)
   of the non-zero elements and the corresponding values:
 
+  ```python
     examples = [Example(features={
                     "ids": Feature(int64_list=Int64List(value=[0])),
                     "values": Feature(float_list=FloatList(value=[-3]))}),
@@ -811,6 +815,7 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None):
                 Example(features={
                     "ids": Feature(int64_list=Int64List(value=[0, 3])),
                     "values": Feature(float_list=FloatList(value=[5, 9]))})]
+  ```
 
   The result of calling parse_example on these examples will produce a
   dictionary with entries for "ids" and "values". Passing those two objects
@@ -823,9 +828,11 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None):
   original matrix, i.e., (3, 6). For our example above, the output will be
   equal to:
 
+  ```python
     SparseTensor(indices=[[0, 0], [1, 1], [1, 3], [1, 4], [2, 0], [2, 3]],
                  values=[-3, 1, 4, 1, 5, 9],
                  shape=[3, 6])
+  ```
 
   Args:
     sp_ids: A `SparseTensor` with `values` property of type `int32`
@@ -834,6 +841,9 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None):
     vocab_size: A scalar `int64` Tensor (or Python int) containing the new size
       of the last dimension, `all(0 <= sp_ids.values < vocab_size)`.
     name: A name prefix for the returned tensors (optional)
+    already_sorted: A boolean to specify whether the per-batch values in
+     `sp_values` are already sorted. If so skip sorting, False by default
+     (optional).
 
   Returns:
     A `SparseTensor` compactly representing a batch of feature ids and values,
@@ -868,7 +878,8 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None):
         [array_ops.slice(sp_ids.shape, [0], array_ops.expand_dims(rank - 1, 0)),
          math_ops.cast(array_ops.pack([vocab_size]), dtypes.int64)])
 
-    return sparse_reorder(ops.SparseTensor(new_indices, new_values, new_shape))
+    result = ops.SparseTensor(new_indices, new_values, new_shape)
+    return result if already_sorted else sparse_reorder(result)
 
 
 def sparse_retain(sp_input, to_retain):
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 4517b6d9978..cfb8dc125ea 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -573,11 +573,19 @@ class VariableScope(object):
     partitioner: callable or `None`: the partitioner passed to `get_variable`.
     custom_getter: default custom getter passed to get_variable.
     name_scope: The name passed to `tf.name_scope`.
+    dtype: default type passed to get_variable (defaults to DT_FLOAT).
   """
 
-  def __init__(self, reuse, name="", initializer=None, regularizer=None,
-               caching_device=None, partitioner=None, custom_getter=None,
-               name_scope=""):
+  def __init__(self,
+               reuse,
+               name="",
+               initializer=None,
+               regularizer=None,
+               caching_device=None,
+               partitioner=None,
+               custom_getter=None,
+               name_scope="",
+               dtype=dtypes.float32):
     """Creates a new VariableScope with the given properties."""
     self._name = name
     self._initializer = initializer
@@ -587,6 +595,7 @@ class VariableScope(object):
     self._partitioner = partitioner
     self._custom_getter = custom_getter
     self._name_scope = name_scope
+    self._dtype = dtype
 
   @property
   def name(self):
@@ -604,6 +613,10 @@ class VariableScope(object):
   def initializer(self):
     return self._initializer
 
+  @property
+  def dtype(self):
+    return self._dtype
+
   @property
   def regularizer(self):
     return self._regularizer
@@ -628,6 +641,10 @@ class VariableScope(object):
     """Set initializer for this scope."""
     self._initializer = initializer
 
+  def set_dtype(self, dtype):
+    """Set data type for this scope."""
+    self._dtype = dtype
+
   def set_regularizer(self, regularizer):
     """Set regularizer for this scope."""
     self._regularizer = regularizer
@@ -644,10 +661,18 @@ class VariableScope(object):
     """Set custom getter for this scope."""
     self._custom_getter = custom_getter
 
-  def get_variable(self, var_store, name, shape=None, dtype=dtypes.float32,
-                   initializer=None, regularizer=None,
-                   trainable=True, collections=None, caching_device=None,
-                   partitioner=None, validate_shape=True,
+  def get_variable(self,
+                   var_store,
+                   name,
+                   shape=None,
+                   dtype=None,
+                   initializer=None,
+                   regularizer=None,
+                   trainable=True,
+                   collections=None,
+                   caching_device=None,
+                   partitioner=None,
+                   validate_shape=True,
                    custom_getter=None):
     """Gets an existing variable with this name or create a new one."""
     if initializer is None:
@@ -660,6 +685,8 @@ class VariableScope(object):
       partitioner = self._partitioner
     if custom_getter is None:
       custom_getter = self._custom_getter
+    if dtype is None:
+      dtype = self._dtype
 
     full_name = self.name + "/" + name if self.name else name
     # Variable names only depend on variable_scope (full_name here),
@@ -672,12 +699,18 @@ class VariableScope(object):
           partitioner=partitioner, validate_shape=validate_shape,
           custom_getter=custom_getter)
 
-  def _get_partitioned_variable(
-      self, var_store, name,
-      shape=None, dtype=dtypes.float32,
-      initializer=None, regularizer=None,
-      trainable=True, collections=None, caching_device=None,
-      partitioner=None, validate_shape=True):
+  def _get_partitioned_variable(self,
+                                var_store,
+                                name,
+                                shape=None,
+                                dtype=None,
+                                initializer=None,
+                                regularizer=None,
+                                trainable=True,
+                                collections=None,
+                                caching_device=None,
+                                partitioner=None,
+                                validate_shape=True):
     """Gets an existing variable with this name or create a new one."""
     if initializer is None:
       initializer = self._initializer
@@ -687,6 +720,9 @@ class VariableScope(object):
       caching_device = self._caching_device
     if partitioner is None:
       partitioner = self._partitioner
+    if dtype is None:
+      dtype = self._dtype
+
     if self._custom_getter is not None:
       raise ValueError(
           "Private access to _get_partitioned_variable is not allowed when "
@@ -743,9 +779,16 @@ def _get_default_variable_store():
   return store
 
 
-def get_variable(name, shape=None, dtype=dtypes.float32, initializer=None,
-                 regularizer=None, trainable=True, collections=None,
-                 caching_device=None, partitioner=None, validate_shape=True,
+def get_variable(name,
+                 shape=None,
+                 dtype=None,
+                 initializer=None,
+                 regularizer=None,
+                 trainable=True,
+                 collections=None,
+                 caching_device=None,
+                 partitioner=None,
+                 validate_shape=True,
                  custom_getter=None):
   """Gets an existing variable with these parameters or create a new one.
 
@@ -830,10 +873,16 @@ def get_variable(name, shape=None, dtype=dtypes.float32, initializer=None,
       custom_getter=custom_getter)
 
 
-def _get_partitioned_variable(
-    name, shape=None, dtype=dtypes.float32, initializer=None,
-    regularizer=None, trainable=True, collections=None,
-    caching_device=None, partitioner=None, validate_shape=True):
+def _get_partitioned_variable(name,
+                              shape=None,
+                              dtype=None,
+                              initializer=None,
+                              regularizer=None,
+                              trainable=True,
+                              collections=None,
+                              caching_device=None,
+                              partitioner=None,
+                              validate_shape=True):
   """Gets or creates a sharded variable list with these parameters.
 
   The `partitioner` must be a callable that accepts a fully defined
@@ -915,10 +964,15 @@ def _get_partitioned_variable(
 
 
 @contextlib.contextmanager
-def _pure_variable_scope(name_or_scope, reuse=None, initializer=None,
-                         regularizer=None, caching_device=None,
-                         partitioner=None, custom_getter=None,
-                         old_name_scope=None):
+def _pure_variable_scope(name_or_scope,
+                         reuse=None,
+                         initializer=None,
+                         regularizer=None,
+                         caching_device=None,
+                         partitioner=None,
+                         custom_getter=None,
+                         old_name_scope=None,
+                         dtype=dtypes.float32):
   """Creates a context for the variable_scope, see `variable_scope` for docs.
 
   Note: this does not create a name scope.
@@ -933,6 +987,7 @@ def _pure_variable_scope(name_or_scope, reuse=None, initializer=None,
     partitioner: default partitioner for variables within this scope.
     custom_getter: default custom getter for variables within this scope.
     old_name_scope: the original name scope when re-entering a variable scope.
+    dtype: type of the variables within this scope (defaults to `DT_FLOAT`).
 
   Yields:
     A scope that can be to captured and reused.
@@ -967,6 +1022,7 @@ def _pure_variable_scope(name_or_scope, reuse=None, initializer=None,
           regularizer=name_or_scope.regularizer,
           caching_device=name_or_scope.caching_device,
           partitioner=name_or_scope.partitioner,
+          dtype=name_or_scope.dtype,
           custom_getter=name_or_scope.custom_getter,
           name_scope=name_scope)
       if initializer is not None:
@@ -979,6 +1035,8 @@ def _pure_variable_scope(name_or_scope, reuse=None, initializer=None,
         default_varscope[0].set_partitioner(partitioner)
       if custom_getter is not None:
         default_varscope[0].set_custom_getter(custom_getter)
+      if dtype is not None:
+        default_varscope[0].set_dtype(dtype)
       yield default_varscope[0]
     else:
       # Handler for the case when we just prolong current variable scope.
@@ -986,11 +1044,13 @@ def _pure_variable_scope(name_or_scope, reuse=None, initializer=None,
       #   reuse and initializer (except if the user provided values to set).
       reuse = reuse or old.reuse  # Re-using is inherited by sub-scopes.
       default_varscope[0] = VariableScope(
-          reuse, name=new_name,
+          reuse,
+          name=new_name,
           initializer=old.initializer,
           regularizer=old.regularizer,
           caching_device=old.caching_device,
           partitioner=old.partitioner,
+          dtype=old.dtype,
           custom_getter=old.custom_getter,
           name_scope=old_name_scope or name_or_scope)
       if initializer is not None:
@@ -1003,6 +1063,8 @@ def _pure_variable_scope(name_or_scope, reuse=None, initializer=None,
         default_varscope[0].set_partitioner(partitioner)
       if custom_getter is not None:
         default_varscope[0].set_custom_getter(custom_getter)
+      if dtype is not None:
+        default_varscope[0].set_dtype(dtype)
       yield default_varscope[0]
   finally:
     var_store.close_variable_subscopes(new_name)
@@ -1024,9 +1086,14 @@ def _get_unique_variable_scope(prefix):
 
 # pylint: disable=g-doc-return-or-yield
 @contextlib.contextmanager
-def variable_scope(name_or_scope, reuse=None, initializer=None,
-                   regularizer=None, caching_device=None, partitioner=None,
-                   custom_getter=None):
+def variable_scope(name_or_scope,
+                   reuse=None,
+                   initializer=None,
+                   regularizer=None,
+                   caching_device=None,
+                   partitioner=None,
+                   custom_getter=None,
+                   dtype=None):
   """Returns a context for variable scope.
 
   Variable scope allows to create new variables and to share already created
@@ -1094,6 +1161,8 @@ def variable_scope(name_or_scope, reuse=None, initializer=None,
     caching_device: default caching device for variables within this scope.
     partitioner: default partitioner for variables within this scope.
     custom_getter: default custom getter for variables within this scope.
+    dtype: type of variables created in this scope (defaults to the type
+      in the passed scope, or inherited from parent scope).
 
   Returns:
     A scope that can be to captured and reused.
@@ -1117,25 +1186,42 @@ def variable_scope(name_or_scope, reuse=None, initializer=None,
       else:
         old_name_scope = name_or_scope.original_name_scope
       with _pure_variable_scope(
-          name_or_scope, reuse=reuse, initializer=initializer,
-          regularizer=regularizer, caching_device=caching_device,
-          partitioner=partitioner, custom_getter=custom_getter,
-          old_name_scope=old_name_scope) as vs:
+          name_or_scope,
+          reuse=reuse,
+          initializer=initializer,
+          regularizer=regularizer,
+          caching_device=caching_device,
+          partitioner=partitioner,
+          custom_getter=custom_getter,
+          old_name_scope=old_name_scope,
+          dtype=dtype) as vs:
         yield vs
   else:
     # This can only happen if someone is entering the root variable scope.
     with _pure_variable_scope(
-        name_or_scope, reuse=reuse, initializer=initializer,
-        regularizer=regularizer, caching_device=caching_device,
-        partitioner=partitioner, custom_getter=custom_getter) as vs:
+        name_or_scope,
+        reuse=reuse,
+        initializer=initializer,
+        regularizer=regularizer,
+        caching_device=caching_device,
+        partitioner=partitioner,
+        custom_getter=custom_getter,
+        dtype=dtype) as vs:
       yield vs
 
 
 # pylint: disable=g-doc-return-or-yield
 @contextlib.contextmanager
-def variable_op_scope(values, name_or_scope, default_name=None,
-                      initializer=None, regularizer=None, caching_device=None,
-                      partitioner=None, custom_getter=None, reuse=None):
+def variable_op_scope(values,
+                      name_or_scope,
+                      default_name=None,
+                      initializer=None,
+                      regularizer=None,
+                      caching_device=None,
+                      partitioner=None,
+                      custom_getter=None,
+                      reuse=None,
+                      dtype=None):
   """Returns a context manager for defining an op that creates variables.
 
   This context manager validates that the given `values` are from the
@@ -1176,6 +1262,8 @@ def variable_op_scope(values, name_or_scope, default_name=None,
     custom_getter: The default custom getter for variables within this scope.
     reuse: `True` or `None`; if `True`, we go into reuse mode for this scope as
       well as all sub-scopes; if `None`, we just inherit the parent scope reuse.
+    dtype: The default type of variables created in this scope, defaults to the
+      type of the parent scope.
 
   Returns:
     A context manager for use in defining a Python op.
@@ -1191,9 +1279,14 @@ def variable_op_scope(values, name_or_scope, default_name=None,
   with g.as_default():
     if name_or_scope:
       with variable_scope(
-          name_or_scope, reuse=reuse, initializer=initializer,
-          regularizer=regularizer, caching_device=caching_device,
-          partitioner=partitioner, custom_getter=custom_getter) as vs:
+          name_or_scope,
+          reuse=reuse,
+          initializer=initializer,
+          regularizer=regularizer,
+          caching_device=caching_device,
+          partitioner=partitioner,
+          custom_getter=custom_getter,
+          dtype=dtype) as vs:
         yield vs
     else:
       if reuse:
@@ -1201,10 +1294,14 @@ def variable_op_scope(values, name_or_scope, default_name=None,
       with ops.name_scope(default_name) as scope:
         unique_default_name = _get_unique_variable_scope(default_name)
         with _pure_variable_scope(
-            unique_default_name, initializer=initializer,
-            regularizer=regularizer, caching_device=caching_device,
-            partitioner=partitioner, custom_getter=custom_getter,
-            old_name_scope=scope) as vs:
+            unique_default_name,
+            initializer=initializer,
+            regularizer=regularizer,
+            caching_device=caching_device,
+            partitioner=partitioner,
+            custom_getter=custom_getter,
+            old_name_scope=scope,
+            dtype=dtype) as vs:
           yield vs
 
 
diff --git a/tensorflow/python/platform/base.i b/tensorflow/python/platform/base.i
index 99aae3b2416..df40491ed30 100644
--- a/tensorflow/python/platform/base.i
+++ b/tensorflow/python/platform/base.i
@@ -146,6 +146,7 @@ std::vector<type>* OUTPUT (std::vector<type> temp),
 %enddef
 
 _LIST_OUTPUT_TYPEMAP(string, _SwigBytes_FromString);
+_LIST_OUTPUT_TYPEMAP(long long, PyLong_FromLongLong);
 _LIST_OUTPUT_TYPEMAP(unsigned long long, PyLong_FromUnsignedLongLong);
 
 %typemap(in) uint64 {
@@ -178,6 +179,7 @@ _LIST_OUTPUT_TYPEMAP(unsigned long long, PyLong_FromUnsignedLongLong);
 %enddef
 
 _COPY_TYPEMAPS(unsigned long long, uint64);
+_COPY_TYPEMAPS(long long, int64);
 
 // SWIG macros for explicit API declaration.
 // Usage:
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index 1ba89db5628..23c03c38b13 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -164,6 +164,7 @@ class Benchmark(six.with_metaclass(_BenchmarkRegistrar, object)):
       wall_time: (optional) Total wall time in seconds
       throughput: (optional) Throughput (in MB/s)
       extras: (optional) Dict mapping string keys to additional benchmark info.
+        Values may be either floats or values that are convertible to strings.
       name: (optional) Override the BenchmarkEntry name with `name`.
         Otherwise it is inferred from the top-level method name.
     """
@@ -189,7 +190,8 @@ class TensorFlowBenchmark(Benchmark):
                        burn_iters=2,
                        min_iters=10,
                        store_trace=False,
-                       name=None):
+                       name=None,
+                       extras=None):
     """Run an op or tensor in the given session.  Report the results.
 
     Args:
@@ -205,6 +207,8 @@ class TensorFlowBenchmark(Benchmark):
         in the extras field "full_trace_chrome_format".
       name: (optional) Override the BenchmarkEntry name with `name`.
         Otherwise it is inferred from the top-level method name.
+      extras: (optional) Dict mapping string keys to additional benchmark info.
+        Values may be either floats or values that are convertible to strings.
     """
     for _ in range(burn_iters):
       sess.run(op_or_tensor, feed_dict=feed_dict)
@@ -218,7 +222,7 @@ class TensorFlowBenchmark(Benchmark):
       delta = end_time - start_time
       deltas[i] = delta
 
-    extras = {}
+    extras = extras if extras is not None else {}
     if store_trace:
       run_options = config_pb2.RunOptions(
           trace_level=config_pb2.RunOptions.FULL_TRACE)
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index ffd211152b9..ef82a009f92 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -28,6 +28,7 @@ limitations under the License.
 
 %include "tensorflow/python/client/tf_session.i"
 %include "tensorflow/python/client/device_lib.i"
+%include "tensorflow/python/client/net_lib.i"
 %include "tensorflow/python/client/quantize_training.i"
 
 %include "tensorflow/python/lib/io/file_io.i"
diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py
index a7fc169d18a..ea583006cc8 100644
--- a/tensorflow/python/training/coordinator.py
+++ b/tensorflow/python/training/coordinator.py
@@ -150,6 +150,11 @@ class Coordinator(object):
     self._exc_info_to_raise = None
     # True if we have called join() already.
     self._joined = False
+    # Set of threads registered for joining when join() is called.  These
+    # threads will be joined in addition to the threads passed to the join()
+    # call.  It's ok if threads are both registered and passed to the join()
+    # call.
+    self._registered_threads = set()
 
   def _filter_exception(self, ex):
     """Check if the exception indicated in 'ex' should be ignored.
@@ -305,10 +310,22 @@ class Coordinator(object):
     """
     return self._stop_event.wait(timeout)
 
-  def join(self, threads, stop_grace_period_secs=120):
+  def register_thread(self, thread):
+    """Register a thread to join.
+
+    Args:
+      thread: A Python thread to join.
+    """
+    with self._lock:
+      self._registered_threads.add(thread)
+
+  def join(self, threads=None, stop_grace_period_secs=120):
     """Wait for threads to terminate.
 
-    Blocks until all `threads` have terminated or `request_stop()` is called.
+    This call blocks until a set of threads have terminated.  The set of thread
+    is the union of the threads passed in the `threads` argument and the list
+    of threads that registered with the coordinator by calling
+    `Coordinator.register_thread()`.
 
     After the threads stop, if an `exc_info` was passed to `request_stop`, that
     exception is re-raised.
@@ -320,7 +337,8 @@ class Coordinator(object):
     that `RuntimeError`.
 
     Args:
-      threads: List of `threading.Threads`. The started threads to join.
+      threads: List of `threading.Threads`. The started threads to join in
+        addition to the registered threads.
       stop_grace_period_secs: Number of seconds given to threads to stop after
         `request_stop()` has been called.
 
@@ -328,6 +346,13 @@ class Coordinator(object):
       RuntimeError: If any thread is still alive after `request_stop()`
         is called and the grace period expires.
     """
+    # Threads registered after this call will not be joined.
+    with self._lock:
+      if threads is None:
+        threads = self._registered_threads
+      else:
+        threads = self._registered_threads.union(set(threads))
+
     # Wait for all threads to stop or for request_stop() to be called.
     while any(t.is_alive() for t in threads) and not self.wait_for_stop(1.0):
       pass
@@ -353,6 +378,7 @@ class Coordinator(object):
     # Terminate with an exception if appropriate.
     with self._lock:
       self._joined = True
+      self._registered_threads = set()
       if self._exc_info_to_raise:
         six.reraise(*self._exc_info_to_raise)
       elif stragglers:
@@ -411,6 +437,7 @@ class LooperThread(threading.Thread):
     elif args or kwargs:
       raise ValueError("'args' and 'kwargs' argument require that you also "
                        "pass 'target'")
+    self._coord.register_thread(self)
 
   @staticmethod
   def loop(coord, timer_interval_secs, target, args=None, kwargs=None):
diff --git a/tensorflow/python/training/coordinator_test.py b/tensorflow/python/training/coordinator_test.py
index 764307fd7d2..d67fb459d83 100644
--- a/tensorflow/python/training/coordinator_test.py
+++ b/tensorflow/python/training/coordinator_test.py
@@ -47,7 +47,9 @@ def RaiseInNUsingContextHandler(coord, n_secs, ex):
     raise ex
 
 
-def SleepABit(n_secs):
+def SleepABit(n_secs, coord=None):
+  if coord:
+    coord.register_thread(threading.current_thread())
   time.sleep(n_secs)
 
 
@@ -80,6 +82,33 @@ class CoordinatorTest(tf.test.TestCase):
     for t in threads:
       t.start()
     coord.join(threads)
+    for t in threads:
+      self.assertFalse(t.is_alive())
+
+  def testJoinAllRegistered(self):
+    coord = tf.train.Coordinator()
+    threads = [
+        threading.Thread(target=SleepABit, args=(0.01, coord)),
+        threading.Thread(target=SleepABit, args=(0.02, coord)),
+        threading.Thread(target=SleepABit, args=(0.01, coord))]
+    for t in threads:
+      t.start()
+    coord.join()
+    for t in threads:
+      self.assertFalse(t.is_alive())
+
+  def testJoinSomeRegistered(self):
+    coord = tf.train.Coordinator()
+    threads = [
+        threading.Thread(target=SleepABit, args=(0.01, coord)),
+        threading.Thread(target=SleepABit, args=(0.02)),
+        threading.Thread(target=SleepABit, args=(0.01, coord))]
+    for t in threads:
+      t.start()
+    # threads[1] is not registred we must pass it in.
+    coord.join(threads[1:1])
+    for t in threads:
+      self.assertFalse(t.is_alive())
 
   def testJoinGraceExpires(self):
     def TestWithGracePeriod(stop_grace_period):
diff --git a/tensorflow/python/training/localhost_cluster_performance_test.py b/tensorflow/python/training/localhost_cluster_performance_test.py
new file mode 100644
index 00000000000..a679cd36a25
--- /dev/null
+++ b/tensorflow/python/training/localhost_cluster_performance_test.py
@@ -0,0 +1,133 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests and benchmarks for creating RPC clusters on localhost."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.util import net_lib
+
+
+def create_local_cluster(num_workers, num_ps, protocol="grpc"):
+  """Create local GRPC servers and return their servers."""
+  worker_ports = [net_lib.pick_unused_port_or_die() for _ in range(num_workers)]
+  ps_ports = [net_lib.pick_unused_port_or_die() for _ in range(num_ps)]
+  cluster_dict = {
+      "worker": ["localhost:%s" % port for port in worker_ports],
+      "ps": ["localhost:%s" % port for port in ps_ports]}
+  cs = tf.train.ClusterSpec(cluster_dict)
+
+  workers = [
+      tf.train.Server(
+          cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_workers)]
+  ps_servers = [
+      tf.train.Server(
+          cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_ps)]
+
+  return workers, ps_servers
+
+
+class CreateLocalClusterTest(tf.test.TestCase):
+
+  def testCreateLocalCluster(self):
+    workers, _ = create_local_cluster(num_workers=2, num_ps=2)
+    worker_sessions = [tf.Session(w.target) for w in workers]
+    with tf.device("/job:ps/task:0"):
+      var0 = tf.Variable(0.0)
+    with tf.device("/job:ps/task:1"):
+      var1 = tf.Variable(1.0)
+    worker_sessions[0].run([var0.initializer, var1.initializer])
+    with tf.device("/job:ps/task:0"):
+      var2 = tf.Variable(2.0)
+    with tf.device("/job:ps/task:1"):
+      var3 = tf.Variable(3.0)
+    worker_sessions[1].run([var2.initializer, var3.initializer])
+
+    # Read values back in the opposite session
+    self.assertAllEqual(0.0, var0.eval(session=worker_sessions[1]))
+    self.assertAllEqual(1.0, var1.eval(session=worker_sessions[1]))
+    self.assertAllEqual(2.0, var2.eval(session=worker_sessions[0]))
+    self.assertAllEqual(3.0, var3.eval(session=worker_sessions[0]))
+
+
+class CreateLocalClusterBenchmark(tf.test.Benchmark):
+
+  def benchmarkCreateLocalCluster(self):
+    deltas = []
+    iters = 50
+    for _ in range(iters):
+      start_time = time.time()
+      create_local_cluster(num_workers=1, num_ps=10)
+      end_time = time.time()
+      deltas.append(end_time - start_time)
+
+    median_deltas = np.median(deltas)
+    print(
+        "\n\nbenchmark_create_local_cluster_1_worker_10_ps.  "
+        "iterations: %d, median wall time: %g\n\n" % (iters, median_deltas))
+    self.report_benchmark(
+        iters=iters,
+        wall_time=median_deltas,
+        name="benchmark_create_local_cluster_1_worker_10_ps")
+
+
+class PartitionedVariablesBenchmark(tf.test.Benchmark):
+
+  def benchmark_create_1000_partitions_with_100_parameter_servers(self):
+    workers, _ = create_local_cluster(num_workers=1, num_ps=100)
+    worker_sessions = [tf.Session(w.target) for w in workers]
+    worker = worker_sessions[0]
+    partition_sizes = (1, 512, 1024*32, 1024*128)
+
+    partitioned = []
+
+    for partition_size in partition_sizes:
+      # max_shard_bytes is 4, shape is 1000*partition_size float32s which should
+      # partition into 1000 shards, each containing partition_size float32s.
+      print("Building partitioned variable with %d floats per partition"
+            % partition_size)
+      with tf.device(tf.train.replica_device_setter(ps_tasks=100)):
+        partitioned_ix = tf.get_variable(
+            "partitioned_%d" % partition_size,
+            shape=[1000 * partition_size],
+            dtype=tf.float32,
+            # Each partition to have exactly N float32s
+            partitioner=tf.variable_axis_size_partitioner(
+                max_shard_bytes=4 * partition_size))
+        # Concatenates along axis 0
+        partitioned.append(tf.convert_to_tensor(partitioned_ix))
+
+    tf.initialize_all_variables().run(session=worker)
+
+    for ix, partition_size in enumerate(partition_sizes):
+      print("Running benchmark having partitions with %d floats"
+            % partition_size)
+      self.run_op_benchmark(
+          worker,
+          partitioned[ix],
+          name=("read_concat_1000_partitions_from_"
+                "100_parameter_servers_partsize_%d_floats" % partition_size))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/training/queue_runner.py b/tensorflow/python/training/queue_runner.py
index d31aca36f60..db3ee9d5280 100644
--- a/tensorflow/python/training/queue_runner.py
+++ b/tensorflow/python/training/queue_runner.py
@@ -176,6 +176,8 @@ class QueueRunner(object):
       coord: Optional Coordinator object for reporting errors and checking
         for stop conditions.
     """
+    if coord:
+      coord.register_thread(threading.current_thread())
     decremented = False
     try:
       while True:
@@ -218,6 +220,7 @@ class QueueRunner(object):
       cancel_op: The Operation to run.
       coord: Coordinator.
     """
+    coord.register_thread(threading.current_thread())
     coord.wait_for_stop()
     try:
       sess.run(cancel_op)
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index a5bc6bb4adb..6487e32892d 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -122,7 +122,7 @@ class QueueRunnerTest(tf.test.TestCase):
       threads = qr.create_threads(sess, coord)
       for t in threads:
         t.start()
-      coord.join(threads)
+      coord.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 0.
       self.assertEqual(0, var.eval())
@@ -137,7 +137,7 @@ class QueueRunnerTest(tf.test.TestCase):
         t.start()
       # The exception should be re-raised when joining.
       with self.assertRaisesRegexp(ValueError, "Operation not in the graph"):
-        coord.join(threads)
+        coord.join()
 
   def testGracePeriod(self):
     with self.test_session() as sess:
@@ -147,14 +147,14 @@ class QueueRunnerTest(tf.test.TestCase):
       dequeue = queue.dequeue()
       qr = tf.train.QueueRunner(queue, [enqueue])
       coord = tf.train.Coordinator()
-      threads = qr.create_threads(sess, coord, start=True)
+      qr.create_threads(sess, coord, start=True)
       # Dequeue one element and then request stop.
       dequeue.op.run()
       time.sleep(0.02)
       coord.request_stop()
       # We should be able to join because the RequestStop() will cause
       # the queue to be closed and the enqueue to terminate.
-      coord.join(threads, stop_grace_period_secs=0.05)
+      coord.join(stop_grace_period_secs=0.05)
 
   def testIgnoreMultiStarts(self):
     with self.test_session() as sess:
@@ -171,7 +171,7 @@ class QueueRunnerTest(tf.test.TestCase):
       new_threads = qr.create_threads(sess, coord=coord)
       self.assertEqual([], new_threads)
       coord.request_stop()
-      coord.join(threads, stop_grace_period_secs=0.5)
+      coord.join(stop_grace_period_secs=0.5)
 
   def testThreads(self):
     with self.test_session() as sess:
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 7eb1e7e519d..68904bb89e7 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -595,7 +595,7 @@ class MaxToKeepTest(tf.test.TestCase):
       self.assertEqual([], save2.last_checkpoints)
       self.assertTrue(gfile.Exists(s2))
 
-  def testNoMetaGrap(self):
+  def testNoMetaGraph(self):
     save_dir = _TestDir("no_meta_graph")
 
     with self.test_session() as sess:
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index 35505b82870..a3ee383758b 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -296,7 +296,6 @@ class Supervisor(object):
     self._graph = graph
     self._is_chief = is_chief
     self._coord = coordinator.Coordinator()
-    self._started_threads = []
     self._recovery_wait_secs = recovery_wait_secs
     self._stop_grace_secs = stop_grace_secs
     self._init_fn = init_fn
@@ -636,8 +635,6 @@ class Supervisor(object):
       threads.append(SVTimerCheckpointThread(self, sess))
     for t in threads:
       t.start()
-    self._started_threads.extend(threads)
-
     return threads
 
   def prepare_or_wait_for_session(self, master="", config=None,
@@ -712,7 +709,6 @@ class Supervisor(object):
     for qr in queue_runners:
       threads.extend(qr.create_threads(sess, coord=self._coord, daemon=True,
                                        start=True))
-    self._started_threads.extend(threads)
     return threads
 
   def loop(self, timer_interval_secs, target, args=None, kwargs=None):
@@ -737,7 +733,6 @@ class Supervisor(object):
     looper = coordinator.LooperThread(self._coord, timer_interval_secs,
                                       target=target, args=args, kwargs=kwargs)
     looper.start()
-    self._started_threads.append(looper)
     return looper
 
   def stop(self, threads=None, close_summary_writer=True):
@@ -755,16 +750,12 @@ class Supervisor(object):
         `True` if the summary writer was created by the supervisor, `False`
         otherwise.
     """
-    join_threads = []
-    join_threads.extend(self._started_threads)
-    if threads is not None:
-      join_threads.extend(threads)
     self._coord.request_stop()
     try:
       # coord.join() re-raises the first reported exception; the "finally"
       # block ensures that we clean up whether or not an exception was
       # reported.
-      self._coord.join(join_threads,
+      self._coord.join(threads,
                        stop_grace_period_secs=self._stop_grace_secs)
     finally:
       # Close the writer last, in case one of the running threads was using it.
@@ -775,8 +766,6 @@ class Supervisor(object):
         self._summary_writer.close()
         self._graph_added_to_summary = False
 
-      self._started_threads = []
-
   def request_stop(self, ex=None):
     """Request that the coordinator stop the threads.
 
diff --git a/tensorflow/python/util/net_lib.py b/tensorflow/python/util/net_lib.py
new file mode 100644
index 00000000000..98a3149fdba
--- /dev/null
+++ b/tensorflow/python/util/net_lib.py
@@ -0,0 +1,28 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A Python interface for creating TensorFlow tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six  # pylint: disable=unused-import
+
+from tensorflow.python import pywrap_tensorflow
+
+
+def pick_unused_port_or_die():
+  """Find an unused port on localhost."""
+  return pywrap_tensorflow.PickUnusedPortOrDie()
diff --git a/tensorflow/python/util/net_lib_test.py b/tensorflow/python/util/net_lib_test.py
new file mode 100644
index 00000000000..1e2ad53cdae
--- /dev/null
+++ b/tensorflow/python/util/net_lib_test.py
@@ -0,0 +1,39 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for the SWIG-wrapped test lib."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.util import net_lib
+
+
+class TestLibTest(tf.test.TestCase):
+
+  def testPickUnusedPortOrDie(self):
+    port0 = net_lib.pick_unused_port_or_die()
+    port1 = net_lib.pick_unused_port_or_die()
+    self.assertGreater(port0, 0)
+    self.assertLess(port0, 65536)
+    self.assertGreater(port1, 0)
+    self.assertLess(port1, 65536)
+    self.assertNotEqual(port0, port1)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/tensorboard/README.md b/tensorflow/tensorboard/README.md
index 49a1656cddc..48d5ce4ccbc 100644
--- a/tensorflow/tensorboard/README.md
+++ b/tensorflow/tensorboard/README.md
@@ -105,7 +105,8 @@ For example, here is a well-organized TensorBoard log directory, with two runs,
 "run1" and "run2".
 
 ```
-/some/path/mnist_experiments/ some/path/mnist_experiments/run1/
+/some/path/mnist_experiments/
+/some/path/mnist_experiments/run1/
 /some/path/mnist_experiments/run1/events.out.tfevents.1456525581.name
 /some/path/mnist_experiments/run1/events.out.tfevents.1456525585.name
 /some/path/mnist_experiments/run2/
@@ -113,6 +114,14 @@ For example, here is a well-organized TensorBoard log directory, with two runs,
 /tensorboard --logdir=/some/path/mnist_experiments
 ```
 
+You may also pass a comma separated list of log directories, and TensorBoard
+will watch each directory. You can also assign names to individual log
+directories by putting a colon between the name and the path, as in
+
+```
+tensorboard --logdir=name1:/path/to/logs/1,name2:/path/to/logs/2
+```
+
 # The Visualizations
 
 ### Events Dashboard
diff --git a/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-dashboard.html b/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-dashboard.html
index 3c803247ac8..6b7ccb0f27c 100644
--- a/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-dashboard.html
+++ b/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-dashboard.html
@@ -25,8 +25,6 @@ tf-audio-dashboard displays a dashboard that loads audio from a TensorFlow run.
 
     <style>
       .center {
-        padding-left: 10px;
-        padding-right: 10px;
         height: 100%;
         width: 100%;
         -webkit-box-sizing: border-box;
diff --git a/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-grid.html b/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-grid.html
index 27d7220ebee..d48dddcd1d3 100644
--- a/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-grid.html
+++ b/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-grid.html
@@ -91,7 +91,7 @@ is high)
         height: 100%;
         flex-direction: column;
         padding-top: 20px;
-        overflow: scroll;
+        overflow: auto;
         -webkit-box-sizing: border-box;
         -moz-box-sizing: border-box;
         box-sizing: border-box;
diff --git a/tensorflow/tensorboard/components/tf-dashboard-common/tf-dashboard-layout.html b/tensorflow/tensorboard/components/tf-dashboard-common/tf-dashboard-layout.html
index a5584ec3425..faf1139922c 100644
--- a/tensorflow/tensorboard/components/tf-dashboard-common/tf-dashboard-layout.html
+++ b/tensorflow/tensorboard/components/tf-dashboard-common/tf-dashboard-layout.html
@@ -25,7 +25,7 @@ Generic layout for a dashboard.
 
       #center {
         height: 100%;
-        overflow-y: scroll;
+        overflow-y: auto;
         flex-grow: 1;
         flex-shrink: 1;
       }
diff --git a/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-obsolete-histogram-chart.html b/tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-chart.html
similarity index 89%
rename from tensorflow/tensorboard/components/tf-histogram-dashboard/tf-obsolete-histogram-chart.html
rename to tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-chart.html
index d5ddf142a83..d58520a8a7c 100644
--- a/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-obsolete-histogram-chart.html
+++ b/tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-chart.html
@@ -2,7 +2,7 @@
 <link rel="import" href="../tf-imports/plottable.html">
 <link rel="import" href="../tf-imports/lodash.html">
 
-<dom-module id="tf-obsolete-histogram-chart">
+<dom-module id="tf-distribution-chart">
   <template>
     <svg id="chartsvg"></svg>
     <style>
@@ -24,11 +24,11 @@
 
     </style>
   </template>
-  <script src="tf-obsolete-histogram-chart.js"></script>
+  <script src="tf-distribution-chart.js"></script>
   <script src="../vz-line-chart/vz-chart-helpers.js"></script>
   <script>
     Polymer({
-      is: "tf-obsolete-histogram-chart",
+      is: "tf-distribution-chart",
       properties: {
         _chart: Object,
         colorScale: Object,
@@ -60,7 +60,7 @@
           return;
         }
         if (this._chart) this._chart.destroy();
-        var chart = new TF.HistogramChart(tag, dataProvider, xType, colorScale);
+        var chart = new TF.DistributionChart(tag, dataProvider, xType, colorScale);
         var svg = d3.select(this.$.chartsvg);
         this.async(function() {
           chart.renderTo(svg);
diff --git a/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-obsolete-histogram-chart.ts b/tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-chart.ts
similarity index 99%
rename from tensorflow/tensorboard/components/tf-histogram-dashboard/tf-obsolete-histogram-chart.ts
rename to tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-chart.ts
index c593b9d4d48..981656a8146 100644
--- a/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-obsolete-histogram-chart.ts
+++ b/tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-chart.ts
@@ -15,7 +15,7 @@ limitations under the License.
 /* tslint:disable:no-namespace variable-name */
 
 module TF {
-  export class HistogramChart {
+  export class DistributionChart {
     protected dataFn: VZ.ChartHelpers.DataFn;
     protected tag: string;
     private run2datasets: {[run: string]: Plottable.Dataset};
diff --git a/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-histogram-dashboard.html b/tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-dashboard.html
similarity index 91%
rename from tensorflow/tensorboard/components/tf-histogram-dashboard/tf-histogram-dashboard.html
rename to tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-dashboard.html
index 6dab1bda03e..962dcdef133 100644
--- a/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-histogram-dashboard.html
+++ b/tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-dashboard.html
@@ -4,7 +4,7 @@
 <link rel="import" href="../tf-color-scale/tf-color-scale.html">
 <link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
 <link rel="import" href="../tf-categorizer/tf-categorizer.html">
-<link rel="import" href="tf-obsolete-histogram-chart.html">
+<link rel="import" href="tf-distribution-chart.html">
 <link rel="import" href="../tf-collapsable-pane/tf-collapsable-pane.html">
 <link rel="import" href="../iron-collapse/iron-collapse.html">
 <link rel="import" href="../paper-icon-button/paper-icon-button.html">
@@ -12,7 +12,7 @@
 <link rel="import" href="../tf-backend/tf-backend.html">
 
 <!--
-tf-histogram-dashboard is a complete frontend that loads runs from a backend,
+tf-distribution-dashboard is a complete frontend that loads runs from a backend,
 and creates chart panes that display data for those runs.
 
 It provides a categorizer, run selector, and x type selector, by which the user
@@ -24,10 +24,10 @@ charts are larger.
 Organizationally, the #plumbing div contains components that have no concrete
 manifestation and just effect data bindings or data loading. The #sidebar contains
 shared controls like the tf-categorizer, tf-run-selector, and tf-x-type-selector.
-The #center div contains tf-obsolete-histogram-charts embedded inside
+The #center div contains tf-distribution-charts embedded inside
 tf-collapsable-panes.
 -->
-<dom-module id="tf-histogram-dashboard">
+<dom-module id="tf-distribution-dashboard">
   <template>
     <div id="plumbing">
       <tf-color-scale
@@ -64,7 +64,7 @@ tf-collapsable-panes.
 
       <div class="center">
         <tf-no-data-warning
-          data-type="histogram"
+          data-type="distribution"
           show-warning="[[dataNotFound]]"
         ></tf-no-data-warning>
         <template is="dom-repeat" items="[[categories]]">
@@ -76,7 +76,7 @@ tf-collapsable-panes.
                     <div class="card">
                       <span class="card-title">[[tag]]</span>
                       <div class="card-content">
-                        <tf-obsolete-histogram-chart
+                        <tf-distribution-chart
                           tag="[[tag]]"
                           id="chart"
                           selected-runs="[[_array(run)]]"
@@ -85,7 +85,7 @@ tf-collapsable-panes.
                           color-scale="[[colorScale]]"
                           on-keyup="toggleSelected"
                           tabindex="2"
-                        ></tf-obsolete-histogram-chart>
+                        ></tf-distribution-chart>
                         <paper-icon-button
                           class="expand-button"
                           icon="fullscreen"
@@ -107,9 +107,9 @@ tf-collapsable-panes.
 
   <script>
     Polymer({
-      is: "tf-histogram-dashboard",
+      is: "tf-distribution-dashboard",
       behaviors: [
-        TF.Dashboard.ReloadBehavior("tf-obsolete-histogram-chart"),
+        TF.Dashboard.ReloadBehavior("tf-distribution-chart"),
         TF.Backend.Behavior,
       ],
       properties: {
diff --git a/tensorflow/tensorboard/components/tf-globals/globals.ts b/tensorflow/tensorboard/components/tf-globals/globals.ts
index 29e4a143e75..1e908ec034f 100644
--- a/tensorflow/tensorboard/components/tf-globals/globals.ts
+++ b/tensorflow/tensorboard/components/tf-globals/globals.ts
@@ -16,7 +16,7 @@ limitations under the License.
 /* tslint:disable:no-namespace */
 module TF.Globals {
   // The names of TensorBoard tabs.
-  export var TABS = ['events', 'images', 'audio', 'graphs', 'histograms'];
+  export var TABS = ['events', 'images', 'audio', 'graphs', 'distributions'];
 
   // If true, TensorBoard stores its hash in the URI state.
   // If false, tab switching in TensorBoard will not update location hash,
diff --git a/tensorflow/tensorboard/components/tf-histogram-dashboard/rebin.ts b/tensorflow/tensorboard/components/tf-histogram-dashboard/rebin.ts
deleted file mode 100644
index 92a6b12f456..00000000000
--- a/tensorflow/tensorboard/components/tf-histogram-dashboard/rebin.ts
+++ /dev/null
@@ -1,43 +0,0 @@
-module TF.Histogram {
-  /**
-   * Re-bins histogram data into uniform-width bins. Assumes a uniform
-   * distribution of values in given bins.
-   *
-   * @param bins - Original histogram data.
-   * @param numberOfBins Number of uniform-width bins to split the data into.
-   * @return Re-binned histogram data. Does not modify original data,
-   *      returns a new array.
-   */
-  export function rebinHistogram(
-      bins: TF.Backend.HistogramBin[], numberOfBins: number) {
-    if (bins.length === 0) {
-      return [];
-    }
-
-    var oldBinsXExtent = [
-      d3.min(bins, function(old: any) { return old.x; }),
-      d3.max(bins, function(old: any) { return old.x + old.dx; })
-    ];
-
-    var newDx: number = (oldBinsXExtent[1] - oldBinsXExtent[0]) / numberOfBins;
-
-    var newBins: TF.Backend.HistogramBin[] =
-        d3.range(oldBinsXExtent[0], oldBinsXExtent[1], newDx)
-            .map(function(newX) {
-
-              // Take the count of each existing bin, multiply it by the
-              // proportion of overlap with the new bin, then sum and store as
-              // the count for new bin. If no overlap, will add zero, if 100%
-              // overlap, will include full count into new bin.
-              var newY = d3.sum(bins.map(function(old) {
-                var intersectDx = Math.min(old.x + old.dx, newX + newDx) -
-                    Math.max(old.x, newX);
-                return (intersectDx > 0) ? (intersectDx / old.dx) * old.y : 0;
-              }));
-
-              return {x: newX, dx: newDx, y: newY};
-            });
-
-    return newBins;
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf-histogram-dashboard/test/index.html b/tensorflow/tensorboard/components/tf-histogram-dashboard/test/index.html
deleted file mode 100644
index c645f7251bd..00000000000
--- a/tensorflow/tensorboard/components/tf-histogram-dashboard/test/index.html
+++ /dev/null
@@ -1,13 +0,0 @@
-<!doctype html>
-<html>
-<head>
-  <meta charset="utf-8">
-  <script src="../../web-component-tester/browser.js"></script>
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <link rel="import" href="../../tf-imports/d3.html">
-</head>
-<body>
-  <script src="../rebin.js"></script>
-  <script src="rebinTests.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf-histogram-dashboard/test/rebinTests.ts b/tensorflow/tensorboard/components/tf-histogram-dashboard/test/rebinTests.ts
deleted file mode 100644
index 661ba75b54f..00000000000
--- a/tensorflow/tensorboard/components/tf-histogram-dashboard/test/rebinTests.ts
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-module TF.Histogram {
-  let assert = chai.assert;
-
-  describe('Rebin', function() {
-
-    var assertHistogramEquality = function(h1, h2) {
-      h1.forEach(function(b1, i) {
-        var b2 = h2[i];
-        assert.closeTo(b1.x, b2.x, 1e-10);
-        assert.closeTo(b1.dx, b2.dx, 1e-10);
-        assert.closeTo(b1.y, b2.y, 1e-10);
-      });
-    };
-
-    //
-    // Rebinning
-    //
-
-    it('Returns an empty array if you don\'t have any bins',
-       function() { assert.deepEqual(rebinHistogram([], 10), []); });
-
-    it('Collapses two bins into one.', function() {
-      var histogram = [
-        {x: 0, dx: 1, y: 1},
-        {x: 1, dx: 1, y: 2}
-      ];
-      var oneBin = [
-        {x: 0, dx: 2, y: 3}
-      ];
-      assertHistogramEquality(rebinHistogram(histogram, 1), oneBin);
-    });
-
-    it('Splits one bin into two.', function() {
-      var histogram = [
-        {x: 0, dx: 1, y: 3}
-      ];
-      var twoBin = [
-        {x: 0, dx: 0.5, y: 1.5},
-        {x: 0.5, dx: 0.5, y: 1.5}
-      ];
-      assertHistogramEquality(rebinHistogram(histogram, 2), twoBin);
-    });
-
-    it('Regularizes non-uniform bins.', function() {
-      var histogram = [
-        {x: 0, dx: 2, y: 3},
-        {x: 2, dx: 3, y: 3},
-        {x: 5, dx: 1, y: 1}
-      ];
-      var twoBin = [
-        {x: 0, dx: 3, y: 4},
-        {x: 3, dx: 3, y: 3}
-      ];
-      assertHistogramEquality(rebinHistogram(histogram, 2), twoBin);
-    });
-
-  });
-}
diff --git a/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-dashboard.html b/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-dashboard.html
index f65f31cc333..a79745b87e4 100644
--- a/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-dashboard.html
+++ b/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-dashboard.html
@@ -25,8 +25,6 @@ tf-image-dashboard displays a dashboard that loads images from a TensorFlow run.
 
     <style>
       .center {
-        padding-left: 10px;
-        padding-right: 10px;
         height: 100%;
         width: 100%;
         -webkit-box-sizing: border-box;
diff --git a/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-grid.html b/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-grid.html
index 5662d034a23..7c174e6abea 100644
--- a/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-grid.html
+++ b/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-grid.html
@@ -91,7 +91,7 @@ is high)
         height: 100%;
         flex-direction: column;
         padding-top: 20px;
-        overflow: scroll;
+        overflow: auto;
         -webkit-box-sizing: border-box;
         -moz-box-sizing: border-box;
         box-sizing: border-box;
diff --git a/tensorflow/tensorboard/components/tf-multi-checkbox/tf-multi-checkbox.html b/tensorflow/tensorboard/components/tf-multi-checkbox/tf-multi-checkbox.html
index d230e54c86e..e397dba0703 100644
--- a/tensorflow/tensorboard/components/tf-multi-checkbox/tf-multi-checkbox.html
+++ b/tensorflow/tensorboard/components/tf-multi-checkbox/tf-multi-checkbox.html
@@ -21,13 +21,13 @@ handle these situations gracefully.
   <style include="run-color-style"></style>
 
   <template>
-    <div id="outer-container" class="scrollbar">
       <paper-input
         id="runs-regex"
         no-label-float
         label="Write a regex to filter runs"
         value="{{regexInput}}"
       ></paper-input>
+    <div id="outer-container" class="scrollbar">
       <template
         is="dom-repeat"
         items="[[namesMatchingRegex]]"
@@ -65,9 +65,10 @@ handle these situations gracefully.
       height: 100%;
     }
     #outer-container {
-      overflow-y: scroll;
+      overflow-y: auto;
       overflow-x: hidden;
       width: 100%;
+      height: 0; /* Quirk to make firefox add scrolling instead of expand div */
       flex-grow: 1;
       flex-shrink: 1;
       word-wrap: break-word;
@@ -202,8 +203,9 @@ handle these situations gracefully.
       window.requestAnimationFrame(function() {_this.updateStyles();});
     },
     _checkboxChange: function(e) {
-      var name = e.srcElement.name;
-      var checked = e.srcElement.checked;
+      var target = e.srcElement || e.target; // Firefox doesn't have srcElement.
+      var name = target.name;
+      var checked = target.checked;
       this.runToIsCheckedMapping[name] = checked;
       // n.b. notifyPath won't work because run names may have periods.
       this.runToIsCheckedMapping = _.clone(this.runToIsCheckedMapping);
diff --git a/tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html b/tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html
index d841562bad9..953f6474a62 100644
--- a/tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html
+++ b/tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html
@@ -7,7 +7,7 @@
 <link rel="import" href="../paper-header-panel/paper-header-panel.html">
 <link rel="import" href="../tf-globals/tf-globals.html">
 <link rel="import" href="../tf-event-dashboard/tf-event-dashboard.html">
-<link rel="import" href="../tf-histogram-dashboard/tf-histogram-dashboard.html">
+<link rel="import" href="../tf-distribution-dashboard/tf-distribution-dashboard.html">
 <link rel="import" href="../tf-image-dashboard/tf-image-dashboard.html">
 <link rel="import" href="../tf-audio-dashboard/tf-audio-dashboard.html">
 <link rel="import" href="../tf-graph-dashboard/tf-graph-dashboard.html">
@@ -90,11 +90,11 @@ allows the user to toggle between various dashboards.
           ></tf-graph-dashboard>
         </template>
 
-        <template is="dom-if" if="[[_modeIsHistograms(mode)]]">
-          <tf-histogram-dashboard
-            id="histograms"
+        <template is="dom-if" if="[[_modeIsDistributions(mode)]]">
+          <tf-distribution-dashboard
+            id="distributions"
             backend="[[_backend]]"
-          ></tf-histogram-dashboard>
+          ></tf-distribution-dashboard>
         </template>
       </div>
     </paper-header-panel>
@@ -227,8 +227,8 @@ allows the user to toggle between various dashboards.
       _modeIsGraphs: function(mode) {
         return mode === "graphs";
       },
-      _modeIsHistograms: function(mode) {
-        return mode === "histograms";
+      _modeIsDistributions: function(mode) {
+        return mode === "distributions";
       },
       selectedDashboard: function() {
         var dashboard = this.$$("#" + this.mode);
diff --git a/tensorflow/tensorboard/components/vz-line-chart/vz-chart-helpers.ts b/tensorflow/tensorboard/components/vz-line-chart/vz-chart-helpers.ts
index 39a296e9ba9..839f0fb8b24 100644
--- a/tensorflow/tensorboard/components/vz-line-chart/vz-chart-helpers.ts
+++ b/tensorflow/tensorboard/components/vz-line-chart/vz-chart-helpers.ts
@@ -21,6 +21,7 @@ module VZ.ChartHelpers {
 
   export interface Scalar {
     scalar: number;
+    smoothed: number;
   }
 
   export type ScalarDatum = Datum & Scalar;
diff --git a/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.html b/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.html
index 6fe110d9b9a..be2045ae9c9 100644
--- a/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.html
+++ b/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.html
@@ -14,13 +14,15 @@ smoothing.
 -->
 <dom-module id="vz-line-chart">
   <template>
-    <svg id="chartsvg"></svg>
     <div id="tooltip">
       <table>
         <thead>
           <tr>
             <th></th>
             <th>Name</th>
+            <template is="dom-if" if="{{smoothingEnabled}}">
+              <th>Smoothed</th>
+            </template>
             <th>Value</th>
             <th>Step</th>
             <th>Time</th>
@@ -31,6 +33,7 @@ smoothing.
         </tbody>
       </table>
     </div>
+    <svg id="chartsvg"></svg>
     <style>
       :host {
         -webkit-user-select: none;
@@ -160,6 +163,10 @@ smoothing.
         _seriesDataCache: {
           type: Object,
           value: function() { return {} }
+        },
+        _makeChartAsyncCallbackId: {
+          type: Number,
+          value: null
         }
       },
       observers: [
@@ -213,21 +220,25 @@ smoothing.
       detached: function() {
         this._attached = false;
       },
+      ready: function() {
+        this.scopeSubtree(this.$.tooltip, true);
+        this.scopeSubtree(this.$.chartsvg, true);
+      },
       _makeChart: function(xType, colorScale, _attached) {
-        if(!this._attached) {
-          return;
+        if (this._makeChartAsyncHandle === null) {
+          this.cancelAsync(this._makeChartAsyncCallbackId);
         }
 
-        if (this._chart) this._chart.destroy();
-        var tooltip = d3.select(this.$.tooltip);
-        this.scopeSubtree(this.$.tooltip, true);
-        var chart = new VZ.LineChart(xType, colorScale, tooltip);
-        var svg = d3.select(this.$.chartsvg);
-        this.async(function() {
+        this._makeChartAsyncHandle = this.async(function() {
+          this._makeChartAsyncCallbackId = null;
+          if (!this._attached) return;
+          if (this._chart) this._chart.destroy();
+          var tooltip = d3.select(this.$.tooltip);
+          var chart = new VZ.LineChart(xType, colorScale, tooltip);
+          var svg = d3.select(this.$.chartsvg);
           chart.renderTo(svg);
-          this.scopeSubtree(this.$.chartsvg, true);
           this._chart = chart;
-        }, 350);
+        }.bind(this), 350);
       },
       _reloadFromCache: function() {
         if(this._chart) {
diff --git a/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.ts b/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.ts
index 0eddd479e90..4e5d219f1c1 100644
--- a/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.ts
+++ b/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.ts
@@ -35,11 +35,10 @@ module VZ {
     private smoothLinePlot: Plottable.Plots.Line<number|Date>;
     private scatterPlot: Plottable.Plots.Scatter<number|Date, Number>;
     private nanDisplay: Plottable.Plots.Scatter<number|Date, Number>;
-    private yAccessor: Plottable.Accessor<number>;
+    private scalarAccessor: Plottable.Accessor<number>;
+    private smoothedAccessor: Plottable.Accessor<number>;
     private lastPointsDataset: Plottable.Dataset;
     private datasets: Plottable.Dataset[];
-    private smoothDatasets: Plottable.Dataset[];
-    private name2smoothDatasets: {[name: string]: Plottable.Dataset};
     private onDatasetChanged: (dataset: Plottable.Dataset) => void;
     private nanDataset: Plottable.Dataset;
     private smoothingDecay: number;
@@ -53,8 +52,6 @@ module VZ {
       this.colorScale = colorScale;
       this.tooltip = tooltip;
       this.datasets = [];
-      this.smoothDatasets = [];
-      this.name2smoothDatasets = {};
       // lastPointDataset is a dataset that contains just the last point of
       // every dataset we're currently drawing.
       this.lastPointsDataset = new Plottable.Dataset();
@@ -97,10 +94,11 @@ module VZ {
     }
 
     private buildPlot(xAccessor, xScale, yScale): Plottable.Component {
-      this.yAccessor = (d: VZ.ChartHelpers.ScalarDatum) => d.scalar;
+      this.scalarAccessor = (d: VZ.ChartHelpers.ScalarDatum) => d.scalar;
+      this.smoothedAccessor = (d: VZ.ChartHelpers.ScalarDatum) => d.smoothed;
       let linePlot = new Plottable.Plots.Line<number|Date>();
       linePlot.x(xAccessor, xScale);
-      linePlot.y(this.yAccessor, yScale);
+      linePlot.y(this.scalarAccessor, yScale);
       linePlot.attr(
           'stroke', (d: VZ.ChartHelpers.Datum, i: number,
                      dataset: Plottable.Dataset) =>
@@ -110,7 +108,7 @@ module VZ {
 
       let smoothLinePlot = new Plottable.Plots.Line<number|Date>();
       smoothLinePlot.x(xAccessor, xScale);
-      smoothLinePlot.y(this.yAccessor, yScale);
+      smoothLinePlot.y(this.smoothedAccessor, yScale);
       smoothLinePlot.attr(
           'stroke', (d: VZ.ChartHelpers.Datum, i: number,
                      dataset: Plottable.Dataset) =>
@@ -122,7 +120,7 @@ module VZ {
       // visible. We hide it when tooltips are active to keep things clean.
       let scatterPlot = new Plottable.Plots.Scatter<number|Date, number>();
       scatterPlot.x(xAccessor, xScale);
-      scatterPlot.y(this.yAccessor, yScale);
+      scatterPlot.y(this.scalarAccessor, yScale);
       scatterPlot.attr('fill', (d: any) => this.colorScale.scale(d.name));
       scatterPlot.attr('opacity', 1);
       scatterPlot.size(VZ.ChartHelpers.TOOLTIP_CIRCLE_SIZE * 2);
@@ -148,10 +146,10 @@ module VZ {
      */
     private _onDatasetChanged(dataset: Plottable.Dataset) {
       if (this.smoothingEnabled) {
-        this.resmoothDataset(this.getSmoothDataset(dataset.metadata().name));
-        this.updateSpecialDatasets(this.smoothDatasets);
+        this.resmoothDataset(dataset);
+        this.updateSpecialDatasets(this.smoothedAccessor);
       } else {
-        this.updateSpecialDatasets(this.datasets);
+        this.updateSpecialDatasets(this.scalarAccessor);
       }
     }
 
@@ -159,14 +157,16 @@ module VZ {
      * values from all of the regular datasets, e.g. last points in series, or
      * NaN values. Those points will have a `name` and `relative` property added
      * (since usually those are context in the surrounding dataset).
+     * The accessor will point to the correct data to access.
      */
-    private updateSpecialDatasets(datasets: Plottable.Dataset[]) {
+    private updateSpecialDatasets(accessor: Plottable.Accessor<number>) {
       let lastPointsData =
-          datasets
+          this.datasets
               .map((d) => {
                 let datum = null;
                 // filter out NaNs to ensure last point is a clean one
-                let nonNanData = d.data().filter((x) => !isNaN(x.scalar));
+                let nonNanData =
+                    d.data().filter((x) => !isNaN(accessor(x, -1, d)));
                 if (nonNanData.length > 0) {
                   let idx = nonNanData.length - 1;
                   datum = nonNanData[idx];
@@ -187,8 +187,8 @@ module VZ {
         let data = d.data();
         let i = 0;
         while (i < data.length && displayY == null) {
-          if (!isNaN(data[i].scalar)) {
-            displayY = data[i].scalar;
+          if (!isNaN(accessor(data[i], -1, d))) {
+            displayY = accessor(data[i], -1, d);
           }
           i++;
         }
@@ -197,8 +197,8 @@ module VZ {
         }
         let nanData = [];
         for (i = 0; i < data.length; i++) {
-          if (!isNaN(data[i].scalar)) {
-            displayY = data[i].scalar;
+          if (!isNaN(accessor(data[i], -1, d))) {
+            displayY = accessor(data[i], -1, d);
           } else {
             data[i].name = d.metadata().name;
             data[i].displayY = displayY;
@@ -208,7 +208,7 @@ module VZ {
         }
         return nanData;
       };
-      let nanData = _.flatten(datasets.map(datasetToNaNData));
+      let nanData = _.flatten(this.datasets.map(datasetToNaNData));
       this.nanDataset.data(nanData);
     }
 
@@ -250,10 +250,8 @@ module VZ {
 
         let centerBBox: SVGRect =
             (<any>this.gridlines.content().node()).getBBox();
-        let datasets =
-            this.smoothingEnabled ? this.smoothDatasets : plot.datasets();
-        let points =
-            datasets.map((dataset) => this.findClosestPoint(target, dataset));
+        let points = plot.datasets().map(
+            (dataset) => this.findClosestPoint(target, dataset));
         let pointsToCircle = points.filter(
             (p) => p != null &&
                 Plottable.Utils.DOM.intersectsBBox(p.x, p.y, centerBBox));
@@ -309,7 +307,7 @@ module VZ {
         let firstX =
             this.xScale.scale(this.xAccessor(firstPoint, 0, d.dataset));
         let lastX = this.xScale.scale(this.xAccessor(lastPoint, 0, d.dataset));
-        let s = d.datum.scalar;
+        let s = this.smoothingEnabled ? d.datum.smoothed : d.datum.scalar;
         let yD = this.yScale.domain();
         return target.x < firstX || target.x > lastX || s < yD[0] ||
             s > yD[1] || isNaN(s);
@@ -330,6 +328,11 @@ module VZ {
               'background-color',
               (d) => this.colorScale.scale(d.dataset.metadata().name));
       rows.append('td').text((d) => d.dataset.metadata().name);
+      if (this.smoothingEnabled) {
+        rows.append('td').text(
+            (d) => isNaN(d.datum.smoothed) ? 'NaN' :
+                                             valueFormatter(d.datum.smoothed));
+      }
       rows.append('td').text(
           (d) =>
               isNaN(d.datum.scalar) ? 'NaN' : valueFormatter(d.datum.scalar));
@@ -349,18 +352,9 @@ module VZ {
       // prevent it from falling off the right side of the screen
       let left =
           Math.min(0, documentWidth - parentRect.left - nodeRect.width - 60);
-      this.tooltip.style('left', left + 'px');
-      // compute top position
-      if (parentRect.bottom + nodeRect.height +
-              VZ.ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET <
-          document.body.clientHeight) {
-        this.tooltip.style(
-            'top', parentRect.bottom + VZ.ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET);
-      } else {
-        this.tooltip.style(
-            'bottom', parentRect.top - VZ.ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET);
-      }
-
+      let top = parentRect.height + VZ.ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET;
+      this.tooltip.style(
+          'transform', 'translate(' + left + 'px,' + top + 'px)');
       this.tooltip.style('opacity', 1);
     }
 
@@ -369,7 +363,8 @@ module VZ {
         dataset: Plottable.Dataset): VZ.ChartHelpers.Point {
       let points: VZ.ChartHelpers.Point[] = dataset.data().map((d, i) => {
         let x = this.xAccessor(d, i, dataset);
-        let y = this.yAccessor(d, i, dataset);
+        let y = this.smoothingEnabled ? this.smoothedAccessor(d, i, dataset) :
+                                        this.scalarAccessor(d, i, dataset);
         return {
           x: this.xScale.scale(x),
           y: this.yScale.scale(y),
@@ -392,28 +387,18 @@ module VZ {
       }
     }
 
-    private getSmoothDataset(name: string) {
-      if (this.name2smoothDatasets[name] === undefined) {
-        this.name2smoothDatasets[name] =
-            new Plottable.Dataset([], {name: name});
-      }
-      return this.name2smoothDatasets[name];
-    }
-
     private resmoothDataset(dataset: Plottable.Dataset) {
-      let unsmoothedData = this.getDataset(dataset.metadata().name).data();
+      let data = dataset.data();
 
       // EMA with first step initialized to first element.
-      let smoothedData = _.cloneDeep(unsmoothedData);
-      smoothedData.forEach((d, i) => {
+      data.forEach((d, i) => {
         if (i === 0) {
-          return;
+          d.smoothed = d.scalar;
+        } else {
+          d.smoothed = (1.0 - this.smoothingDecay) * d.scalar +
+              this.smoothingDecay * data[i - 1].smoothed;
         }
-        d.scalar = (1.0 - this.smoothingDecay) * d.scalar +
-            this.smoothingDecay * smoothedData[i - 1].scalar;
       });
-
-      dataset.data(smoothedData);
     }
 
     private getDataset(name: string) {
@@ -434,11 +419,6 @@ module VZ {
       this.datasets = names.map((r) => this.getDataset(r));
       this.datasets.forEach((d) => d.onUpdate(this.onDatasetChanged));
       this.linePlot.datasets(this.datasets);
-
-      if (this.smoothingEnabled) {
-        this.smoothDatasets = names.map((r) => this.getSmoothDataset(r));
-        this.smoothLinePlot.datasets(this.smoothDatasets);
-      }
     }
 
     /**
@@ -449,26 +429,26 @@ module VZ {
     }
 
     public smoothingUpdate(decay: number) {
+      this.smoothingDecay = decay;
+      this.datasets.forEach((d) => this.resmoothDataset(d));
+
       if (!this.smoothingEnabled) {
         this.linePlot.addClass('ghost');
+        this.scatterPlot.y(this.smoothedAccessor, this.yScale);
         this.smoothingEnabled = true;
-        this.smoothDatasets =
-            this.seriesNames.map((r) => this.getSmoothDataset(r));
-        this.smoothLinePlot.datasets(this.smoothDatasets);
+        this.smoothLinePlot.datasets(this.datasets);
       }
 
-      this.smoothingDecay = decay;
-      this.smoothDatasets.forEach((d) => this.resmoothDataset(d));
-      this.updateSpecialDatasets(this.smoothDatasets);
+      this.updateSpecialDatasets(this.smoothedAccessor);
     }
 
     public smoothingDisable() {
       if (this.smoothingEnabled) {
         this.linePlot.removeClass('ghost');
-        this.smoothDatasets = [];
-        this.smoothLinePlot.datasets(this.smoothDatasets);
+        this.scatterPlot.y(this.scalarAccessor, this.yScale);
+        this.smoothLinePlot.datasets([]);
         this.smoothingEnabled = false;
-        this.updateSpecialDatasets(this.datasets);
+        this.updateSpecialDatasets(this.scalarAccessor);
       }
     }
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 8c92b0bf780..194309b134b 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -634,6 +634,7 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
                    srcs=srcs,
                    deps=deps + if_cuda(cuda_deps),
                    data=[name + "_check_deps"],
+                   copts=tf_copts(),
                    linkshared=1,
                    linkopts = select({
                        "//conditions:default": [
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index e02e51eae07..e7749ab0f70 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -1 +1,2 @@
 *tensorflow*
+*perftools*gputools*
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index 61ffb8d29aa..8c8c8be5a93 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -1,6 +1,7 @@
 tensorflow {
   global:
     *tensorflow*;
+    *perftools*gputools*;
   local:
     *;
 };
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index f033c2514ab..3c5ccb0a04e 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -17,7 +17,7 @@
 set -e
 
 # Select bazel version.
-BAZEL_VERSION="0.3.0"
+BAZEL_VERSION="0.3.1"
 
 # Install bazel.
 mkdir /bazel
diff --git a/tensorflow/tools/ci_build/install/install_proto3.sh b/tensorflow/tools/ci_build/install/install_proto3.sh
index c4c613b6349..2f1b7dd175c 100755
--- a/tensorflow/tools/ci_build/install/install_proto3.sh
+++ b/tensorflow/tools/ci_build/install/install_proto3.sh
@@ -19,7 +19,7 @@ set -e
 # Install protobuf3.
 
 # Select protobuf version.
-PROTOBUF_VERSION="3.0.0-beta-2"
+PROTOBUF_VERSION="3.0.0"
 
 PROTOBUF_URL="https://github.com/google/protobuf/releases/download/v${PROTOBUF_VERSION}/protoc-${PROTOBUF_VERSION}-linux-x86_64.zip"
 PROTOBUF_ZIP=$(basename "${PROTOBUF_URL}")
@@ -27,7 +27,7 @@ UNZIP_DEST="google-protobuf"
 
 wget -q "${PROTOBUF_URL}"
 unzip "${PROTOBUF_ZIP}" -d "${UNZIP_DEST}"
-cp "${UNZIP_DEST}/protoc" /usr/local/bin/
+cp "${UNZIP_DEST}/bin/protoc" /usr/local/bin/
 
 rm -f "${PROTOBUF_ZIP}"
 rm -rf "${UNZIP_DEST}"
diff --git a/tensorflow/tools/ci_build/update_version.sh b/tensorflow/tools/ci_build/update_version.sh
index 1d1e492ef87..dd02d02d17f 100755
--- a/tensorflow/tools/ci_build/update_version.sh
+++ b/tensorflow/tools/ci_build/update_version.sh
@@ -131,6 +131,13 @@ check_existence file "${TEST_SERVER_DOCKER_FILE}"
 
 sed -i -r -e "s/(.*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${TEST_SERVER_DOCKER_FILE}"
 
+# Update tensorflow/tools/gcs_test/Dockerfile
+GCS_TEST_DOCKER_FILE="${TF_SRC_DIR}/tools/gcs_test/Dockerfile"
+
+check_existence file "${GCS_TEST_DOCKER_FILE}"
+
+sed -i -r -e "s/(.*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${GCS_TEST_DOCKER_FILE}"
+
 
 # Updates to be made if there are major / minor version changes
 MAJOR_MINOR_CHANGE=0
diff --git a/tensorflow/tools/gcs_test/Dockerfile b/tensorflow/tools/gcs_test/Dockerfile
index be3ad40b157..2831a07de76 100644
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@@ -16,7 +16,7 @@ RUN ./install_google_cloud_sdk.bash --disable-prompts --install-dir=/var/gcloud
 
 # Install nightly TensorFlow pip
 RUN pip install \
-   http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
+   http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Copy test files
 RUN mkdir -p /gcs-smoke/python
diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD
index 9062ed2ec0d..15534fa9612 100644
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@@ -8,6 +8,7 @@ cc_library(
         "Eigen/Cholesky",
         "Eigen/Eigenvalues",
         "Eigen/QR",
+        "Eigen/SVD",
         "unsupported/Eigen/SpecialFunctions",
         "unsupported/Eigen/CXX11/Tensor",
         "unsupported/Eigen/CXX11/FixedPoint",
diff --git a/third_party/eigen3/Eigen/SVD b/third_party/eigen3/Eigen/SVD
new file mode 100644
index 00000000000..eecf47c1031
--- /dev/null
+++ b/third_party/eigen3/Eigen/SVD
@@ -0,0 +1 @@
+#include "Eigen/SVD"