diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 5e321686531..0e512772849 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -293,14 +293,13 @@ cc_library(
     ],
 )
 
-# TODO(opensource): Make it work externally
 tf_proto_library(
     name = "protos_all",
     srcs = glob(["**/*.proto"]),
     cc_api_version = 2,
     go_api_version = 2,
     java_api_version = 2,
-    py_api_version = 2,  # TODO(irving): Handle 3
+    py_api_version = 2,
     visibility = ["//visibility:public"],
 )
 
@@ -507,7 +506,6 @@ filegroup(
             "kernels/maxpooling_op.h",
             "kernels/pooling_ops_common.h",
             "kernels/pooling_ops_common.cc",
-            "kernels/reference_gemm.h",
         ],
         exclude = [
             "**/*test.cc",
@@ -571,7 +569,6 @@ filegroup(
         "//tensorflow/core:kernels/no_op.cc",
         "//tensorflow/core:kernels/no_op.h",
         "//tensorflow/core:kernels/pack_op.cc",
-        "//tensorflow/core:kernels/reference_gemm.h",
         "//tensorflow/core:kernels/reshape_op.cc",
         "//tensorflow/core:kernels/reshape_op.h",
         "//tensorflow/core:kernels/reverse_sequence_op.cc",
@@ -628,6 +625,8 @@ filegroup(
         "//tensorflow/core:kernels/relu_op.h",
         "//tensorflow/core:kernels/softplus_op.cc",
         "//tensorflow/core:kernels/softplus_op.h",
+        "//tensorflow/core:kernels/softsign_op.cc",
+        "//tensorflow/core:kernels/softsign_op.h",
         "//tensorflow/core:kernels/stack_ops.cc",
         "//tensorflow/core:kernels/transpose_op.cc",
         "//tensorflow/core:kernels/transpose_op.h",
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 3178e91f617..63d49b50108 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -758,7 +758,11 @@ void ExecutorState::RunAsync(Executor::DoneCallback done) {
 
   // Ask the device to fill in the device context map.
   Device* device = impl_->params_.device;
-  device->FillContextMap(graph, &device_context_map_);
+  Status fill_status = device->FillContextMap(graph, &device_context_map_);
+  if (!fill_status.ok()) {
+    done(fill_status);
+    return;
+  }
 
   // Initialize the ready queue.
   for (const Node* n : graph->nodes()) {
@@ -1077,7 +1081,7 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
 
   for (int i = 0; i < node->num_outputs(); ++i) {
     TensorValue val = ctx->release_output(i);
-    // Only Switch and Recv nodes can generate new dead outputs
+    // Only Switch and Recv can generate new dead outputs.
     if (*ctx->is_output_dead() || val.tensor == nullptr) {
       DCHECK(IsSwitch(node) || IsRecv(node));
     } else {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index 1821289f4b6..32109157aa5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -40,13 +40,13 @@ EventMgr::~EventMgr() {
     delete e;
   }
   while (!used_events_.empty()) {
-    delete used_events_[0].event;
-    delete used_events_[0].mem;
-    if (used_events_[0].bufrec.buf) {
-      used_events_[0].bufrec.alloc->DeallocateRaw(used_events_[0].bufrec.buf);
+    InUse* ue = &used_events_[0];
+    delete ue->event;
+    delete ue->mem;
+    if (ue->bufrec.buf) {
+      ue->bufrec.alloc->DeallocateRaw(ue->bufrec.buf);
     }
-    if (used_events_[0].func != nullptr)
-      threadpool_.Schedule(used_events_[0].func);
+    if (ue->func != nullptr) threadpool_.Schedule(ue->func);
     used_events_.pop_front();
   }
 }
@@ -60,15 +60,17 @@ EventMgr::~EventMgr() {
 void EventMgr::PollLoop() {
   while (!stop_polling_.HasBeenNotified()) {
     Env::Default()->SleepForMicroseconds(1 * 1000);
+    ToFreeVector to_free;
     {
       mutex_lock l(mu_);
-      PollEvents(true);
+      PollEvents(true, &to_free);
     }
+    FreeMemory(to_free);
   }
   polling_stopped_.Notify();
 }
 
-void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu) {
+void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu, gpu::Event** e) {
   VLOG(2) << "QueueInUse  free_events_ " << free_events_.size()
           << " used_events_ " << used_events_.size();
   // Events are created on demand, and repeatedly reused.  There is no
@@ -77,10 +79,9 @@ void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu) {
     free_events_.push_back(new gpu::Event(exec_));
     free_events_.back()->Init();
   }
-  gpu::Event* e = free_events_.back();
+  *e = free_events_.back();
   free_events_.pop_back();
-  stream->ThenRecordEvent(e);
-  iu.event = e;
+  iu.event = *e;
   used_events_.push_back(iu);
 }
 
@@ -103,7 +104,8 @@ void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu) {
 // GPU memory use to spike needlessly.  An alternative strategy would
 // be to throttle new Op execution until the pending event queue
 // clears.
-void EventMgr::PollEvents(bool is_dedicated_poller) {
+void EventMgr::PollEvents(bool is_dedicated_poller,
+                          gtl::InlinedVector<InUse, 4>* to_free) {
   VLOG(2) << "PollEvents  free_events_ " << free_events_.size()
           << " used_events_ " << used_events_.size();
   // Sweep the remaining events in order.  If this is the dedicated
@@ -123,11 +125,9 @@ void EventMgr::PollEvents(bool is_dedicated_poller) {
         if (!is_dedicated_poller) return;  // quit processing queue
         break;
       case gpu::Event::Status::kComplete:
-        delete iu.mem;
-        if (iu.bufrec.buf) iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf);
-        // The function must be called in another thread, outside of
-        // the mutex held here.
-        if (iu.func != nullptr) threadpool_.Schedule(iu.func);
+        // Make a copy of the InUse record so we can free it after releasing
+        // the lock
+        to_free->push_back(iu);
         free_events_.push_back(iu.event);
         // Mark this InUse record as completed.
         iu.event = nullptr;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
index 5fe9fd782db..f2a1ea26031 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -18,8 +18,10 @@ limitations under the License.
 
 #include <deque>
 #include <vector>
+#include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/port.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/public/tensor.h"
@@ -47,9 +49,15 @@ class EventMgr {
   // currently enqueued on *stream have completed.
   inline void ThenDeleteTensors(perftools::gputools::Stream* stream,
                                 std::vector<Tensor>* tensors) {
-    mutex_lock l(mu_);
-    QueueTensors(stream, tensors);
-    PollEvents(false);
+    ToFreeVector to_free;
+    ::perftools::gputools::Event* e;
+    {
+      mutex_lock l(mu_);
+      QueueTensors(stream, tensors, &e);
+      PollEvents(false, &to_free);
+    }
+    stream->ThenRecordEvent(e);
+    FreeMemory(to_free);
   }
 
   struct BufRec {
@@ -61,16 +69,28 @@ class EventMgr {
   // on it as soon as all events currently enqueued on *stream have completed.
   inline void ThenDeleteBuffer(perftools::gputools::Stream* stream,
                                BufRec bufrec) {
-    mutex_lock l(mu_);
-    QueueBuffer(stream, bufrec);
-    PollEvents(false);
+    ToFreeVector to_free;
+    ::perftools::gputools::Event* e;
+    {
+      mutex_lock l(mu_);
+      QueueBuffer(stream, bufrec, &e);
+      PollEvents(false, &to_free);
+    }
+    stream->ThenRecordEvent(e);
+    FreeMemory(to_free);
   }
 
   inline void ThenExecute(perftools::gputools::Stream* stream,
                           std::function<void()> func) {
-    mutex_lock l(mu_);
-    QueueFunc(stream, func);
-    PollEvents(false);
+    ToFreeVector to_free;
+    ::perftools::gputools::Event* e;
+    {
+      mutex_lock l(mu_);
+      QueueFunc(stream, func, &e);
+      PollEvents(false, &to_free);
+    }
+    stream->ThenRecordEvent(e);
+    FreeMemory(to_free);
   }
 
  private:
@@ -85,32 +105,50 @@ class EventMgr {
     std::function<void()> func;
   };
 
+  typedef gtl::InlinedVector<InUse, 4> ToFreeVector;
+
+  void FreeMemory(const ToFreeVector& to_free) {
+    for (const auto& iu : to_free) {
+      delete iu.mem;
+      if (iu.bufrec.buf) iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf);
+      // The function must be called in another thread.
+      if (iu.func != nullptr) threadpool_.Schedule(iu.func);
+    }
+  }
+
   // Stream-enqueue an unused Event and save with it a collection of
   // Tensors and/or a BufRec to be deleted only after the Event
   // records.
-  void QueueInUse(perftools::gputools::Stream* stream, InUse in_use)
+  void QueueInUse(perftools::gputools::Stream* stream, InUse in_use,
+                  ::perftools::gputools::Event** e)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   void QueueTensors(perftools::gputools::Stream* stream,
-                    std::vector<Tensor>* tensors)
+                    std::vector<Tensor>* tensors,
+                    ::perftools::gputools::Event** e)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr});
+    QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr}, e);
   }
 
-  void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec)
+  void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec,
+                   ::perftools::gputools::Event** e)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr});
+    QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr}, e);
   }
 
   void QueueFunc(perftools::gputools::Stream* stream,
-                 std::function<void()> func) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    QueueInUse(stream, {nullptr, nullptr, BufRec(), func});
+                 std::function<void()> func, ::perftools::gputools::Event** e)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    QueueInUse(stream, {nullptr, nullptr, BufRec(), func}, e);
   }
 
   // This function should be called at roughly the same tempo as
   // QueueTensors() to check whether pending events have recorded,
-  // and then retire them.
-  void PollEvents(bool is_dedicated_poller) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // and then retire them.  It appends InUse elements that need cleanup
+  // to "*to_free".  The caller should call FreeMemory(to_free)
+  // when this returns.
+  void PollEvents(bool is_dedicated_poller, ToFreeVector* to_free)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // An internal polling loop that runs at a low frequency to clear
   // straggler Events.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index 6956ead643e..c6893c91e7e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -42,13 +42,21 @@ class TEST_EventMgrHelper {
 
   void QueueTensors(perftools::gputools::Stream* stream,
                     std::vector<Tensor>* tensors) {
-    mutex_lock l(em_->mu_);
-    em_->QueueTensors(stream, tensors);
+    ::perftools::gputools::Event* e;
+    {
+      mutex_lock l(em_->mu_);
+      em_->QueueTensors(stream, tensors, &e);
+    }
+    stream->ThenRecordEvent(e);
   }
 
   void PollEvents(bool is_dedicated_poller) {
-    mutex_lock l(em_->mu_);
-    em_->PollEvents(is_dedicated_poller);
+    EventMgr::ToFreeVector to_free;
+    {
+      mutex_lock l(em_->mu_);
+      em_->PollEvents(is_dedicated_poller, &to_free);
+    }
+    em_->FreeMemory(to_free);
   }
 
  private:
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 29f3cf483ac..66f181d4cf0 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -119,9 +119,10 @@ class DeviceBase {
   // "event_mgr" is used to delay deallocation of temporary GPU buffers.
   // TODO(pbar) Work out how to move this out of DeviceBase.
   struct GpuDeviceInfo {
-    perftools::gputools::Stream* stream;
-    DeviceContext* default_context;
-    EventMgr* event_mgr;
+    // Make sure all the defaults are NULL, so we can spot missing assignments.
+    perftools::gputools::Stream* stream = nullptr;
+    DeviceContext* default_context = nullptr;
+    EventMgr* event_mgr = nullptr;
   };
 
   // Does not take ownership.
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index 960c6535938..8d5ed3c2fe4 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -55,6 +55,24 @@ struct CastFunctor<CPUDevice, O, I> {
 
 }  // namespace functor
 
+#define CURRY_TYPES2(FN, arg0) \
+  FN(arg0, bool);              \
+  FN(arg0, uint8);             \
+  FN(arg0, int16);             \
+  FN(arg0, int32);             \
+  FN(arg0, int64);             \
+  FN(arg0, float);             \
+  FN(arg0, double)
+
+#define CURRY_TYPES3(FN, arg0, arg1) \
+  FN(arg0, arg1, bool);              \
+  FN(arg0, arg1, uint8);             \
+  FN(arg0, arg1, int16);             \
+  FN(arg0, arg1, int32);             \
+  FN(arg0, arg1, int64);             \
+  FN(arg0, arg1, float);             \
+  FN(arg0, arg1, double)
+
 #define CAST_CASE(DEVICE, IN, OUT)                                         \
   if (DataTypeToEnum<IN>::value == src_dtype_ &&                           \
       DataTypeToEnum<OUT>::value == dst_dtype_) {                          \
@@ -110,27 +128,14 @@ class CpuCastOp : public CastOpBase {
       work_ = nullptr;  // Identity
       return Status::OK();
     }
-    CAST_CASE(CPUDevice, bool, float);
-    CAST_CASE(CPUDevice, bool, int32);
-    CAST_CASE(CPUDevice, bool, double);
-    CAST_CASE(CPUDevice, double, float);
-    CAST_CASE(CPUDevice, double, int32);
-    CAST_CASE(CPUDevice, double, int64);
-    CAST_CASE(CPUDevice, float, double);
-    CAST_CASE(CPUDevice, float, uint8);
-    CAST_CASE(CPUDevice, float, int32);
-    CAST_CASE(CPUDevice, float, int64);
-    CAST_CASE(CPUDevice, int32, double);
-    CAST_CASE(CPUDevice, int32, float);
-    CAST_CASE(CPUDevice, int32, uint8);
-    CAST_CASE(CPUDevice, int32, int64);
-    CAST_CASE(CPUDevice, int64, double);
-    CAST_CASE(CPUDevice, int64, float);
-    CAST_CASE(CPUDevice, int64, int32);
-    CAST_CASE(CPUDevice, uint8, float);
-    CAST_CASE(CPUDevice, uint8, int32);
-    CAST_CASE(CPUDevice, uint8, int64);
-    CAST_CASE(CPUDevice, uint8, double);
+    CURRY_TYPES3(CAST_CASE, CPUDevice, bool);
+    CURRY_TYPES3(CAST_CASE, CPUDevice, uint8);
+    CURRY_TYPES3(CAST_CASE, CPUDevice, int16);
+    CURRY_TYPES3(CAST_CASE, CPUDevice, int32);
+    CURRY_TYPES3(CAST_CASE, CPUDevice, int64);
+    CURRY_TYPES3(CAST_CASE, CPUDevice, float);
+    CURRY_TYPES3(CAST_CASE, CPUDevice, double);
+
     if (src_dtype_ == DT_BFLOAT16 && dst_dtype_ == DT_FLOAT) {
       work_ = [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
         int64 N = out->NumElements();
@@ -185,24 +190,15 @@ class GpuCastOp : public CastOpBase {
       work_ = nullptr;  // Identity
       return Status::OK();
     }
-    CAST_CASE(GPUDevice, bfloat16, float);
-    CAST_CASE(GPUDevice, bool, float);
-    CAST_CASE(GPUDevice, double, float);
-    CAST_CASE(GPUDevice, double, int64);
+    CURRY_TYPES3(CAST_CASE, GPUDevice, bool);
+    CURRY_TYPES3(CAST_CASE, GPUDevice, uint8);
+    CURRY_TYPES3(CAST_CASE, GPUDevice, int16);
+    CURRY_TYPES3(CAST_CASE, GPUDevice, int32);
+    CURRY_TYPES3(CAST_CASE, GPUDevice, int64);
+    CURRY_TYPES3(CAST_CASE, GPUDevice, float);
+    CURRY_TYPES3(CAST_CASE, GPUDevice, double);
     CAST_CASE(GPUDevice, float, bfloat16);
-    CAST_CASE(GPUDevice, float, double);
-    CAST_CASE(GPUDevice, float, int64);
-    CAST_CASE(GPUDevice, int64, double);
-    CAST_CASE(GPUDevice, int64, float);
-    CAST_CASE(GPUDevice, uint8, float);
-    CAST_CASE(GPUDevice, float, uint8);
-    CAST_CASE(GPUDevice, bool, int32);
-    CAST_CASE(GPUDevice, double, int32);
-    CAST_CASE(GPUDevice, float, int32);
-    CAST_CASE(GPUDevice, int32, double);
-    CAST_CASE(GPUDevice, int32, float);
-    CAST_CASE(GPUDevice, int32, int64);
-    CAST_CASE(GPUDevice, int64, int32);
+    CAST_CASE(GPUDevice, bfloat16, float);
     return Unimplemented();
   }
 };
@@ -217,28 +213,24 @@ REGISTER_KERNEL_BUILDER(Name("Cast").Device(DEVICE_CPU), CpuCastOp);
                               .TypeConstraint<srctype>("SrcT") \
                               .TypeConstraint<dsttype>("DstT") \
                               .Device(DEVICE_GPU),             \
-                          GpuCastOp);
-REGISTER_CAST_GPU(bfloat16, float);
-REGISTER_CAST_GPU(bool, float);
-REGISTER_CAST_GPU(double, float);
-REGISTER_CAST_GPU(double, int64);
+                          GpuCastOp)
+
+CURRY_TYPES2(REGISTER_CAST_GPU, bool);
+CURRY_TYPES2(REGISTER_CAST_GPU, uint8);
+CURRY_TYPES2(REGISTER_CAST_GPU, int16);
+CURRY_TYPES2(REGISTER_CAST_GPU, int32);
+CURRY_TYPES2(REGISTER_CAST_GPU, int64);
+CURRY_TYPES2(REGISTER_CAST_GPU, float);
+CURRY_TYPES2(REGISTER_CAST_GPU, double);
 REGISTER_CAST_GPU(float, bfloat16);
-REGISTER_CAST_GPU(float, double);
-REGISTER_CAST_GPU(float, int64);
-REGISTER_CAST_GPU(int64, double);
-REGISTER_CAST_GPU(int64, float);
-REGISTER_CAST_GPU(uint8, float);
-REGISTER_CAST_GPU(float, uint8);
-REGISTER_CAST_GPU(bool, int32);
-REGISTER_CAST_GPU(double, int32);
-REGISTER_CAST_GPU(float, int32);
-REGISTER_CAST_GPU(int32, double);
-REGISTER_CAST_GPU(int32, float);
-REGISTER_CAST_GPU(int32, int64);
-REGISTER_CAST_GPU(int64, int32);
+REGISTER_CAST_GPU(bfloat16, float);
+
 #undef REGISTER_CAST_GPU
 #endif  // GOOGLE_CUDA
 
+#undef CURRY_TYPES2
+#undef CURRY_TYPES3
+
 // HostCast differs from Cast in that its input and output are in host memory.
 REGISTER_KERNEL_BUILDER(Name("_HostCast").Device(DEVICE_CPU), CpuCastOp);
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/cast_op_gpu.cu.cc b/tensorflow/core/kernels/cast_op_gpu.cu.cc
index 43f8cd90edc..57f08736211 100644
--- a/tensorflow/core/kernels/cast_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/cast_op_gpu.cu.cc
@@ -33,25 +33,27 @@ struct CastFunctor<GPUDevice, O, I> {
   }
 };
 
-#define DEFINE(O, I) template struct CastFunctor<GPUDevice, O, I>;
-DEFINE(float, double);
-DEFINE(float, int32);
-DEFINE(float, int64);
-DEFINE(double, float);
-DEFINE(double, int32);
-DEFINE(double, int64);
-DEFINE(int32, float);
-DEFINE(int32, double);
-DEFINE(int32, int64);
-DEFINE(int64, float);
-DEFINE(int64, double);
-DEFINE(int64, int32);
-DEFINE(int32, bool);
-DEFINE(float, bool);
-DEFINE(float, uint8);
-DEFINE(uint8, float);
-DEFINE(float, bfloat16);
+#define DEFINE(O, I) template struct CastFunctor<GPUDevice, O, I>
+#define DEFINE_ALL_FROM(in_type) \
+  DEFINE(in_type, bool);         \
+  DEFINE(in_type, uint8);        \
+  DEFINE(in_type, int16);        \
+  DEFINE(in_type, int32);        \
+  DEFINE(in_type, int64);        \
+  DEFINE(in_type, float);        \
+  DEFINE(in_type, double)
+
+DEFINE_ALL_FROM(bool);
+DEFINE_ALL_FROM(uint8);
+DEFINE_ALL_FROM(int16);
+DEFINE_ALL_FROM(int32);
+DEFINE_ALL_FROM(int64);
+DEFINE_ALL_FROM(float);
+DEFINE_ALL_FROM(double);
 DEFINE(bfloat16, float);
+DEFINE(float, bfloat16);
+
+#undef DEFINE_ALL_FROM
 #undef DEFINE
 
 }  // end namespace functor
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index b93c0857db9..168914f5539 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -41,22 +41,48 @@ class CastOpTest : public OpsTestBase {
   void MakeOp(DataType src, DataType dst) {
     RequireDefaultOps();
     EXPECT_OK(NodeDefBuilder("cast_op", "Cast")
-                  .Input(FakeInput(DT_INT32))
+                  .Input(FakeInput(src))
                   .Attr("SrcT", src)
                   .Attr("DstT", dst)
                   .Finalize(node_def()));
     EXPECT_OK(InitOp());
   }
+
+  template <typename IN, typename OUT>
+  void CheckCast() {
+    DataType in_type = DataTypeToEnum<IN>::v();
+    DataType out_type = DataTypeToEnum<OUT>::v();
+    MakeOp(in_type, out_type);
+    AddInputFromArray<IN>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+    ASSERT_OK(RunOpKernel());
+    Tensor expected(allocator(), out_type, TensorShape({1, 2, 2, 1}));
+    test::FillValues<OUT>(&expected, {1, 2, 3, 4});
+    test::ExpectTensorEqual<OUT>(expected, *GetOutput(0));
+  }
 };
 
-TEST_F(CastOpTest, Int32ToUint8) {
-  MakeOp(DT_INT32, DT_UINT8);
-  AddInputFromArray<int32>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  ASSERT_OK(RunOpKernel());
-  Tensor expected(allocator(), DT_UINT8, TensorShape({1, 2, 2, 1}));
-  test::FillValues<uint8>(&expected, {1, 2, 3, 4});
-  test::ExpectTensorEqual<uint8>(expected, *GetOutput(0));
-}
+#define TEST_CAST(in, out) \
+  TEST_F(CastOpTest, TestCast##_##in##_##out) { CheckCast<in, out>(); }
+
+#define TEST_ALL_CASTS_FROM(in) \
+  TEST_CAST(in, uint8);         \
+  TEST_CAST(in, int16);         \
+  TEST_CAST(in, int32);         \
+  TEST_CAST(in, int64);         \
+  TEST_CAST(in, float);         \
+  TEST_CAST(in, double)
+
+TEST_ALL_CASTS_FROM(uint8)
+TEST_ALL_CASTS_FROM(int16)
+TEST_ALL_CASTS_FROM(int32)
+TEST_ALL_CASTS_FROM(int64)
+TEST_ALL_CASTS_FROM(float)
+TEST_ALL_CASTS_FROM(double)
+
+#undef TEST_ALL_CASTS_FROM
+#undef TEST_CAST
+
+// TODO(wicke): check conversions from/to bool, and bfloat16
 
 static void BM_cpu_float_int64(int iters, int num) {
   testing::ItemsProcessed(static_cast<int64>(iters) * num);
diff --git a/tensorflow/core/kernels/concat_op_gpu.cu.cc b/tensorflow/core/kernels/concat_op_gpu.cu.cc
index 581171c6bae..084ca9a7643 100644
--- a/tensorflow/core/kernels/concat_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/concat_op_gpu.cu.cc
@@ -34,10 +34,12 @@ void ConcatGPU(const GPUDevice& d,
                const std::vector<
                    std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
                typename TTypes<T, 2>::Matrix* output) {
-  Eigen::array<Eigen::DenseIndex, 2> offset(0, 0);
+  Eigen::array<int32, 2> offset{0, 0};
   for (int i = 0; i < inputs.size(); ++i) {
-    Eigen::array<Eigen::DenseIndex, 2> size = inputs[i]->dimensions();
-    output->slice(offset, size).device(d) = *inputs[i];
+    Eigen::array<int32_t, 2> size;
+    size[0] = inputs[i]->dimension(0);
+    size[1] = inputs[i]->dimension(1);
+    To32Bit(*output).slice(offset, size).device(d) = To32Bit(*inputs[i]);
     offset[1] += size[1];
   }
 }
diff --git a/tensorflow/core/kernels/constant_op_gpu.cu.cc b/tensorflow/core/kernels/constant_op_gpu.cu.cc
index 5991391850a..bbb7a0ee284 100644
--- a/tensorflow/core/kernels/constant_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/constant_op_gpu.cu.cc
@@ -73,7 +73,7 @@ struct FillFunctor<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
                   typename TTypes<T>::ConstScalar in) {
     Eigen::internal::scalar_const_op<T> f(in.data());
-    out.device(d) = out.nullaryExpr(f);
+    To32Bit(out).device(d) = To32Bit(out).nullaryExpr(f);
   }
 };
 
@@ -91,7 +91,7 @@ DEFINE_FILL_GPU(int64);
 template <typename T>
 struct SetZeroFunctor<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) {
-    out.device(d) = out.constant(0);
+    To32Bit(out).device(d) = To32Bit(out).constant(0);
   }
 };
 
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
index dae06f4bfc7..8bd13b4be3d 100644
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -242,13 +242,13 @@ typedef Eigen::GpuDevice GPUDevice;
   const auto expanded_out_cols = (output_cols - 1) * stride + 1;               \
   const auto padded_out_rows = input_rows + filter_rows - 1;                   \
   const auto padded_out_cols = input_cols + filter_cols - 1;                   \
-  const auto top_pad_rows = filter_rows - 1 - pad_rows;                        \
-  const auto left_pad_cols = filter_cols - 1 - pad_cols;                       \
-  const auto bottom_pad_rows =                                                 \
+  const int top_pad_rows = filter_rows - 1 - pad_rows;                         \
+  const int left_pad_cols = filter_cols - 1 - pad_cols;                        \
+  const int bottom_pad_rows =                                                  \
       padded_out_rows - expanded_out_rows - top_pad_rows;                      \
-  const auto right_pad_cols =                                                  \
+  const int right_pad_cols =                                                   \
       padded_out_cols - expanded_out_cols - left_pad_cols;                     \
-  Eigen::DSizes<Eigen::DenseIndex, 4> strides{1, stride, stride, 1};           \
+  Eigen::DSizes<int, 4> strides{1, stride, stride, 1};                         \
   VLOG(2) << "Conv2d: " << label                                               \
           << ": expanded_out_rows = " << expanded_out_rows                     \
           << ", expanded_out_cols = " << expanded_out_cols                     \
@@ -809,9 +809,11 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
                    context->allocate_output(0, input_shape, &in_backprop));
 
     const int padding_rows =
-        (output_rows - 1) * stride + filter_rows - input_rows;
+        (padding_ == VALID) ? 0 : (output_rows - 1) * stride + filter_rows -
+                                      input_rows;
     const int padding_cols =
-        (output_cols - 1) * stride + filter_cols - input_cols;
+        (padding_ == VALID) ? 0 : (output_cols - 1) * stride + filter_cols -
+                                      input_cols;
 
     // TODO(keveman): cuDNN only supports equal padding on both sides, so only
     // calling it when that is true. Remove this check when (if?) cuDNN starts
@@ -954,16 +956,17 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
                      context->allocate_temp(DataTypeToEnum<T>::v(),
                                             padded_out_shape, &padded_output));
 
-      Eigen::DSizes<Eigen::DenseIndex, 4> trivial_order{0, 1, 2, 3};
-      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4> pad_dims{
+      Eigen::DSizes<int, 4> trivial_order{0, 1, 2, 3};
+      Eigen::array<Eigen::IndexPair<int>, 4> pad_dims{
           {{0, 0},
            {top_pad_rows, bottom_pad_rows},
            {left_pad_cols, right_pad_cols},
            {0, 0}}};
 
-      functor::InflatePadAndShuffle<Device, T, 4, Eigen::DenseIndex>()(
-          context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), strides,
-          pad_dims, trivial_order, padded_output.tensor<T, 4>());
+      functor::InflatePadAndShuffle<Device, T, 4, int>()(
+          context->eigen_device<Device>(), To32Bit(out_backprop.tensor<T, 4>()),
+          strides, pad_dims, trivial_order,
+          To32Bit(padded_output.tensor<T, 4>()));
       const Tensor& padded_output_cref = padded_output;
 
       // We then need to fill a new "reverted" filter
@@ -976,11 +979,11 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
                      context->allocate_temp(DataTypeToEnum<T>::v(),
                                             r_filter_shape, &r_filter));
 
-      Eigen::DSizes<Eigen::DenseIndex, 4> filter_order{0, 1, 3, 2};
+      Eigen::DSizes<int, 4> filter_order{0, 1, 3, 2};
       Eigen::array<bool, 4> filter_rev_dims{true, true, false, false};
-      functor::ShuffleAndReverse<Device, T, 4, Eigen::DenseIndex>()(
-          context->eigen_device<Device>(), filter.tensor<T, 4>(), filter_order,
-          filter_rev_dims, r_filter.tensor<T, 4>());
+      functor::ShuffleAndReverse<Device, T, 4, int>()(
+          context->eigen_device<Device>(), To32Bit(filter.tensor<T, 4>()),
+          filter_order, filter_rev_dims, To32Bit(r_filter.tensor<T, 4>()));
       const Tensor& r_filter_cref = r_filter;
 
       // Now we can call conv_2d directly.
@@ -1039,20 +1042,22 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
                    context->allocate_output(0, filter_shape, &filter_backprop));
 
     const int padding_rows =
-        (output_rows - 1) * stride + filter_rows - input_rows;
+        (padding_ == VALID) ? 0 : (output_rows - 1) * stride + filter_rows -
+                                      input_rows;
     const int padding_cols =
-        (output_cols - 1) * stride + filter_cols - input_cols;
+        (padding_ == VALID) ? 0 : (output_cols - 1) * stride + filter_cols -
+                                      input_cols;
 
     // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only
     // calling it when that is true. Remove this check when (if?) cuDNN starts
     // supporting different padding.
-    bool padding_compatible =
-        (padding_rows % 2 == 0) && (padding_cols % 2 == 0);
+    bool rows_odd = (padding_rows % 2 != 0);
+    bool cols_odd = (padding_cols % 2 != 0);
 
     auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
-    if (use_cudnn_ && padding_compatible) {
+    if (use_cudnn_) {
       if (filter_rows == 1 && filter_cols == 1 && stride == 1) {
         const uint64 m = in_depth;
         const uint64 k = batch * input_rows * input_cols;
@@ -1089,10 +1094,31 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
         return;
       }
 
+      Tensor compatible_input;
+      if (rows_odd || cols_odd) {
+        // If a padding dimension is odd, we have one more element on the right
+        // side or the bottom side. This is unsupported in cudnn. Therefore,
+        // we pad that extra element and make it compatible.
+        OP_REQUIRES_OK(
+            context,
+            context->allocate_temp(
+                DataTypeToEnum<T>::value,
+                TensorShape({input.dim_size(0), input.dim_size(1) + rows_odd,
+                             input.dim_size(2) + cols_odd, input.dim_size(3)}),
+                &compatible_input));
+
+        functor::PadInput<GPUDevice, T, int>()(
+            context->template eigen_device<GPUDevice>(),
+            To32Bit(input.tensor<T, 4>()), 0, rows_odd, 0, cols_odd,
+            To32Bit(compatible_input.tensor<T, 4>()));
+      } else {
+        compatible_input = input;
+      }
+
       perftools::gputools::dnn::BatchDescriptor input_desc;
       input_desc.set_count(batch)
-          .set_height(input_rows)
-          .set_width(input_cols)
+          .set_height(compatible_input.dim_size(1))
+          .set_width(compatible_input.dim_size(2))
           .set_feature_map_count(in_depth)
           .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
       perftools::gputools::dnn::BatchDescriptor output_desc;
@@ -1146,14 +1172,19 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
                                        transformed_out_backprop.tensor<T, 4>());
 
       Tensor transformed_input;
-      OP_REQUIRES_OK(context,
-                     context->allocate_temp(
-                         DataTypeToEnum<T>::value,
-                         TensorShape({batch, in_depth, input_rows, input_cols}),
-                         &transformed_input));
-      functor::NHWCToNCHW<Device, T>()(context->eigen_device<Device>(),
-                                       input.tensor<T, 4>(),
-                                       transformed_input.tensor<T, 4>());
+      OP_REQUIRES_OK(
+          context,
+          context->allocate_temp(
+              DataTypeToEnum<T>::value,
+              TensorShape({
+                  compatible_input.dim_size(0), compatible_input.dim_size(3),
+                  compatible_input.dim_size(1), compatible_input.dim_size(2),
+              }),
+              &transformed_input));
+      functor::NHWCToNCHW<Device, T>()(
+          context->eigen_device<Device>(),
+          const_cast<const Tensor&>(compatible_input).tensor<T, 4>(),
+          transformed_input.tensor<T, 4>());
 
       auto out_backprop_ptr =
           AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
@@ -1193,7 +1224,7 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
       //   [batch, out_rows, out_cols, out_depth]
       // And we need to change it to
       //   [out_depth, out_rows, out_cols, batch]
-      Eigen::DSizes<Eigen::DenseIndex, 4> out_order{3, 1, 2, 0};
+      Eigen::DSizes<int, 4> out_order{3, 1, 2, 0};
       TensorShape padded_out_shape(
           {out_depth, padded_out_rows, padded_out_cols, batch});
       Tensor padded_output;
@@ -1201,14 +1232,14 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
                      context->allocate_temp(DataTypeToEnum<T>::v(),
                                             padded_out_shape, &padded_output));
 
-      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4> pad_dims{
+      Eigen::array<Eigen::IndexPair<int>, 4> pad_dims{
           {{0, 0},
            {top_pad_rows, bottom_pad_rows},
            {left_pad_cols, right_pad_cols},
            {0, 0}}};
-      functor::InflatePadAndShuffle<Device, T, 4, Eigen::DenseIndex>()(
-          context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), strides,
-          pad_dims, out_order, padded_output.tensor<T, 4>());
+      functor::InflatePadAndShuffle<Device, T, 4, int>()(
+          context->eigen_device<Device>(), To32Bit(out_backprop.tensor<T, 4>()),
+          strides, pad_dims, out_order, To32Bit(padded_output.tensor<T, 4>()));
       const Tensor& padded_output_cref = padded_output;
 
       // For the backprop of the filter, we need to transpose the input.
@@ -1216,7 +1247,7 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
       //   [batch, in_rows, in_cols, in_depth]
       // And we need to change it to
       //   [in_rows, in_cols, batch, in_depth]
-      Eigen::DSizes<Eigen::DenseIndex, 4> in_order{1, 2, 0, 3};
+      Eigen::DSizes<int, 4> in_order{1, 2, 0, 3};
       TensorShape in_shuffle_shape({input_rows, input_cols, batch, in_depth});
       Tensor in_shuffle;
       OP_REQUIRES_OK(context,
@@ -1225,9 +1256,9 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
 
       // No need for reversing this time.
       Eigen::array<bool, 4> trivial_dims{false, false, false, false};
-      functor::ShuffleAndReverse<Device, T, 4, Eigen::DenseIndex>()(
-          context->eigen_device<Device>(), input.tensor<T, 4>(), in_order,
-          trivial_dims, in_shuffle.tensor<T, 4>());
+      functor::ShuffleAndReverse<Device, T, 4, int>()(
+          context->eigen_device<Device>(), To32Bit(input.tensor<T, 4>()),
+          in_order, trivial_dims, To32Bit(in_shuffle.tensor<T, 4>()));
       const Tensor& in_shuffle_cref = in_shuffle;
 
       // The output of the conv_2d would be
@@ -1250,12 +1281,13 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
           BrainPadding2EigenPadding(VALID));
 
       // Now copy the filter_backprop back to the destination.
-      Eigen::DSizes<Eigen::DenseIndex, 4> filter_order{1, 2, 3, 0};
+      Eigen::DSizes<int, 4> filter_order{1, 2, 3, 0};
       Eigen::array<bool, 4> filter_rev_dims{true, true, false, false};
       const Tensor& filter_shuffle_cref = filter_shuffle;
-      functor::ShuffleAndReverse<Device, T, 4, Eigen::DenseIndex>()(
-          context->eigen_device<Device>(), filter_shuffle_cref.tensor<T, 4>(),
-          filter_order, filter_rev_dims, filter_backprop->tensor<T, 4>());
+      functor::ShuffleAndReverse<Device, T, 4, int>()(
+          context->eigen_device<Device>(),
+          To32Bit(filter_shuffle_cref.tensor<T, 4>()), filter_order,
+          filter_rev_dims, To32Bit(filter_backprop->tensor<T, 4>()));
     }
   }
 
@@ -1271,25 +1303,6 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                                  \
   template <>                                                                \
-  void ShuffleAndReverse<GPUDevice, T, 4, Eigen::DenseIndex>::operator()(    \
-      const GPUDevice& d,                                                    \
-      typename TTypes<T, 4, Eigen::DenseIndex>::ConstTensor input,           \
-      const Eigen::DSizes<Eigen::DenseIndex, 4>& order,                      \
-      const Eigen::array<bool, 4>& reverse_dims,                             \
-      typename TTypes<T, 4, Eigen::DenseIndex>::Tensor output);              \
-  extern template struct ShuffleAndReverse<GPUDevice, T, 4,                  \
-                                           Eigen::DenseIndex>;               \
-  template <>                                                                \
-  void InflatePadAndShuffle<GPUDevice, T, 4, Eigen::DenseIndex>::operator()( \
-      const GPUDevice& d,                                                    \
-      typename TTypes<T, 4, Eigen::DenseIndex>::ConstTensor input,           \
-      const Eigen::DSizes<Eigen::DenseIndex, 4>& strides,                    \
-      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4>& pad_dims,  \
-      const Eigen::DSizes<Eigen::DenseIndex, 4>& order,                      \
-      typename TTypes<T, 4, Eigen::DenseIndex>::Tensor output);              \
-  extern template struct InflatePadAndShuffle<GPUDevice, T, 4,               \
-                                              Eigen::DenseIndex>;            \
-  template <>                                                                \
   void ShuffleAndReverse<GPUDevice, T, 4, int>::operator()(                  \
       const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor input,     \
       const Eigen::DSizes<int, 4>& order,                                    \
@@ -1328,7 +1341,13 @@ namespace functor {
       typename TTypes<T, 4>::ConstTensor filter,                             \
       typename TTypes<T, 4>::ConstTensor output_backprop, int input_rows,    \
       int input_cols, int stride);                                           \
-  extern template struct SpatialConvolutionBackwardInput<GPUDevice, T>
+  extern template struct SpatialConvolutionBackwardInput<GPUDevice, T>;      \
+  template <>                                                                \
+  void PadInput<GPUDevice, T, int>::operator()(                              \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,        \
+      int padding_rows_left, int padding_rows_right, int padding_cols_left,  \
+      int padding_cols_right, typename TTypes<T, 4, int>::Tensor out);       \
+  extern template struct PadInput<GPUDevice, T, int>;
 
 DECLARE_GPU_SPEC(float);
 #undef DECLARE_GPU_SPEC
diff --git a/tensorflow/core/kernels/conv_ops_gpu.cu.cc b/tensorflow/core/kernels/conv_ops_gpu.cu.cc
index 60ff6b00241..e4ee058406e 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu.cu.cc
@@ -33,12 +33,8 @@ struct SpatialConvolution<GPUDevice, T> {
                   typename TTypes<T, 4>::ConstTensor input,
                   typename TTypes<T, 4>::ConstTensor filter, int stride,
                   const Eigen::PaddingType& padding) {
-    // TODO(keveman): nvcc 6.5 crashes when 32 bit indexing is turned on. Enable
-    // this when we move to cuda 7.0.
-    // SpatialConvolutionFunc(d, To32Bit(output), To32Bit(input),
-    // To32Bit(filter), stride, padding);
-
-    SpatialConvolutionFunc(d, output, input, filter, stride, padding);
+    SpatialConvolutionFunc(d, To32Bit(output), To32Bit(input), To32Bit(filter),
+                           stride, padding);
   }
 };
 
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index bc2b62375f1..8fed594b258 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -16,21 +16,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Div", functor::div, float, double, int32, int64,
-          complex64);
+REGISTER7(BinaryOp, CPU, "Div", functor::div, float, double, uint8, int16,
+          int32, int64, complex64);
 #if GOOGLE_CUDA
-REGISTER3(BinaryOp, GPU, "Div", functor::div, float, double, int64);
+REGISTER6(BinaryOp, GPU, "Div", functor::div, float, double, uint8, int16,
+          int32, int64);
 #endif
 
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
-REGISTER_KERNEL_BUILDER(Name("Div")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::div<int32>>);
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
index 80a02da6512..a2809d54811 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY3(div, float, double, int64);
+DEFINE_BINARY6(div, float, double, uint8, int16, int32, int64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
index a4ecaf185ac..068003b2945 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY3(mul, float, double, int64);
+DEFINE_BINARY7(mul, float, double, uint8, int8, int16, int32, int64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_mul.cc b/tensorflow/core/kernels/cwise_op_mul.cc
index a7b9859b193..42d50358e63 100644
--- a/tensorflow/core/kernels/cwise_op_mul.cc
+++ b/tensorflow/core/kernels/cwise_op_mul.cc
@@ -16,21 +16,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER7(BinaryOp, CPU, "Mul", functor::mul, float, double, int32, int64, int8,
-          int16, complex64);
+REGISTER8(BinaryOp, CPU, "Mul", functor::mul, float, double, uint8, int8, int16,
+          int32, int64, complex64);
 #if GOOGLE_CUDA
-REGISTER3(BinaryOp, GPU, "Mul", functor::mul, float, double, int64);
+REGISTER7(BinaryOp, GPU, "Mul", functor::mul, float, double, uint8, int8, int16,
+          int32, int64);
 #endif
 
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
-REGISTER_KERNEL_BUILDER(Name("Mul")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::mul<int32>>);
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index 3296826d483..adf4203322a 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -379,6 +379,8 @@ struct SelectFunctor<CPUDevice, T> {
 #define REGISTER6(OP, D, N, F, T0, T1, T2, T3, T4, T5) REGISTER(OP, D, N, F, T0)
 #define REGISTER7(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6) \
   REGISTER(OP, D, N, F, T0)
+#define REGISTER8(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6, T7) \
+  REGISTER(OP, D, N, F, T0)
 #else  // !defined(__ANDROID__)
 #define REGISTER2(OP, D, N, F, T0, T1) \
   REGISTER(OP, D, N, F, T0)            \
@@ -398,6 +400,9 @@ struct SelectFunctor<CPUDevice, T> {
 #define REGISTER7(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6) \
   REGISTER4(OP, D, N, F, T0, T1, T2, T3)                   \
   REGISTER3(OP, D, N, F, T4, T5, T6)
+#define REGISTER8(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6, T7) \
+  REGISTER4(OP, D, N, F, T0, T1, T2, T3)                       \
+  REGISTER4(OP, D, N, F, T4, T5, T6, T7)
 #endif  // defined(__ANDROID__)
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
index 966d3393b65..091c6717dc3 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@@ -40,7 +40,7 @@ template <typename Functor>
 struct UnaryFunctor<GPUDevice, Functor> {
   void operator()(const GPUDevice& d, typename Functor::tout_type out,
                   typename Functor::tin_type in) {
-    out.device(d) = in.unaryExpr(typename Functor::func());
+    To32Bit(out).device(d) = To32Bit(in).unaryExpr(typename Functor::func());
   }
 };
 
@@ -50,7 +50,8 @@ struct BinaryFunctor<GPUDevice, Functor, NDIMS> {
   void operator()(const GPUDevice& d, typename Functor::tout_type out,
                   typename Functor::tin_type in0,
                   typename Functor::tin_type in1) {
-    out.device(d) = in0.binaryExpr(in1, typename Functor::func());
+    To32Bit(out).device(d) =
+        To32Bit(in0).binaryExpr(in1, typename Functor::func());
   }
 
   void Left(const GPUDevice& d, typename Functor::tout_type out,
@@ -60,7 +61,7 @@ struct BinaryFunctor<GPUDevice, Functor, NDIMS> {
     typedef typename Functor::in_type Tin;
     typedef typename Functor::func Binary;
     typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary;
-    out.device(d) = in.unaryExpr(Unary(scalar.data()));
+    To32Bit(out).device(d) = To32Bit(in).unaryExpr(Unary(scalar.data()));
   }
 
   void Right(const GPUDevice& d, typename Functor::tout_type out,
@@ -70,7 +71,7 @@ struct BinaryFunctor<GPUDevice, Functor, NDIMS> {
     typedef typename Functor::in_type Tin;
     typedef typename Functor::func Binary;
     typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary;
-    out.device(d) = in.unaryExpr(Unary(scalar.data()));
+    To32Bit(out).device(d) = To32Bit(in).unaryExpr(Unary(scalar.data()));
   }
 
   void BCast(const GPUDevice& d,
@@ -86,16 +87,18 @@ struct BinaryFunctor<GPUDevice, Functor, NDIMS> {
       const bool bcast0_all_one = AllOne<NDIMS>(bcast0);
       const bool bcast1_all_one = AllOne<NDIMS>(bcast1);
       if (bcast0_all_one && !bcast1_all_one) {
-        out.device(d) = in0.binaryExpr(in1.broadcast(bcast1), func);
+        To32Bit(out).device(d) =
+            To32Bit(in0).binaryExpr(To32Bit(in1).broadcast(bcast1), func);
         return;
       }
       if (!bcast0_all_one && bcast1_all_one) {
-        out.device(d) = in0.broadcast(bcast0).binaryExpr(in1, func);
+        To32Bit(out).device(d) =
+            To32Bit(in0).broadcast(bcast0).binaryExpr(To32Bit(in1), func);
         return;
       }
     }
-    out.device(d) =
-        in0.broadcast(bcast0).binaryExpr(in1.broadcast(bcast1), func);
+    To32Bit(out).device(d) = To32Bit(in0).broadcast(bcast0).binaryExpr(
+        To32Bit(in1).broadcast(bcast1), func);
   }
 };
 
@@ -105,7 +108,8 @@ struct SelectFunctor<GPUDevice, T> {
                   typename TTypes<bool>::ConstFlat cond_flat,
                   typename TTypes<T>::ConstFlat then_flat,
                   typename TTypes<T>::ConstFlat else_flat) {
-    out.device(d) = cond_flat.select(then_flat, else_flat);
+    To32Bit(out).device(d) =
+        To32Bit(cond_flat).select(To32Bit(then_flat), To32Bit(else_flat));
   }
 };
 
@@ -143,6 +147,12 @@ struct SelectFunctor<GPUDevice, T> {
 #define DEFINE_BINARY5(F, T0, T1, T2, T3, T4) \
   DEFINE_BINARY2(F, T0, T1);                  \
   DEFINE_BINARY3(F, T2, T3, T4)
+#define DEFINE_BINARY6(F, T0, T1, T2, T3, T4, T5) \
+  DEFINE_BINARY3(F, T0, T1, T2);                  \
+  DEFINE_BINARY3(F, T3, T4, T5)
+#define DEFINE_BINARY7(F, T0, T1, T2, T3, T4, T5, T6) \
+  DEFINE_BINARY3(F, T0, T1, T2);                      \
+  DEFINE_BINARY4(F, T3, T4, T5, T6)
 
 }  // end namespace functor
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index fb779f24665..9ae2eedb30c 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -30,10 +30,17 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
+// When the depth is large and beta_ is 0.5 or 1.0, MognetLRN is faster than the
+// main band matrix approach used below. Benchmarks suggest switching to
+// MognetLRN when depth > 384.
+const int kMognetLRNDepthCutoff = 384;
+
 // Create a depth-by-depth band matrix with 1s along a swath of size (2 *
 // depth_radius + 1) around the diagonal.
-static void GetBandMatrix(int depth, int64 depth_radius,
-                          Eigen::Tensor<float, 2, Eigen::RowMajor>* result) {
+void GetBandMatrix(int depth, int64 depth_radius,
+                   Eigen::Tensor<float, 2, Eigen::RowMajor>* result) {
   result->setZero();
   for (int row = 0; row < depth; ++row) {
     const int begin = std::max<int>(0, row - depth_radius);
@@ -44,6 +51,8 @@ static void GetBandMatrix(int depth, int64 depth_radius,
   }
 }
 
+}  // namespace
+
 class LRNOp : public OpKernel {
  public:
   explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -69,6 +78,11 @@ class LRNOp : public OpKernel {
 #if defined(__ANDROID__)
     MognetLRN(in, batch, rows, cols, depth, output);
 #else
+    if (depth > kMognetLRNDepthCutoff && (beta_ == 0.5f || beta_ == 1.0f)) {
+      MognetLRN(in, batch, rows, cols, depth, output);
+      return;
+    }
+
     const int nodes = cols * rows;
     auto in_shaped = in.shaped<float, 2>({nodes * batch, depth});
 
@@ -79,13 +93,16 @@ class LRNOp : public OpKernel {
 
     auto out_shaped = output->shaped<float, 2>({nodes * batch, depth});
     Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
-    /// TODO(keveman): Optimize for beta in {0, 1, 0.5}
-    out_shaped.device(context->eigen_cpu_device()) =
-        in_shaped /
-        in_shaped.square()
-            .contract(multiplier, dims)
-            .unaryExpr([this](float x) { return bias_ + alpha_ * x; })
-            .pow(beta_);
+    auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
+    if (beta_ == 1.0f) {
+      out_shaped.device(context->eigen_cpu_device()) =
+          in_shaped * tmp.inverse();
+    } else if (beta_ == 0.5f) {
+      out_shaped.device(context->eigen_cpu_device()) = in_shaped * tmp.rsqrt();
+    } else {
+      out_shaped.device(context->eigen_cpu_device()) =
+          in_shaped * (tmp.log() * -beta_).exp();
+    }
 #endif
   }
 
@@ -104,11 +121,11 @@ class LRNOp : public OpKernel {
     Eigen::VectorXf padded_square(data_in.rows() + double_depth_radius);
     padded_square.setZero();
     for (int r = 0; r < data_in.cols(); ++r) {
-      // Do local response normalization for data_in(:, r)
-      // first, compute the square and store them in buffer for repeated use
+      // Do local response normalization for data_in(:, r). First, compute the
+      // square and store them in buffer for repeated use.
       padded_square.block(depth_radius_, 0, data_out.rows(), 1) =
           data_in.col(r).cwiseProduct(data_in.col(r)) * alpha_;
-      // Then, compute the scale and writes them to data_out
+      // Then, compute the scale and write it to data_out.
       float accumulated_scale = 0;
       for (int i = 0; i < double_depth_radius; ++i) {
         accumulated_scale += padded_square(i);
@@ -120,13 +137,13 @@ class LRNOp : public OpKernel {
       }
     }
 
-    // In a few cases, the pow computation could benefit from speedups.
     if (beta_ == 1) {
       data_out.array() = data_in.array() * data_out.array().inverse();
     } else if (beta_ == 0.5) {
-      data_out.array() = data_in.array() * data_out.array().sqrt().inverse();
+      data_out.array() = data_in.array() * data_out.array().rsqrt();
     } else {
-      data_out.array() = data_in.array() * data_out.array().pow(-beta_);
+      data_out.array() =
+          data_in.array() * (data_out.array().log() * -beta_).exp();
     }
   }
 
diff --git a/tensorflow/core/kernels/reference_gemm.h b/tensorflow/core/kernels/reference_gemm.h
deleted file mode 100644
index 16fa541238f..00000000000
--- a/tensorflow/core/kernels/reference_gemm.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright 2015 Google Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_KERNELS_REFERENCE_GEMM_H_
-#define TENSORFLOW_KERNELS_REFERENCE_GEMM_H_
-
-// This is an unoptimized but debuggable implementation of the GEMM matrix
-// multiply function, used to compare to faster but more opaque versions, or
-// for bit depths or argument combinations that aren't supported by optimized
-// code.
-// It assumes the row-major convention used by TensorFlow, and implements
-// C = A * B, like the standard BLAS GEMM interface. If the tranpose flags are
-// true, then the relevant matrix is treated as stored in column-major order.
-
-namespace tensorflow {
-template <class T1, class T2, class T3>
-void ReferenceGemm(bool transpose_a, bool transpose_b, bool transpose_c,
-                   size_t m, size_t n, size_t k, const T1* a, T1 offset_a,
-                   size_t lda, const T2* b, T2 offset_b, size_t ldb, T3* c,
-                   int32 shift_c, int32 offset_c, int32 mult_c, size_t ldc) {
-  int a_i_stride;
-  int a_l_stride;
-  if (transpose_a) {
-    a_i_stride = 1;
-    a_l_stride = lda;
-  } else {
-    a_i_stride = lda;
-    a_l_stride = 1;
-  }
-  int b_j_stride;
-  int b_l_stride;
-  if (transpose_b) {
-    b_j_stride = ldb;
-    b_l_stride = 1;
-  } else {
-    b_j_stride = 1;
-    b_l_stride = ldb;
-  }
-  int c_i_stride;
-  int c_j_stride;
-  if (transpose_c) {
-    c_i_stride = 1;
-    c_j_stride = ldc;
-  } else {
-    c_i_stride = ldc;
-    c_j_stride = 1;
-  }
-
-  const int32 highest = static_cast<int32>(Eigen::NumTraits<T3>::highest());
-  const int32 lowest = static_cast<int32>(Eigen::NumTraits<T3>::lowest());
-  const int32 rounding = (shift_c < 1) ? 0 : (1 << (shift_c - 1));
-
-  int i, j, l;
-  for (j = 0; j < n; j++) {
-    for (i = 0; i < m; i++) {
-      int32 total = 0;
-      for (l = 0; l < k; l++) {
-        const size_t a_index = ((i * a_i_stride) + (l * a_l_stride));
-        const int32 a_value = a[a_index] - offset_a;
-        const size_t b_index = ((j * b_j_stride) + (l * b_l_stride));
-        const int32 b_value = b[b_index] - offset_b;
-        total += (a_value * b_value);
-      }
-      const size_t c_index = ((i * c_i_stride) + (j * c_j_stride));
-      int32_t output = ((((total + offset_c) * mult_c) + rounding) >> shift_c);
-      if (output > highest) {
-        output = highest;
-      }
-      if (output < lowest) {
-        output = lowest;
-      }
-      c[c_index] = static_cast<T3>(output);
-    }
-  }
-}
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_KERNELS_REFERENCE_GEMM_H_
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index 0671414c510..a25c68a15ac 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -39,7 +39,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename Device>
-void CheckErrors(OpKernelContext* context, int seq_dim) {
+void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
   const Tensor& input = context->input(0);
   const Tensor& seq_lens = context->input(1);
 
@@ -52,15 +52,18 @@ void CheckErrors(OpKernelContext* context, int seq_dim) {
       seq_lens_vec.data(), seq_lens_t.data(),
       sizeof(int64) * seq_lens_t.size());
 
-  OP_REQUIRES(context, 0 != seq_dim, errors::InvalidArgument("0 == seq_dim"));
+  OP_REQUIRES(context, batch_dim != seq_dim,
+              errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
   OP_REQUIRES(context, seq_dim < input.dims(),
               errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
                                       seq_dim, " vs. ", input.dims(), ")"));
-
-  OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(0),
-              errors::InvalidArgument("len(seq_lens) != input.dims(", 0, "), ",
-                                      "(", seq_lens.NumElements(), " vs. ",
-                                      input.dim_size(seq_dim)));
+  OP_REQUIRES(context, batch_dim < input.dims(),
+              errors::InvalidArgument("batch_dim must be < input.dims()", "( ",
+                                      batch_dim, " vs. ", input.dims(), ")"));
+  OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(batch_dim),
+              errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim,
+                                      "), ", "(", seq_lens.NumElements(),
+                                      " vs. ", input.dim_size(batch_dim)));
 
   for (int d = 0; d < seq_lens_vec.size(); ++d) {
     OP_REQUIRES(context, seq_lens_vec[d] >= 0,
@@ -72,19 +75,24 @@ void CheckErrors(OpKernelContext* context, int seq_dim) {
 }
 
 template <>
-void CheckErrors<GPUDevice>(OpKernelContext* context, int seq_dim) {
+void CheckErrors<GPUDevice>(OpKernelContext* context, int batch_dim,
+                            int seq_dim) {
   const Tensor& input = context->input(0);
   const Tensor& seq_lens = context->input(1);
 
-  OP_REQUIRES(context, 0 != seq_dim, errors::InvalidArgument("0 == seq_dim"));
+  OP_REQUIRES(context, batch_dim != seq_dim,
+              errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
   OP_REQUIRES(context, seq_dim < input.dims(),
               errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
                                       seq_dim, " vs. ", input.dims(), ")"));
+  OP_REQUIRES(context, batch_dim < input.dims(),
+              errors::InvalidArgument("batch_dim must be < input.dims()", "( ",
+                                      batch_dim, " vs. ", input.dims(), ")"));
 
-  OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(0),
-              errors::InvalidArgument("len(seq_lens) != input.dims(", 0, "), ",
-                                      "(", seq_lens.NumElements(), " vs. ",
-                                      input.dim_size(seq_dim)));
+  OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(batch_dim),
+              errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim,
+                                      "), ", "(", seq_lens.NumElements(),
+                                      " vs. ", input.dim_size(batch_dim)));
 }
 
 template <typename Device, typename T>
@@ -92,6 +100,7 @@ class ReverseSequenceOp : public OpKernel {
  public:
   explicit ReverseSequenceOp(OpKernelConstruction* context)
       : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("batch_dim", &batch_dim_));
     OP_REQUIRES_OK(context, context->GetAttr("seq_dim", &seq_dim_));
   }
 
@@ -106,7 +115,7 @@ class ReverseSequenceOp : public OpKernel {
 
     auto seq_lens_t = seq_lens.vec<int64>();
 
-    CheckErrors<Device>(context, seq_dim_);
+    CheckErrors<Device>(context, batch_dim_, seq_dim_);
 
     const int input_dims = input.dims();
 
@@ -114,11 +123,11 @@ class ReverseSequenceOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
 
-#define HANDLE_DIM(NDIM)                                                    \
-  case NDIM:                                                                \
-    functor::ReverseSequence<Device, T, NDIM>::Compute(                     \
-        context->eigen_device<Device>(), input.tensor<T, NDIM>(), seq_dim_, \
-        seq_lens_t, output->tensor<T, NDIM>());                             \
+#define HANDLE_DIM(NDIM)                                                      \
+  case NDIM:                                                                  \
+    functor::ReverseSequence<Device, T, NDIM>::Compute(                       \
+        context->eigen_device<Device>(), input.tensor<T, NDIM>(), batch_dim_, \
+        seq_dim_, seq_lens_t, output->tensor<T, NDIM>());                     \
     break;
 
     switch (input_dims) {
@@ -136,6 +145,7 @@ class ReverseSequenceOp : public OpKernel {
   }
 
  private:
+  int32 batch_dim_;
   int32 seq_dim_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ReverseSequenceOp);
@@ -152,12 +162,12 @@ TF_CALL_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE);
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T, Dims)                                      \
-  template <>                                                          \
-  void ReverseSequence<GPUDevice, T, Dims>::Compute(                   \
-      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
-      int32 seq_dim, TTypes<int64>::ConstVec seq_lens,                 \
-      typename TTypes<T, Dims>::Tensor output);                        \
+#define DECLARE_GPU_SPEC(T, Dims)                                       \
+  template <>                                                           \
+  void ReverseSequence<GPUDevice, T, Dims>::Compute(                    \
+      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,  \
+      int32 batch_dim, int32 seq_dim, TTypes<int64>::ConstVec seq_lens, \
+      typename TTypes<T, Dims>::Tensor output);                         \
   extern template struct ReverseSequence<GPUDevice, T, Dims>;
 
 #define DECLARE_GPU_SPECS(T) \
diff --git a/tensorflow/core/kernels/reverse_sequence_op.h b/tensorflow/core/kernels/reverse_sequence_op.h
index ceb1b0b8801..9dd1e4d01dd 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.h
+++ b/tensorflow/core/kernels/reverse_sequence_op.h
@@ -29,15 +29,19 @@ template <typename T, size_t Dims>
 class ReverseGenerator {
  public:
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  ReverseGenerator(typename TTypes<T, Dims>::ConstTensor input, int32 seq_dim,
-                   TTypes<int64>::ConstVec seq_lengths)
-      : input_(input), seq_dim_(seq_dim), seq_lengths_(seq_lengths) {}
+  ReverseGenerator(typename TTypes<T, Dims>::ConstTensor input, int32 batch_dim,
+                   int32 seq_dim, TTypes<int64>::ConstVec seq_lengths)
+      : input_(input),
+        batch_dim_(batch_dim),
+        seq_dim_(seq_dim),
+        seq_lengths_(seq_lengths) {}
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
   operator()(const Eigen::array<Eigen::DenseIndex, Dims>& coords) const {
     Eigen::array<Eigen::DenseIndex, Dims> new_coords = coords;
-    if (coords[seq_dim_] < seq_lengths_(coords[0])) {
-      new_coords[seq_dim_] = seq_lengths_(coords[0]) - coords[seq_dim_] - 1;
+    if (coords[seq_dim_] < seq_lengths_(coords[batch_dim_])) {
+      new_coords[seq_dim_] =
+          seq_lengths_(coords[batch_dim_]) - coords[seq_dim_] - 1;
     }
 
     return input_(new_coords);
@@ -45,6 +49,7 @@ class ReverseGenerator {
 
  private:
   typename TTypes<T, Dims>::ConstTensor input_;
+  int32 batch_dim_;
   int32 seq_dim_;
   TTypes<int64>::ConstVec seq_lengths_;
 };
@@ -57,9 +62,10 @@ template <typename Device, typename T, size_t Dims>
 struct ReverseSequence {
   EIGEN_ALWAYS_INLINE static void Compute(
       const Device& d, typename TTypes<T, Dims>::ConstTensor input,
-      int32 seq_dim, TTypes<int64>::ConstVec seq_lengths,
+      int32 batch_dim, int32 seq_dim, TTypes<int64>::ConstVec seq_lengths,
       typename TTypes<T, Dims>::Tensor output) {
-    generator::ReverseGenerator<T, Dims> generator(input, seq_dim, seq_lengths);
+    generator::ReverseGenerator<T, Dims> generator(input, batch_dim, seq_dim,
+                                                   seq_lengths);
     output.device(d) = input.generate(generator);
   }
 };
diff --git a/tensorflow/core/kernels/softsign_op.cc b/tensorflow/core/kernels/softsign_op.cc
new file mode 100644
index 00000000000..e3480e35947
--- /dev/null
+++ b/tensorflow/core/kernels/softsign_op.cc
@@ -0,0 +1,112 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/softsign_op.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class SoftsignOp : public UnaryElementWiseOp<T, SoftsignOp<Device, T>> {
+ public:
+  using UnaryElementWiseOp<T, SoftsignOp<Device, T>>::UnaryElementWiseOp;
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::Softsign<Device, T> functor;
+    functor(context->eigen_device<Device>(), input.flat<T>(),
+            output->flat<T>());
+  }
+};
+
+template <typename Device, typename T>
+class SoftsignGradOp
+    : public BinaryElementWiseOp<T, SoftsignGradOp<Device, T>> {
+ public:
+  using BinaryElementWiseOp<T, SoftsignGradOp<Device, T>>::BinaryElementWiseOp;
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (inputs): inputs that were passed to SoftsignOp()
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OP_REQUIRES(context, a.IsSameSize(g),
+                errors::InvalidArgument("g and a must be the same size"));
+    functor::SoftsignGrad<Device, T> functor;
+    functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+            output->flat<T>());
+  }
+};
+
+#define REGISTER_KERNELS(type)                                           \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("Softsign").Device(DEVICE_CPU).TypeConstraint<type>("T"),     \
+      SoftsignOp<CPUDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("SoftsignGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      SoftsignGradOp<CPUDevice, type>);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                          \
+  template <>                                                        \
+  void Softsign<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor features,  \
+      typename TTypes<T>::Tensor activations);                       \
+  extern template struct Softsign<GPUDevice, T>;                     \
+                                                                     \
+  template <>                                                        \
+  void SoftsignGrad<GPUDevice, T>::operator()(                       \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \
+      typename TTypes<T>::ConstTensor features,                      \
+      typename TTypes<T>::Tensor backprops);                         \
+  extern template struct SoftsignGrad<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNELS(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("Softsign").Device(DEVICE_GPU).TypeConstraint<type>("T"),     \
+      SoftsignOp<GPUDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("SoftsignGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      SoftsignGradOp<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softsign_op.h b/tensorflow/core/kernels/softsign_op.h
new file mode 100644
index 00000000000..36790a5874c
--- /dev/null
+++ b/tensorflow/core/kernels/softsign_op.h
@@ -0,0 +1,60 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_SOFTSIGN_OP_H_
+#define TENSORFLOW_KERNELS_SOFTSIGN_OP_H_
+// Functor definition for SoftsignOp and SoftsignGradOp, must be compilable by
+// nvcc.
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by SoftsignOp to do the computations.
+template <typename Device, typename T>
+struct Softsign {
+  // Computes Softsign activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor activations) {
+    activations.device(d) =
+        features / (features.abs() + features.constant(1.0f));
+  }
+};
+
+// Functor used by SoftsignGradOp to do the computations.
+template <typename Device, typename T>
+struct SoftsignGrad {
+  // Computes SoftsignGrad backprops.
+  //
+  // gradients: gradients backpropagated to the Softsign op.
+  // features: inputs that were passed to the Softsign op.
+  // backprops: gradients to backpropagate to the Softsign inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor backprops) {
+    backprops.device(d) =
+        gradients / (features.abs() + features.constant(1.0f)).square();
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_SOFTSIGN_OP_H_
diff --git a/tensorflow/core/kernels/softsign_op_gpu.cu.cc b/tensorflow/core/kernels/softsign_op_gpu.cu.cc
new file mode 100644
index 00000000000..4ae941c9f01
--- /dev/null
+++ b/tensorflow/core/kernels/softsign_op_gpu.cu.cc
@@ -0,0 +1,40 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+
+#include "tensorflow/core/kernels/softsign_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Definition of the GPU implementations declared in softsign_op.cc.
+#define DEFINE_GPU_KERNELS(T)                      \
+  template struct functor::Softsign<GPUDevice, T>; \
+  template struct functor::SoftsignGrad<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/split_op_gpu.cu.cc b/tensorflow/core/kernels/split_op_gpu.cu.cc
index c79410b68c0..13463b705b0 100644
--- a/tensorflow/core/kernels/split_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_op_gpu.cu.cc
@@ -33,7 +33,7 @@ void Split<Device, T>::operator()(
     typename TTypes<T, 3>::ConstTensor input,
     const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
     const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes) {
-  output.device(d) = input.slice(slice_indices, slice_sizes);
+  To32Bit(output).device(d) = To32Bit(input).slice(slice_indices, slice_sizes);
 }
 
 #define DEFINE_GPU_KERNELS(T) template struct Split<Eigen::GpuDevice, T>;
diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc
index 055050cd34a..2c146b3d6c2 100644
--- a/tensorflow/core/kernels/stack_ops.cc
+++ b/tensorflow/core/kernels/stack_ops.cc
@@ -1,3 +1,18 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
 // See docs in ../ops/data_flow_ops.cc.
 
 #include <limits.h>
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 20806cff684..8287d758f0b 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -404,8 +404,9 @@ Reshapes a tensor.
 Given `tensor`, this operation returns a tensor that has the same values
 as `tensor` with shape `shape`.
 
-If `shape` is the special value `[-1]`, then `tensor` is flattened and the
-operation outputs a 1-D tensor with all elements of `tensor`.
+If one component of `shape` is the special value -1, the size of that dimension
+is computed so that the total size remains constant.  In particular, a `shape`
+of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
 
 If `shape` is 1-D or higher, then the operation returns a tensor with shape
 `shape` filled with the values of `tensor`. In this case, the number of elements
@@ -435,6 +436,13 @@ reshape(t, [2, 4]) ==> [[1, 1, 2, 2]
 # tensor 't' has shape [3, 2, 3]
 # pass '[-1]' to flatten 't'
 reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+# -1 can also be used with higher dimensional shapes
+reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+
+# tensor 't' is [7]
+# shape `[]` reshapes to a scalar
+reshape(t, []) ==> 7
 ```
 
 shape: Defines the shape of the output tensor.
@@ -535,25 +543,29 @@ REGISTER_OP("ReverseSequence")
     .Input("seq_lengths: int64")
     .Output("output: T")
     .Attr("seq_dim: int")
+    .Attr("batch_dim: int = 0")
     .Attr("T: type")
     .Doc(R"doc(
-Reverses variable length slices in dimension `seq_dim`.
+Reverses variable length slices.
 
-This op first slices `input` along the first dimension, and for each slice `i`,
-reverses the first `seq_lengths[i]` elements along the dimension `seq_dim`.
+This op first slices `input` along the dimension `batch_dim`, and for each
+slice `i`, reverses the first `seq_lengths[i]` elements along
+the dimension `seq_dim`.
 
 The elements of `seq_lengths` must obey `seq_lengths[i] < input.dims[seq_dim]`,
-and `seq_lengths` must be a vector of length `input.dims(0)`.
+and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
 
-The output slice `i` along dimension 0 is then given by input slice `i`, with
-the first `seq_lengths[i]` slices along dimension `seq_dim` reversed.
+The output slice `i` along dimension `batch_dim` is then given by input
+slice `i`, with the first `seq_lengths[i]` slices along dimension
+`seq_dim` reversed.
 
 For example:
 
 ```prettyprint
 # Given this:
+batch_dim = 0
 seq_dim = 1
-input.dims = (4, ...)
+input.dims = (4, 8, ...)
 seq_lengths = [7, 2, 3, 5]
 
 # then slices of input are reversed on seq_dim, but only up to seq_lengths:
@@ -569,10 +581,32 @@ output[2, 3:, :, ...] = input[2, 3:, :, ...]
 output[3, 2:, :, ...] = input[3, 2:, :, ...]
 ```
 
+In contrast, if:
+```prettyprint
+# Given this:
+batch_dim = 2
+seq_dim = 0
+input.dims = (8, ?, 4, ...)
+seq_lengths = [7, 2, 3, 5]
+
+# then slices of input are reversed on seq_dim, but only up to seq_lengths:
+output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+
+# while entries past seq_lens are copied through:
+output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+```
+
 input: The input to reverse.
 seq_lengths: 1-D with length `input.dims(0)` and
   `max(seq_lengths) < input.dims(seq_dim)`
 seq_dim: The dimension which is partially reversed.
+batch_dim: The dimension along which reversal is performed.
 output: The partially reversed input. It has the same shape as `input`.
 )doc");
 
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 023c598aa61..a1f1db5f7f1 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -264,7 +264,7 @@ Returns element-wise smallest integer in not less than x.
 
 #define BINARY_MORE()                              \
   Input("x: T").Input("y: T").Output("z: T").Attr( \
-      "T: {float, double, int8, int16, int32, complex64, int64}")
+      "T: {float, double, uint8, int8, int16, int32, int64, complex64}")
 
 #define BINARY_FEWER()                             \
   Input("x: T").Input("y: T").Output("z: T").Attr( \
@@ -293,7 +293,7 @@ Returns x * y element-wise.
 )doc");
 
 REGISTER_OP("Div")
-    .BINARY_FEWER()
+    .BINARY_MORE()
     .Doc(R"doc(
 Returns x / y element-wise.
 )doc");
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 593f986edb7..29a71730950 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -466,6 +466,27 @@ features: The features passed as input to the corresponding softplus operation.
 backprops: The gradients: `gradients / (1 + exp(-features))`.
 )doc");
 
+REGISTER_OP("Softsign")
+    .Input("features: T")
+    .Output("activations: T")
+    .Attr("T: realnumbertype")
+    .Doc(R"doc(
+Computes softsign: `features / (abs(features) + 1)`.
+)doc");
+
+REGISTER_OP("SoftsignGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Output("backprops: T")
+    .Attr("T: realnumbertype")
+    .Doc(R"doc(
+Computes softsign gradients for a softsign operation.
+
+gradients: The backpropagated gradients to the corresponding softsign operation.
+features: The features passed as input to the corresponding softsign operation.
+backprops: The gradients: `gradients / (1 + abs(-features)) ** 2`.
+)doc");
+
 // --------------------------------------------------------------------------
 
 REGISTER_OP("Softmax")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 33875345873..9f48da94e1d 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -44,11 +44,12 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
         type: DT_INT8
         type: DT_INT16
         type: DT_INT32
-        type: DT_COMPLEX64
         type: DT_INT64
+        type: DT_COMPLEX64
       }
     }
   }
@@ -1973,9 +1974,12 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
-        type: DT_COMPLEX64
         type: DT_INT64
+        type: DT_COMPLEX64
       }
     }
   }
@@ -4251,11 +4255,12 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
         type: DT_INT8
         type: DT_INT16
         type: DT_INT32
-        type: DT_COMPLEX64
         type: DT_INT64
+        type: DT_COMPLEX64
       }
     }
   }
@@ -5532,7 +5537,7 @@ op {
     type: "type"
   }
   summary: "Reshapes a tensor."
-  description: "Given `tensor`, this operation returns a tensor that has the same values\nas `tensor` with shape `shape`.\n\nIf `shape` is the special value `[-1]`, then `tensor` is flattened and the\noperation outputs a 1-D tensor with all elements of `tensor`.\n\nIf `shape` is 1-D or higher, then the operation returns a tensor with shape\n`shape` filled with the values of `tensor`. In this case, the number of elements\nimplied by `shape` must be the same as the number of elements in `tensor`.\n\nFor example:\n\n```prettyprint\n# tensor \'t\' is [1, 2, 3, 4, 5, 6, 7, 8, 9]\n# tensor \'t\' has shape [9]\nreshape(t, [3, 3]) ==> [[1, 2, 3]\n                        [4, 5, 6]\n                        [7, 8, 9]]\n\n# tensor \'t\' is [[[1, 1], [2, 2]]\n#                [[3, 3], [4, 4]]]\n# tensor \'t\' has shape [2, 2, 2]\nreshape(t, [2, 4]) ==> [[1, 1, 2, 2]\n                        [3, 3, 4, 4]]\n\n# tensor \'t\' is [[[1, 1, 1],\n#                 [2, 2, 2]],\n#                [[3, 3, 3],\n#                 [4, 4, 4]],\n#                [[5, 5, 5],\n#                 [6, 6, 6]]]\n# tensor \'t\' has shape [3, 2, 3]\n# pass \'[-1]\' to flatten \'t\'\nreshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]\n```"
+  description: "Given `tensor`, this operation returns a tensor that has the same values\nas `tensor` with shape `shape`.\n\nIf one component of `shape` is the special value -1, the size of that dimension\nis computed so that the total size remains constant.  In particular, a `shape`\nof `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.\n\nIf `shape` is 1-D or higher, then the operation returns a tensor with shape\n`shape` filled with the values of `tensor`. In this case, the number of elements\nimplied by `shape` must be the same as the number of elements in `tensor`.\n\nFor example:\n\n```prettyprint\n# tensor \'t\' is [1, 2, 3, 4, 5, 6, 7, 8, 9]\n# tensor \'t\' has shape [9]\nreshape(t, [3, 3]) ==> [[1, 2, 3]\n                        [4, 5, 6]\n                        [7, 8, 9]]\n\n# tensor \'t\' is [[[1, 1], [2, 2]]\n#                [[3, 3], [4, 4]]]\n# tensor \'t\' has shape [2, 2, 2]\nreshape(t, [2, 4]) ==> [[1, 1, 2, 2]\n                        [3, 3, 4, 4]]\n\n# tensor \'t\' is [[[1, 1, 1],\n#                 [2, 2, 2]],\n#                [[3, 3, 3],\n#                 [4, 4, 4]],\n#                [[5, 5, 5],\n#                 [6, 6, 6]]]\n# tensor \'t\' has shape [3, 2, 3]\n# pass \'[-1]\' to flatten \'t\'\nreshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]\n# -1 can also be used with higher dimensional shapes\nreshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n\n# tensor \'t\' is [7]\n# shape `[]` reshapes to a scalar\nreshape(t, []) ==> 7\n```"
 }
 op {
   name: "ResizeArea"
@@ -6770,6 +6775,67 @@ op {
   }
   summary: "Computes softplus gradients for a softplus operation."
 }
+op {
+  name: "Softsign"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+      }
+    }
+  }
+  summary: "Computes softsign: `features / (abs(features) + 1)`."
+}
+op {
+  name: "SoftsignGrad"
+  input_arg {
+    name: "gradients"
+    description: "The backpropagated gradients to the corresponding softsign operation."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    description: "The features passed as input to the corresponding softsign operation."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    description: "The gradients: `gradients / (1 + abs(-features)) ** 2`."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+      }
+    }
+  }
+  summary: "Computes softsign gradients for a softsign operation."
+}
 op {
   name: "SparseApplyAdagrad"
   input_arg {
diff --git a/tensorflow/core/public/README.md b/tensorflow/core/public/README.md
index d5051ae690f..f5e10bf79f3 100644
--- a/tensorflow/core/public/README.md
+++ b/tensorflow/core/public/README.md
@@ -12,7 +12,7 @@ process.
 
 First, bring in tensorflow python dependency
 
-//third_party/tensorflow:tensorflow_py
+//third_party/py/tensorflow
 
 to get the python TensorFlow API.
 
@@ -22,9 +22,9 @@ Then:
 import tensorflow as tf
 
 with tf.Session("local"):
-  input1 = tf.Constant(1.0, shape=[1, 1], name="input1")
-  input2 = tf.Constant(2.0, shape=[1, 1], name="input2")
-  output = tf.MatMul(input1, input2)
+  input1 = tf.constant(1.0, shape=[1, 1], name="input1")
+  input2 = tf.constant(2.0, shape=[1, 1], name="input2")
+  output = tf.matmul(input1, input2)
 
   # Run graph and fetch the output
   result = output.eval()
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index c78ee33e06d..9cff418c670 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -64,11 +64,13 @@ TF_DEFINE_string(image,
                  "tensorflow/examples/label_image/data/grace_hopper.jpg",
                  "The image to classify (JPEG or PNG).");
 TF_DEFINE_string(graph,
-                 "tensorflow/examples/label_image/data/googlenet_graph.pb",
+                 "tensorflow/examples/label_image/data/"
+                 "tensorflow_inception_graph.pb",
                  "The location of the GraphDef file containing the protobuf"
                  " definition of the network.");
 TF_DEFINE_string(labels,
-                 "tensorflow/examples/label_image/data/googlenet_labels.txt",
+                 "tensorflow/examples/label_image/data/"
+                 "imagenet_comp_graph_label_strings.txt",
                  "A text file containing the labels of all the categories, one"
                  " per line.");
 TF_DEFINE_int32(input_width, 224, "Width of the image the network expects.");
@@ -85,6 +87,10 @@ TF_DEFINE_string(root_dir, "", "The directory at the root of the data files.");
 // of the result is a multiple of 16, because our model expects that.
 Status ReadLabelsFile(string file_name, std::vector<string>* result) {
   std::ifstream file(file_name);
+  if (!file) {
+    return tensorflow::errors::NotFound("Labels file ", file_name,
+                                        " not found.");
+  }
   result->clear();
   string line;
   while (std::getline(file, line)) {
diff --git a/tensorflow/g3doc/api_docs/python/array_ops.md b/tensorflow/g3doc/api_docs/python/array_ops.md
index 14d1e396233..79abef17177 100644
--- a/tensorflow/g3doc/api_docs/python/array_ops.md
+++ b/tensorflow/g3doc/api_docs/python/array_ops.md
@@ -277,8 +277,9 @@ Reshapes a tensor.
 Given `tensor`, this operation returns a tensor that has the same values
 as `tensor` with shape `shape`.
 
-If `shape` is the special value `[-1]`, then `tensor` is flattened and the
-operation outputs a 1-D tensor with all elements of `tensor`.
+If one component of `shape` is the special value -1, the size of that dimension
+is computed so that the total size remains constant.  In particular, a `shape`
+of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
 
 If `shape` is 1-D or higher, then the operation returns a tensor with shape
 `shape` filled with the values of `tensor`. In this case, the number of elements
@@ -308,6 +309,13 @@ reshape(t, [2, 4]) ==> [[1, 1, 2, 2]
 # tensor 't' has shape [3, 2, 3]
 # pass '[-1]' to flatten 't'
 reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+# -1 can also be used with higher dimensional shapes
+reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+
+# tensor 't' is [7]
+# shape `[]` reshapes to a scalar
+reshape(t, []) ==> 7
 ```
 
 ##### Args:
diff --git a/tensorflow/g3doc/api_docs/python/framework.md b/tensorflow/g3doc/api_docs/python/framework.md
index eea786647b5..8217729e9cf 100644
--- a/tensorflow/g3doc/api_docs/python/framework.md
+++ b/tensorflow/g3doc/api_docs/python/framework.md
@@ -1355,7 +1355,7 @@ for more details.
 
 - - -
 
-### `tf.convert_to_tensor(value, dtype=None, name=None)` {#convert_to_tensor}
+### `tf.convert_to_tensor(value, dtype=None, name=None, as_ref=False)` {#convert_to_tensor}
 
 Converts the given `value` to a `Tensor`.
 
@@ -1390,6 +1390,7 @@ and scalars in addition to `Tensor` objects.
 *  <b>`dtype`</b>: Optional element type for the returned tensor. If missing, the
     type is inferred from the type of `value`.
 *  <b>`name`</b>: Optional name to use if a new `Tensor` is created.
+*  <b>`as_ref`</b>: True if we want the result as a ref tensor.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/image.md b/tensorflow/g3doc/api_docs/python/image.md
index 5ca185edf4a..3d8c51d5204 100644
--- a/tensorflow/g3doc/api_docs/python/image.md
+++ b/tensorflow/g3doc/api_docs/python/image.md
@@ -18,7 +18,8 @@ are all of variable size.  If you need fixed size images, pass the output of
 the decode Ops to one of the cropping and resizing Ops.
 
 Note: The PNG encode and decode Ops support RGBA, but the conversions Ops
-presently only support RGB, HSV, and GrayScale.
+presently only support RGB, HSV, and GrayScale. Presently, the alpha channel has
+to be stripped from the image and re-attached using slicing ops.
 
 - - -
 
@@ -204,10 +205,6 @@ image = tf.image.decode_jpeg(...)
 resized_image = tf.image.resize_bilinear(image, [299, 299])
 ```
 
-<i>Maybe refer to the Queue examples that show how to add images to a Queue
-after resizing them to a fixed size, and how to dequeue batches of resized
-images from the Queue.</i>
-
 - - -
 
 ### `tf.image.resize_images(images, new_height, new_width, method=0)` {#resize_images}
@@ -661,6 +658,43 @@ See also `transpose()`.
 
 
 
+## Converting Between Colorspaces.
+
+Internally, images are either stored in as one `float32` per channel per pixel
+(implicitly, values are assumed to lie in `[0,1)`) or one `uint8` per channel
+per pixel (values are assumed to lie in `[0,255]`).
+
+- - -
+
+### `tf.image.convert_image_dtype(image, dtype, name=None)` {#convert_image_dtype}
+
+Convert `image` to `dtype`, scaling its values if needed.
+
+Images that are represented using floating point values are expected to have
+values in the range [0,1). Image data stored in integer data types are
+expected to have values in the range `[0,MAX]`, wbere `MAX` is the largest
+positive representable number for the data type.
+
+This op converts between data types, scaling the values appropriately before
+casting.
+
+Note that for floating point inputs, this op expects values to lie in [0,1).
+Conversion of an image containing values outside that range may lead to
+overflow errors when converted to integer `Dtype`s.
+
+##### Args:
+
+
+*  <b>`image`</b>: An image.
+*  <b>`dtype`</b>: A `DType` to convert `image` to.
+*  <b>`name`</b>: A name for this operation (optional).
+
+##### Returns:
+
+  `image`, converted to `dtype`.
+
+
+
 ## Image Adjustments
 
 TensorFlow provides functions to adjust images in various ways: brightness,
diff --git a/tensorflow/g3doc/api_docs/python/index.md b/tensorflow/g3doc/api_docs/python/index.md
index 1211233fae9..b04559287cd 100644
--- a/tensorflow/g3doc/api_docs/python/index.md
+++ b/tensorflow/g3doc/api_docs/python/index.md
@@ -194,6 +194,7 @@
 * **[Images](../../api_docs/python/image.md)**:
   * [`adjust_brightness`](../../api_docs/python/image.md#adjust_brightness)
   * [`adjust_contrast`](../../api_docs/python/image.md#adjust_contrast)
+  * [`convert_image_dtype`](../../api_docs/python/image.md#convert_image_dtype)
   * [`crop_to_bounding_box`](../../api_docs/python/image.md#crop_to_bounding_box)
   * [`decode_jpeg`](../../api_docs/python/image.md#decode_jpeg)
   * [`decode_png`](../../api_docs/python/image.md#decode_png)
@@ -283,6 +284,7 @@
   * [`nce_loss`](../../api_docs/python/nn.md#nce_loss)
   * [`relu`](../../api_docs/python/nn.md#relu)
   * [`relu6`](../../api_docs/python/nn.md#relu6)
+  * [`rnn`](../../api_docs/python/nn.md#rnn)
   * [`sampled_softmax_loss`](../../api_docs/python/nn.md#sampled_softmax_loss)
   * [`separable_conv2d`](../../api_docs/python/nn.md#separable_conv2d)
   * [`sigmoid`](../../api_docs/python/nn.md#sigmoid)
@@ -290,6 +292,8 @@
   * [`softmax`](../../api_docs/python/nn.md#softmax)
   * [`softmax_cross_entropy_with_logits`](../../api_docs/python/nn.md#softmax_cross_entropy_with_logits)
   * [`softplus`](../../api_docs/python/nn.md#softplus)
+  * [`softsign`](../../api_docs/python/nn.md#softsign)
+  * [`state_saving_rnn`](../../api_docs/python/nn.md#state_saving_rnn)
   * [`tanh`](../../api_docs/python/nn.md#tanh)
   * [`top_k`](../../api_docs/python/nn.md#top_k)
   * [`uniform_candidate_sampler`](../../api_docs/python/nn.md#uniform_candidate_sampler)
diff --git a/tensorflow/g3doc/api_docs/python/io_ops.md b/tensorflow/g3doc/api_docs/python/io_ops.md
index 0d4d52eea55..8d84df05302 100644
--- a/tensorflow/g3doc/api_docs/python/io_ops.md
+++ b/tensorflow/g3doc/api_docs/python/io_ops.md
@@ -1773,6 +1773,12 @@ Output strings (e.g. filenames) to a queue for an input pipeline.
   A queue with the output strings.  A `QueueRunner` for the Queue
   is added to the current `Graph`'s `QUEUE_RUNNER` collection.
 
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the string_tensor is a null Python list.  At runtime,
+  will fail with an assertion if string_tensor becomes a null tensor.
+
 
 
 ### Batching at the end of an input pipeline
diff --git a/tensorflow/g3doc/api_docs/python/math_ops.md b/tensorflow/g3doc/api_docs/python/math_ops.md
index 43261de10bf..346c2fbf956 100644
--- a/tensorflow/g3doc/api_docs/python/math_ops.md
+++ b/tensorflow/g3doc/api_docs/python/math_ops.md
@@ -23,7 +23,7 @@ Returns x + y element-wise.
 ##### Args:
 
 
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int8`, `int16`, `int32`, `complex64`, `int64`.
+*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`.
 *  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
 *  <b>`name`</b>: A name for the operation (optional).
 
@@ -59,7 +59,7 @@ Returns x * y element-wise.
 ##### Args:
 
 
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int8`, `int16`, `int32`, `complex64`, `int64`.
+*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`.
 *  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
 *  <b>`name`</b>: A name for the operation (optional).
 
@@ -77,7 +77,7 @@ Returns x / y element-wise.
 ##### Args:
 
 
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `complex64`, `int64`.
+*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`, `complex64`.
 *  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
 *  <b>`name`</b>: A name for the operation (optional).
 
diff --git a/tensorflow/g3doc/api_docs/python/nn.md b/tensorflow/g3doc/api_docs/python/nn.md
index 068e5f2ec47..67c315745dc 100644
--- a/tensorflow/g3doc/api_docs/python/nn.md
+++ b/tensorflow/g3doc/api_docs/python/nn.md
@@ -9,11 +9,10 @@ Note: Functions taking `Tensor` arguments can also take anything accepted by
 
 ## Activation Functions
 
-The activation ops provide different types of nonlinearities for use in
-neural networks.  These include smooth nonlinearities (`sigmoid`,
-`tanh`, and `softplus`), continuous but not everywhere differentiable
-functions (`relu`, `relu6`, and `relu_x`), and random regularization
-(`dropout`).
+The activation ops provide different types of nonlinearities for use in neural
+networks.  These include smooth nonlinearities (`sigmoid`, `tanh`, `softplus`,
+and `softsign`), continuous but not everywhere differentiable functions (`relu`,
+`relu6`, and `relu_x`), and random regularization (`dropout`).
 
 All activation ops apply componentwise, and produce a tensor of the same
 shape as the input tensor.
@@ -62,6 +61,23 @@ Computes softplus: `log(exp(features) + 1)`.
 ##### Args:
 
 
+*  <b>`features`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor`. Has the same type as `features`.
+
+
+- - -
+
+### `tf.nn.softsign(features, name=None)` {#softsign}
+
+Computes softsign: `features / (abs(features) + 1)`.
+
+##### Args:
+
+
 *  <b>`features`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`.
 *  <b>`name`</b>: A name for the operation (optional).
 
@@ -1228,3 +1244,89 @@ target classes as noise classes for the same example.
     Each value is `-FLOAT_MAX`.
 
 
+
+## Other Functions and Classes
+- - -
+
+### `tf.nn.rnn(cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None)` {#rnn}
+
+Creates a recurrent neural network specified by RNNCell "cell".
+
+##### The simplest form of RNN network generated is:
+
+  state = cell.zero_state(...)
+  outputs = []
+  states = []
+  for input_ in inputs:
+    output, state = cell(input_, state)
+    outputs.append(output)
+    states.append(state)
+  return (outputs, states)
+
+However, a few other options are available:
+
+An initial state can be provided.
+If sequence_length is provided, dynamic calculation is performed.
+
+Dynamic calculation returns, at time t:
+  (t >= max(sequence_length)
+      ? (zeros(output_shape), zeros(state_shape))
+      : cell(input, state)
+
+Thus saving computational time when unrolling past the max sequence length.
+
+##### Args:
+
+
+*  <b>`cell`</b>: An instance of RNNCell.
+*  <b>`inputs`</b>: A length T list of inputs, each a vector with shape [batch_size].
+*  <b>`initial_state`</b>: (optional) An initial state for the RNN.  This must be
+    a tensor of appropriate type and shape [batch_size x cell.state_size].
+*  <b>`dtype`</b>: (optional) The data type for the initial state.  Required if
+    initial_state is not provided.
+*  <b>`sequence_length`</b>: An int64 vector (tensor) size [batch_size].
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "RNN".
+
+##### Returns:
+
+  A pair (outputs, states) where:
+    outputs is a length T list of outputs (one for each input)
+    states is a length T list of states (one state following each input)
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: If "cell" is not an instance of RNNCell.
+*  <b>`ValueError`</b>: If inputs is None or an empty list.
+
+
+- - -
+
+### `tf.nn.state_saving_rnn(cell, inputs, state_saver, state_name, sequence_length=None, scope=None)` {#state_saving_rnn}
+
+RNN that accepts a state saver for time-truncated RNN calculation.
+
+##### Args:
+
+
+*  <b>`cell`</b>: An instance of RNNCell.
+*  <b>`inputs`</b>: A length T list of inputs, each a vector with shape [batch_size].
+*  <b>`state_saver`</b>: A state saver object with methods `state` and `save_state`.
+*  <b>`state_name`</b>: The name to use with the state_saver.
+*  <b>`sequence_length`</b>: (optional) An int64 vector (tensor) size [batch_size].
+    See the documentation for rnn() for more details about sequence_length.
+*  <b>`scope`</b>: VariableScope for the created subgraph; defaults to "RNN".
+
+##### Returns:
+
+  A pair (outputs, states) where:
+    outputs is a length T list of outputs (one for each input)
+    states is a length T list of states (one state following each input)
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: If "cell" is not an instance of RNNCell.
+*  <b>`ValueError`</b>: If inputs is None or an empty list.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/sparse_ops.md b/tensorflow/g3doc/api_docs/python/sparse_ops.md
index 4c7db4b10f5..99a075f14d8 100644
--- a/tensorflow/g3doc/api_docs/python/sparse_ops.md
+++ b/tensorflow/g3doc/api_docs/python/sparse_ops.md
@@ -43,23 +43,23 @@ dense[tuple(indices[i])] = values[i]
 ```
 
 By convention, `indices` should be sorted in row-major order (or equivalently
-lexigraphic order on the tuples `indices[i]`).  This is not enforced when
-`SparseTensor` objects are constructed, but most Ops assume correct ordering.
+lexicographic order on the tuples `indices[i]`).  This is not enforced when
+`SparseTensor` objects are constructed, but most ops assume correct ordering.
 If the ordering is wrong, it can be fixed by calling `sparse_reorder` on the
 misordered `SparseTensor`.
 
 Example: The sparse tensor
 
 ```python
-  SparseTensor(values=[1, 2], indices=[[0, 0], [1, 2]], shape=[3, 4])
+SparseTensor(values=[1, 2], indices=[[0, 0], [1, 2]], shape=[3, 4])
 ```
 
 represents the dense tensor
 
 ```python
-  [[1, 0, 0, 0]
-   [0, 0, 2, 0]
-   [0, 0, 0, 0]]
+[[1, 0, 0, 0]
+ [0, 0, 2, 0]
+ [0, 0, 0, 0]]
 ```
 
 - - -
@@ -73,7 +73,7 @@ Creates a `SparseTensor`.
 
 *  <b>`indices`</b>: A 2-D int64 tensor of shape `[N, ndims]`.
 *  <b>`values`</b>: A 1-D tensor of any type and shape `[N]`.
-*  <b>`dense_shape`</b>: A 1-D int64 tensor of shape `[ndims]`.
+*  <b>`shape`</b>: A 1-D int64 tensor of shape `[ndims]`.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/state_ops.md b/tensorflow/g3doc/api_docs/python/state_ops.md
index cb9a090ebda..8b2e8b379f7 100644
--- a/tensorflow/g3doc/api_docs/python/state_ops.md
+++ b/tensorflow/g3doc/api_docs/python/state_ops.md
@@ -380,6 +380,51 @@ The `Operation` of this variable.
 
 
 
+#### Other Methods
+- - -
+
+#### `tf.Variable.ref()` {#Variable.ref}
+
+Returns a reference to this variable.
+
+You usually do not need to call this method as all ops that need a reference
+to the variable call it automatically.
+
+Returns is a `Tensor` which holds a reference to the variable.  You can
+assign a new value to the variable by passing the tensor to an assign op.
+See [`value()`](#Variable.value) if you want to get the value of the
+variable.
+
+##### Returns:
+
+  A `Tensor` that is a reference to the variable.
+
+
+- - -
+
+#### `tf.Variable.value()` {#Variable.value}
+
+Returns the last snapshot of this variable.
+
+You usually do not need to call this method as all ops that need the value
+of the variable call it automatically through a `convert_to_tensor()` call.
+
+Returns a `Tensor` which holds the value of the variable.  You can not
+assign a new value to this tensor as it is not a reference to the variable.
+See [`ref()`](#Variable.ref) if you want to get a reference to the
+variable.
+
+To avoid copies, if the consumer of the returned value is on the same device
+as the variable, this actually returns the live value of the variable, not
+a copy.  Updates to the variable are seen by the consumer.  If the consumer
+is on a different device it will get a copy of the variable.
+
+##### Returns:
+
+  A `Tensor` containing the value of the variable.
+
+
+
 
 ## Variable helper functions
 
diff --git a/tensorflow/g3doc/api_docs/python/train.md b/tensorflow/g3doc/api_docs/python/train.md
index 6b36d913565..b686968a8c1 100644
--- a/tensorflow/g3doc/api_docs/python/train.md
+++ b/tensorflow/g3doc/api_docs/python/train.md
@@ -192,6 +192,7 @@ applies gradients.
 
 
 *  <b>`TypeError`</b>: if `grads_and_vars` is malformed.
+*  <b>`ValueError`</b>: if none of the variables have gradients.
 
 
 
@@ -388,9 +389,9 @@ current good choice is 1.0 or 0.1.
 *  <b>`beta1`</b>: A float value or a constant float tensor.
     The exponential decay rate for the 1st moment estimates.
 *  <b>`beta2`</b>: A float value or a constant float tensor.
-    The exponential decay rate for the 2st moment estimates.
+    The exponential decay rate for the 2nd moment estimates.
 *  <b>`epsilon`</b>: A small constant for numerical stability.
-*  <b>`use_locking`</b>: If True use locks for update operation.s
+*  <b>`use_locking`</b>: If True use locks for update operations.
 *  <b>`name`</b>: Optional name for the operations created when applying gradients.
     Defaults to "Adam".
 
diff --git a/tensorflow/g3doc/get_started/basic_usage.md b/tensorflow/g3doc/get_started/basic_usage.md
index e09b574cc7a..cca15c1de46 100644
--- a/tensorflow/g3doc/get_started/basic_usage.md
+++ b/tensorflow/g3doc/get_started/basic_usage.md
@@ -274,8 +274,8 @@ tf.placeholder() to create them:
 
 ```python
 
-input1 = tf.placeholder(tf.types.float32)
-input2 = tf.placeholder(tf.types.float32)
+input1 = tf.placeholder(tf.float32)
+input2 = tf.placeholder(tf.float32)
 output = tf.mul(input1, input2)
 
 with tf.Session() as sess:
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/index.md b/tensorflow/g3doc/how_tos/adding_an_op/index.md
index 150ad8d6e68..fe943fac6cf 100644
--- a/tensorflow/g3doc/how_tos/adding_an_op/index.md
+++ b/tensorflow/g3doc/how_tos/adding_an_op/index.md
@@ -22,7 +22,7 @@ to:
 * Optionally, write a function to compute gradients for the Op.
 * Optionally, write a function that describes the input and output shapes
   for the Op.  This allows shape inference to work with your Op.
-* Test the Op, typically in Python.
+* Test the Op, typically in Python. If you define gradients, you can verify them   with the Python [`GradientChecker`](https://tensorflow.googlesource.com/tensorflow/+/master/tensorflow/python/kernel_tests/gradient_checker.py).
 
 [TOC]
 
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_2_test.py b/tensorflow/g3doc/how_tos/adding_an_op/zero_out_2_test.py
index 49df02d2cbd..25229213b9a 100644
--- a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_2_test.py
+++ b/tensorflow/g3doc/how_tos/adding_an_op/zero_out_2_test.py
@@ -24,7 +24,6 @@ import tensorflow.python.platform
 import tensorflow as tf
 from tensorflow.g3doc.how_tos.adding_an_op import gen_zero_out_op_2
 from tensorflow.g3doc.how_tos.adding_an_op import zero_out_grad_2
-from tensorflow.python.kernel_tests import gradient_checker
 
 
 class ZeroOut2Test(tf.test.TestCase):
@@ -39,7 +38,7 @@ class ZeroOut2Test(tf.test.TestCase):
       shape = (5,)
       x = tf.constant([5, 4, 3, 2, 1], dtype=tf.float32)
       y = gen_zero_out_op_2.zero_out(x)
-      err = gradient_checker.ComputeGradientError(x, shape, y, shape)
+      err = tf.test.compute_gradient_error(x, shape, y, shape)
       self.assertLess(err, 1e-4)
 
 
diff --git a/tensorflow/g3doc/how_tos/reading_data/convert_to_records.py b/tensorflow/g3doc/how_tos/reading_data/convert_to_records.py
index 00b351545c2..ce3b016798f 100644
--- a/tensorflow/g3doc/how_tos/reading_data/convert_to_records.py
+++ b/tensorflow/g3doc/how_tos/reading_data/convert_to_records.py
@@ -53,7 +53,7 @@ def convert_to(images, labels, name):
   num_examples = labels.shape[0]
   if images.shape[0] != num_examples:
     raise ValueError("Images size %d does not match label size %d." %
-                     (dat.shape[0], num_examples))
+                     (images.shape[0], num_examples))
   rows = images.shape[1]
   cols = images.shape[2]
   depth = images.shape[3]
diff --git a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
index fdec071aeec..f1b7bb8205b 100644
--- a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
+++ b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
@@ -62,18 +62,66 @@ Now that you've modified your graph and have a `SummaryWriter`, you're ready to
 start running your network! If you want, you could run the merged summary op
 every single step, and record a ton of training data. That's likely to be more
 data than you need, though. Instead, consider running the merged summary op
-every hundred steps or so, as in the following code example.
+every `n` steps.
+
+The code example below is a modification of the [simple MNIST tutorial]
+(http://tensorflow.org/tutorials/mnist/beginners/index.md), in which we have
+added some summary ops, and run them every ten steps. If you run this and then
+launch `tensorboard --logdir=/tmp/mnist_data`, you'll be able to visualize
+statistics, such as how the weights or accuracy varied during training.
+The code below is an exerpt; full source is [here](mnist_with_summaries.py).
 
 ```python
-merged_summary_op = tf.merge_all_summaries()
-summary_writer = tf.train.SummaryWriter('/tmp/mnist_logs', sess.graph_def)
-total_step = 0
-while training:
-  total_step += 1
-  session.run(training_op)
-  if total_step % 100 == 0:
-    summary_str = session.run(merged_summary_op)
-    summary_writer.add_summary(summary_str, total_step)
+# Create the model
+x = tf.placeholder("float", [None, 784], name="x-input")
+W = tf.Variable(tf.zeros([784,10]), name="weights")
+b = tf.Variable(tf.zeros([10], name="bias"))
+
+# use a name scope to organize nodes in the graph visualizer
+with tf.name_scope("Wx_b") as scope:
+  y = tf.nn.softmax(tf.matmul(x,W) + b)
+
+# Add summary ops to collect data
+w_hist = tf.histogram_summary("weights", W)
+b_hist = tf.histogram_summary("biases", b)
+y_hist = tf.histogram_summary("y", y)
+
+# Define loss and optimizer
+y_ = tf.placeholder("float", [None,10], name="y-input")
+# More name scopes will clean up the graph representation
+with tf.name_scope("xent") as scope:
+  cross_entropy = -tf.reduce_sum(y_*tf.log(y))
+  ce_summ = tf.scalar_summary("cross entropy", cross_entropy)
+with tf.name_scope("train") as scope:
+  train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
+
+with tf.name_scope("test") as scope:
+  correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
+  accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
+  accuracy_summary = tf.scalar_summary("accuracy", accuracy)
+
+# Merge all the summaries and write them out to /tmp/mnist_logs
+merged = tf.merge_all_summaries()
+writer = tf.train.SummaryWriter("/tmp/mnist_logs", sess.graph_def)
+tf.initialize_all_variables().run()
+
+# Train the model, and feed in test data and record summaries every 10 steps
+
+for i in range(1000):
+  if i % 10 == 0:  # Record summary data, and the accuracy
+    feed = {x: mnist.test.images, y_: mnist.test.labels}
+    result = sess.run([merged, accuracy], feed_dict=feed)
+    summary_str = result[0]
+    acc = result[1]
+    writer.add_summary(summary_str, i)
+    print("Accuracy at step %s: %s" % (i, acc))
+  else:
+    batch_xs, batch_ys = mnist.train.next_batch(100)
+    feed = {x: batch_xs, y_: batch_ys}
+    sess.run(train_step, feed_dict=feed)
+
+print(accuracy.eval({x: mnist.test.images, y_: mnist.test.labels}))
+
 ```
 
 You're now all set to visualize this data using TensorBoard.
diff --git a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/mnist_with_summaries.py b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/mnist_with_summaries.py
new file mode 100644
index 00000000000..cea82b137ee
--- /dev/null
+++ b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/mnist_with_summaries.py
@@ -0,0 +1,69 @@
+"""A very simple MNIST classifer, modified to display data in TensorBoard
+
+See extensive documentation for the original model at
+http://tensorflow.org/tutorials/mnist/beginners/index.md
+
+See documentaion on the TensorBoard specific pieces at
+http://tensorflow.org/how_tos/summaries_and_tensorboard/index.md
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Import data
+import input_data
+mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
+
+import tensorflow as tf
+sess = tf.InteractiveSession()
+
+# Create the model
+x = tf.placeholder("float", [None, 784], name="x-input")
+W = tf.Variable(tf.zeros([784,10]), name="weights")
+b = tf.Variable(tf.zeros([10], name="bias"))
+
+# use a name scope to organize nodes in the graph visualizer
+with tf.name_scope("Wx_b") as scope:
+  y = tf.nn.softmax(tf.matmul(x,W) + b)
+
+# Add summary ops to collect data
+w_hist = tf.histogram_summary("weights", W)
+b_hist = tf.histogram_summary("biases", b)
+y_hist = tf.histogram_summary("y", y)
+
+# Define loss and optimizer
+y_ = tf.placeholder("float", [None,10], name="y-input")
+# More name scopes will clean up the graph representation
+with tf.name_scope("xent") as scope:
+  cross_entropy = -tf.reduce_sum(y_*tf.log(y))
+  ce_summ = tf.scalar_summary("cross entropy", cross_entropy)
+with tf.name_scope("train") as scope:
+  train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
+
+with tf.name_scope("test") as scope:
+  correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
+  accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
+  accuracy_summary = tf.scalar_summary("accuracy", accuracy)
+
+# Merge all the summaries and write them out to /tmp/mnist_logs
+merged = tf.merge_all_summaries()
+writer = tf.train.SummaryWriter("/tmp/mnist_logs", sess.graph_def)
+tf.initialize_all_variables().run()
+
+# Train the model, and feed in test data and record summaries every 10 steps
+
+for i in range(1000):
+  if i % 10 == 0:  # Record summary data, and the accuracy
+    feed = {x: mnist.test.images, y_: mnist.test.labels}
+    result = sess.run([merged, accuracy], feed_dict=feed)
+    summary_str = result[0]
+    acc = result[1]
+    writer.add_summary(summary_str, i)
+    print("Accuracy at step %s: %s" % (i, acc))
+  else:
+    batch_xs, batch_ys = mnist.train.next_batch(100)
+    feed = {x: batch_xs, y_: batch_ys}
+    sess.run(train_step, feed_dict=feed)
+
+print(accuracy.eval({x: mnist.test.images, y_: mnist.test.labels}))
diff --git a/tensorflow/g3doc/tutorials/mnist/beginners/index.md b/tensorflow/g3doc/tutorials/mnist/beginners/index.md
index fc29a47ceba..44efd432352 100644
--- a/tensorflow/g3doc/tutorials/mnist/beginners/index.md
+++ b/tensorflow/g3doc/tutorials/mnist/beginners/index.md
@@ -224,13 +224,13 @@ We describe these interacting operations by manipulating symbolic variables.
 Let's create one:
 
 ```python
-x = tf.placeholder("float", [None, 784])
+x = tf.placeholder(tf.float32, [None, 784])
 ```
 
 `x` isn't a specific value. It's a `placeholder`, a value that we'll input when
 we ask TensorFlow to run a computation. We want to be able to input any number
 of MNIST images, each flattened into a 784-dimensional vector. We represent
-this as a 2d tensor of floating point numbers, with a shape `[None, 784]`.
+this as a 2-D tensor of floating-point numbers, with a shape `[None, 784]`.
 (Here `None` means that a dimension can be of any length.)
 
 We also need the weights and biases for our model. We could imagine treating
@@ -242,7 +242,7 @@ operations. It can be used and even modified by the computation. For machine
 learning applications, one generally has the model parameters be `Variable`s.
 
 ```python
-W = tf.Variable(tf.zeros([784,10]))
+W = tf.Variable(tf.zeros([784, 10]))
 b = tf.Variable(tf.zeros([10]))
 ```
 
@@ -259,10 +259,10 @@ to the output.
 We can now implement our model. It only takes one line!
 
 ```python
-y = tf.nn.softmax(tf.matmul(x,W) + b)
+y = tf.nn.softmax(tf.matmul(x, W) + b)
 ```
 
-First, we multiply `x` by `W` with the expression `tf.matmul(x,W)`. This is
+First, we multiply `x` by `W` with the expression `tf.matmul(x, W)`. This is
 flipped from when we multiplied them in our equation, where we had \\(Wx\\), as a
 small trick
 to deal with `x` being a 2D tensor with multiple inputs. We then add `b`, and
@@ -301,7 +301,7 @@ To implement cross-entropy we need to first add a new placeholder to input
 the correct answers:
 
 ```python
-y_ = tf.placeholder("float", [None,10])
+y_ = tf.placeholder(tf.float32, [None, 10])
 ```
 
 Then we can implement the cross-entropy, \\(-\sum y'\log(y)\\):
diff --git a/tensorflow/models/embedding/BUILD b/tensorflow/models/embedding/BUILD
index 9cd0d24b5b4..fbed1b0a380 100644
--- a/tensorflow/models/embedding/BUILD
+++ b/tensorflow/models/embedding/BUILD
@@ -38,6 +38,9 @@ py_test(
     size = "small",
     srcs = ["word2vec_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "notsan",  # b/25864127
+    ],
     deps = [
         ":word2vec",
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/models/rnn/BUILD b/tensorflow/models/rnn/BUILD
index 1a81ce2801e..118884fd28d 100644
--- a/tensorflow/models/rnn/BUILD
+++ b/tensorflow/models/rnn/BUILD
@@ -7,8 +7,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("/tensorflow/tensorflow", "cuda_py_tests")
-
 py_library(
     name = "linear",
     srcs = [
@@ -20,17 +18,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "linear_test",
-    size = "small",
-    srcs = ["linear_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":linear",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
 py_library(
     name = "rnn_cell",
     srcs = [
@@ -43,17 +30,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "rnn_cell_test",
-    size = "small",
-    srcs = ["rnn_cell_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":rnn_cell",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
 py_library(
     name = "package",
     srcs = [
@@ -79,16 +55,6 @@ py_library(
     ],
 )
 
-cuda_py_tests(
-    name = "rnn_tests",
-    srcs = [
-        "rnn_test.py",
-    ],
-    additional_deps = [
-        ":rnn",
-    ],
-)
-
 py_library(
     name = "seq2seq",
     srcs = [
@@ -101,18 +67,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "seq2seq_test",
-    srcs = [
-        "seq2seq_test.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":seq2seq",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/models/rnn/linear.py b/tensorflow/models/rnn/linear.py
index 1c8eda67151..30b420087c8 100644
--- a/tensorflow/models/rnn/linear.py
+++ b/tensorflow/models/rnn/linear.py
@@ -12,57 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""Basic linear combinations that implicitly generate variables."""
-
+"""Import linear python op for backward compatibility."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# pylint: disable=g-bad-import-order,unused-import
+import tensorflow.python.platform
+
 import tensorflow as tf
 
-
-def linear(args, output_size, bias, bias_start=0.0, scope=None):
-  """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
-
-  Args:
-    args: a 2D Tensor or a list of 2D, batch x n, Tensors.
-    output_size: int, second dimension of W[i].
-    bias: boolean, whether to add a bias term or not.
-    bias_start: starting value to initialize the bias; 0 by default.
-    scope: VariableScope for the created subgraph; defaults to "Linear".
-
-  Returns:
-    A 2D Tensor with shape [batch x output_size] equal to
-    sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
-
-  Raises:
-    ValueError: if some of the arguments has unspecified or wrong shape.
-  """
-  assert args
-  if not isinstance(args, (list, tuple)):
-    args = [args]
-
-  # Calculate the total size of arguments on dimension 1.
-  total_arg_size = 0
-  shapes = [a.get_shape().as_list() for a in args]
-  for shape in shapes:
-    if len(shape) != 2:
-      raise ValueError("Linear is expecting 2D arguments: %s" % str(shapes))
-    if not shape[1]:
-      raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes))
-    else:
-      total_arg_size += shape[1]
-
-  # Now the computation.
-  with tf.variable_scope(scope or "Linear"):
-    matrix = tf.get_variable("Matrix", [total_arg_size, output_size])
-    if len(args) == 1:
-      res = tf.matmul(args[0], matrix)
-    else:
-      res = tf.matmul(tf.concat(1, args), matrix)
-    if not bias:
-      return res
-    bias_term = tf.get_variable("Bias", [output_size],
-                                initializer=tf.constant_initializer(bias_start))
-  return res + bias_term
+linear = tf.nn.linear
diff --git a/tensorflow/models/rnn/rnn.py b/tensorflow/models/rnn/rnn.py
index b95bf98f723..9bfc978db1f 100644
--- a/tensorflow/models/rnn/rnn.py
+++ b/tensorflow/models/rnn/rnn.py
@@ -12,137 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""RNN helpers for TensorFlow models."""
+"""Import rnn python ops for backward compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
-
-from tensorflow.models.rnn import rnn_cell
-from tensorflow.python.ops import control_flow_ops
-
-
-def rnn(cell, inputs, initial_state=None, dtype=None,
-        sequence_length=None, scope=None):
-  """Creates a recurrent neural network specified by RNNCell "cell".
-
-  The simplest form of RNN network generated is:
-    state = cell.zero_state(...)
-    outputs = []
-    states = []
-    for input_ in inputs:
-      output, state = cell(input_, state)
-      outputs.append(output)
-      states.append(state)
-    return (outputs, states)
-
-  However, a few other options are available:
-
-  An initial state can be provided.
-  If sequence_length is provided, dynamic calculation is performed.
-
-  Dynamic calculation returns, at time t:
-    (t >= max(sequence_length)
-        ? (zeros(output_shape), zeros(state_shape))
-        : cell(input, state)
-
-  Thus saving computational time when unrolling past the max sequence length.
-
-  Args:
-    cell: An instance of RNNCell.
-    inputs: A length T list of inputs, each a vector with shape [batch_size].
-    initial_state: (optional) An initial state for the RNN.  This must be
-      a tensor of appropriate type and shape [batch_size x cell.state_size].
-    dtype: (optional) The data type for the initial state.  Required if
-      initial_state is not provided.
-    sequence_length: An int64 vector (tensor) size [batch_size].
-    scope: VariableScope for the created subgraph; defaults to "RNN".
-
-  Returns:
-    A pair (outputs, states) where:
-      outputs is a length T list of outputs (one for each input)
-      states is a length T list of states (one state following each input)
-
-  Raises:
-    TypeError: If "cell" is not an instance of RNNCell.
-    ValueError: If inputs is None or an empty list.
-  """
-
-  if not isinstance(cell, rnn_cell.RNNCell):
-    raise TypeError("cell must be an instance of RNNCell")
-  if not isinstance(inputs, list):
-    raise TypeError("inputs must be a list")
-  if not inputs:
-    raise ValueError("inputs must not be empty")
-
-  outputs = []
-  states = []
-  with tf.variable_scope(scope or "RNN"):
-    batch_size = tf.shape(inputs[0])[0]
-    if initial_state is not None:
-      state = initial_state
-    else:
-      if not dtype:
-        raise ValueError("If no initial_state is provided, dtype must be.")
-      state = cell.zero_state(batch_size, dtype)
-
-    if sequence_length:  # Prepare variables
-      zero_output_state = (
-          tf.zeros(tf.pack([batch_size, cell.output_size]),
-                   inputs[0].dtype),
-          tf.zeros(tf.pack([batch_size, cell.state_size]),
-                   state.dtype))
-      max_sequence_length = tf.reduce_max(sequence_length)
-
-    for time, input_ in enumerate(inputs):
-      if time > 0: tf.get_variable_scope().reuse_variables()
-      # pylint: disable=cell-var-from-loop
-      def output_state():
-        return cell(input_, state)
-      # pylint: enable=cell-var-from-loop
-      if sequence_length:
-        (output, state) = control_flow_ops.cond(
-            time >= max_sequence_length,
-            lambda: zero_output_state, output_state)
-      else:
-        (output, state) = output_state()
-
-      outputs.append(output)
-      states.append(state)
-
-    return (outputs, states)
-
-
-def state_saving_rnn(cell, inputs, state_saver, state_name,
-                     sequence_length=None, scope=None):
-  """RNN that accepts a state saver for time-truncated RNN calculation.
-
-  Args:
-    cell: An instance of RNNCell.
-    inputs: A length T list of inputs, each a vector with shape [batch_size].
-    state_saver: A state saver object with methods `state` and `save_state`.
-    state_name: The name to use with the state_saver.
-    sequence_length: (optional) An int64 vector (tensor) size [batch_size].
-      See the documentation for rnn() for more details about sequence_length.
-    scope: VariableScope for the created subgraph; defaults to "RNN".
-
-  Returns:
-    A pair (outputs, states) where:
-      outputs is a length T list of outputs (one for each input)
-      states is a length T list of states (one state following each input)
-
-  Raises:
-    TypeError: If "cell" is not an instance of RNNCell.
-    ValueError: If inputs is None or an empty list.
-  """
-  initial_state = state_saver.state(state_name)
-  (outputs, states) = rnn(cell, inputs, initial_state=initial_state,
-                          sequence_length=sequence_length, scope=scope)
-  save_state = state_saver.save_state(state_name, states[-1])
-  with tf.control_dependencies([save_state]):
-    outputs[-1] = tf.identity(outputs[-1])
-
-  return (outputs, states)
+# pylint: disable=g-bad-import-order,wildcard-import,unused-import
+import tensorflow.python.platform
+from tensorflow.python.ops.rnn import *
diff --git a/tensorflow/models/rnn/rnn_cell.py b/tensorflow/models/rnn/rnn_cell.py
index bdedfebd7fc..6ff94e8026f 100644
--- a/tensorflow/models/rnn/rnn_cell.py
+++ b/tensorflow/models/rnn/rnn_cell.py
@@ -12,614 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Import rnn_cell python ops for backward compatibility."""
 
-"""Module for constructing RNN Cells."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-from tensorflow.models.rnn import linear
-
-
-class RNNCell(object):
-  """Abstract object representing an RNN cell.
-
-  An RNN cell, in the most abstract setting, is anything that has
-  a state -- a vector of floats of size self.state_size -- and performs some
-  operation that takes inputs of size self.input_size. This operation
-  results in an output of size self.output_size and a new state.
-
-  This module provides a number of basic commonly used RNN cells, such as
-  LSTM (Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number
-  of operators that allow add dropouts, projections, or embeddings for inputs.
-  Constructing multi-layer cells is supported by a super-class, MultiRNNCell,
-  defined later. Every RNNCell must have the properties below and and
-  implement __call__ with the following signature.
-  """
-
-  def __call__(self, inputs, state, scope=None):
-    """Run this RNN cell on inputs, starting from the given state.
-
-    Args:
-      inputs: 2D Tensor with shape [batch_size x self.input_size].
-      state: 2D Tensor with shape [batch_size x self.state_size].
-      scope: VariableScope for the created subgraph; defaults to class name.
-
-    Returns:
-      A pair containing:
-      - Output: A 2D Tensor with shape [batch_size x self.output_size]
-      - New state: A 2D Tensor with shape [batch_size x self.state_size].
-    """
-    raise NotImplementedError("Abstract method")
-
-  @property
-  def input_size(self):
-    """Integer: size of inputs accepted by this cell."""
-    raise NotImplementedError("Abstract method")
-
-  @property
-  def output_size(self):
-    """Integer: size of outputs produced by this cell."""
-    raise NotImplementedError("Abstract method")
-
-  @property
-  def state_size(self):
-    """Integer: size of state used by this cell."""
-    raise NotImplementedError("Abstract method")
-
-  def zero_state(self, batch_size, dtype):
-    """Return state tensor (shape [batch_size x state_size]) filled with 0.
-
-    Args:
-      batch_size: int, float, or unit Tensor representing the batch size.
-      dtype: the data type to use for the state.
-
-    Returns:
-      A 2D Tensor of shape [batch_size x state_size] filled with zeros.
-    """
-    zeros = tf.zeros(tf.pack([batch_size, self.state_size]), dtype=dtype)
-    zeros.set_shape([None, self.state_size])
-    return zeros
-
-
-class BasicRNNCell(RNNCell):
-  """The most basic RNN cell."""
-
-  def __init__(self, num_units):
-    self._num_units = num_units
-
-  @property
-  def input_size(self):
-    return self._num_units
-
-  @property
-  def output_size(self):
-    return self._num_units
-
-  @property
-  def state_size(self):
-    return self._num_units
-
-  def __call__(self, inputs, state, scope=None):
-    """Most basic RNN: output = new_state = tanh(W * input + U * state + B)."""
-    with tf.variable_scope(scope or type(self).__name__):  # "BasicRNNCell"
-      output = tf.tanh(linear.linear([inputs, state], self._num_units, True))
-    return output, output
-
-
-class GRUCell(RNNCell):
-  """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078)."""
-
-  def __init__(self, num_units):
-    self._num_units = num_units
-
-  @property
-  def input_size(self):
-    return self._num_units
-
-  @property
-  def output_size(self):
-    return self._num_units
-
-  @property
-  def state_size(self):
-    return self._num_units
-
-  def __call__(self, inputs, state, scope=None):
-    """Gated recurrent unit (GRU) with nunits cells."""
-    with tf.variable_scope(scope or type(self).__name__):  # "GRUCell"
-      with tf.variable_scope("Gates"):  # Reset gate and update gate.
-        # We start with bias of 1.0 to not reset and not udpate.
-        r, u = tf.split(1, 2, linear.linear([inputs, state],
-                                            2 * self._num_units, True, 1.0))
-        r, u = tf.sigmoid(r), tf.sigmoid(u)
-      with tf.variable_scope("Candidate"):
-        c = tf.tanh(linear.linear([inputs, r * state], self._num_units, True))
-      new_h = u * state + (1 - u) * c
-    return new_h, new_h
-
-
-class BasicLSTMCell(RNNCell):
-  """Basic LSTM recurrent network cell.
-
-  The implementation is based on: http://arxiv.org/pdf/1409.2329v5.pdf.
-
-  It does not allow cell clipping, a projection layer, and does not
-  use peep-hole connections: it is the basic baseline.
-
-  Biases of the forget gate are initialized by default to 1 in order to reduce
-  the scale of forgetting in the beginning of the training.
-  """
-
-  def __init__(self, num_units, forget_bias=1.0):
-    self._num_units = num_units
-    self._forget_bias = forget_bias
-
-  @property
-  def input_size(self):
-    return self._num_units
-
-  @property
-  def output_size(self):
-    return self._num_units
-
-  @property
-  def state_size(self):
-    return 2 * self._num_units
-
-  def __call__(self, inputs, state, scope=None):
-    """Long short-term memory cell (LSTM)."""
-    with tf.variable_scope(scope or type(self).__name__):  # "BasicLSTMCell"
-      # Parameters of gates are concatenated into one multiply for efficiency.
-      c, h = tf.split(1, 2, state)
-      concat = linear.linear([inputs, h], 4 * self._num_units, True)
-
-      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-      i, j, f, o = tf.split(1, 4, concat)
-
-      new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * tf.tanh(j)
-      new_h = tf.tanh(new_c) * tf.sigmoid(o)
-
-    return new_h, tf.concat(1, [new_c, new_h])
-
-
-class LSTMCell(RNNCell):
-  """Long short-term memory unit (LSTM) recurrent network cell.
-
-  This implementation is based on:
-
-    https://research.google.com/pubs/archive/43905.pdf
-
-  Hasim Sak, Andrew Senior, and Francoise Beaufays.
-  "Long short-term memory recurrent neural network architectures for
-   large scale acoustic modeling." INTERSPEECH, 2014.
-
-  It uses peep-hole connections, optional cell clipping, and an optional
-  projection layer.
-  """
-
-  def __init__(self, num_units, input_size,
-               use_peepholes=False, cell_clip=None,
-               initializer=None, num_proj=None,
-               num_unit_shards=1, num_proj_shards=1):
-    """Initialize the parameters for an LSTM cell.
-
-    Args:
-      num_units: int, The number of units in the LSTM cell
-      input_size: int, The dimensionality of the inputs into the LSTM cell
-      use_peepholes: bool, set True to enable diagonal/peephole connections.
-      cell_clip: (optional) A float value, if provided the cell state is clipped
-        by this value prior to the cell output activation.
-      initializer: (optional) The initializer to use for the weight and
-        projection matrices.
-      num_proj: (optional) int, The output dimensionality for the projection
-        matrices.  If None, no projection is performed.
-      num_unit_shards: How to split the weight matrix.  If >1, the weight
-        matrix is stored across num_unit_shards.
-        Note that num_unit_shards must evenly divide num_units * 4.
-      num_proj_shards: How to split the projection matrix.  If >1, the
-        projection matrix is stored across num_proj_shards.
-        Note that num_proj_shards must evenly divide num_proj
-              (if num_proj is not None).
-
-    Raises:
-      ValueError: if num_unit_shards doesn't divide 4 * num_units or
-        num_proj_shards doesn't divide num_proj
-    """
-    self._num_units = num_units
-    self._input_size = input_size
-    self._use_peepholes = use_peepholes
-    self._cell_clip = cell_clip
-    self._initializer = initializer
-    self._num_proj = num_proj
-    self._num_unit_shards = num_unit_shards
-    self._num_proj_shards = num_proj_shards
-
-    if (num_units * 4) % num_unit_shards != 0:
-      raise ValueError("num_unit_shards must evently divide 4 * num_units")
-    if num_proj and num_proj % num_proj_shards != 0:
-      raise ValueError("num_proj_shards must evently divide num_proj")
-
-    if num_proj:
-      self._state_size = num_units + num_proj
-      self._output_size = num_proj
-    else:
-      self._state_size = 2 * num_units
-      self._output_size = num_units
-
-  @property
-  def input_size(self):
-    return self._input_size
-
-  @property
-  def output_size(self):
-    return self._output_size
-
-  @property
-  def state_size(self):
-    return self._state_size
-
-  def __call__(self, input_, state, scope=None):
-    """Run one step of LSTM.
-
-    Args:
-      input_: input Tensor, 2D, batch x num_units.
-      state: state Tensor, 2D, batch x state_size.
-      scope: VariableScope for the created subgraph; defaults to "LSTMCell".
-
-    Returns:
-      A tuple containing:
-      - A 2D, batch x output_dim, Tensor representing the output of the LSTM
-        after reading "input_" when previous state was "state".
-        Here output_dim is:
-           num_proj if num_proj was set,
-           num_units otherwise.
-      - A 2D, batch x state_size, Tensor representing the new state of LSTM
-        after reading "input_" when previous state was "state".
-    """
-    num_proj = self._num_units if self._num_proj is None else self._num_proj
-
-    c_prev = tf.slice(state, [0, 0], [-1, self._num_units])
-    m_prev = tf.slice(state, [0, self._num_units], [-1, num_proj])
-
-    dtype = input_.dtype
-
-    unit_shard_size = (4 * self._num_units) // self._num_unit_shards
-
-    with tf.variable_scope(scope or type(self).__name__):  # "LSTMCell"
-      w = tf.concat(
-          1,
-          [tf.get_variable("W_%d" % i,
-                           shape=[self.input_size + num_proj, unit_shard_size],
-                           initializer=self._initializer,
-                           dtype=dtype) for i in xrange(self._num_unit_shards)])
-
-      b = tf.get_variable(
-          "B", shape=[4 * self._num_units],
-          initializer=tf.zeros_initializer, dtype=dtype)
-
-      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-      cell_inputs = tf.concat(1, [input_, m_prev])
-      i, j, f, o = tf.split(1, 4, tf.nn.bias_add(tf.matmul(cell_inputs, w), b))
-
-      # Diagonal connections
-      if self._use_peepholes:
-        w_f_diag = tf.get_variable(
-            "W_F_diag", shape=[self._num_units], dtype=dtype)
-        w_i_diag = tf.get_variable(
-            "W_I_diag", shape=[self._num_units], dtype=dtype)
-        w_o_diag = tf.get_variable(
-            "W_O_diag", shape=[self._num_units], dtype=dtype)
-
-      if self._use_peepholes:
-        c = (tf.sigmoid(f + 1 + w_f_diag * c_prev) * c_prev +
-             tf.sigmoid(i + w_i_diag * c_prev) * tf.tanh(j))
-      else:
-        c = (tf.sigmoid(f + 1) * c_prev + tf.sigmoid(i) * tf.tanh(j))
-
-      if self._cell_clip is not None:
-        c = tf.clip_by_value(c, -self._cell_clip, self._cell_clip)
-
-      if self._use_peepholes:
-        m = tf.sigmoid(o + w_o_diag * c) * tf.tanh(c)
-      else:
-        m = tf.sigmoid(o) * tf.tanh(c)
-
-      if self._num_proj is not None:
-        proj_shard_size = self._num_proj // self._num_proj_shards
-        w_proj = tf.concat(
-            1,
-            [tf.get_variable("W_P_%d" % i,
-                             shape=[self._num_units, proj_shard_size],
-                             initializer=self._initializer,
-                             dtype=dtype)
-             for i in xrange(self._num_proj_shards)])
-        # TODO(ebrevdo), use matmulsum
-        m = tf.matmul(m, w_proj)
-
-    return m, tf.concat(1, [c, m])
-
-
-class OutputProjectionWrapper(RNNCell):
-  """Operator adding an output projection to the given cell.
-
-  Note: in many cases it may be more efficient to not use this wrapper,
-  but instead concatenate the whole sequence of your outputs in time,
-  do the projection on this batch-concated sequence, then split it
-  if needed or directly feed into a softmax.
-  """
-
-  def __init__(self, cell, output_size):
-    """Create a cell with output projection.
-
-    Args:
-      cell: an RNNCell, a projection to output_size is added to it.
-      output_size: integer, the size of the output after projection.
-
-    Raises:
-      TypeError: if cell is not an RNNCell.
-      ValueError: if output_size is not positive.
-    """
-    if not isinstance(cell, RNNCell):
-      raise TypeError("The parameter cell is not RNNCell.")
-    if output_size < 1:
-      raise ValueError("Parameter output_size must be > 0: %d." % output_size)
-    self._cell = cell
-    self._output_size = output_size
-
-  @property
-  def input_size(self):
-    return self._cell.input_size
-
-  @property
-  def output_size(self):
-    return self._output_size
-
-  @property
-  def state_size(self):
-    return self._cell.state_size
-
-  def __call__(self, inputs, state, scope=None):
-    """Run the cell and output projection on inputs, starting from state."""
-    output, res_state = self._cell(inputs, state)
-    # Default scope: "OutputProjectionWrapper"
-    with tf.variable_scope(scope or type(self).__name__):
-      projected = linear.linear(output, self._output_size, True)
-    return projected, res_state
-
-
-class InputProjectionWrapper(RNNCell):
-  """Operator adding an input projection to the given cell.
-
-  Note: in many cases it may be more efficient to not use this wrapper,
-  but instead concatenate the whole sequence of your inputs in time,
-  do the projection on this batch-concated sequence, then split it.
-  """
-
-  def __init__(self, cell, input_size):
-    """Create a cell with input projection.
-
-    Args:
-      cell: an RNNCell, a projection of inputs is added before it.
-      input_size: integer, the size of the inputs before projection.
-
-    Raises:
-      TypeError: if cell is not an RNNCell.
-      ValueError: if input_size is not positive.
-    """
-    if not isinstance(cell, RNNCell):
-      raise TypeError("The parameter cell is not RNNCell.")
-    if input_size < 1:
-      raise ValueError("Parameter input_size must be > 0: %d." % input_size)
-    self._cell = cell
-    self._input_size = input_size
-
-  @property
-  def input_size(self):
-    return self._input_size
-
-  @property
-  def output_size(self):
-    return self._cell.output_size
-
-  @property
-  def state_size(self):
-    return self._cell.state_size
-
-  def __call__(self, inputs, state, scope=None):
-    """Run the input projection and then the cell."""
-    # Default scope: "InputProjectionWrapper"
-    with tf.variable_scope(scope or type(self).__name__):
-      projected = linear.linear(inputs, self._cell.input_size, True)
-    return self._cell(projected, state)
-
-
-class DropoutWrapper(RNNCell):
-  """Operator adding dropout to inputs and outputs of the given cell."""
-
-  def __init__(self, cell, input_keep_prob=1.0, output_keep_prob=1.0,
-               seed=None):
-    """Create a cell with added input and/or output dropout.
-
-    Dropout is never used on the state.
-
-    Args:
-      cell: an RNNCell, a projection to output_size is added to it.
-      input_keep_prob: unit Tensor or float between 0 and 1, input keep
-        probability; if it is float and 1, no input dropout will be added.
-      output_keep_prob: unit Tensor or float between 0 and 1, output keep
-        probability; if it is float and 1, no output dropout will be added.
-      seed: (optional) integer, the randomness seed.
-
-    Raises:
-      TypeError: if cell is not an RNNCell.
-      ValueError: if keep_prob is not between 0 and 1.
-    """
-    if not isinstance(cell, RNNCell):
-      raise TypeError("The parameter cell is not a RNNCell.")
-    if (isinstance(input_keep_prob, float) and
-        not (input_keep_prob >= 0.0 and input_keep_prob <= 1.0)):
-      raise ValueError("Parameter input_keep_prob must be between 0 and 1: %d"
-                       % input_keep_prob)
-    if (isinstance(output_keep_prob, float) and
-        not (output_keep_prob >= 0.0 and output_keep_prob <= 1.0)):
-      raise ValueError("Parameter input_keep_prob must be between 0 and 1: %d"
-                       % output_keep_prob)
-    self._cell = cell
-    self._input_keep_prob = input_keep_prob
-    self._output_keep_prob = output_keep_prob
-    self._seed = seed
-
-  @property
-  def input_size(self):
-    return self._cell.input_size
-
-  @property
-  def output_size(self):
-    return self._cell.output_size
-
-  @property
-  def state_size(self):
-    return self._cell.state_size
-
-  def __call__(self, inputs, state):
-    """Run the cell with the declared dropouts."""
-    if (not isinstance(self._input_keep_prob, float) or
-        self._input_keep_prob < 1):
-      inputs = tf.nn.dropout(inputs, self._input_keep_prob, seed=self._seed)
-    output, new_state = self._cell(inputs, state)
-    if (not isinstance(self._output_keep_prob, float) or
-        self._output_keep_prob < 1):
-      output = tf.nn.dropout(output, self._output_keep_prob, seed=self._seed)
-    return output, new_state
-
-
-class EmbeddingWrapper(RNNCell):
-  """Operator adding input embedding to the given cell.
-
-  Note: in many cases it may be more efficient to not use this wrapper,
-  but instead concatenate the whole sequence of your inputs in time,
-  do the embedding on this batch-concated sequence, then split it and
-  feed into your RNN.
-  """
-
-  def __init__(self, cell, embedding_classes=0, embedding=None,
-               initializer=None):
-    """Create a cell with an added input embedding.
-
-    Args:
-      cell: an RNNCell, an embedding will be put before its inputs.
-      embedding_classes: integer, how many symbols will be embedded.
-      embedding: Variable, the embedding to use; if None, a new embedding
-        will be created; if set, then embedding_classes is not required.
-      initializer: an initializer to use when creating the embedding;
-        if None, the initializer from variable scope or a default one is used.
-
-    Raises:
-      TypeError: if cell is not an RNNCell.
-      ValueError: if embedding_classes is not positive.
-    """
-    if not isinstance(cell, RNNCell):
-      raise TypeError("The parameter cell is not RNNCell.")
-    if embedding_classes < 1 and embedding is None:
-      raise ValueError("Pass embedding or embedding_classes must be > 0: %d."
-                       % embedding_classes)
-    if embedding_classes > 0 and embedding is not None:
-      if embedding.size[0] != embedding_classes:
-        raise ValueError("You declared embedding_classes=%d but passed an "
-                         "embedding for %d classes." % (embedding.size[0],
-                                                        embedding_classes))
-      if embedding.size[1] != cell.input_size:
-        raise ValueError("You passed embedding with output size %d and a cell"
-                         " that accepts size %d." % (embedding.size[1],
-                                                     cell.input_size))
-    self._cell = cell
-    self._embedding_classes = embedding_classes
-    self._embedding = embedding
-    self._initializer = initializer
-
-  @property
-  def input_size(self):
-    return 1
-
-  @property
-  def output_size(self):
-    return self._cell.output_size
-
-  @property
-  def state_size(self):
-    return self._cell.state_size
-
-  def __call__(self, inputs, state, scope=None):
-    """Run the cell on embedded inputs."""
-    with tf.variable_scope(scope or type(self).__name__):  # "EmbeddingWrapper"
-      with tf.device("/cpu:0"):
-        if self._embedding:
-          embedding = self._embedding
-        else:
-          if self._initializer:
-            initializer = self._initializer
-          elif tf.get_variable_scope().initializer:
-            initializer = tf.get_variable_scope().initializer
-          else:
-            # Default initializer for embeddings should have variance=1.
-            sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
-            initializer = tf.random_uniform_initializer(-sqrt3, sqrt3)
-          embedding = tf.get_variable("embedding", [self._embedding_classes,
-                                                    self._cell.input_size],
-                                      initializer=initializer)
-        embedded = tf.nn.embedding_lookup(embedding, tf.reshape(inputs, [-1]))
-    return self._cell(embedded, state)
-
-
-class MultiRNNCell(RNNCell):
-  """RNN cell composed sequentially of multiple simple cells."""
-
-  def __init__(self, cells):
-    """Create a RNN cell composed sequentially of a number of RNNCells.
-
-    Args:
-      cells: list of RNNCells that will be composed in this order.
-
-    Raises:
-      ValueError: if cells is empty (not allowed) or if their sizes don't match.
-    """
-    if not cells:
-      raise ValueError("Must specify at least one cell for MultiRNNCell.")
-    for i in xrange(len(cells) - 1):
-      if cells[i + 1].input_size != cells[i].output_size:
-        raise ValueError("In MultiRNNCell, the input size of each next"
-                         " cell must match the output size of the previous one."
-                         " Mismatched output size in cell %d." % i)
-    self._cells = cells
-
-  @property
-  def input_size(self):
-    return self._cells[0].input_size
-
-  @property
-  def output_size(self):
-    return self._cells[-1].output_size
-
-  @property
-  def state_size(self):
-    return sum([cell.state_size for cell in self._cells])
-
-  def __call__(self, inputs, state, scope=None):
-    """Run this multi-layer cell on inputs, starting from state."""
-    with tf.variable_scope(scope or type(self).__name__):  # "MultiRNNCell"
-      cur_state_pos = 0
-      cur_inp = inputs
-      new_states = []
-      for i, cell in enumerate(self._cells):
-        with tf.variable_scope("Cell%d" % i):
-          cur_state = tf.slice(state, [0, cur_state_pos], [-1, cell.state_size])
-          cur_state_pos += cell.state_size
-          cur_inp, new_state = cell(cur_inp, cur_state)
-          new_states.append(new_state)
-    return cur_inp, tf.concat(1, new_states)
+# pylint: disable=g-bad-import-order,wildcard-import,unused-import
+import tensorflow.python.platform
+from tensorflow.python.ops.rnn_cell import *
diff --git a/tensorflow/models/rnn/seq2seq.py b/tensorflow/models/rnn/seq2seq.py
index 77782ee9347..3732a096922 100644
--- a/tensorflow/models/rnn/seq2seq.py
+++ b/tensorflow/models/rnn/seq2seq.py
@@ -12,757 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Import seq2seq python ops for backward compatibility."""
 
-"""Library for creating sequence-to-sequence models."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# pylint: disable=g-bad-import-order,wildcard-import,unused-import
 import tensorflow.python.platform
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-from tensorflow.models.rnn import linear
-from tensorflow.models.rnn import rnn
-from tensorflow.models.rnn import rnn_cell
-
-
-def rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None,
-                scope=None):
-  """RNN decoder for the sequence-to-sequence model.
-
-  Args:
-    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
-    initial_state: 2D Tensor with shape [batch_size x cell.state_size].
-    cell: rnn_cell.RNNCell defining the cell function and size.
-    loop_function: if not None, this function will be applied to i-th output
-      in order to generate i+1-th input, and decoder_inputs will be ignored,
-      except for the first element ("GO" symbol). This can be used for decoding,
-      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
-      Signature -- loop_function(prev, i) = next
-        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
-        * i is an integer, the step number (when advanced control is needed),
-        * next is a 2D Tensor of shape [batch_size x cell.input_size].
-    scope: VariableScope for the created subgraph; defaults to "rnn_decoder".
-
-  Returns:
-    outputs: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x cell.output_size] containing generated outputs.
-    states: The state of each cell in each time-step. This is a list with
-      length len(decoder_inputs) -- one item for each time-step.
-      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
-      (Note that in some cases, like basic RNN cell or GRU cell, outputs and
-       states can be the same. They are different for LSTM cells though.)
-  """
-  with tf.variable_scope(scope or "rnn_decoder"):
-    states = [initial_state]
-    outputs = []
-    prev = None
-    for i in xrange(len(decoder_inputs)):
-      inp = decoder_inputs[i]
-      if loop_function is not None and prev is not None:
-        with tf.variable_scope("loop_function", reuse=True):
-          # We do not propagate gradients over the loop function.
-          inp = tf.stop_gradient(loop_function(prev, i))
-      if i > 0:
-        tf.get_variable_scope().reuse_variables()
-      output, new_state = cell(inp, states[-1])
-      outputs.append(output)
-      states.append(new_state)
-      if loop_function is not None:
-        prev = tf.stop_gradient(output)
-  return outputs, states
-
-
-def basic_rnn_seq2seq(
-    encoder_inputs, decoder_inputs, cell, dtype=tf.float32, scope=None):
-  """Basic RNN sequence-to-sequence model.
-
-  This model first runs an RNN to encode encoder_inputs into a state vector, and
-  then runs decoder, initialized with the last encoder state, on decoder_inputs.
-  Encoder and decoder use the same RNN cell type, but don't share parameters.
-
-  Args:
-    encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
-    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
-    cell: rnn_cell.RNNCell defining the cell function and size.
-    dtype: The dtype of the initial state of the RNN cell (default: tf.float32).
-    scope: VariableScope for the created subgraph; default: "basic_rnn_seq2seq".
-
-  Returns:
-    outputs: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x cell.output_size] containing the generated outputs.
-    states: The state of each decoder cell in each time-step. This is a list
-      with length len(decoder_inputs) -- one item for each time-step.
-      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
-  """
-  with tf.variable_scope(scope or "basic_rnn_seq2seq"):
-    _, enc_states = rnn.rnn(cell, encoder_inputs, dtype=dtype)
-    return rnn_decoder(decoder_inputs, enc_states[-1], cell)
-
-
-def tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell,
-                     loop_function=None, dtype=tf.float32, scope=None):
-  """RNN sequence-to-sequence model with tied encoder and decoder parameters.
-
-  This model first runs an RNN to encode encoder_inputs into a state vector, and
-  then runs decoder, initialized with the last encoder state, on decoder_inputs.
-  Encoder and decoder use the same RNN cell and share parameters.
-
-  Args:
-    encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
-    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
-    cell: rnn_cell.RNNCell defining the cell function and size.
-    loop_function: if not None, this function will be applied to i-th output
-      in order to generate i+1-th input, and decoder_inputs will be ignored,
-      except for the first element ("GO" symbol), see rnn_decoder for details.
-    dtype: The dtype of the initial state of the rnn cell (default: tf.float32).
-    scope: VariableScope for the created subgraph; default: "tied_rnn_seq2seq".
-
-  Returns:
-    outputs: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x cell.output_size] containing the generated outputs.
-    states: The state of each decoder cell in each time-step. This is a list
-      with length len(decoder_inputs) -- one item for each time-step.
-      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
-  """
-  with tf.variable_scope("combined_tied_rnn_seq2seq"):
-    scope = scope or "tied_rnn_seq2seq"
-    _, enc_states = rnn.rnn(
-        cell, encoder_inputs, dtype=dtype, scope=scope)
-    tf.get_variable_scope().reuse_variables()
-    return rnn_decoder(decoder_inputs, enc_states[-1], cell,
-                       loop_function=loop_function, scope=scope)
-
-
-def embedding_rnn_decoder(decoder_inputs, initial_state, cell, num_symbols,
-                          output_projection=None, feed_previous=False,
-                          scope=None):
-  """RNN decoder with embedding and a pure-decoding option.
-
-  Args:
-    decoder_inputs: a list of 1D batch-sized int32-Tensors (decoder inputs).
-    initial_state: 2D Tensor [batch_size x cell.state_size].
-    cell: rnn_cell.RNNCell defining the cell function.
-    num_symbols: integer, how many symbols come into the embedding.
-    output_projection: None or a pair (W, B) of output projection weights and
-      biases; W has shape [cell.output_size x num_symbols] and B has
-      shape [num_symbols]; if provided and feed_previous=True, each fed
-      previous output will first be multiplied by W and added B.
-    feed_previous: Boolean; if True, only the first of decoder_inputs will be
-      used (the "GO" symbol), and all other decoder inputs will be generated by:
-        next = embedding_lookup(embedding, argmax(previous_output)),
-      In effect, this implements a greedy decoder. It can also be used
-      during training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
-      If False, decoder_inputs are used as given (the standard decoder case).
-    scope: VariableScope for the created subgraph; defaults to
-      "embedding_rnn_decoder".
-
-  Returns:
-    outputs: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x cell.output_size] containing the generated outputs.
-    states: The state of each decoder cell in each time-step. This is a list
-      with length len(decoder_inputs) -- one item for each time-step.
-      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
-
-  Raises:
-    ValueError: when output_projection has the wrong shape.
-  """
-  if output_projection is not None:
-    proj_weights = tf.convert_to_tensor(output_projection[0], dtype=tf.float32)
-    proj_weights.get_shape().assert_is_compatible_with([cell.output_size,
-                                                        num_symbols])
-    proj_biases = tf.convert_to_tensor(output_projection[1], dtype=tf.float32)
-    proj_biases.get_shape().assert_is_compatible_with([num_symbols])
-
-  with tf.variable_scope(scope or "embedding_rnn_decoder"):
-    with tf.device("/cpu:0"):
-      embedding = tf.get_variable("embedding", [num_symbols, cell.input_size])
-
-    def extract_argmax_and_embed(prev, _):
-      """Loop_function that extracts the symbol from prev and embeds it."""
-      if output_projection is not None:
-        prev = tf.nn.xw_plus_b(prev, output_projection[0], output_projection[1])
-      prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
-      return tf.nn.embedding_lookup(embedding, prev_symbol)
-
-    loop_function = None
-    if feed_previous:
-      loop_function = extract_argmax_and_embed
-
-    emb_inp = [tf.nn.embedding_lookup(embedding, i) for i in decoder_inputs]
-    return rnn_decoder(emb_inp, initial_state, cell,
-                       loop_function=loop_function)
-
-
-def embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell,
-                          num_encoder_symbols, num_decoder_symbols,
-                          output_projection=None, feed_previous=False,
-                          dtype=tf.float32, scope=None):
-  """Embedding RNN sequence-to-sequence model.
-
-  This model first embeds encoder_inputs by a newly created embedding (of shape
-  [num_encoder_symbols x cell.input_size]). Then it runs an RNN to encode
-  embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs
-  by another newly created embedding (of shape [num_decoder_symbols x
-  cell.input_size]). Then it runs RNN decoder, initialized with the last
-  encoder state, on embedded decoder_inputs.
-
-  Args:
-    encoder_inputs: a list of 1D int32-Tensors of shape [batch_size].
-    decoder_inputs: a list of 1D int32-Tensors of shape [batch_size].
-    cell: rnn_cell.RNNCell defining the cell function and size.
-    num_encoder_symbols: integer; number of symbols on the encoder side.
-    num_decoder_symbols: integer; number of symbols on the decoder side.
-    output_projection: None or a pair (W, B) of output projection weights and
-      biases; W has shape [cell.output_size x num_decoder_symbols] and B has
-      shape [num_decoder_symbols]; if provided and feed_previous=True, each
-      fed previous output will first be multiplied by W and added B.
-    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
-      of decoder_inputs will be used (the "GO" symbol), and all other decoder
-      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
-      If False, decoder_inputs are used as given (the standard decoder case).
-    dtype: The dtype of the initial state for both the encoder and encoder
-      rnn cells (default: tf.float32).
-    scope: VariableScope for the created subgraph; defaults to
-      "embedding_rnn_seq2seq"
-
-  Returns:
-    outputs: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x num_decoder_symbols] containing the generated outputs.
-    states: The state of each decoder cell in each time-step. This is a list
-      with length len(decoder_inputs) -- one item for each time-step.
-      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
-  """
-  with tf.variable_scope(scope or "embedding_rnn_seq2seq"):
-    # Encoder.
-    encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols)
-    _, encoder_states = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype)
-
-    # Decoder.
-    if output_projection is None:
-      cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
-
-    if isinstance(feed_previous, bool):
-      return embedding_rnn_decoder(decoder_inputs, encoder_states[-1], cell,
-                                   num_decoder_symbols, output_projection,
-                                   feed_previous)
-    else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
-      outputs1, states1 = embedding_rnn_decoder(
-          decoder_inputs, encoder_states[-1], cell, num_decoder_symbols,
-          output_projection, True)
-      tf.get_variable_scope().reuse_variables()
-      outputs2, states2 = embedding_rnn_decoder(
-          decoder_inputs, encoder_states[-1], cell, num_decoder_symbols,
-          output_projection, False)
-
-      outputs = tf.control_flow_ops.cond(feed_previous,
-                                         lambda: outputs1, lambda: outputs2)
-      states = tf.control_flow_ops.cond(feed_previous,
-                                        lambda: states1, lambda: states2)
-      return outputs, states
-
-
-def embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell,
-                               num_symbols, output_projection=None,
-                               feed_previous=False, dtype=tf.float32,
-                               scope=None):
-  """Embedding RNN sequence-to-sequence model with tied (shared) parameters.
-
-  This model first embeds encoder_inputs by a newly created embedding (of shape
-  [num_symbols x cell.input_size]). Then it runs an RNN to encode embedded
-  encoder_inputs into a state vector. Next, it embeds decoder_inputs using
-  the same embedding. Then it runs RNN decoder, initialized with the last
-  encoder state, on embedded decoder_inputs.
-
-  Args:
-    encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
-    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
-    cell: rnn_cell.RNNCell defining the cell function and size.
-    num_symbols: integer; number of symbols for both encoder and decoder.
-    output_projection: None or a pair (W, B) of output projection weights and
-      biases; W has shape [cell.output_size x num_symbols] and B has
-      shape [num_symbols]; if provided and feed_previous=True, each
-      fed previous output will first be multiplied by W and added B.
-    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
-      of decoder_inputs will be used (the "GO" symbol), and all other decoder
-      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
-      If False, decoder_inputs are used as given (the standard decoder case).
-    dtype: The dtype to use for the initial RNN states (default: tf.float32).
-    scope: VariableScope for the created subgraph; defaults to
-      "embedding_tied_rnn_seq2seq".
-
-  Returns:
-    outputs: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x num_decoder_symbols] containing the generated outputs.
-    states: The state of each decoder cell in each time-step. This is a list
-      with length len(decoder_inputs) -- one item for each time-step.
-      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
-
-  Raises:
-    ValueError: when output_projection has the wrong shape.
-  """
-  if output_projection is not None:
-    proj_weights = tf.convert_to_tensor(output_projection[0], dtype=dtype)
-    proj_weights.get_shape().assert_is_compatible_with([cell.output_size,
-                                                        num_symbols])
-    proj_biases = tf.convert_to_tensor(output_projection[1], dtype=dtype)
-    proj_biases.get_shape().assert_is_compatible_with([num_symbols])
-
-  with tf.variable_scope(scope or "embedding_tied_rnn_seq2seq"):
-    with tf.device("/cpu:0"):
-      embedding = tf.get_variable("embedding", [num_symbols, cell.input_size])
-
-    emb_encoder_inputs = [tf.nn.embedding_lookup(embedding, x)
-                          for x in encoder_inputs]
-    emb_decoder_inputs = [tf.nn.embedding_lookup(embedding, x)
-                          for x in decoder_inputs]
-
-    def extract_argmax_and_embed(prev, _):
-      """Loop_function that extracts the symbol from prev and embeds it."""
-      if output_projection is not None:
-        prev = tf.nn.xw_plus_b(prev, output_projection[0], output_projection[1])
-      prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
-      return tf.nn.embedding_lookup(embedding, prev_symbol)
-
-    if output_projection is None:
-      cell = rnn_cell.OutputProjectionWrapper(cell, num_symbols)
-
-    if isinstance(feed_previous, bool):
-      loop_function = extract_argmax_and_embed if feed_previous else None
-      return tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell,
-                              loop_function=loop_function, dtype=dtype)
-    else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
-      outputs1, states1 = tied_rnn_seq2seq(
-          emb_encoder_inputs, emb_decoder_inputs, cell,
-          loop_function=extract_argmax_and_embed, dtype=dtype)
-      tf.get_variable_scope().reuse_variables()
-      outputs2, states2 = tied_rnn_seq2seq(
-          emb_encoder_inputs, emb_decoder_inputs, cell, dtype=dtype)
-
-      outputs = tf.control_flow_ops.cond(feed_previous,
-                                         lambda: outputs1, lambda: outputs2)
-      states = tf.control_flow_ops.cond(feed_previous,
-                                        lambda: states1, lambda: states2)
-      return outputs, states
-
-
-def attention_decoder(decoder_inputs, initial_state, attention_states, cell,
-                      output_size=None, num_heads=1, loop_function=None,
-                      dtype=tf.float32, scope=None):
-  """RNN decoder with attention for the sequence-to-sequence model.
-
-  Args:
-    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
-    initial_state: 2D Tensor [batch_size x cell.state_size].
-    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
-    cell: rnn_cell.RNNCell defining the cell function and size.
-    output_size: size of the output vectors; if None, we use cell.output_size.
-    num_heads: number of attention heads that read from attention_states.
-    loop_function: if not None, this function will be applied to i-th output
-      in order to generate i+1-th input, and decoder_inputs will be ignored,
-      except for the first element ("GO" symbol). This can be used for decoding,
-      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
-      Signature -- loop_function(prev, i) = next
-        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
-        * i is an integer, the step number (when advanced control is needed),
-        * next is a 2D Tensor of shape [batch_size x cell.input_size].
-    dtype: The dtype to use for the RNN initial state (default: tf.float32).
-    scope: VariableScope for the created subgraph; default: "attention_decoder".
-
-  Returns:
-    outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
-      [batch_size x output_size]. These represent the generated outputs.
-      Output i is computed from input i (which is either i-th decoder_inputs or
-      loop_function(output {i-1}, i)) as follows. First, we run the cell
-      on a combination of the input and previous attention masks:
-        cell_output, new_state = cell(linear(input, prev_attn), prev_state).
-      Then, we calculate new attention masks:
-        new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
-      and then we calculate the output:
-        output = linear(cell_output, new_attn).
-    states: The state of each decoder cell in each time-step. This is a list
-      with length len(decoder_inputs) -- one item for each time-step.
-      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
-
-  Raises:
-    ValueError: when num_heads is not positive, there are no inputs, or shapes
-      of attention_states are not set.
-  """
-  if not decoder_inputs:
-    raise ValueError("Must provide at least 1 input to attention decoder.")
-  if num_heads < 1:
-    raise ValueError("With less than 1 heads, use a non-attention decoder.")
-  if not attention_states.get_shape()[1:2].is_fully_defined():
-    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
-                     % attention_states.get_shape())
-  if output_size is None:
-    output_size = cell.output_size
-
-  with tf.variable_scope(scope or "attention_decoder"):
-    batch_size = tf.shape(decoder_inputs[0])[0]  # Needed for reshaping.
-    attn_length = attention_states.get_shape()[1].value
-    attn_size = attention_states.get_shape()[2].value
-
-    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
-    hidden = tf.reshape(attention_states, [-1, attn_length, 1, attn_size])
-    hidden_features = []
-    v = []
-    attention_vec_size = attn_size  # Size of query vectors for attention.
-    for a in xrange(num_heads):
-      k = tf.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size])
-      hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
-      v.append(tf.get_variable("AttnV_%d" % a, [attention_vec_size]))
-
-    states = [initial_state]
-
-    def attention(query):
-      """Put attention masks on hidden using hidden_features and query."""
-      ds = []  # Results of attention reads will be stored here.
-      for a in xrange(num_heads):
-        with tf.variable_scope("Attention_%d" % a):
-          y = linear.linear(query, attention_vec_size, True)
-          y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
-          # Attention mask is a softmax of v^T * tanh(...).
-          s = tf.reduce_sum(v[a] * tf.tanh(hidden_features[a] + y), [2, 3])
-          a = tf.nn.softmax(s)
-          # Now calculate the attention-weighted vector d.
-          d = tf.reduce_sum(tf.reshape(a, [-1, attn_length, 1, 1]) * hidden,
-                            [1, 2])
-          ds.append(tf.reshape(d, [-1, attn_size]))
-      return ds
-
-    outputs = []
-    prev = None
-    batch_attn_size = tf.pack([batch_size, attn_size])
-    attns = [tf.zeros(batch_attn_size, dtype=dtype)
-             for _ in xrange(num_heads)]
-    for a in attns:  # Ensure the second shape of attention vectors is set.
-      a.set_shape([None, attn_size])
-    for i in xrange(len(decoder_inputs)):
-      if i > 0:
-        tf.get_variable_scope().reuse_variables()
-      inp = decoder_inputs[i]
-      # If loop_function is set, we use it instead of decoder_inputs.
-      if loop_function is not None and prev is not None:
-        with tf.variable_scope("loop_function", reuse=True):
-          inp = tf.stop_gradient(loop_function(prev, i))
-      # Merge input and previous attentions into one vector of the right size.
-      x = linear.linear([inp] + attns, cell.input_size, True)
-      # Run the RNN.
-      cell_output, new_state = cell(x, states[-1])
-      states.append(new_state)
-      # Run the attention mechanism.
-      attns = attention(new_state)
-      with tf.variable_scope("AttnOutputProjection"):
-        output = linear.linear([cell_output] + attns, output_size, True)
-      if loop_function is not None:
-        # We do not propagate gradients over the loop function.
-        prev = tf.stop_gradient(output)
-      outputs.append(output)
-
-  return outputs, states
-
-
-def embedding_attention_decoder(decoder_inputs, initial_state, attention_states,
-                                cell, num_symbols, num_heads=1,
-                                output_size=None, output_projection=None,
-                                feed_previous=False, dtype=tf.float32,
-                                scope=None):
-  """RNN decoder with embedding and attention and a pure-decoding option.
-
-  Args:
-    decoder_inputs: a list of 1D batch-sized int32-Tensors (decoder inputs).
-    initial_state: 2D Tensor [batch_size x cell.state_size].
-    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
-    cell: rnn_cell.RNNCell defining the cell function.
-    num_symbols: integer, how many symbols come into the embedding.
-    num_heads: number of attention heads that read from attention_states.
-    output_size: size of the output vectors; if None, use cell.output_size.
-    output_projection: None or a pair (W, B) of output projection weights and
-      biases; W has shape [output_size x num_symbols] and B has shape
-      [num_symbols]; if provided and feed_previous=True, each fed previous
-      output will first be multiplied by W and added B.
-    feed_previous: Boolean; if True, only the first of decoder_inputs will be
-      used (the "GO" symbol), and all other decoder inputs will be generated by:
-        next = embedding_lookup(embedding, argmax(previous_output)),
-      In effect, this implements a greedy decoder. It can also be used
-      during training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
-      If False, decoder_inputs are used as given (the standard decoder case).
-    dtype: The dtype to use for the RNN initial states (default: tf.float32).
-    scope: VariableScope for the created subgraph; defaults to
-      "embedding_attention_decoder".
-
-  Returns:
-    outputs: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x output_size] containing the generated outputs.
-    states: The state of each decoder cell in each time-step. This is a list
-      with length len(decoder_inputs) -- one item for each time-step.
-      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
-
-  Raises:
-    ValueError: when output_projection has the wrong shape.
-  """
-  if output_size is None:
-    output_size = cell.output_size
-  if output_projection is not None:
-    proj_weights = tf.convert_to_tensor(output_projection[0], dtype=dtype)
-    proj_weights.get_shape().assert_is_compatible_with([cell.output_size,
-                                                        num_symbols])
-    proj_biases = tf.convert_to_tensor(output_projection[1], dtype=dtype)
-    proj_biases.get_shape().assert_is_compatible_with([num_symbols])
-
-  with tf.variable_scope(scope or "embedding_attention_decoder"):
-    with tf.device("/cpu:0"):
-      embedding = tf.get_variable("embedding", [num_symbols, cell.input_size])
-
-    def extract_argmax_and_embed(prev, _):
-      """Loop_function that extracts the symbol from prev and embeds it."""
-      if output_projection is not None:
-        prev = tf.nn.xw_plus_b(prev, output_projection[0], output_projection[1])
-      prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
-      emb_prev = tf.nn.embedding_lookup(embedding, prev_symbol)
-      return emb_prev
-
-    loop_function = None
-    if feed_previous:
-      loop_function = extract_argmax_and_embed
-
-    emb_inp = [tf.nn.embedding_lookup(embedding, i) for i in decoder_inputs]
-    return attention_decoder(
-        emb_inp, initial_state, attention_states, cell, output_size=output_size,
-        num_heads=num_heads, loop_function=loop_function)
-
-
-def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell,
-                                num_encoder_symbols, num_decoder_symbols,
-                                num_heads=1, output_projection=None,
-                                feed_previous=False, dtype=tf.float32,
-                                scope=None):
-  """Embedding sequence-to-sequence model with attention.
-
-  This model first embeds encoder_inputs by a newly created embedding (of shape
-  [num_encoder_symbols x cell.input_size]). Then it runs an RNN to encode
-  embedded encoder_inputs into a state vector. It keeps the outputs of this
-  RNN at every step to use for attention later. Next, it embeds decoder_inputs
-  by another newly created embedding (of shape [num_decoder_symbols x
-  cell.input_size]). Then it runs attention decoder, initialized with the last
-  encoder state, on embedded decoder_inputs and attending to encoder outputs.
-
-  Args:
-    encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
-    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
-    cell: rnn_cell.RNNCell defining the cell function and size.
-    num_encoder_symbols: integer; number of symbols on the encoder side.
-    num_decoder_symbols: integer; number of symbols on the decoder side.
-    num_heads: number of attention heads that read from attention_states.
-    output_projection: None or a pair (W, B) of output projection weights and
-      biases; W has shape [cell.output_size x num_decoder_symbols] and B has
-      shape [num_decoder_symbols]; if provided and feed_previous=True, each
-      fed previous output will first be multiplied by W and added B.
-    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
-      of decoder_inputs will be used (the "GO" symbol), and all other decoder
-      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
-      If False, decoder_inputs are used as given (the standard decoder case).
-    dtype: The dtype of the initial RNN state (default: tf.float32).
-    scope: VariableScope for the created subgraph; defaults to
-      "embedding_attention_seq2seq".
-
-  Returns:
-    outputs: A list of the same length as decoder_inputs of 2D Tensors with
-      shape [batch_size x num_decoder_symbols] containing the generated outputs.
-    states: The state of each decoder cell in each time-step. This is a list
-      with length len(decoder_inputs) -- one item for each time-step.
-      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
-  """
-  with tf.variable_scope(scope or "embedding_attention_seq2seq"):
-    # Encoder.
-    encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols)
-    encoder_outputs, encoder_states = rnn.rnn(
-        encoder_cell, encoder_inputs, dtype=dtype)
-
-    # First calculate a concatenation of encoder outputs to put attention on.
-    top_states = [tf.reshape(e, [-1, 1, cell.output_size])
-                  for e in encoder_outputs]
-    attention_states = tf.concat(1, top_states)
-
-    # Decoder.
-    output_size = None
-    if output_projection is None:
-      cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
-      output_size = num_decoder_symbols
-
-    if isinstance(feed_previous, bool):
-      return embedding_attention_decoder(
-          decoder_inputs, encoder_states[-1], attention_states, cell,
-          num_decoder_symbols, num_heads, output_size, output_projection,
-          feed_previous)
-    else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
-      outputs1, states1 = embedding_attention_decoder(
-          decoder_inputs, encoder_states[-1], attention_states, cell,
-          num_decoder_symbols, num_heads, output_size, output_projection, True)
-      tf.get_variable_scope().reuse_variables()
-      outputs2, states2 = embedding_attention_decoder(
-          decoder_inputs, encoder_states[-1], attention_states, cell,
-          num_decoder_symbols, num_heads, output_size, output_projection, False)
-
-      outputs = tf.control_flow_ops.cond(feed_previous,
-                                         lambda: outputs1, lambda: outputs2)
-      states = tf.control_flow_ops.cond(feed_previous,
-                                        lambda: states1, lambda: states2)
-      return outputs, states
-
-
-def sequence_loss_by_example(logits, targets, weights, num_decoder_symbols,
-                             average_across_timesteps=True,
-                             softmax_loss_function=None, name=None):
-  """Weighted cross-entropy loss for a sequence of logits (per example).
-
-  Args:
-    logits: list of 2D Tensors of shape [batch_size x num_decoder_symbols].
-    targets: list of 1D batch-sized int32-Tensors of the same length as logits.
-    weights: list of 1D batch-sized float-Tensors of the same length as logits.
-    num_decoder_symbols: integer, number of decoder symbols (output classes).
-    average_across_timesteps: If set, divide the returned cost by the total
-      label weight.
-    softmax_loss_function: function (inputs-batch, labels-batch) -> loss-batch
-      to be used instead of the standard softmax (the default if this is None).
-    name: optional name for this operation, default: "sequence_loss_by_example".
-
-  Returns:
-    1D batch-sized float Tensor: the log-perplexity for each sequence.
-
-  Raises:
-    ValueError: if len(logits) is different from len(targets) or len(weights).
-  """
-  if len(targets) != len(logits) or len(weights) != len(logits):
-    raise ValueError("Lengths of logits, weights, and targets must be the same "
-                     "%d, %d, %d." % (len(logits), len(weights), len(targets)))
-  with tf.op_scope(logits + targets + weights, name,
-                   "sequence_loss_by_example"):
-    batch_size = tf.shape(targets[0])[0]
-    log_perp_list = []
-    length = batch_size * num_decoder_symbols
-    for i in xrange(len(logits)):
-      if softmax_loss_function is None:
-        # TODO(lukaszkaiser): There is no SparseCrossEntropy in TensorFlow, so
-        # we need to first cast targets into a dense representation, and as
-        # SparseToDense does not accept batched inputs, we need to do this by
-        # re-indexing and re-sizing. When TensorFlow adds SparseCrossEntropy,
-        # rewrite this method.
-        indices = targets[i] + num_decoder_symbols * tf.range(batch_size)
-        with tf.device("/cpu:0"):  # Sparse-to-dense must happen on CPU for now.
-          dense = tf.sparse_to_dense(indices, tf.expand_dims(length, 0), 1.0,
-                                     0.0)
-        target = tf.reshape(dense, [-1, num_decoder_symbols])
-        crossent = tf.nn.softmax_cross_entropy_with_logits(
-            logits[i], target, name="SequenceLoss/CrossEntropy{0}".format(i))
-      else:
-        crossent = softmax_loss_function(logits[i], targets[i])
-      log_perp_list.append(crossent * weights[i])
-    log_perps = tf.add_n(log_perp_list)
-    if average_across_timesteps:
-      total_size = tf.add_n(weights)
-      total_size += 1e-12  # Just to avoid division by 0 for all-0 weights.
-      log_perps /= total_size
-  return log_perps
-
-
-def sequence_loss(logits, targets, weights, num_decoder_symbols,
-                  average_across_timesteps=True, average_across_batch=True,
-                  softmax_loss_function=None, name=None):
-  """Weighted cross-entropy loss for a sequence of logits, batch-collapsed.
-
-  Args:
-    logits: list of 2D Tensors os shape [batch_size x num_decoder_symbols].
-    targets: list of 1D batch-sized int32-Tensors of the same length as logits.
-    weights: list of 1D batch-sized float-Tensors of the same length as logits.
-    num_decoder_symbols: integer, number of decoder symbols (output classes).
-    average_across_timesteps: If set, divide the returned cost by the total
-      label weight.
-    average_across_batch: If set, divide the returned cost by the batch size.
-    softmax_loss_function: function (inputs-batch, labels-batch) -> loss-batch
-      to be used instead of the standard softmax (the default if this is None).
-    name: optional name for this operation, defaults to "sequence_loss".
-
-  Returns:
-    A scalar float Tensor: the average log-perplexity per symbol (weighted).
-
-  Raises:
-    ValueError: if len(logits) is different from len(targets) or len(weights).
-  """
-  with tf.op_scope(logits + targets + weights, name, "sequence_loss"):
-    cost = tf.reduce_sum(sequence_loss_by_example(
-        logits, targets, weights, num_decoder_symbols,
-        average_across_timesteps=average_across_timesteps,
-        softmax_loss_function=softmax_loss_function))
-    if average_across_batch:
-      batch_size = tf.shape(targets[0])[0]
-      return cost / tf.cast(batch_size, tf.float32)
-    else:
-      return cost
-
-
-def model_with_buckets(encoder_inputs, decoder_inputs, targets, weights,
-                       buckets, num_decoder_symbols, seq2seq,
-                       softmax_loss_function=None, name=None):
-  """Create a sequence-to-sequence model with support for bucketing.
-
-  The seq2seq argument is a function that defines a sequence-to-sequence model,
-  e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24))
-
-  Args:
-    encoder_inputs: a list of Tensors to feed the encoder; first seq2seq input.
-    decoder_inputs: a list of Tensors to feed the decoder; second seq2seq input.
-    targets: a list of 1D batch-sized int32-Tensors (desired output sequence).
-    weights: list of 1D batch-sized float-Tensors to weight the targets.
-    buckets: a list of pairs of (input size, output size) for each bucket.
-    num_decoder_symbols: integer, number of decoder symbols (output classes).
-    seq2seq: a sequence-to-sequence model function; it takes 2 input that
-      agree with encoder_inputs and decoder_inputs, and returns a pair
-      consisting of outputs and states (as, e.g., basic_rnn_seq2seq).
-    softmax_loss_function: function (inputs-batch, labels-batch) -> loss-batch
-      to be used instead of the standard softmax (the default if this is None).
-    name: optional name for this operation, defaults to "model_with_buckets".
-
-  Returns:
-    outputs: The outputs for each bucket. Its j'th element consists of a list
-      of 2D Tensors of shape [batch_size x num_decoder_symbols] (j'th outputs).
-    losses: List of scalar Tensors, representing losses for each bucket.
-  Raises:
-    ValueError: if length of encoder_inputsut, targets, or weights is smaller
-      than the largest (last) bucket.
-  """
-  if len(encoder_inputs) < buckets[-1][0]:
-    raise ValueError("Length of encoder_inputs (%d) must be at least that of la"
-                     "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0]))
-  if len(targets) < buckets[-1][1]:
-    raise ValueError("Length of targets (%d) must be at least that of last"
-                     "bucket (%d)." % (len(targets), buckets[-1][1]))
-  if len(weights) < buckets[-1][1]:
-    raise ValueError("Length of weights (%d) must be at least that of last"
-                     "bucket (%d)." % (len(weights), buckets[-1][1]))
-
-  all_inputs = encoder_inputs + decoder_inputs + targets + weights
-  losses = []
-  outputs = []
-  with tf.op_scope(all_inputs, name, "model_with_buckets"):
-    for j in xrange(len(buckets)):
-      if j > 0:
-        tf.get_variable_scope().reuse_variables()
-      bucket_encoder_inputs = [encoder_inputs[i]
-                               for i in xrange(buckets[j][0])]
-      bucket_decoder_inputs = [decoder_inputs[i]
-                               for i in xrange(buckets[j][1])]
-      bucket_outputs, _ = seq2seq(bucket_encoder_inputs,
-                                  bucket_decoder_inputs)
-      outputs.append(bucket_outputs)
-
-      bucket_targets = [targets[i] for i in xrange(buckets[j][1])]
-      bucket_weights = [weights[i] for i in xrange(buckets[j][1])]
-      losses.append(sequence_loss(
-          outputs[-1], bucket_targets, bucket_weights, num_decoder_symbols,
-          softmax_loss_function=softmax_loss_function))
-
-  return outputs, losses
+from tensorflow.python.ops.seq2seq import *
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 76126b30435..49f42dd6f3b 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -34,6 +34,7 @@ py_library(
         ":client_testlib",
         ":framework",
         ":framework_test_lib",
+        ":kernel_tests/gradient_checker",
         ":platform",
         ":platform_test",
         ":summary",
@@ -467,6 +468,7 @@ tf_gen_op_wrapper_py(
         "ReluGrad",
         "Relu6Grad",
         "SoftplusGrad",
+        "SoftsignGrad",
         "BiasAdd",
         "Relu6",
         "AvgPool",
@@ -588,6 +590,9 @@ py_library(
         "ops/op_def_library.py",
         "ops/parsing_ops.py",
         "ops/random_ops.py",
+        "ops/rnn.py",
+        "ops/rnn_cell.py",
+        "ops/seq2seq.py",
         "ops/sparse_grad.py",
         "ops/sparse_ops.py",
         "ops/standard_ops.py",
diff --git a/tensorflow/python/framework/gen_docs_combined.py b/tensorflow/python/framework/gen_docs_combined.py
index 9646ef6673d..7c4018332d8 100644
--- a/tensorflow/python/framework/gen_docs_combined.py
+++ b/tensorflow/python/framework/gen_docs_combined.py
@@ -93,8 +93,8 @@ def all_libraries(module_to_name, members, documented):
                                "max_pool_grad", "max_pool_grad_with_argmax",
                                "batch_norm_with_global_normalization_grad",
                                "lrn_grad", "relu6_grad", "softplus_grad",
-                               "xw_plus_b", "relu_layer", "lrn",
-                               "batch_norm_with_global_normalization",
+                               "softsign_grad", "xw_plus_b", "relu_layer",
+                               "lrn", "batch_norm_with_global_normalization",
                                "batch_norm_with_global_normalization_grad",
                                "all_candidate_sampler",
                                "embedding_lookup_sparse"],
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index d66e93300d9..352c73c0f77 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -442,8 +442,8 @@ class Tensor(object):
     return _eval_using_default_session(self, feed_dict, self.graph, session)
 
 
-def _TensorTensorConversionFunction(t, dtype=None, name=None):
-  _ = name
+def _TensorTensorConversionFunction(t, dtype=None, name=None, as_ref=False):
+  _ = name, as_ref
   if dtype and not dtype.is_compatible_with(t.dtype):
     raise ValueError(
         "Tensor conversion requested dtype %s for Tensor with dtype %s: %r"
@@ -455,7 +455,7 @@ _tensor_conversion_func_registry = {
     0: [(Tensor, _TensorTensorConversionFunction)]}
 
 
-def convert_to_tensor(value, dtype=None, name=None):
+def convert_to_tensor(value, dtype=None, name=None, as_ref=False):
   """Converts the given `value` to a `Tensor`.
 
   This function converts Python objects of various types to `Tensor`
@@ -487,6 +487,7 @@ def convert_to_tensor(value, dtype=None, name=None):
     dtype: Optional element type for the returned tensor. If missing, the
       type is inferred from the type of `value`.
     name: Optional name to use if a new `Tensor` is created.
+    as_ref: True if we want the result as a ref tensor.
 
   Returns:
     A `Tensor` based on `value`.
@@ -502,7 +503,7 @@ def convert_to_tensor(value, dtype=None, name=None):
   for _, funcs_at_priority in sorted(_tensor_conversion_func_registry.items()):
     for base_type, conversion_func in funcs_at_priority:
       if isinstance(value, base_type):
-        ret = conversion_func(value, dtype=dtype, name=name)
+        ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
         if not isinstance(ret, Tensor):
           raise RuntimeError(
               "%sConversion function %r for type %s returned non-Tensor: %r"
@@ -519,7 +520,8 @@ def convert_to_tensor(value, dtype=None, name=None):
                   % (error_prefix, value, type(value)))
 
 
-def convert_to_tensor_or_indexed_slices(value, dtype=None, name=None):
+def convert_to_tensor_or_indexed_slices(value, dtype=None, name=None,
+                                        as_ref=False):
   """Converts the given object to a `Tensor` or an `IndexedSlices`.
 
   If `value` is an `IndexedSlices` it is returned
@@ -532,6 +534,7 @@ def convert_to_tensor_or_indexed_slices(value, dtype=None, name=None):
     dtype: (Optional.) The required `DType` of the returned `Tensor` or
       `IndexedSlices`.
     name: (Optional.) A name to use if a new `Tensor` is created.
+    as_ref: True if the caller wants the results as ref tensors.
 
   Returns:
     An `Tensor` or an `IndexedSlices` based on `value`.
@@ -546,10 +549,11 @@ def convert_to_tensor_or_indexed_slices(value, dtype=None, name=None):
           % (dtypes.as_dtype(dtype).name, value.dtype.name, str(value)))
     return value
   else:
-    return convert_to_tensor(value, dtype, name)
+    return convert_to_tensor(value, dtype=dtype, name=name, as_ref=as_ref)
 
 
-def convert_n_to_tensor_or_indexed_slices(values, dtype=None, name=None):
+def convert_n_to_tensor_or_indexed_slices(values, dtype=None, name=None,
+                                          as_ref=False):
   """Converts `values` to a list of `Tensor` or `IndexedSlices` objects.
 
   Args:
@@ -557,10 +561,10 @@ def convert_n_to_tensor_or_indexed_slices(values, dtype=None, name=None):
       by `convert_to_tensor()`.
     dtype: (Optional.) The required `DType` of the returned `Tensor`
       `IndexedSlices`.
-
     name: (Optional.) A name prefix to used when a new `Tensor` is
       created, in which case element `i` will be given the name `name
       + '_' + i`.
+    as_ref: True if the caller wants the results as ref tensors.
 
   Returns:
     A list of `Tensor` and/or `IndexedSlices` objects.
@@ -580,7 +584,8 @@ def convert_n_to_tensor_or_indexed_slices(values, dtype=None, name=None):
     else:
       n = None if name is None else "%s_%d" % (name, i)
       ret.append(
-          convert_to_tensor_or_indexed_slices(value, dtype=dtype, name=n))
+          convert_to_tensor_or_indexed_slices(value, dtype=dtype, name=n,
+                                              as_ref=as_ref))
   return ret
 
 
@@ -590,13 +595,16 @@ def register_tensor_conversion_function(base_type, conversion_func,
 
   The conversion function must have the following signature:
 
-      def conversion_func(value, dtype=None, name=None):
+      def conversion_func(value, dtype=None, name=None, as_ref=False):
         # ...
 
   It must return a Tensor with the given dtype if specified. If the
   conversion function creates a new Tensor, it should use the given
   name if specified. All exceptions will be propagated to the caller.
 
+  If `as_ref` is true, the function must return a Tensor reference,
+  such as a VariableOp.
+
   NOTE: The conversion functions will execute in order of priority,
     followed by order of registration. To ensure that a conversion
     function F runs before another conversion function G, ensure that
@@ -762,23 +770,23 @@ class SparseTensor(object):
   ```
 
   By convention, `indices` should be sorted in row-major order (or equivalently
-  lexigraphic order on the tuples `indices[i]`).  This is not enforced when
-  `SparseTensor` objects are constructed, but most Ops assume correct ordering.
+  lexicographic order on the tuples `indices[i]`).  This is not enforced when
+  `SparseTensor` objects are constructed, but most ops assume correct ordering.
   If the ordering is wrong, it can be fixed by calling `sparse_reorder` on the
   misordered `SparseTensor`.
 
   Example: The sparse tensor
 
   ```python
-    SparseTensor(values=[1, 2], indices=[[0, 0], [1, 2]], shape=[3, 4])
+  SparseTensor(values=[1, 2], indices=[[0, 0], [1, 2]], shape=[3, 4])
   ```
 
   represents the dense tensor
 
   ```python
-    [[1, 0, 0, 0]
-     [0, 0, 2, 0]
-     [0, 0, 0, 0]]
+  [[1, 0, 0, 0]
+   [0, 0, 2, 0]
+   [0, 0, 0, 0]]
   ```
 
   @@__init__
@@ -795,14 +803,18 @@ class SparseTensor(object):
     Args:
       indices: A 2-D int64 tensor of shape `[N, ndims]`.
       values: A 1-D tensor of any type and shape `[N]`.
-     dense_shape: A 1-D int64 tensor of shape `[ndims]`.
+      shape: A 1-D int64 tensor of shape `[ndims]`.
 
     Returns:
       A `SparseTensor`
     """
     with op_scope([indices, values, shape], None, "SparseTensor"):
       indices = convert_to_tensor(indices, name="indices")
-      values = convert_to_tensor(values, name="values")
+      # Always pass as_ref=True because we want to be able to update
+      # values later if it is a VariableOp.
+      # TODO(touts): Consider adding mutable_values() when 'values'
+      # is a VariableOp and updating users of SparseTensor.
+      values = convert_to_tensor(values, name="values", as_ref=True)
       shape = convert_to_tensor(shape, name="shape")
     self._indices = indices
     self._values = values
@@ -987,7 +999,9 @@ class Operation(object):
     self._graph = g
     if inputs is None:
       inputs = []
-    self._inputs = inputs
+    elif not isinstance(inputs, list):
+      raise TypeError("inputs needs to be a list of Tensors: %s" % inputs)
+    self._inputs = list(inputs)  # Defensive copy.
     for a in self._inputs:
       if not isinstance(a, Tensor):
         raise TypeError("input needs to be a Tensor: %s" % a)
@@ -1391,6 +1405,7 @@ def get_gradient_function(op):
 _shape_registry = registry.Registry("shape functions")
 _default_shape_function_registry = registry.Registry("default shape functions")
 
+
 class RegisterShape(object):
   """A decorator for registering the shape function for an op type.
 
@@ -1924,6 +1939,7 @@ class Graph(object):
       A list of Operations.
     """
     return list(self._nodes_by_id.values())
+
   def get_operation_by_name(self, name):
     """Returns the `Operation` with the given `name`.
 
@@ -2045,7 +2061,7 @@ class Graph(object):
     else:
       c = []
       for item in self._collections.get(name, list()):
-        if hasattr(item, 'name') and item.name.startswith(scope):
+        if hasattr(item, "name") and item.name.startswith(scope):
           c.append(item)
       return c
 
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 7802db473e4..dd0c6e01b6d 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -522,19 +522,21 @@ def ConstantValue(tensor):
   elif tensor.op.type == "Shape":
     input_shape = tensor.op.inputs[0].get_shape()
     if input_shape.is_fully_defined():
-      return np.array([dim.value for dim in input_shape.dims])
+      return np.array([dim.value for dim in input_shape.dims],
+                      dtype=tensor.dtype.as_numpy_dtype)
     else:
       return None
   elif tensor.op.type == "Size":
     input_shape = tensor.op.inputs[0].get_shape()
     if input_shape.is_fully_defined():
-      return np.array([np.prod([dim.value for dim in input_shape.dims])])
+      return np.array([np.prod([dim.value for dim in input_shape.dims])],
+                      dtype=tensor.dtype.as_numpy_dtype)
     else:
       return None
   elif tensor.op.type == "Rank":
     input_shape = tensor.op.inputs[0].get_shape()
     if input_shape.ndims is not None:
-      return np.array([input_shape.ndims])
+      return np.array([input_shape.ndims], dtype=tensor.dtype.as_numpy_dtype)
     else:
       return None
   elif tensor.op.type == "Range":
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index c7e672d460e..f2828475aef 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -378,19 +378,25 @@ class ConstantValueTest(test_util.TensorFlowTestCase):
     self.assertIs(None, tensor_util.ConstantValue(tf_val))
 
   def testShape(self):
-    np_val = np.array([1, 2, 3])
+    np_val = np.array([1, 2, 3], dtype=np.int32)
     tf_val = array_ops.shape(constant_op.constant(0.0, shape=[1, 2, 3]))
-    self.assertAllEqual(np_val, tensor_util.ConstantValue(tf_val))
+    c_val = tensor_util.ConstantValue(tf_val)
+    self.assertAllEqual(np_val, c_val)
+    self.assertEqual(np.int32, c_val.dtype)
 
   def testSize(self):
-    np_val = np.array([6])
+    np_val = np.array([6], dtype=np.int32)
     tf_val = array_ops.size(constant_op.constant(0.0, shape=[1, 2, 3]))
-    self.assertAllEqual(np_val, tensor_util.ConstantValue(tf_val))
+    c_val = tensor_util.ConstantValue(tf_val)
+    self.assertAllEqual(np_val, c_val)
+    self.assertEqual(np.int32, c_val.dtype)
 
   def testRank(self):
-    np_val = np.array([3])
+    np_val = np.array([3], dtype=np.int32)
     tf_val = array_ops.rank(constant_op.constant(0.0, shape=[1, 2, 3]))
-    self.assertAllEqual(np_val, tensor_util.ConstantValue(tf_val))
+    c_val = tensor_util.ConstantValue(tf_val)
+    self.assertAllEqual(np_val, c_val)
+    self.assertEqual(np.int32, c_val.dtype)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/batch_matmul_op_test.py b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
index 2a09594ad43..809b23bd7d5 100644
--- a/tensorflow/python/kernel_tests/batch_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
-
 
 class BatchMatmulOpTest(tf.test.TestCase):
 
@@ -176,9 +174,14 @@ class BatchMatmulGradientTest(tf.test.TestCase):
       z = tf.batch_matmul(inx, iny, adj_x, adj_y)
       loss = tf.reduce_sum(z)
       epsilon = 1e-2
-      ((x_jacob_t, x_jacob_n), (y_jacob_t, y_jacob_n)) = gc.ComputeGradient(
-          [inx, iny], [x.shape, y.shape], loss, [1],
-          x_init_value=[x, y], delta=epsilon)
+      ((x_jacob_t, x_jacob_n),
+       (y_jacob_t, y_jacob_n)) = tf.test.compute_gradient(
+           [inx, iny],
+           [x.shape, y.shape],
+           loss,
+           [1],
+           x_init_value=[x, y],
+           delta=epsilon)
 
     tf.logging.info("x_jacob_t = %s", x_jacob_t.reshape(x.shape))
     tf.logging.info("x_jacob_n = %s", x_jacob_n.reshape(x.shape))
diff --git a/tensorflow/python/kernel_tests/bias_op_test.py b/tensorflow/python/kernel_tests/bias_op_test.py
index cffbfc97c4e..e79cb8fc022 100644
--- a/tensorflow/python/kernel_tests/bias_op_test.py
+++ b/tensorflow/python/kernel_tests/bias_op_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker
-
 
 class BiasAddTest(tf.test.TestCase):
 
@@ -82,7 +80,7 @@ class BiasAddTest(tf.test.TestCase):
                                dtype=tf.float64)
       b = tf.constant([1.3, 2.4], dtype=tf.float64)
       bo = tf.nn.bias_add(t, b)
-      err = gradient_checker.ComputeGradientError(t, [3, 2], bo, [3, 2])
+      err = tf.test.compute_gradient_error(t, [3, 2], bo, [3, 2])
     print("bias add tensor gradient err = ", err)
     self.assertLess(err, 1e-10)
 
@@ -92,7 +90,7 @@ class BiasAddTest(tf.test.TestCase):
                                dtype=tf.float64)
       b = tf.constant([1.3, 2.4], dtype=tf.float64)
       bo = tf.nn.bias_add(t, b)
-      err = gradient_checker.ComputeGradientError(b, [2], bo, [3, 2])
+      err = tf.test.compute_gradient_error(b, [2], bo, [3, 2])
     print("bias add bias gradient err = ", err)
     self.assertLess(err, 1e-10)
 
@@ -103,7 +101,7 @@ class BiasAddTest(tf.test.TestCase):
       t = tf.constant(x, shape=s, dtype=tf.float32)
       b = tf.constant([1.3, 2.4], dtype=tf.float32)
       bo = tf.nn.bias_add(t, b)
-      err = gradient_checker.ComputeGradientError(t, s, bo, s, x_init_value=x)
+      err = tf.test.compute_gradient_error(t, s, bo, s, x_init_value=x)
     print("bias add tensor gradient err = ", err)
     self.assertLess(err, 1e-3)
 
diff --git a/tensorflow/python/kernel_tests/cast_op_test.py b/tensorflow/python/kernel_tests/cast_op_test.py
index 4b3699c0611..cf2a8949cbe 100644
--- a/tensorflow/python/kernel_tests/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/cast_op_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
-
 
 class CastOpTest(tf.test.TestCase):
 
@@ -160,7 +158,7 @@ class CastOpTest(tf.test.TestCase):
           x = tf.constant(1.0, src_t)
           z = tf.identity(x)
           y = tf.cast(z, dst_t)
-          err = gc.ComputeGradientError(x, [1], y, [1])
+          err = tf.test.compute_gradient_error(x, [1], y, [1])
           self.assertLess(err, 1e-3)
 
 
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index f96750d4b04..9bd73f710a5 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -303,6 +303,63 @@ class ConcatOpTest(tf.test.TestCase):
                 dxs = sess.run(tf.gradients(c, xs, dc))
                 self.assertAllEqual(dc, np.concatenate(dxs, axis=axis))
 
+  def testTensorConcatDim0Grad(self):
+    x_shapes = [[20, 7, 3], [10, 7, 3], [14, 7, 3]]
+    output_shape = [44, 7, 3]
+    x_vals = [np.random.random_sample(x_shape).astype(
+        np.float64) for x_shape in x_shapes]
+    with self.test_session():
+      xs = [tf.constant(x_val) for x_val in x_vals]
+      output = tf.concat(0, xs)
+      err = tf.test.compute_gradient_error(xs, x_shapes, output, output_shape)
+    self.assertLess(err, 1e-11)
+
+  def testTensorConcatDim1Grad(self):
+    x_shapes = [[20, 7, 3], [20, 3, 3], [20, 1, 3]]
+    output_shape = [20, 11, 3]
+    x_vals = [np.random.random_sample(x_shape).astype(
+        np.float64) for x_shape in x_shapes]
+    with self.test_session():
+      xs = [tf.constant(x_val) for x_val in x_vals]
+      output = tf.concat(1, xs)
+      err = tf.test.compute_gradient_error(xs, x_shapes, output, output_shape)
+    self.assertLess(err, 1e-11)
+
+  def testIndexedSlicesConcatDim0Grad(self):
+    x_shapes = [[20, 7, 3], [10, 7, 3], [14, 7, 3]]
+    output_shape = [4, 7, 3]
+    x_vals = [np.random.random_sample(x_shape).astype(
+        np.float64) for x_shape in x_shapes]
+    with self.test_session():
+      xs = [tf.constant(x_val) for x_val in x_vals]
+      x_concat = tf.concat(0, xs)
+      output = tf.gather(x_concat, [1, 2, 0, 5])
+      err = tf.test.compute_gradient_error(xs, x_shapes, output, output_shape)
+    self.assertLess(err, 1e-11)
+
+  def testIndexedSlicesConcatDim1Grad(self):
+    x_shapes = [[20, 7, 3], [20, 3, 3], [20, 1, 3]]
+    output_shape = [4, 11, 3]
+    x_vals = [np.random.random_sample(x_shape).astype(
+        np.float64) for x_shape in x_shapes]
+    with self.test_session():
+      xs = [tf.constant(x_val) for x_val in x_vals]
+      x_concat = tf.concat(1, xs)
+      output = tf.gather(x_concat, [1, 2, 0, 5])
+      err = tf.test.compute_gradient_error(xs, x_shapes, output, output_shape)
+    self.assertLess(err, 1e-11)
+
+  def testIndexedSlicesConcatDim2Grad(self):
+    x_shapes = [[20, 7, 3], [20, 7, 1], [20, 7, 2]]
+    output_shape = [4, 7, 6]
+    x_vals = [np.random.random_sample(x_shape).astype(
+        np.float64) for x_shape in x_shapes]
+    with self.test_session():
+      xs = [tf.constant(x_val) for x_val in x_vals]
+      x_concat = tf.concat(2, xs)
+      output = tf.gather(x_concat, [1, 2, 0, 5])
+      err = tf.test.compute_gradient_error(xs, x_shapes, output, output_shape)
+    self.assertLess(err, 1e-11)
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index c9634562549..b70ec134aba 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -1091,9 +1091,10 @@ class ControlFlowTest(tf.test.TestCase):
 
       # Use a control dependency to ensure init_variable is run
       # while asking for c
-      real_v = control_flow_ops.with_dependencies(name="real_tensor",
-                                                 output_tensor=v,
-                                                 dependencies=[v.initializer])
+      real_v = control_flow_ops.with_dependencies(
+          name="real_tensor",
+          output_tensor=v.ref(),
+          dependencies=[v.initializer])
       c_val, real_v_val = sess.run([c, real_v])
 
     # Ensure the result of 'real_c' is the same as 'c'
@@ -1259,12 +1260,12 @@ class TupleTest(tf.test.TestCase):
       with self.test_session():
         v1 = tf.Variable([1.0])
         add1 = tf.add(
-            control_flow_ops.with_dependencies([v1.initializer], v1),
+            control_flow_ops.with_dependencies([v1.initializer], v1.ref()),
             2.0)
         v2 = tf.Variable([10.0])
-        add2 = tf.add(control_flow_ops.with_dependencies([v2.initializer],
-                                                               v2),
-                            20.0)
+        add2 = tf.add(
+            control_flow_ops.with_dependencies([v2.initializer], v2.ref()),
+            20.0)
         t1, _, t2 = control_flow_ops.tuple([add1, None, add2])
 
         # v1 is not initialized.
@@ -1291,14 +1292,14 @@ class TupleTest(tf.test.TestCase):
             np.array([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]]).astype(
                 np.float32))
         v1_at_1 = tf.IndexedSlices(
-            control_flow_ops.with_dependencies([v1.initializer], v1),
+            control_flow_ops.with_dependencies([v1.initializer], v1.ref()),
             tf.constant([1]))
 
         v2 = tf.Variable(
             np.array([[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]]).astype(
                 np.float32))
         v2_at_1 = tf.IndexedSlices(
-            control_flow_ops.with_dependencies([v2.initializer], v2),
+            control_flow_ops.with_dependencies([v2.initializer], v2.ref()),
             tf.constant([1]))
 
         st1, st2 = control_flow_ops.tuple([v1_at_1, v2_at_1])
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 1ce09c48a6a..5efe4855670 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
-
 
 def GetInceptionShapes():
   """Iterator for the convolution shapes used in the Inception 2015 model.
@@ -429,11 +427,11 @@ class Conv2DTest(tf.test.TestCase):
                            name="conv")
       self.assertEqual(output_shape, conv.get_shape())
       if test_input:
-        err = gc.ComputeGradientError(input_tensor, input_shape,
-                                      conv, output_shape)
+        err = tf.test.compute_gradient_error(input_tensor, input_shape, conv,
+                                             output_shape)
       else:
-        err = gc.ComputeGradientError(filter_tensor, filter_shape,
-                                      conv, output_shape)
+        err = tf.test.compute_gradient_error(filter_tensor, filter_shape, conv,
+                                             output_shape)
       print("conv_2d gradient error = ", err)
       self.assertLess(err, tolerance)
 
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 4fb2fcafbf4..a823250d512 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -24,7 +24,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
 
 _ADD = lambda x, y: x + y
 _SUB = lambda x, y: x - y
@@ -58,11 +57,19 @@ class UnaryOpTest(tf.test.TestCase):
       self.assertAllClose(np_ans, tf_cpu)
       if x.dtype == np.float32:
         s = list(np.shape(x))
-        jacob_t, jacob_n = gc.ComputeGradient(inx, s, y, s, x_init_value=x)
+        jacob_t, jacob_n = tf.test.compute_gradient(inx,
+                                                    s,
+                                                    y,
+                                                    s,
+                                                    x_init_value=x)
         self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
       elif x.dtype == np.float64:
         s = list(np.shape(x))
-        jacob_t, jacob_n = gc.ComputeGradient(inx, s, y, s, x_init_value=x)
+        jacob_t, jacob_n = tf.test.compute_gradient(inx,
+                                                    s,
+                                                    y,
+                                                    s,
+                                                    x_init_value=x)
         self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
 
   def _compareGpu(self, x, np_func, tf_func):
@@ -216,7 +223,11 @@ class BinaryOpTest(tf.test.TestCase):
       iny = tf.convert_to_tensor(y)
       out = tf_func(inx, iny)
       xs = list(x.shape)
-      jacob_t, jacob_n = gc.ComputeGradient(inx, xs, out, zs, x_init_value=x)
+      jacob_t, jacob_n = tf.test.compute_gradient(inx,
+                                                  xs,
+                                                  out,
+                                                  zs,
+                                                  x_init_value=x)
       if x.dtype == np.float32:
         self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
       elif x.dtype == np.float64:
@@ -230,7 +241,11 @@ class BinaryOpTest(tf.test.TestCase):
       iny = tf.convert_to_tensor(y)
       out = tf_func(inx, iny)
       ys = list(np.shape(y))
-      jacob_t, jacob_n = gc.ComputeGradient(iny, ys, out, zs, x_init_value=y)
+      jacob_t, jacob_n = tf.test.compute_gradient(iny,
+                                                  ys,
+                                                  out,
+                                                  zs,
+                                                  x_init_value=y)
     if x.dtype == np.float32:
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
     elif x.dtype == np.float64:
@@ -833,7 +848,11 @@ class SelectOpTest(tf.test.TestCase):
       iny = tf.convert_to_tensor(y)
       out = tf.select(c, inx, iny)
       s = list(np.shape(c))
-      jacob_t, jacob_n = gc.ComputeGradient(inx, s, out, s, x_init_value=x)
+      jacob_t, jacob_n = tf.test.compute_gradient(inx,
+                                                  s,
+                                                  out,
+                                                  s,
+                                                  x_init_value=x)
     if x.dtype == np.float32:
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
     elif x.dtype == np.float64:
@@ -845,7 +864,11 @@ class SelectOpTest(tf.test.TestCase):
       iny = tf.convert_to_tensor(y)
       out = tf.select(c, inx, iny)
       s = list(np.shape(c))
-      jacob_t, jacob_n = gc.ComputeGradient(iny, s, out, s, x_init_value=y)
+      jacob_t, jacob_n = tf.test.compute_gradient(iny,
+                                                  s,
+                                                  out,
+                                                  s,
+                                                  x_init_value=y)
     if x.dtype == np.float32:
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
     elif x.dtype == np.float64:
@@ -923,7 +946,11 @@ class MinMaxOpTest(tf.test.TestCase):
       iny = tf.convert_to_tensor(y)
       out = func(inx, iny)
       s = list(np.shape(x))
-      jacob_t, jacob_n = gc.ComputeGradient(inx, s, out, s, x_init_value=x)
+      jacob_t, jacob_n = tf.test.compute_gradient(inx,
+                                                  s,
+                                                  out,
+                                                  s,
+                                                  x_init_value=x)
     if x.dtype == np.float32:
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
     elif x.dtype == np.float64:
@@ -935,7 +962,11 @@ class MinMaxOpTest(tf.test.TestCase):
       iny = tf.convert_to_tensor(y)
       out = func(inx, iny)
       s = list(np.shape(x))
-      jacob_t, jacob_n = gc.ComputeGradient(iny, s, out, s, x_init_value=y)
+      jacob_t, jacob_n = tf.test.compute_gradient(iny,
+                                                  s,
+                                                  out,
+                                                  s,
+                                                  x_init_value=y)
     if x.dtype == np.float32:
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
     elif x.dtype == np.float64:
@@ -1159,8 +1190,12 @@ class ComplexMakeRealImagTest(tf.test.TestCase):
           tf.square(tf.real(cplx))) + tf.reduce_sum(
               tf.square(tf.imag(cplx)))
       epsilon = 1e-3
-      jacob_t, jacob_n = gc.ComputeGradient(inx, list(x.shape), loss, [1],
-                                            x_init_value=x, delta=epsilon)
+      jacob_t, jacob_n = tf.test.compute_gradient(inx,
+                                                  list(x.shape),
+                                                  loss,
+                                                  [1],
+                                                  x_init_value=x,
+                                                  delta=epsilon)
     self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
 
   def testGradient(self):
@@ -1187,8 +1222,12 @@ class ComplexMakeRealImagTest(tf.test.TestCase):
       # Defines the loss function as the sum of all coefficients of z.
       loss = tf.reduce_sum(tf.real(z) + tf.imag(z))
       epsilon = 0.005
-      jacob_t, jacob_n = gc.ComputeGradient(inp, list(data.shape), loss, [1],
-                                            x_init_value=data, delta=epsilon)
+      jacob_t, jacob_n = tf.test.compute_gradient(inp,
+                                                  list(data.shape),
+                                                  loss,
+                                                  [1],
+                                                  x_init_value=data,
+                                                  delta=epsilon)
     self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
 
   def testMulGradient(self):
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
index b17cdc0ed54..5f54f02bf06 100644
--- a/tensorflow/python/kernel_tests/embedding_ops_test.py
+++ b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -26,8 +26,6 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
-
 
 def _AsLong(array):
   """Casts arrays elements to long type. Used to convert from numpy tf."""
@@ -225,8 +223,11 @@ class EmbeddingLookupTest(tf.test.TestCase):
         x_name = [_PName(i) for i in range(num_shards)]
         x_init_value = [params[x_n + ":0"] for x_n in x_name]
         x_shape = [i.shape for i in x_init_value]
-        err = gc.ComputeGradientError(x, x_shape, y, y_shape,
-                                      x_init_value=x_init_value)
+        err = tf.test.compute_gradient_error(x,
+                                             x_shape,
+                                             y,
+                                             y_shape,
+                                             x_init_value=x_init_value)
       self.assertLess(err, 1e-4)
 
   def testGradientsEmbeddingLookupWithComputedParams(self):
@@ -246,8 +247,11 @@ class EmbeddingLookupTest(tf.test.TestCase):
         x_name = [_PName(i) for i in range(num_shards)]
         x_init_value = [params[x_n + ":0"] for x_n in x_name]
         x_shape = [i.shape for i in x_init_value]
-        err = gc.ComputeGradientError(x, x_shape, y, y_shape,
-                                      x_init_value=x_init_value)
+        err = tf.test.compute_gradient_error(x,
+                                             x_shape,
+                                             y,
+                                             y_shape,
+                                             x_init_value=x_init_value)
       self.assertLess(err, 1e-3)
 
   def testConstructionNonSharded(self):
@@ -381,8 +385,11 @@ class EmbeddingLookupSparseTest(tf.test.TestCase):
         x_init_value = [params[x_n + ":0"] for x_n in x_name]
         x_shape = [i.shape for i in x_init_value]
         y_shape = [batch_size] + list(params[_PName(0) + ":0"].shape[1:])
-        err = gc.ComputeGradientError(x, x_shape, y, y_shape,
-                                      x_init_value=x_init_value)
+        err = tf.test.compute_gradient_error(x,
+                                             x_shape,
+                                             y,
+                                             y_shape,
+                                             x_init_value=x_init_value)
       self.assertLess(err, 1e-5 if dtype == tf.float64 else 2e-3)
 
 
diff --git a/tensorflow/python/kernel_tests/gradient_checker.py b/tensorflow/python/kernel_tests/gradient_checker.py
index 69cc811a6ba..d0cdc3b3bcb 100644
--- a/tensorflow/python/kernel_tests/gradient_checker.py
+++ b/tensorflow/python/kernel_tests/gradient_checker.py
@@ -34,7 +34,7 @@ from tensorflow.python.ops import gradients
 from tensorflow.python.platform import logging
 
 
-def _Product(t):
+def _product(t):
   if isinstance(t, int):
     return t
   else:
@@ -44,11 +44,11 @@ def _Product(t):
     return y
 
 
-def _ComputeTheoricalJacobian(x, x_shape, x_data, dy, dy_shape, dx):
+def _compute_theoretical_jacobian(x, x_shape, x_data, dy, dy_shape, dx):
   """Computes the theoretical Jacobian for dy/dx.
 
   Computes the theoretical Jacobian using the ops generated by
-  ComputeGradient().
+  compute_gradient().
 
   Args:
     x: the tensor "x".
@@ -64,9 +64,9 @@ def _ComputeTheoricalJacobian(x, x_shape, x_data, dy, dy_shape, dx):
     "dy_size" is the number of elements in dy.
   """
   # To compute the jacobian, we treat x and y are one-dimensional vectors
-  x_size = _Product(x_shape)
-  x_val_size = _Product(x_shape[1:])  # This is used for sparse gradients
-  dy_size = _Product(dy_shape)
+  x_size = _product(x_shape)
+  x_val_size = _product(x_shape[1:])  # This is used for sparse gradients
+  dy_size = _product(dy_shape)
 
   jacobian = np.zeros((x_size, dy_size), dtype=x_data.dtype)
   # For each of the entry of dy, we set this to be 1 and
@@ -92,7 +92,7 @@ def _ComputeTheoricalJacobian(x, x_shape, x_data, dy, dy_shape, dx):
   return jacobian
 
 
-def _ComputeNumericJacobian(x, x_shape, x_data, y, y_shape, delta):
+def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta):
   """Computes the numeric Jacobian for dy/dx.
 
   Computes the numeric Jacobian by slightly perturbing the inputs and
@@ -113,8 +113,8 @@ def _ComputeNumericJacobian(x, x_shape, x_data, y, y_shape, delta):
   """
 
   # To compute the jacobian, we treat x and y are one-dimensional vectors
-  x_size = _Product(x_shape)
-  y_size = _Product(y_shape)
+  x_size = _product(x_shape)
+  y_size = _product(y_shape)
 
   jacobian = np.zeros((x_size, y_size), dtype=x_data.dtype)
   # For each of the entry of x, we slightly perturbs this by adding and
@@ -134,7 +134,7 @@ def _ComputeNumericJacobian(x, x_shape, x_data, y, y_shape, delta):
   return jacobian
 
 
-def _ComputeDxAndDy(x, y, y_shape):
+def _compute_dx_and_dy(x, y, y_shape):
   """Returns a node to compute gradient of x wrt y."""
   # We make up a dy so that we can compute the gradients. We don't really use
   # the value of dy -- we will always feed it. We need to add an identity node
@@ -149,8 +149,14 @@ def _ComputeDxAndDy(x, y, y_shape):
   return grads[0], dy_orig
 
 
-def _ComputeGradient(x, x_shape, dx, y, y_shape, dy,
-                     x_init_value=None, delta=1e-3):
+def _compute_gradient(x,
+                      x_shape,
+                      dx,
+                      y,
+                      y_shape,
+                      dy,
+                      x_init_value=None,
+                      delta=1e-3):
   """Computes the theoretical and numerical jacobian."""
   t = dtypes.as_dtype(x.dtype)
   allowed_types = [dtypes.float32, dtypes.float64]
@@ -170,16 +176,21 @@ def _ComputeGradient(x, x_shape, dx, y, y_shape, dy,
       dtype = np.float64
     x_data = np.asfarray(np.random.random_sample(x_shape), dtype=dtype)
 
-  jacob_t = _ComputeTheoricalJacobian(x, x_shape, x_data, dy, y_shape, dx)
-  jacob_n = _ComputeNumericJacobian(x, x_shape, x_data, y, y_shape, delta)
+  jacob_t = _compute_theoretical_jacobian(x, x_shape, x_data, dy, y_shape, dx)
+  jacob_n = _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta)
   return jacob_t, jacob_n
 
 
-def _ComputeGradientList(
-    x, x_shape, y, y_shape, x_init_value=None, delta=1e-3, init_targets=None):
+def _compute_gradient_list(x,
+                           x_shape,
+                           y,
+                           y_shape,
+                           x_init_value=None,
+                           delta=1e-3,
+                           init_targets=None):
   """Compute gradients for a list of x values."""
   assert isinstance(x, list)
-  dx, dy = zip(*[_ComputeDxAndDy(xi, y, y_shape) for xi in x])
+  dx, dy = zip(*[_compute_dx_and_dy(xi, y, y_shape) for xi in x])
 
   if init_targets is not None:
     assert isinstance(init_targets, (list, tuple))
@@ -187,15 +198,20 @@ def _ComputeGradientList(
       init.run()
   if x_init_value is None:
     x_init_value = [None] * len(x)
-  ret = [_ComputeGradient(xi, x_shapei, dxi, y, y_shape, dyi,
-                          x_init_valuei, delta)
-         for xi, x_shapei, dxi, dyi, x_init_valuei in
-         zip(x, x_shape, dx, dy, x_init_value)]
+  ret = [_compute_gradient(xi, x_shapei, dxi, y, y_shape, dyi, x_init_valuei,
+                           delta)
+         for xi, x_shapei, dxi, dyi, x_init_valuei in zip(x, x_shape, dx, dy,
+                                                          x_init_value)]
   return ret
 
 
-def ComputeGradient(
-    x, x_shape, y, y_shape, x_init_value=None, delta=1e-3, init_targets=None):
+def compute_gradient(x,
+                     x_shape,
+                     y,
+                     y_shape,
+                     x_init_value=None,
+                     delta=1e-3,
+                     init_targets=None):
   """Computes and returns the theoretical and numerical Jacobian.
 
   Args:
@@ -219,20 +235,25 @@ def ComputeGradient(
     number of elements in y. If x is a list, returns a list of two numpy arrays.
   """
   if isinstance(x, list):
-    return _ComputeGradientList(x, x_shape, y, y_shape, x_init_value,
-                                delta, init_targets)
+    return _compute_gradient_list(x, x_shape, y, y_shape, x_init_value, delta,
+                                  init_targets)
   else:
     if init_targets is not None:
       assert isinstance(init_targets, (list, tuple))
       for init in init_targets:
         init.run()
-    dx, dy = _ComputeDxAndDy(x, y, y_shape)
-    ret = _ComputeGradient(x, x_shape, dx, y, y_shape, dy, x_init_value, delta)
+    dx, dy = _compute_dx_and_dy(x, y, y_shape)
+    ret = _compute_gradient(x, x_shape, dx, y, y_shape, dy, x_init_value, delta)
     return ret
 
 
-def ComputeGradientError(
-    x, x_shape, y, y_shape, x_init_value=None, delta=1e-3, init_targets=None):
+def compute_gradient_error(x,
+                           x_shape,
+                           y,
+                           y_shape,
+                           x_init_value=None,
+                           delta=1e-3,
+                           init_targets=None):
   """Computes the gradient error.
 
   Computes the maximum error for dy/dx between the computed Jacobian and the
@@ -263,8 +284,8 @@ def ComputeGradientError(
   Returns:
     The maximum error in between the two Jacobians.
   """
-  grad = ComputeGradient(x, x_shape, y, y_shape, x_init_value,
-                         delta, init_targets)
+  grad = compute_gradient(x, x_shape, y, y_shape, x_init_value, delta,
+                          init_targets)
   if isinstance(grad, tuple):
     grad = [grad]
   return max(np.fabs(j_t - j_n).max() for j_t, j_n in grad)
diff --git a/tensorflow/python/kernel_tests/gradient_checker_test.py b/tensorflow/python/kernel_tests/gradient_checker_test.py
index 6a835ff651c..2ded0375a87 100644
--- a/tensorflow/python/kernel_tests/gradient_checker_test.py
+++ b/tensorflow/python/kernel_tests/gradient_checker_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests.gradient_checker import ComputeGradientError
-
 
 class GradientCheckerTest(tf.test.TestCase):
 
@@ -37,7 +35,7 @@ class GradientCheckerTest(tf.test.TestCase):
       y = tf.add(x1, x2, name="y")
 
       # checking gradients for x1
-      error = ComputeGradientError(x1, size, y, size)
+      error = tf.test.compute_gradient_error(x1, size, y, size)
     tf.logging.info("x1 error = %f", error)
     assert error < 1e-4
 
@@ -50,7 +48,7 @@ class GradientCheckerTest(tf.test.TestCase):
       y = tf.add(x1, x2, name="y")
 
       # checking gradients for x1
-      error = ComputeGradientError(x1, size, y, size)
+      error = tf.test.compute_gradient_error(x1, size, y, size)
     tf.logging.info("x1 error = %f", error)
     assert error < 1e-4
 
@@ -66,8 +64,12 @@ class GradientCheckerTest(tf.test.TestCase):
 
       # checkint gradients for x2 using a special init_value and delta
       x_init_value = np.asarray(np.arange(6, dtype=np.float64).reshape(2, 3))
-      error = ComputeGradientError(x2, size, y, size, x_init_value=x_init_value,
-                                   delta=1e-2)
+      error = tf.test.compute_gradient_error(x2,
+                                             size,
+                                             y,
+                                             size,
+                                             x_init_value=x_init_value,
+                                             delta=1e-2)
     tf.logging.info("x2 error = %f", error)
     assert error < 1e-10
 
@@ -82,7 +84,7 @@ class GradientCheckerTest(tf.test.TestCase):
       indices = tf.constant(index_values, name="i")
       y = tf.gather(params, indices, name="y")
 
-      error = ComputeGradientError(params, p_shape, y, y_shape)
+      error = tf.test.compute_gradient_error(params, p_shape, y, y_shape)
     tf.logging.info("gather error = %f", error)
     assert error < 1e-4
 
@@ -101,7 +103,7 @@ class GradientCheckerTest(tf.test.TestCase):
       indices2 = tf.constant(index_values2, name="i2")
       y2 = tf.gather(y, indices2, name="y2")
 
-      error = ComputeGradientError(params, p_shape, y2, y2_shape)
+      error = tf.test.compute_gradient_error(params, p_shape, y2, y2_shape)
     tf.logging.info("nested gather error = %f", error)
     assert error < 1e-4
 
@@ -166,9 +168,11 @@ def BuildAndTestMiniMNIST(param_index, tag):
     cost = tf.nn.softmax_cross_entropy_with_logits(logits, labels, name="cost")
 
     # Test the gradients.
-    err = ComputeGradientError(all_params[param_index],
-                               param_sizes[param_index],
-                               cost, [batch], delta=1e-5)
+    err = tf.test.compute_gradient_error(all_params[param_index],
+                                         param_sizes[param_index],
+                                         cost,
+                                         [batch],
+                                         delta=1e-5)
 
   tf.logging.info("Mini MNIST: %s gradient error = %g", tag, err)
   return err
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index 40da6c3ad51..8c9c47ac622 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
-
 
 class MatrixInverseGradientTest(tf.test.TestCase):
   pass  # Filled in below
@@ -49,11 +47,11 @@ def _GetMatrixInverseGradientTest(dtype_, shape_):
       else:
         ainv = tf.batch_matrix_inverse(a)
 
-      theoretical, numerical = gc.ComputeGradient(a,
-                                                  shape_,
-                                                  ainv,
-                                                  shape_,
-                                                  delta=delta)
+      theoretical, numerical = tf.test.compute_gradient(a,
+                                                        shape_,
+                                                        ainv,
+                                                        shape_,
+                                                        delta=delta)
       self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
   return Test
@@ -87,8 +85,11 @@ def _GetMatrixDeterminantGradientTest(dtype_, shape_):
         c = tf.batch_matrix_determinant(a)
 
       out_shape = shape_[:-2]  # last two dimensions hold matrices
-      theoretical, numerical = gc.ComputeGradient(a, shape_, c, out_shape,
-                                                  delta=delta)
+      theoretical, numerical = tf.test.compute_gradient(a,
+                                                        shape_,
+                                                        c,
+                                                        out_shape,
+                                                        delta=delta)
 
       self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
diff --git a/tensorflow/models/rnn/linear_test.py b/tensorflow/python/kernel_tests/linear_test.py
similarity index 89%
rename from tensorflow/models/rnn/linear_test.py
rename to tensorflow/python/kernel_tests/linear_test.py
index 22c38434133..fdb45411147 100644
--- a/tensorflow/models/rnn/linear_test.py
+++ b/tensorflow/python/kernel_tests/linear_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.models.rnn import linear
-
 
 class LinearTest(tf.test.TestCase):
 
@@ -32,21 +30,21 @@ class LinearTest(tf.test.TestCase):
     with self.test_session() as sess:
       with tf.variable_scope("root", initializer=tf.constant_initializer(1.0)):
         x = tf.zeros([1, 2])
-        l = linear.linear([x], 2, False)
+        l = tf.nn.rnn_cell.linear([x], 2, False)
         sess.run([tf.variables.initialize_all_variables()])
         res = sess.run([l], {x.name: np.array([[1., 2.]])})
         self.assertAllClose(res[0], [[3.0, 3.0]])
 
         # Checks prevent you from accidentally creating a shared function.
         with self.assertRaises(ValueError) as exc:
-          l1 = linear.linear([x], 2, False)
+          l1 = tf.nn.rnn_cell.linear([x], 2, False)
         self.assertEqual(str(exc.exception)[:12], "Over-sharing")
 
         # But you can create a new one in a new scope and share the variables.
         with tf.variable_scope("l1") as new_scope:
-          l1 = linear.linear([x], 2, False)
+          l1 = tf.nn.rnn_cell.linear([x], 2, False)
         with tf.variable_scope(new_scope, reuse=True):
-          linear.linear([l1], 2, False)
+          tf.nn.rnn_cell.linear([l1], 2, False)
         self.assertEqual(len(tf.trainable_variables()), 2)
 
 
diff --git a/tensorflow/python/kernel_tests/lrn_op_test.py b/tensorflow/python/kernel_tests/lrn_op_test.py
index 4dd7372f4a6..2d7a082b863 100644
--- a/tensorflow/python/kernel_tests/lrn_op_test.py
+++ b/tensorflow/python/kernel_tests/lrn_op_test.py
@@ -25,9 +25,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests.gradient_checker import ComputeGradientError
-
-
 
 class LRNOpTest(tf.test.TestCase):
 
@@ -107,7 +104,7 @@ class LRNOpTest(tf.test.TestCase):
       lrn_op = tf.nn.local_response_normalization(
           inp, name="lrn", depth_radius=lrn_depth_radius, bias=bias,
           alpha=alpha, beta=beta)
-      err = ComputeGradientError(inp, shape, lrn_op, shape)
+      err = tf.test.compute_gradient_error(inp, shape, lrn_op, shape)
     print("LRN Gradient error ", err)
     self.assertLess(err, 1e-4)
 
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 791951eb982..986aa9797ed 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
-
 
 class MatMulTest(tf.test.TestCase):
 
@@ -161,7 +159,7 @@ class MatMulGradientTest(tf.test.TestCase):
       y = tf.constant([1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7],
                    shape=[2, 4], dtype=tf.float64, name="y")
       m = tf.matmul(x, y, name="matmul")
-      err = gc.ComputeGradientError(x, [3, 2], m, [3, 4])
+      err = tf.test.compute_gradient_error(x, [3, 2], m, [3, 4])
     print("matmul input0 gradient err = ", err)
     self.assertLess(err, 1e-10)
 
@@ -172,7 +170,7 @@ class MatMulGradientTest(tf.test.TestCase):
       y = tf.constant([1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7],
                    shape=[2, 4], dtype=tf.float64, name="y")
       m = tf.matmul(x, y, name="matmul")
-      err = gc.ComputeGradientError(y, [2, 4], m, [3, 4])
+      err = tf.test.compute_gradient_error(y, [2, 4], m, [3, 4])
     print("matmul input1 gradient err = ", err)
     self.assertLess(err, 1e-10)
 
@@ -189,7 +187,7 @@ class MatMulGradientTest(tf.test.TestCase):
       y = tf.constant([1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7],
                    shape=shape_y, dtype=tf.float64, name="y")
       m = tf.matmul(x, y, transpose_a, transpose_b, name="matmul")
-      err = gc.ComputeGradientError(x, shape_x, m, [3, 4])
+      err = tf.test.compute_gradient_error(x, shape_x, m, [3, 4])
     print("matmul input0 gradient err = ", err)
     self.assertLess(err, 1e-10)
 
@@ -211,7 +209,7 @@ class MatMulGradientTest(tf.test.TestCase):
       y = tf.constant([1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7],
                    shape=shape_y, dtype=tf.float64, name="y")
       m = tf.matmul(x, y, transpose_a, transpose_b, name="matmul")
-      err = gc.ComputeGradientError(y, shape_y, m, [3, 4])
+      err = tf.test.compute_gradient_error(y, shape_y, m, [3, 4])
     print("matmul input1 gradient err = ", err)
     self.assertLess(err, 1e-10)
 
diff --git a/tensorflow/python/kernel_tests/pack_op_test.py b/tensorflow/python/kernel_tests/pack_op_test.py
index f9bdadb82b7..03f580169f2 100644
--- a/tensorflow/python/kernel_tests/pack_op_test.py
+++ b/tensorflow/python/kernel_tests/pack_op_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker
-
 
 class PackOpTest(tf.test.TestCase):
 
@@ -51,7 +49,7 @@ class PackOpTest(tf.test.TestCase):
           # TODO(irving): Remove list() once we handle maps correctly
           xs = list(map(tf.constant, data))
           c = tf.pack(xs)
-          err = gradient_checker.ComputeGradientError(xs, shapes, c, shape)
+          err = tf.test.compute_gradient_error(xs, shapes, c, shape)
           self.assertLess(err, 1e-6)
 
   def testZeroSize(self):
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index 46f1b4a3a1e..754642b204d 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -24,8 +24,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
-
 
 class PadOpTest(tf.test.TestCase):
 
@@ -58,7 +56,11 @@ class PadOpTest(tf.test.TestCase):
       y = tf.pad(inx, ina)
       # Expected y's shape to be:
       ys = list(np.array(x.shape) + np.sum(np.array(a), axis=1))
-      jacob_t, jacob_n = gc.ComputeGradient(inx, xs, y, ys, x_init_value=x)
+      jacob_t, jacob_n = tf.test.compute_gradient(inx,
+                                                  xs,
+                                                  y,
+                                                  ys,
+                                                  x_init_value=x)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
 
   def _testAll(self, np_inputs, paddings):
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index ab36ec5fde5..427d83b2106 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -23,7 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
 from tensorflow.python.ops import gen_nn_ops
 
 
@@ -436,9 +435,12 @@ class PoolingTest(tf.test.TestCase):
       t = pool_func(input_tensor, ksize=[1, window_rows, window_rows, 1],
                     strides=[1, row_stride, col_stride, 1],
                     padding=padding, name=func_name)
-      err = gc.ComputeGradientError(
-          input_tensor, input_sizes, t, output_sizes,
-          x_init_value=x_init_value, delta=1e-2)
+      err = tf.test.compute_gradient_error(input_tensor,
+                                           input_sizes,
+                                           t,
+                                           output_sizes,
+                                           x_init_value=x_init_value,
+                                           delta=1e-2)
     print("%s gradient error = " % func_name, err)
     self.assertLess(err, err_margin)
 
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index afb437ea3c2..3867034dc16 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -24,7 +24,6 @@ import numpy as np
 import tensorflow as tf
 
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.kernel_tests import gradient_checker
 
 
 class SumReductionTest(tf.test.TestCase):
@@ -150,13 +149,12 @@ class SumReductionTest(tf.test.TestCase):
     with self.test_session():
       t = tf.convert_to_tensor(x)
       su = tf.reduce_sum(t, reduction_axes)
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t,
-          shape,
-          su,
-          sum_shape,
-          x_init_value=x,
-          delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  shape,
+                                                  su,
+                                                  sum_shape,
+                                                  x_init_value=x,
+                                                  delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
   def testGradient(self):
@@ -211,18 +209,30 @@ class MeanReductionTest(tf.test.TestCase):
     with self.test_session():
       t = tf.convert_to_tensor(x)
       su = tf.reduce_mean(t, [1, 2])
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t, s, su, [2, 2], x_init_value=x, delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  s,
+                                                  su,
+                                                  [2, 2],
+                                                  x_init_value=x,
+                                                  delta=1)
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
 
       su = tf.reduce_mean(t, [0, 1, 2, 3])
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t, s, su, [1], x_init_value=x, delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  s,
+                                                  su,
+                                                  [1],
+                                                  x_init_value=x,
+                                                  delta=1)
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
 
       su = tf.reduce_mean(t, [])
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t, s, su, [2, 3, 4, 2], x_init_value=x, delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  s,
+                                                  su,
+                                                  [2, 3, 4, 2],
+                                                  x_init_value=x,
+                                                  delta=1)
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
 
 
@@ -269,18 +279,30 @@ class ProdReductionTest(tf.test.TestCase):
       t = tf.convert_to_tensor(x)
 
       su = tf.reduce_prod(t, [])
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t, s, su, [2, 3, 4, 2], x_init_value=x, delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  s,
+                                                  su,
+                                                  [2, 3, 4, 2],
+                                                  x_init_value=x,
+                                                  delta=1)
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
 
       su = tf.reduce_prod(t, [1, 2])
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t, s, su, [2, 2], x_init_value=x, delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  s,
+                                                  su,
+                                                  [2, 2],
+                                                  x_init_value=x,
+                                                  delta=1)
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
 
       su = tf.reduce_prod(t, [0, 1, 2, 3])
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t, s, su, [1], x_init_value=x, delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  s,
+                                                  su,
+                                                  [1],
+                                                  x_init_value=x,
+                                                  delta=1)
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
 
     # NOTE(kearnes): the current gradient calculation gives NaNs for 0 inputs
@@ -288,8 +310,12 @@ class ProdReductionTest(tf.test.TestCase):
     with self.test_session():
       t = tf.convert_to_tensor(x)
       su = tf.reduce_prod(t, [])
-      jacob_t, _ = gradient_checker.ComputeGradient(
-          t, s, su, [2, 3, 4, 2], x_init_value=x, delta=1)
+      jacob_t, _ = tf.test.compute_gradient(t,
+                                            s,
+                                            su,
+                                            [2, 3, 4, 2],
+                                            x_init_value=x,
+                                            delta=1)
       with self.assertRaisesOpError("Tensor had NaN values"):
         tf.check_numerics(jacob_t, message="_ProdGrad NaN test").op.run()
 
@@ -336,8 +362,12 @@ class MinReductionTest(tf.test.TestCase):
     with self.test_session():
       t = tf.convert_to_tensor(x)
       su = tf.reduce_min(t, [1, 2])
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t, s, su, [2, 2], x_init_value=x, delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  s,
+                                                  su,
+                                                  [2, 2],
+                                                  x_init_value=x,
+                                                  delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
   def testGradient2(self):
@@ -346,8 +376,12 @@ class MinReductionTest(tf.test.TestCase):
     with self.test_session():
       t = tf.convert_to_tensor(x)
       su = tf.reduce_min(t, [1])
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t, s, su, [2, 4, 2], x_init_value=x, delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  s,
+                                                  su,
+                                                  [2, 4, 2],
+                                                  x_init_value=x,
+                                                  delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
   def testGradient3(self):
@@ -356,8 +390,12 @@ class MinReductionTest(tf.test.TestCase):
     with self.test_session():
       t = tf.convert_to_tensor(x)
       su = tf.reduce_min(t, [2])
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t, s, su, [2, 3, 2], x_init_value=x, delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  s,
+                                                  su,
+                                                  [2, 3, 2],
+                                                  x_init_value=x,
+                                                  delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
   def testGradient4(self):
@@ -366,8 +404,12 @@ class MinReductionTest(tf.test.TestCase):
     with self.test_session():
       t = tf.convert_to_tensor(x)
       su = tf.reduce_min(t)
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t, s, su, [1], x_init_value=x, delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  s,
+                                                  su,
+                                                  [1],
+                                                  x_init_value=x,
+                                                  delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
 
@@ -414,8 +456,12 @@ class MaxReductionTest(tf.test.TestCase):
     with self.test_session():
       t = tf.convert_to_tensor(x)
       su = tf.reduce_max(t, [1, 2])
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t, s, su, [2, 2], x_init_value=x, delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  s,
+                                                  su,
+                                                  [2, 2],
+                                                  x_init_value=x,
+                                                  delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
   def testGradient2(self):
@@ -424,8 +470,12 @@ class MaxReductionTest(tf.test.TestCase):
     with self.test_session():
       t = tf.convert_to_tensor(x)
       su = tf.reduce_max(t, [1])
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t, s, su, [2, 4, 2], x_init_value=x, delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  s,
+                                                  su,
+                                                  [2, 4, 2],
+                                                  x_init_value=x,
+                                                  delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
   def testGradient3(self):
@@ -434,8 +484,12 @@ class MaxReductionTest(tf.test.TestCase):
     with self.test_session():
       t = tf.convert_to_tensor(x)
       su = tf.reduce_max(t, [2])
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t, s, su, [2, 3, 2], x_init_value=x, delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  s,
+                                                  su,
+                                                  [2, 3, 2],
+                                                  x_init_value=x,
+                                                  delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
   def testGradient4(self):
@@ -444,8 +498,12 @@ class MaxReductionTest(tf.test.TestCase):
     with self.test_session():
       t = tf.convert_to_tensor(x)
       su = tf.reduce_max(t)
-      jacob_t, jacob_n = gradient_checker.ComputeGradient(
-          t, s, su, [1], x_init_value=x, delta=1)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  s,
+                                                  su,
+                                                  [1],
+                                                  x_init_value=x,
+                                                  delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
 
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index 0dbece5897d..38ab52b0c16 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
-
 
 class ReluTest(tf.test.TestCase):
 
@@ -67,7 +65,11 @@ class ReluTest(tf.test.TestCase):
       x_init = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32, order="F")
-      err = gc.ComputeGradientError(x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = tf.test.compute_gradient_error(x,
+                                           [2, 5],
+                                           y,
+                                           [2, 5],
+                                           x_init_value=x_init)
     print("relu (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
@@ -98,7 +100,11 @@ class ReluTest(tf.test.TestCase):
       x_init = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64, order="F")
-      err = gc.ComputeGradientError(x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = tf.test.compute_gradient_error(x,
+                                           [2, 5],
+                                           y,
+                                           [2, 5],
+                                           x_init_value=x_init)
     print("relu (double) gradient err = ", err)
     self.assertLess(err, 1e-10)
 
@@ -112,8 +118,11 @@ class ReluTest(tf.test.TestCase):
       x_init = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32, order="F")
-      err = gc.ComputeGradientError(x, [2, 5], z[0], [2, 5],
-                                    x_init_value=x_init)
+      err = tf.test.compute_gradient_error(x,
+                                           [2, 5],
+                                           z[0],
+                                           [2, 5],
+                                           x_init_value=x_init)
     print("relu (float) gradient of gradient err = ", err)
     self.assertLess(err, 1e-4)
 
@@ -127,8 +136,11 @@ class ReluTest(tf.test.TestCase):
       x_init = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64, order="F")
-      err = gc.ComputeGradientError(x, [2, 5], z[0], [2, 5],
-                                    x_init_value=x_init)
+      err = tf.test.compute_gradient_error(x,
+                                           [2, 5],
+                                           z[0],
+                                           [2, 5],
+                                           x_init_value=x_init)
     print("relu (double) gradient of gradient err = ", err)
     self.assertLess(err, 1e-10)
 
@@ -178,7 +190,11 @@ class Relu6Test(tf.test.TestCase):
       x_init = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [6.1, 6.3, 6.5, 6.7, 6.9]],
           dtype=np.float32, order="F")
-      err = gc.ComputeGradientError(x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = tf.test.compute_gradient_error(x,
+                                           [2, 5],
+                                           y,
+                                           [2, 5],
+                                           x_init_value=x_init)
     print("relu6 (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
@@ -191,7 +207,11 @@ class Relu6Test(tf.test.TestCase):
       x_init = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [6.1, 6.3, 6.5, 6.7, 6.9]],
           dtype=np.float64, order="F")
-      err = gc.ComputeGradientError(x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = tf.test.compute_gradient_error(x,
+                                           [2, 5],
+                                           y,
+                                           [2, 5],
+                                           x_init_value=x_init)
     print("relu6 (double) gradient err = ", err)
     self.assertLess(err, 1e-10)
 
diff --git a/tensorflow/python/kernel_tests/reshape_op_test.py b/tensorflow/python/kernel_tests/reshape_op_test.py
index fd1130c082d..f3fc9086d6c 100644
--- a/tensorflow/python/kernel_tests/reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/reshape_op_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
-
 
 class ReshapeTest(tf.test.TestCase):
 
@@ -81,8 +79,11 @@ class ReshapeTest(tf.test.TestCase):
     with self.test_session():
       input_tensor = tf.constant(x, shape=[2, 3, 4])
       reshape_out = tf.reshape(input_tensor, [1, 8, 3])
-      err = gc.ComputeGradientError(input_tensor, s,
-                                    reshape_out, s, x_init_value=x)
+      err = tf.test.compute_gradient_error(input_tensor,
+                                           s,
+                                           reshape_out,
+                                           s,
+                                           x_init_value=x)
     print("Reshape gradient error = " % err)
     self.assertLess(err, 1e-3)
 
diff --git a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
index ba90a35b1b6..f2bc9641091 100644
--- a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
+++ b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
@@ -23,15 +23,14 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
-
 
 class ReverseSequenceTest(tf.test.TestCase):
 
-  def _testReverseSequence(self, x, seq_dim, seq_lengths,
+  def _testReverseSequence(self, x, batch_dim, seq_dim, seq_lengths,
                            truth, use_gpu=False, expected_err_re=None):
     with self.test_session(use_gpu=use_gpu):
       ans = tf.reverse_sequence(x,
+                                batch_dim=batch_dim,
                                 seq_dim=seq_dim,
                                 seq_lengths=seq_lengths)
       if expected_err_re is None:
@@ -42,11 +41,11 @@ class ReverseSequenceTest(tf.test.TestCase):
         with self.assertRaisesOpError(expected_err_re):
           ans.eval()
 
-  def _testBothReverseSequence(self, x, seq_dim, seq_lengths,
+  def _testBothReverseSequence(self, x, batch_dim, seq_dim, seq_lengths,
                                truth, expected_err_re=None):
-    self._testReverseSequence(x, seq_dim, seq_lengths,
+    self._testReverseSequence(x, batch_dim, seq_dim, seq_lengths,
                               truth, True, expected_err_re)
-    self._testReverseSequence(x, seq_dim, seq_lengths,
+    self._testReverseSequence(x, batch_dim, seq_dim, seq_lengths,
                               truth, False, expected_err_re)
 
   def _testBasic(self, dtype):
@@ -55,18 +54,22 @@ class ReverseSequenceTest(tf.test.TestCase):
         [[9, 10, 11, 12], [13, 14, 15, 16]],
         [[17, 18, 19, 20], [21, 22, 23, 24]]], dtype=dtype)
     x = x.reshape(3, 2, 4, 1, 1)
+    x = x.transpose([2, 1, 0, 3, 4])  # permute axes 0 <=> 2
 
     # reverse dim 2 up to (0:3, none, 0:4) along dim=0
-    seq_dim = 2
     seq_lengths = np.asarray([3, 0, 4], dtype=np.int64)
 
-    truth = np.asarray(
+    truth_orig = np.asarray(
         [[[3, 2, 1, 4], [7, 6, 5, 8]],  # reverse 0:3
          [[9, 10, 11, 12], [13, 14, 15, 16]],  # reverse none
          [[20, 19, 18, 17], [24, 23, 22, 21]]],  # reverse 0:4 (all)
         dtype=dtype)
-    truth = truth.reshape(3, 2, 4, 1, 1)
-    self._testBothReverseSequence(x, seq_dim, seq_lengths, truth)
+    truth_orig = truth_orig.reshape(3, 2, 4, 1, 1)
+    truth = truth_orig.transpose([2, 1, 0, 3, 4])  # permute axes 0 <=> 2
+
+    seq_dim = 0    # permute seq_dim and batch_dim (originally 2 and 0, resp.)
+    batch_dim = 2
+    self._testBothReverseSequence(x, batch_dim, seq_dim, seq_lengths, truth)
 
   def testFloatBasic(self):
     self._testBasic(np.float32)
@@ -89,22 +92,25 @@ class ReverseSequenceTest(tf.test.TestCase):
         [[9, 10, 11, 12], [13, 14, 15, 16]],
         [[17, 18, 19, 20], [21, 22, 23, 24]]], dtype=np.float)
     x = x.reshape(3, 2, 4, 1, 1)
+    x = x.transpose([2, 1, 0, 3, 4])  # transpose axes 0 <=> 2
 
-    # reverse dim 2 up to (0:3, none, 0:4) along dim=0
-    seq_dim = 2
+    # reverse dim 0 up to (0:3, none, 0:4) along dim=2
+    seq_dim = 0
+    batch_dim = 2
     seq_lengths = np.asarray([3, 0, 4], dtype=np.int64)
 
     with self.test_session():
       input_t = tf.constant(x, shape=x.shape)
       seq_lengths_t = tf.constant(seq_lengths, shape=seq_lengths.shape)
       reverse_sequence_out = tf.reverse_sequence(input_t,
+                                                 batch_dim=batch_dim,
                                                  seq_dim=seq_dim,
                                                  seq_lengths=seq_lengths_t)
-      err = gc.ComputeGradientError(input_t,
-                                    x.shape,
-                                    reverse_sequence_out,
-                                    x.shape,
-                                    x_init_value=x)
+      err = tf.test.compute_gradient_error(input_t,
+                                           x.shape,
+                                           reverse_sequence_out,
+                                           x.shape,
+                                           x_init_value=x)
     print("ReverseSequence gradient error = %g" % err)
     self.assertLess(err, 1e-8)
 
@@ -123,6 +129,26 @@ class ReverseSequenceTest(tf.test.TestCase):
           seq_lengths=tf.placeholder(tf.int64, shape=(32,)),
           seq_dim=3)
 
+    # batch_dim out of bounds.
+    with self.assertRaisesRegexp(
+        ValueError, "batch_dim must be < input.dims()"):
+      tf.reverse_sequence(
+          tf.placeholder(tf.float32, shape=(32, 2, 3)),
+          seq_lengths=tf.placeholder(tf.int64, shape=(32,)),
+          seq_dim=0,
+          batch_dim=3)
+
+    with self.test_session():
+      inputs = tf.placeholder(tf.float32, shape=(32, 2, 3))
+      seq_lengths = tf.placeholder(tf.int64, shape=(32,))
+      output = tf.reverse_sequence(
+          inputs,
+          seq_lengths=seq_lengths,
+          seq_dim=0)  # batch_dim default is 0
+      with self.assertRaisesOpError("batch_dim == seq_dim"):
+        output.eval(feed_dict={inputs: np.random.rand(32, 2, 3),
+                               seq_lengths: xrange(32)})
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/models/rnn/rnn_cell_test.py b/tensorflow/python/kernel_tests/rnn_cell_test.py
similarity index 97%
rename from tensorflow/models/rnn/rnn_cell_test.py
rename to tensorflow/python/kernel_tests/rnn_cell_test.py
index 53d7caf2b7f..fefe4b078dc 100644
--- a/tensorflow/models/rnn/rnn_cell_test.py
+++ b/tensorflow/python/kernel_tests/rnn_cell_test.py
@@ -26,7 +26,7 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
-from tensorflow.models.rnn import rnn_cell
+from tensorflow.python.ops import rnn_cell
 
 
 class RNNCellTest(tf.test.TestCase):
@@ -96,9 +96,9 @@ class RNNCellTest(tf.test.TestCase):
         # Different inputs so different outputs and states
         for i in range(1, batch_size):
           self.assertTrue(
-              float(np.linalg.norm((res[0][0,:] - res[0][i,:]))) > 1e-6)
+              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) > 1e-6)
           self.assertTrue(
-              float(np.linalg.norm((res[1][0,:] - res[1][i,:]))) > 1e-6)
+              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) > 1e-6)
 
   def testOutputProjectionWrapper(self):
     with self.test_session() as sess:
diff --git a/tensorflow/models/rnn/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
similarity index 91%
rename from tensorflow/models/rnn/rnn_test.py
rename to tensorflow/python/kernel_tests/rnn_test.py
index 108a615f9af..1ed53c0a1f3 100644
--- a/tensorflow/models/rnn/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -25,11 +25,8 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.models.rnn import rnn
-from tensorflow.models.rnn import rnn_cell
 
-
-class Plus1RNNCell(rnn_cell.RNNCell):
+class Plus1RNNCell(tf.nn.rnn_cell.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
 
   @property
@@ -68,7 +65,7 @@ class RNNTest(tf.test.TestCase):
     cell = Plus1RNNCell()
     batch_size = 2
     inputs = [tf.placeholder(tf.float32, shape=(batch_size, 5))] * 10
-    outputs, states = rnn.rnn(cell, inputs, dtype=tf.float32)
+    outputs, states = tf.nn.rnn(cell, inputs, dtype=tf.float32)
     self.assertEqual(len(outputs), len(inputs))
     for out, inp in zip(outputs, inputs):
       self.assertEqual(out.get_shape(), inp.get_shape())
@@ -89,14 +86,15 @@ class RNNTest(tf.test.TestCase):
 
   def testDropout(self):
     cell = Plus1RNNCell()
-    full_dropout_cell = rnn_cell.DropoutWrapper(
+    full_dropout_cell = tf.nn.rnn_cell.DropoutWrapper(
         cell, input_keep_prob=1e-12, seed=0)
     batch_size = 2
     inputs = [tf.placeholder(tf.float32, shape=(batch_size, 5))] * 10
     with tf.variable_scope("share_scope"):
-      outputs, states = rnn.rnn(cell, inputs, dtype=tf.float32)
+      outputs, states = tf.nn.rnn(cell, inputs, dtype=tf.float32)
     with tf.variable_scope("drop_scope"):
-      dropped_outputs, _ = rnn.rnn(full_dropout_cell, inputs, dtype=tf.float32)
+      dropped_outputs, _ = tf.nn.rnn(
+          full_dropout_cell, inputs, dtype=tf.float32)
     self.assertEqual(len(outputs), len(inputs))
     for out, inp in zip(outputs, inputs):
       self.assertEqual(out.get_shape().as_list(), inp.get_shape().as_list())
@@ -120,7 +118,7 @@ class RNNTest(tf.test.TestCase):
     batch_size = 2
     inputs = [tf.placeholder(tf.float32, shape=(batch_size, 5))] * 10
     with tf.variable_scope("drop_scope"):
-      dynamic_outputs, dynamic_states = rnn.rnn(
+      dynamic_outputs, dynamic_states = tf.nn.rnn(
           cell, inputs, sequence_length=sequence_length, dtype=tf.float32)
     self.assertEqual(len(dynamic_outputs), len(inputs))
     self.assertEqual(len(dynamic_states), len(inputs))
@@ -158,11 +156,11 @@ class LSTMTest(tf.test.TestCase):
     batch_size = 2
     with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
       initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
-      cell = rnn_cell.LSTMCell(
+      cell = tf.nn.rnn_cell.LSTMCell(
           num_units, input_size, initializer=initializer)
       inputs = 10 * [
           tf.placeholder(tf.float32, shape=(batch_size, input_size))]
-      outputs, _ = rnn.rnn(cell, inputs, dtype=tf.float32)
+      outputs, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
       self.assertEqual(len(outputs), len(inputs))
       for out in outputs:
         self.assertEqual(out.get_shape().as_list(), [batch_size, num_units])
@@ -177,12 +175,12 @@ class LSTMTest(tf.test.TestCase):
     batch_size = 2
     with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
       initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
-      cell = rnn_cell.LSTMCell(
+      cell = tf.nn.rnn_cell.LSTMCell(
           num_units, input_size, use_peepholes=True,
           cell_clip=0.0, initializer=initializer)
       inputs = 10 * [
           tf.placeholder(tf.float32, shape=(batch_size, input_size))]
-      outputs, _ = rnn.rnn(cell, inputs, dtype=tf.float32)
+      outputs, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
       self.assertEqual(len(outputs), len(inputs))
       for out in outputs:
         self.assertEqual(out.get_shape().as_list(), [batch_size, num_units])
@@ -202,12 +200,12 @@ class LSTMTest(tf.test.TestCase):
     with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
       initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(batch_size, 2*num_units)
-      cell = rnn_cell.LSTMCell(
+      cell = tf.nn.rnn_cell.LSTMCell(
           num_units, input_size, use_peepholes=False, initializer=initializer)
       inputs = 10 * [
           tf.placeholder(tf.float32, shape=(batch_size, input_size))]
       with tf.variable_scope("share_scope"):
-        outputs, states = rnn.state_saving_rnn(
+        outputs, states = tf.nn.state_saving_rnn(
             cell, inputs, state_saver=state_saver, state_name="save_lstm")
       self.assertEqual(len(outputs), len(inputs))
       for out in outputs:
@@ -229,10 +227,10 @@ class LSTMTest(tf.test.TestCase):
       initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
       inputs = 10 * [
           tf.placeholder(tf.float32, shape=(None, input_size))]
-      cell = rnn_cell.LSTMCell(
+      cell = tf.nn.rnn_cell.LSTMCell(
           num_units, input_size, use_peepholes=True,
           num_proj=num_proj, initializer=initializer)
-      outputs, _ = rnn.rnn(cell, inputs, dtype=tf.float32)
+      outputs, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
       self.assertEqual(len(outputs), len(inputs))
 
       tf.initialize_all_variables().run()
@@ -252,7 +250,7 @@ class LSTMTest(tf.test.TestCase):
       inputs = 10 * [
           tf.placeholder(tf.float32, shape=(None, input_size))]
 
-      cell = rnn_cell.LSTMCell(
+      cell = tf.nn.rnn_cell.LSTMCell(
           num_units,
           input_size=input_size,
           use_peepholes=True,
@@ -261,7 +259,7 @@ class LSTMTest(tf.test.TestCase):
           num_proj_shards=num_proj_shards,
           initializer=initializer)
 
-      outputs, _ = rnn.rnn(cell, inputs, dtype=tf.float32)
+      outputs, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
 
       self.assertEqual(len(outputs), len(inputs))
 
@@ -280,7 +278,7 @@ class LSTMTest(tf.test.TestCase):
       initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = 10 * [tf.placeholder(tf.float64)]
 
-      cell = rnn_cell.LSTMCell(
+      cell = tf.nn.rnn_cell.LSTMCell(
           num_units,
           input_size=input_size,
           use_peepholes=True,
@@ -289,7 +287,7 @@ class LSTMTest(tf.test.TestCase):
           num_proj_shards=num_proj_shards,
           initializer=initializer)
 
-      outputs, _ = rnn.rnn(
+      outputs, _ = tf.nn.rnn(
           cell, inputs, initial_state=cell.zero_state(batch_size, tf.float64))
 
       self.assertEqual(len(outputs), len(inputs))
@@ -311,7 +309,7 @@ class LSTMTest(tf.test.TestCase):
       inputs = 10 * [tf.placeholder(tf.float32)]
       initializer = tf.constant_initializer(0.001)
 
-      cell_noshard = rnn_cell.LSTMCell(
+      cell_noshard = tf.nn.rnn_cell.LSTMCell(
           num_units, input_size,
           num_proj=num_proj,
           use_peepholes=True,
@@ -319,15 +317,15 @@ class LSTMTest(tf.test.TestCase):
           num_unit_shards=num_unit_shards,
           num_proj_shards=num_proj_shards)
 
-      cell_shard = rnn_cell.LSTMCell(
+      cell_shard = tf.nn.rnn_cell.LSTMCell(
           num_units, input_size, use_peepholes=True,
           initializer=initializer, num_proj=num_proj)
 
       with tf.variable_scope("noshard_scope"):
-        outputs_noshard, states_noshard = rnn.rnn(
+        outputs_noshard, states_noshard = tf.nn.rnn(
             cell_noshard, inputs, dtype=tf.float32)
       with tf.variable_scope("shard_scope"):
-        outputs_shard, states_shard = rnn.rnn(
+        outputs_shard, states_shard = tf.nn.rnn(
             cell_shard, inputs, dtype=tf.float32)
 
       self.assertEqual(len(outputs_noshard), len(inputs))
@@ -362,7 +360,7 @@ class LSTMTest(tf.test.TestCase):
       initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
       inputs = 10 * [tf.placeholder(tf.float64)]
 
-      cell = rnn_cell.LSTMCell(
+      cell = tf.nn.rnn_cell.LSTMCell(
           num_units,
           input_size=input_size,
           use_peepholes=True,
@@ -370,9 +368,9 @@ class LSTMTest(tf.test.TestCase):
           num_unit_shards=num_unit_shards,
           num_proj_shards=num_proj_shards,
           initializer=initializer)
-      dropout_cell = rnn_cell.DropoutWrapper(cell, 0.5, seed=0)
+      dropout_cell = tf.nn.rnn_cell.DropoutWrapper(cell, 0.5, seed=0)
 
-      outputs, states = rnn.rnn(
+      outputs, states = tf.nn.rnn(
           dropout_cell, inputs, sequence_length=sequence_length,
           initial_state=cell.zero_state(batch_size, tf.float64))
 
@@ -398,16 +396,16 @@ class LSTMTest(tf.test.TestCase):
       initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = 10 * [
           tf.placeholder(tf.float32, shape=(None, input_size))]
-      cell = rnn_cell.LSTMCell(
+      cell = tf.nn.rnn_cell.LSTMCell(
           num_units, input_size, use_peepholes=True,
           num_proj=num_proj, initializer=initializer)
 
       with tf.variable_scope("share_scope"):
-        outputs0, _ = rnn.rnn(cell, inputs, dtype=tf.float32)
+        outputs0, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
       with tf.variable_scope("share_scope", reuse=True):
-        outputs1, _ = rnn.rnn(cell, inputs, dtype=tf.float32)
+        outputs1, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
       with tf.variable_scope("diff_scope"):
-        outputs2, _ = rnn.rnn(cell, inputs, dtype=tf.float32)
+        outputs2, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
 
       tf.initialize_all_variables().run()
       input_value = np.random.randn(batch_size, input_size)
@@ -433,16 +431,16 @@ class LSTMTest(tf.test.TestCase):
       initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = 10 * [
           tf.placeholder(tf.float32, shape=(None, input_size))]
-      cell = rnn_cell.LSTMCell(
+      cell = tf.nn.rnn_cell.LSTMCell(
           num_units, input_size, use_peepholes=True,
           num_proj=num_proj, initializer=initializer)
 
       with tf.name_scope("scope0"):
         with tf.variable_scope("share_scope"):
-          outputs0, _ = rnn.rnn(cell, inputs, dtype=tf.float32)
+          outputs0, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
       with tf.name_scope("scope1"):
         with tf.variable_scope("share_scope", reuse=True):
-          outputs1, _ = rnn.rnn(cell, inputs, dtype=tf.float32)
+          outputs1, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
 
       tf.initialize_all_variables().run()
       input_value = np.random.randn(batch_size, input_size)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index c84921f21e1..adfe42009a0 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker
-
 
 class SegmentReductionHelper(tf.test.TestCase):
 
@@ -127,8 +125,12 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       with self.test_session():
         tf_x, np_x = self._input(shape, dtype=tf.float64)
         s = tf_op(data=tf_x, segment_ids=indices)
-        jacob_t, jacob_n = gradient_checker.ComputeGradient(
-            tf_x, shape, s, [3, 4], x_init_value=np_x.astype(np.double),
+        jacob_t, jacob_n = tf.test.compute_gradient(
+            tf_x,
+            shape,
+            s,
+            [3, 4],
+            x_init_value=np_x.astype(np.double),
             delta=1)
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
 
@@ -170,7 +172,7 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
         s = tf.unsorted_segment_sum(data=tf_x,
                                     segment_ids=indices,
                                     num_segments=num_segments)
-        jacob_t, jacob_n = gradient_checker.ComputeGradient(
+        jacob_t, jacob_n = tf.test.compute_gradient(
             tf_x,
             shape,
             s,
@@ -196,14 +198,20 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
       unsorted_s = tf.unsorted_segment_sum(data=tf_x,
                                                  segment_ids=indices,
                                                  num_segments=num_segments)
-      unsorted_jacob_t, unsorted_jacob_n = gradient_checker.ComputeGradient(
-          tf_x, shape, unsorted_s, [num_segments, num_cols],
+      (unsorted_jacob_t, unsorted_jacob_n) = tf.test.compute_gradient(
+          tf_x,
+          shape,
+          unsorted_s,
+          [num_segments, num_cols],
           x_init_value=np_x.astype(np.double),
           delta=1)
       # Results from SegmentSum
       sorted_s = tf.segment_sum(data=tf_x, segment_ids=indices)
-      sorted_jacob_t, sorted_jacob_n = gradient_checker.ComputeGradient(
-          tf_x, shape, sorted_s, [num_segments, num_cols],
+      sorted_jacob_t, sorted_jacob_n = tf.test.compute_gradient(
+          tf_x,
+          shape,
+          sorted_s,
+          [num_segments, num_cols],
           x_init_value=np_x.astype(np.double),
           delta=1)
     self.assertAllClose(unsorted_jacob_t, sorted_jacob_t, rtol=1e-3, atol=1e-3)
@@ -277,8 +285,12 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         tf_indices, _, tf_x, np_x = self._sparse_input(
             shape, num_indices, dtype=tf.float64)
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        jacob_t, jacob_n = gradient_checker.ComputeGradient(
-            tf_x, shape, s, [3, 4], x_init_value=np_x.astype(np.double),
+        jacob_t, jacob_n = tf.test.compute_gradient(
+            tf_x,
+            shape,
+            s,
+            [3, 4],
+            x_init_value=np_x.astype(np.double),
             delta=1)
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
 
diff --git a/tensorflow/models/rnn/seq2seq_test.py b/tensorflow/python/kernel_tests/seq2seq_test.py
similarity index 74%
rename from tensorflow/models/rnn/seq2seq_test.py
rename to tensorflow/python/kernel_tests/seq2seq_test.py
index 12d22630f0f..5ee2845780d 100644
--- a/tensorflow/models/rnn/seq2seq_test.py
+++ b/tensorflow/python/kernel_tests/seq2seq_test.py
@@ -21,16 +21,13 @@ from __future__ import print_function
 import math
 import random
 
+# pylint: disable=g-bad-import-order,unused-import
 import tensorflow.python.platform
 
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
-from tensorflow.models.rnn import rnn
-from tensorflow.models.rnn import rnn_cell
-from tensorflow.models.rnn import seq2seq
-
 
 class Seq2SeqTest(tf.test.TestCase):
 
@@ -38,10 +35,12 @@ class Seq2SeqTest(tf.test.TestCase):
     with self.test_session() as sess:
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         inp = [tf.constant(0.5, shape=[2, 2]) for _ in xrange(2)]
-        _, enc_states = rnn.rnn(rnn_cell.GRUCell(2), inp, dtype=tf.float32)
+        _, enc_states = tf.nn.rnn(
+            tf.nn.rnn_cell.GRUCell(2), inp, dtype=tf.float32)
         dec_inp = [tf.constant(0.4, shape=[2, 2]) for _ in xrange(3)]
-        cell = rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(2), 4)
-        dec, mem = seq2seq.rnn_decoder(dec_inp, enc_states[-1], cell)
+        cell = tf.nn.rnn_cell.OutputProjectionWrapper(
+            tf.nn.rnn_cell.GRUCell(2), 4)
+        dec, mem = tf.nn.seq2seq.rnn_decoder(dec_inp, enc_states[-1], cell)
         sess.run([tf.initialize_all_variables()])
         res = sess.run(dec)
         self.assertEqual(len(res), 3)
@@ -56,8 +55,9 @@ class Seq2SeqTest(tf.test.TestCase):
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         inp = [tf.constant(0.5, shape=[2, 2]) for _ in xrange(2)]
         dec_inp = [tf.constant(0.4, shape=[2, 2]) for _ in xrange(3)]
-        cell = rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(2), 4)
-        dec, mem = seq2seq.basic_rnn_seq2seq(inp, dec_inp, cell)
+        cell = tf.nn.rnn_cell.OutputProjectionWrapper(
+            tf.nn.rnn_cell.GRUCell(2), 4)
+        dec, mem = tf.nn.seq2seq.basic_rnn_seq2seq(inp, dec_inp, cell)
         sess.run([tf.initialize_all_variables()])
         res = sess.run(dec)
         self.assertEqual(len(res), 3)
@@ -72,8 +72,9 @@ class Seq2SeqTest(tf.test.TestCase):
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         inp = [tf.constant(0.5, shape=[2, 2]) for _ in xrange(2)]
         dec_inp = [tf.constant(0.4, shape=[2, 2]) for _ in xrange(3)]
-        cell = rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(2), 4)
-        dec, mem = seq2seq.tied_rnn_seq2seq(inp, dec_inp, cell)
+        cell = tf.nn.rnn_cell.OutputProjectionWrapper(
+            tf.nn.rnn_cell.GRUCell(2), 4)
+        dec, mem = tf.nn.seq2seq.tied_rnn_seq2seq(inp, dec_inp, cell)
         sess.run([tf.initialize_all_variables()])
         res = sess.run(dec)
         self.assertEqual(len(res), 3)
@@ -87,11 +88,11 @@ class Seq2SeqTest(tf.test.TestCase):
     with self.test_session() as sess:
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         inp = [tf.constant(0.5, shape=[2, 2]) for _ in xrange(2)]
-        cell = rnn_cell.BasicLSTMCell(2)
-        _, enc_states = rnn.rnn(cell, inp, dtype=tf.float32)
+        cell = tf.nn.rnn_cell.BasicLSTMCell(2)
+        _, enc_states = tf.nn.rnn(cell, inp, dtype=tf.float32)
         dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in xrange(3)]
-        dec, mem = seq2seq.embedding_rnn_decoder(dec_inp, enc_states[-1],
-                                                 cell, 4)
+        dec, mem = tf.nn.seq2seq.embedding_rnn_decoder(dec_inp, enc_states[-1],
+                                                       cell, 4)
         sess.run([tf.initialize_all_variables()])
         res = sess.run(dec)
         self.assertEqual(len(res), 3)
@@ -106,8 +107,9 @@ class Seq2SeqTest(tf.test.TestCase):
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         enc_inp = [tf.constant(1, tf.int32, shape=[2]) for i in xrange(2)]
         dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in xrange(3)]
-        cell = rnn_cell.BasicLSTMCell(2)
-        dec, mem = seq2seq.embedding_rnn_seq2seq(enc_inp, dec_inp, cell, 2, 5)
+        cell = tf.nn.rnn_cell.BasicLSTMCell(2)
+        dec, mem = tf.nn.seq2seq.embedding_rnn_seq2seq(
+            enc_inp, dec_inp, cell, 2, 5)
         sess.run([tf.variables.initialize_all_variables()])
         res = sess.run(dec)
         self.assertEqual(len(res), 3)
@@ -121,7 +123,7 @@ class Seq2SeqTest(tf.test.TestCase):
         w = tf.get_variable("proj_w", [2, 5])
         b = tf.get_variable("proj_b", [5])
         with tf.variable_scope("proj_seq2seq"):
-          dec, _ = seq2seq.embedding_rnn_seq2seq(
+          dec, _ = tf.nn.seq2seq.embedding_rnn_seq2seq(
               enc_inp, dec_inp, cell, 2, 5, output_projection=(w, b))
         sess.run([tf.variables.initialize_all_variables()])
         res = sess.run(dec)
@@ -131,12 +133,15 @@ class Seq2SeqTest(tf.test.TestCase):
         # Test that previous-feeding model ignores inputs after the first.
         dec_inp2 = [tf.constant(0, tf.int32, shape=[2]) for _ in xrange(3)]
         tf.get_variable_scope().reuse_variables()
-        d1, _ = seq2seq.embedding_rnn_seq2seq(enc_inp, dec_inp, cell, 2, 5,
-                                              feed_previous=True)
-        d2, _ = seq2seq.embedding_rnn_seq2seq(enc_inp, dec_inp2, cell, 2, 5,
-                                              feed_previous=True)
-        d3, _ = seq2seq.embedding_rnn_seq2seq(enc_inp, dec_inp2, cell, 2, 5,
-                                              feed_previous=tf.constant(True))
+        d1, _ = tf.nn.seq2seq.embedding_rnn_seq2seq(
+            enc_inp, dec_inp, cell, 2, 5,
+            feed_previous=True)
+        d2, _ = tf.nn.seq2seq.embedding_rnn_seq2seq(
+            enc_inp, dec_inp2, cell, 2, 5,
+            feed_previous=True)
+        d3, _ = tf.nn.seq2seq.embedding_rnn_seq2seq(
+            enc_inp, dec_inp2, cell, 2, 5,
+            feed_previous=tf.constant(True))
         res1 = sess.run(d1)
         res2 = sess.run(d2)
         res3 = sess.run(d3)
@@ -148,8 +153,9 @@ class Seq2SeqTest(tf.test.TestCase):
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         enc_inp = [tf.constant(1, tf.int32, shape=[2]) for i in xrange(2)]
         dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in xrange(3)]
-        cell = rnn_cell.BasicLSTMCell(2)
-        dec, mem = seq2seq.embedding_tied_rnn_seq2seq(enc_inp, dec_inp, cell, 5)
+        cell = tf.nn.rnn_cell.BasicLSTMCell(2)
+        dec, mem = tf.nn.seq2seq.embedding_tied_rnn_seq2seq(
+            enc_inp, dec_inp, cell, 5)
         sess.run([tf.variables.initialize_all_variables()])
         res = sess.run(dec)
         self.assertEqual(len(res), 3)
@@ -163,7 +169,7 @@ class Seq2SeqTest(tf.test.TestCase):
         w = tf.get_variable("proj_w", [2, 5])
         b = tf.get_variable("proj_b", [5])
         with tf.variable_scope("proj_seq2seq"):
-          dec, _ = seq2seq.embedding_tied_rnn_seq2seq(
+          dec, _ = tf.nn.seq2seq.embedding_tied_rnn_seq2seq(
               enc_inp, dec_inp, cell, 5, output_projection=(w, b))
         sess.run([tf.variables.initialize_all_variables()])
         res = sess.run(dec)
@@ -173,11 +179,13 @@ class Seq2SeqTest(tf.test.TestCase):
         # Test that previous-feeding model ignores inputs after the first.
         dec_inp2 = [tf.constant(0, tf.int32, shape=[2]) for _ in xrange(3)]
         tf.get_variable_scope().reuse_variables()
-        d1, _ = seq2seq.embedding_tied_rnn_seq2seq(enc_inp, dec_inp, cell, 5,
-                                                   feed_previous=True)
-        d2, _ = seq2seq.embedding_tied_rnn_seq2seq(enc_inp, dec_inp2, cell, 5,
-                                                   feed_previous=True)
-        d3, _ = seq2seq.embedding_tied_rnn_seq2seq(
+        d1, _ = tf.nn.seq2seq.embedding_tied_rnn_seq2seq(
+            enc_inp, dec_inp, cell, 5,
+            feed_previous=True)
+        d2, _ = tf.nn.seq2seq.embedding_tied_rnn_seq2seq(
+            enc_inp, dec_inp2, cell, 5,
+            feed_previous=True)
+        d3, _ = tf.nn.seq2seq.embedding_tied_rnn_seq2seq(
             enc_inp, dec_inp2, cell, 5, feed_previous=tf.constant(True))
         res1 = sess.run(d1)
         res2 = sess.run(d2)
@@ -188,14 +196,15 @@ class Seq2SeqTest(tf.test.TestCase):
   def testAttentionDecoder1(self):
     with self.test_session() as sess:
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
-        cell = rnn_cell.GRUCell(2)
+        cell = tf.nn.rnn_cell.GRUCell(2)
         inp = [tf.constant(0.5, shape=[2, 2]) for _ in xrange(2)]
-        enc_outputs, enc_states = rnn.rnn(cell, inp, dtype=tf.float32)
+        enc_outputs, enc_states = tf.nn.rnn(cell, inp, dtype=tf.float32)
         attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
                                     for e in enc_outputs])
         dec_inp = [tf.constant(0.4, shape=[2, 2]) for _ in xrange(3)]
-        dec, mem = seq2seq.attention_decoder(dec_inp, enc_states[-1],
-                                             attn_states, cell, output_size=4)
+        dec, mem = tf.nn.seq2seq.attention_decoder(
+            dec_inp, enc_states[-1],
+            attn_states, cell, output_size=4)
         sess.run([tf.initialize_all_variables()])
         res = sess.run(dec)
         self.assertEqual(len(res), 3)
@@ -208,15 +217,16 @@ class Seq2SeqTest(tf.test.TestCase):
   def testAttentionDecoder2(self):
     with self.test_session() as sess:
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
-        cell = rnn_cell.GRUCell(2)
+        cell = tf.nn.rnn_cell.GRUCell(2)
         inp = [tf.constant(0.5, shape=[2, 2]) for _ in xrange(2)]
-        enc_outputs, enc_states = rnn.rnn(cell, inp, dtype=tf.float32)
+        enc_outputs, enc_states = tf.nn.rnn(cell, inp, dtype=tf.float32)
         attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
                                     for e in enc_outputs])
         dec_inp = [tf.constant(0.4, shape=[2, 2]) for _ in xrange(3)]
-        dec, mem = seq2seq.attention_decoder(dec_inp, enc_states[-1],
-                                             attn_states, cell, output_size=4,
-                                             num_heads=2)
+        dec, mem = tf.nn.seq2seq.attention_decoder(
+            dec_inp, enc_states[-1],
+            attn_states, cell, output_size=4,
+            num_heads=2)
         sess.run([tf.initialize_all_variables()])
         res = sess.run(dec)
         self.assertEqual(len(res), 3)
@@ -230,14 +240,15 @@ class Seq2SeqTest(tf.test.TestCase):
     with self.test_session() as sess:
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         inp = [tf.constant(0.5, shape=[2, 2]) for _ in xrange(2)]
-        cell = rnn_cell.GRUCell(2)
-        enc_outputs, enc_states = rnn.rnn(cell, inp, dtype=tf.float32)
+        cell = tf.nn.rnn_cell.GRUCell(2)
+        enc_outputs, enc_states = tf.nn.rnn(cell, inp, dtype=tf.float32)
         attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
                                     for e in enc_outputs])
         dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in xrange(3)]
-        dec, mem = seq2seq.embedding_attention_decoder(dec_inp, enc_states[-1],
-                                                       attn_states, cell, 4,
-                                                       output_size=3)
+        dec, mem = tf.nn.seq2seq.embedding_attention_decoder(
+            dec_inp, enc_states[-1],
+            attn_states, cell, 4,
+            output_size=3)
         sess.run([tf.initialize_all_variables()])
         res = sess.run(dec)
         self.assertEqual(len(res), 3)
@@ -252,8 +263,8 @@ class Seq2SeqTest(tf.test.TestCase):
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
         enc_inp = [tf.constant(1, tf.int32, shape=[2]) for i in xrange(2)]
         dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in xrange(3)]
-        cell = rnn_cell.BasicLSTMCell(2)
-        dec, mem = seq2seq.embedding_attention_seq2seq(
+        cell = tf.nn.rnn_cell.BasicLSTMCell(2)
+        dec, mem = tf.nn.seq2seq.embedding_attention_seq2seq(
             enc_inp, dec_inp, cell, 2, 5)
         sess.run([tf.initialize_all_variables()])
         res = sess.run(dec)
@@ -268,7 +279,7 @@ class Seq2SeqTest(tf.test.TestCase):
         w = tf.get_variable("proj_w", [2, 5])
         b = tf.get_variable("proj_b", [5])
         with tf.variable_scope("proj_seq2seq"):
-          dec, _ = seq2seq.embedding_attention_seq2seq(
+          dec, _ = tf.nn.seq2seq.embedding_attention_seq2seq(
               enc_inp, dec_inp, cell, 2, 5, output_projection=(w, b))
         sess.run([tf.variables.initialize_all_variables()])
         res = sess.run(dec)
@@ -278,11 +289,11 @@ class Seq2SeqTest(tf.test.TestCase):
         # Test that previous-feeding model ignores inputs after the first.
         dec_inp2 = [tf.constant(0, tf.int32, shape=[2]) for _ in xrange(3)]
         tf.get_variable_scope().reuse_variables()
-        d1, _ = seq2seq.embedding_attention_seq2seq(
+        d1, _ = tf.nn.seq2seq.embedding_attention_seq2seq(
             enc_inp, dec_inp, cell, 2, 5, feed_previous=True)
-        d2, _ = seq2seq.embedding_attention_seq2seq(
+        d2, _ = tf.nn.seq2seq.embedding_attention_seq2seq(
             enc_inp, dec_inp2, cell, 2, 5, feed_previous=True)
-        d3, _ = seq2seq.embedding_attention_seq2seq(
+        d3, _ = tf.nn.seq2seq.embedding_attention_seq2seq(
             enc_inp, dec_inp2, cell, 2, 5, feed_previous=tf.constant(True))
         res1 = sess.run(d1)
         res2 = sess.run(d2)
@@ -297,21 +308,21 @@ class Seq2SeqTest(tf.test.TestCase):
       targets = [tf.constant(i, tf.int32, shape=[2]) for i in xrange(3)]
       weights = [tf.constant(1.0, shape=[2]) for i in xrange(3)]
 
-      average_loss_per_example = seq2seq.sequence_loss(
+      average_loss_per_example = tf.nn.seq2seq.sequence_loss(
           logits, targets, weights, output_classes,
           average_across_timesteps=True,
           average_across_batch=True)
       res = sess.run(average_loss_per_example)
       self.assertAllClose(res, 1.60944)
 
-      average_loss_per_sequence = seq2seq.sequence_loss(
+      average_loss_per_sequence = tf.nn.seq2seq.sequence_loss(
           logits, targets, weights, output_classes,
           average_across_timesteps=False,
           average_across_batch=True)
       res = sess.run(average_loss_per_sequence)
       self.assertAllClose(res, 4.828314)
 
-      total_loss = seq2seq.sequence_loss(
+      total_loss = tf.nn.seq2seq.sequence_loss(
           logits, targets, weights, output_classes,
           average_across_timesteps=False,
           average_across_batch=False)
@@ -326,13 +337,13 @@ class Seq2SeqTest(tf.test.TestCase):
       targets = [tf.constant(i, tf.int32, shape=[2]) for i in xrange(3)]
       weights = [tf.constant(1.0, shape=[2]) for i in xrange(3)]
 
-      average_loss_per_example = seq2seq.sequence_loss_by_example(
+      average_loss_per_example = tf.nn.seq2seq.sequence_loss_by_example(
           logits, targets, weights, output_classes,
           average_across_timesteps=True)
       res = sess.run(average_loss_per_example)
       self.assertAllClose(res, np.asarray([1.609438, 1.609438]))
 
-      loss_per_sequence = seq2seq.sequence_loss_by_example(
+      loss_per_sequence = tf.nn.seq2seq.sequence_loss_by_example(
           logits, targets, weights, output_classes,
           average_across_timesteps=False)
       res = sess.run(loss_per_sequence)
@@ -343,26 +354,30 @@ class Seq2SeqTest(tf.test.TestCase):
     # We learn to copy 10 symbols in 2 buckets: length 4 and length 8.
     classes = 10
     buckets = [(4, 4), (8, 8)]
-    # We use sampled softmax so we keep output projection separate.
-    w = tf.get_variable("proj_w", [24, classes])
-    w_t = tf.transpose(w)
-    b = tf.get_variable("proj_b", [classes])
-    # Here comes a sample Seq2Seq model using GRU cells.
-    def SampleGRUSeq2Seq(enc_inp, dec_inp, weights):
-      """Example sequence-to-sequence model that uses GRU cells."""
-      def GRUSeq2Seq(enc_inp, dec_inp):
-        cell = rnn_cell.MultiRNNCell([rnn_cell.GRUCell(24)] * 2)
-        return seq2seq.embedding_attention_seq2seq(
-            enc_inp, dec_inp, cell, classes, classes, output_projection=(w, b))
-      targets = [dec_inp[i+1] for i in xrange(len(dec_inp) - 1)] + [0]
-      def SampledLoss(inputs, labels):
-        labels = tf.reshape(labels, [-1, 1])
-        return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, 8, classes)
-      return seq2seq.model_with_buckets(enc_inp, dec_inp, targets, weights,
-                                        buckets, classes, GRUSeq2Seq,
-                                        softmax_loss_function=SampledLoss)
-    # Now we construct the copy model.
+
     with self.test_session() as sess:
+      # We use sampled softmax so we keep output projection separate.
+      w = tf.get_variable("proj_w", [24, classes])
+      w_t = tf.transpose(w)
+      b = tf.get_variable("proj_b", [classes])
+      # Here comes a sample Seq2Seq model using GRU cells.
+      def SampleGRUSeq2Seq(enc_inp, dec_inp, weights):
+        """Example sequence-to-sequence model that uses GRU cells."""
+        def GRUSeq2Seq(enc_inp, dec_inp):
+          cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.GRUCell(24)] * 2)
+          return tf.nn.seq2seq.embedding_attention_seq2seq(
+              enc_inp, dec_inp, cell, classes, classes,
+              output_projection=(w, b))
+        targets = [dec_inp[i+1] for i in xrange(len(dec_inp) - 1)] + [0]
+        def SampledLoss(inputs, labels):
+          labels = tf.reshape(labels, [-1, 1])
+          return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, 8, classes)
+        return tf.nn.seq2seq.model_with_buckets(
+            enc_inp, dec_inp, targets, weights,
+            buckets, classes, GRUSeq2Seq,
+            softmax_loss_function=SampledLoss)
+
+      # Now we construct the copy model.
       tf.set_random_seed(111)
       batch_size = 32
       inp = [tf.placeholder(tf.int32, shape=[None]) for _ in xrange(8)]
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index b2ff0b92b43..2621ad9dec2 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -24,8 +24,6 @@ import numpy as np
 
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
-
 
 class ShapeOpsTest(tf.test.TestCase):
 
@@ -119,7 +117,7 @@ class ShapeOpsTest(tf.test.TestCase):
                      dtype=tf.float32)
       squeezed = tf.expand_dims(inp, 1)
 
-      err = gc.ComputeGradientError(inp, [4, 2], squeezed, [4, 1, 2])
+      err = tf.test.compute_gradient_error(inp, [4, 2], squeezed, [4, 1, 2])
     self.assertLess(err, 1e-3)
 
   def testExpandDimsScalar(self):
@@ -202,7 +200,7 @@ class ShapeOpsTest(tf.test.TestCase):
       a = tf.reshape(inp, [4, 1, 2])
       squeezed = tf.squeeze(a, [])
 
-      err = gc.ComputeGradientError(a, [4, 1, 2], squeezed, [4, 2])
+      err = tf.test.compute_gradient_error(a, [4, 1, 2], squeezed, [4, 2])
     self.assertLess(err, 1e-3)
 
   def testSqueezeGradientWithSqueezeDims(self):
@@ -211,7 +209,7 @@ class ShapeOpsTest(tf.test.TestCase):
       a = tf.reshape(inp, [4, 1, 2, 1])
       squeezed = tf.squeeze(a, [1])
 
-      err = gc.ComputeGradientError(a, [4, 1, 2, 1], squeezed, [4, 2, 1])
+      err = tf.test.compute_gradient_error(a, [4, 1, 2, 1], squeezed, [4, 2, 1])
     self.assertLess(err, 1e-3)
 
 
@@ -366,8 +364,11 @@ class TileTest(tf.test.TestCase):
                    shape=input_shape, dtype=tf.float64)
       tiled = tf.tile(a, multiples)
       grad_shape = list(np.array(multiples) * np.array(inp.shape))
-      err = gc.ComputeGradientError(a, list(input_shape), tiled, grad_shape,
-                                    x_init_value=inp)
+      err = tf.test.compute_gradient_error(a,
+                                           list(input_shape),
+                                           tiled,
+                                           grad_shape,
+                                           x_init_value=inp)
     print("tile(float) error = ", err)
     self.assertLess(err, 1e-3)
 
@@ -382,7 +383,7 @@ class TileTest(tf.test.TestCase):
       a = tf.constant([float(x) for x in inp.flatten()],
                    shape=[4, 2], dtype=tf.float32)
       tiled = tf.tile(a, [1, 2])
-      err = gc.ComputeGradientError(a, [4, 2], tiled, [4, 4])
+      err = tf.test.compute_gradient_error(a, [4, 2], tiled, [4, 4])
     self.assertLess(err, 1e-3)
 
   def testShapeFunctionEdgeCases(self):
diff --git a/tensorflow/python/kernel_tests/softplus_op_test.py b/tensorflow/python/kernel_tests/softplus_op_test.py
index e79fc1ca20f..3575f7ab7c3 100644
--- a/tensorflow/python/kernel_tests/softplus_op_test.py
+++ b/tensorflow/python/kernel_tests/softplus_op_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
-
 
 class SoftplusTest(tf.test.TestCase):
 
@@ -57,7 +55,11 @@ class SoftplusTest(tf.test.TestCase):
       x_init = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32, order="F")
-      err = gc.ComputeGradientError(x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = tf.test.compute_gradient_error(x,
+                                           [2, 5],
+                                           y,
+                                           [2, 5],
+                                           x_init_value=x_init)
     print("softplus (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
diff --git a/tensorflow/python/kernel_tests/softsign_op_test.py b/tensorflow/python/kernel_tests/softsign_op_test.py
new file mode 100644
index 00000000000..fd8431c7c76
--- /dev/null
+++ b/tensorflow/python/kernel_tests/softsign_op_test.py
@@ -0,0 +1,68 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for Softsign and SoftsignGrad."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.python.platform
+
+import numpy as np
+import tensorflow as tf
+
+
+class SoftsignTest(tf.test.TestCase):
+
+  def _npSoftsign(self, np_features):
+    return np_features / (1 + np.abs(np_features))
+
+  def _testSoftsign(self, np_features, use_gpu=False):
+    np_softsign = self._npSoftsign(np_features)
+    with self.test_session(use_gpu=use_gpu):
+      softsign = tf.nn.softsign(np_features)
+      tf_softsign = softsign.eval()
+    self.assertAllClose(np_softsign, tf_softsign)
+    self.assertShapeEqual(np_softsign, softsign)
+
+  def testNumbers(self):
+    for t in [np.float, np.double]:
+      self._testSoftsign(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+          use_gpu=False)
+      self._testSoftsign(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+          use_gpu=True)
+
+  def testGradient(self):
+    with self.test_session():
+      x = tf.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5], name="x")
+      y = tf.nn.softsign(x, name="softsign")
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32, order="F")
+      err = tf.test.compute_gradient_error(x,
+                                           [2, 5],
+                                           y,
+                                           [2, 5],
+                                           x_init_value=x_init)
+    print("softsign (float) gradient err = ", err)
+    self.assertLess(err, 1e-4)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
index 8f0c60c4553..89e9fda178b 100644
--- a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
-
 
 def RandMatrix(rows, cols, tr):
   if tr:
@@ -96,8 +94,10 @@ class MatMulGradientTest(tf.test.TestCase):
                     transpose_b=tr_b,
                     a_is_sparse=sp_a,
                     b_is_sparse=sp_b)
-      err = (gc.ComputeGradientError(a, [2, 3] if tr_a else [3, 2], m, [3, 4]) +
-             gc.ComputeGradientError(b, [4, 2] if tr_b else [2, 4], m, [3, 4]))
+      err = (tf.test.compute_gradient_error(a, [2, 3]
+                                            if tr_a else [3, 2], m, [3, 4]) +
+             tf.test.compute_gradient_error(b, [4, 2]
+                                            if tr_b else [2, 4], m, [3, 4]))
     print("sparse_matmul gradient err = ", err)
     self.assertLess(err, 1e-3)
 
diff --git a/tensorflow/python/kernel_tests/transpose_op_test.py b/tensorflow/python/kernel_tests/transpose_op_test.py
index 8c5ff7bd7e6..fa38152a865 100644
--- a/tensorflow/python/kernel_tests/transpose_op_test.py
+++ b/tensorflow/python/kernel_tests/transpose_op_test.py
@@ -24,8 +24,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests.gradient_checker import ComputeGradient
-
 
 class TransposeTest(tf.test.TestCase):
 
@@ -48,10 +46,10 @@ class TransposeTest(tf.test.TestCase):
       xs = list(np.shape(x))
       ys = list(np.shape(tf_ans))
       if x.dtype == np.float32:
-        jacob_t, jacob_n = ComputeGradient(inx, xs, y, ys, x, 1e-2)
+        jacob_t, jacob_n = tf.test.compute_gradient(inx, xs, y, ys, x, 1e-2)
         self.assertAllClose(jacob_t, jacob_n, 1e-3, 1e-3)
       elif x.dtype == np.float64:
-        jacob_t, jacob_n = ComputeGradient(inx, xs, y, ys, x, 1e-2)
+        jacob_t, jacob_n = tf.test.compute_gradient(inx, xs, y, ys, x, 1e-2)
         self.assertAllClose(jacob_t, jacob_n, 1e-6, 1e-6)
 
       return tf_ans, jacob_t
@@ -70,10 +68,10 @@ class TransposeTest(tf.test.TestCase):
       xs = list(np.shape(x))
       ys = list(np.shape(tf_ans))
       if x.dtype == np.float32:
-        jacob_t, jacob_n = ComputeGradient(inx, xs, y, ys, x, 1e-2)
+        jacob_t, jacob_n = tf.test.compute_gradient(inx, xs, y, ys, x, 1e-2)
         self.assertAllClose(jacob_t, jacob_n, 1e-3, 1e-3)
       elif x.dtype == np.float64:
-        jacob_t, jacob_n = ComputeGradient(inx, xs, y, ys, x, 1e-2)
+        jacob_t, jacob_n = tf.test.compute_gradient(inx, xs, y, ys, x, 1e-2)
         self.assertAllClose(jacob_t, jacob_n, 1e-6, 1e-6)
 
       return tf_ans, jacob_t
diff --git a/tensorflow/python/kernel_tests/unpack_op_test.py b/tensorflow/python/kernel_tests/unpack_op_test.py
index 308b219f318..47ed9e617cb 100644
--- a/tensorflow/python/kernel_tests/unpack_op_test.py
+++ b/tensorflow/python/kernel_tests/unpack_op_test.py
@@ -24,8 +24,6 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker
-
 
 class UnpackOpTest(tf.test.TestCase):
 
@@ -53,8 +51,7 @@ class UnpackOpTest(tf.test.TestCase):
           with self.test_session(use_gpu=use_gpu):
             x = tf.constant(data)
             cs = tf.unpack(x, num=shape[0])
-            err = gradient_checker.ComputeGradientError(x, shape, cs[i],
-                                                        shapes[i])
+            err = tf.test.compute_gradient_error(x, shape, cs[i], shapes[i])
             self.assertLess(err, 1e-6)
 
   def testInferNum(self):
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index d3b01640529..39ec5f10a63 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -23,8 +23,6 @@ import tensorflow.python.platform
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.kernel_tests import gradient_checker as gc
-
 
 class XentTest(tf.test.TestCase):
 
@@ -120,7 +118,7 @@ class XentTest(tf.test.TestCase):
                        0.1, 0.8, 2.7, 6.4], shape=[3, 4],
                       dtype=tf.float64, name="f")
       x = tf.nn.softmax_cross_entropy_with_logits(f, l, name="xent")
-      err = gc.ComputeGradientError(f, [3, 4], x, [3])
+      err = tf.test.compute_gradient_error(f, [3, 4], x, [3])
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
 
diff --git a/tensorflow/python/lib/io/py_record_writer.cc b/tensorflow/python/lib/io/py_record_writer.cc
index 63c1460ac07..956b8719221 100644
--- a/tensorflow/python/lib/io/py_record_writer.cc
+++ b/tensorflow/python/lib/io/py_record_writer.cc
@@ -42,7 +42,7 @@ PyRecordWriter::~PyRecordWriter() {
   delete file_;
 }
 
-bool PyRecordWriter::WriteRecord(::tensorflow::StringPiece record) {
+bool PyRecordWriter::WriteRecord(tensorflow::StringPiece record) {
   if (writer_ == nullptr) return false;
   Status s = writer_->WriteRecord(record);
   return s.ok();
diff --git a/tensorflow/python/lib/io/py_record_writer.h b/tensorflow/python/lib/io/py_record_writer.h
index 99720f3b8ee..637ee1b8bb2 100644
--- a/tensorflow/python/lib/io/py_record_writer.h
+++ b/tensorflow/python/lib/io/py_record_writer.h
@@ -36,7 +36,7 @@ class PyRecordWriter {
   static PyRecordWriter* New(const string& filename);
   ~PyRecordWriter();
 
-  bool WriteRecord(::tensorflow::StringPiece record);
+  bool WriteRecord(tensorflow::StringPiece record);
   void Close();
 
  private:
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index ce171ed9db0..8d288525dc7 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -20,16 +20,17 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import constant_op
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
 
 
 @ops.RegisterGradient("Pack")
 def _PackGrad(op, grad):
   """Gradient for pack op."""
-  return array_ops.unpack(grad, num=op.get_attr('N'))
+  return array_ops.unpack(grad, num=op.get_attr("N"))
 
 
 @ops.RegisterGradient("Unpack")
@@ -41,28 +42,82 @@ def _UnpackGrad(_, *grads):
 @ops.RegisterGradient("Concat")
 def _ConcatGrad(op, grad):
   """Gradient for concat op."""
-  assert isinstance(grad, ops.Tensor)
+
+  def _CreateDenseMaskAndBegin(sizes, concat_dim):
+    """Create variables for iteratively slicing a dense gradients tensor."""
+    # Since shape is 1-D, shape_of_shape = [rank-of-inputs]
+    shape_of_shape = array_ops.shape(sizes[0])
+    # Make a vector of length equal to the input's dimensions,
+    # with 0's everywhere and 1 in the concat dim position.
+    # Note: Can't use sparse_to_dense since it isn't GPU-capable (for now)
+    mask = array_ops.concat(0,
+                            [array_ops.fill(
+                                array_ops.expand_dims(concat_dim, 0), 0),
+                             [1],
+                             array_ops.fill(
+                                 shape_of_shape - concat_dim - 1, 0)])
+    begin = array_ops.fill(shape_of_shape, 0)
+    return mask, begin
+
   # Degenerate concatenation, just return grad.
   if len(op.inputs) == 2:
     return [None, grad]
-  # Get the inputs' tensor shapes
-  sizes = [array_ops.shape(x) for x in op.inputs[1:]]
+
   concat_dim = op.inputs[0]
-  # Since shape is 1-D, shape_of_shape = [rank-of-inputs]
-  shape_of_shape = array_ops.shape(sizes[0])
-  # Make a vector of length equal to the input's dimensions,
-  # with 0's everywhere and 1 in the concat dim position.
-  # Note: Can't use sparse_to_dense since it isn't GPU-capable (for now)
-  mask = array_ops.concat(0,
-                          [array_ops.fill(
-                              array_ops.expand_dims(concat_dim, 0), 0), [1],
-                           array_ops.fill(shape_of_shape - concat_dim - 1, 0)])
   out_grads = []
-  begin = array_ops.fill(shape_of_shape, 0)
-  for i in range(len(sizes)):
-    out_grads.append(array_ops.slice(grad, begin, sizes[i]))
-    # Lint complains begin = begin + ...
-    begin = math_ops.add(begin, sizes[i] * mask)
+  if isinstance(grad, ops.Tensor):
+    # Get the inputs' tensor shapes
+    sizes = [array_ops.shape(x) for x in op.inputs[1:]]
+    mask, begin = _CreateDenseMaskAndBegin(sizes, concat_dim)
+    for size in sizes:
+      out_grads.append(array_ops.slice(grad, begin, size))
+      # Lint complains begin = begin + ...
+      begin = math_ops.add(begin, size * mask)
+  elif isinstance(grad, ops.IndexedSlices):
+    concat_dim_static = tensor_util.ConstantValue(concat_dim)
+    if concat_dim_static is None:
+      raise ValueError("Can only compute IndexedSlices gradient with "
+                       "statically-known concat_dim")
+    # Get the inputs' tensor shapes
+    sizes = [array_ops.shape(x) for x in op.inputs[1:]]
+    if concat_dim_static > 0:
+      # IndexedSlices, concat_dim > 0. Each input gets IndexedSlices gradients
+      # with all the indices, but with grad.values sliced accordingly. This
+      # is like the Tensor case, except shape(grad.values)[0] is not equal to
+      # shape(sizes[i])[0], since only a subset of the dim-0 values are stored.
+      mask, begin = _CreateDenseMaskAndBegin(sizes, concat_dim)
+      for size in sizes:
+        new_values = array_ops.slice(
+            grad.values,
+            begin,
+            array_ops.concat(0, [[-1], array_ops.slice(size, [1], [-1])]))
+        out_grads.append(
+            ops.IndexedSlices(new_values, grad.indices, size))
+        # Lint complains begin = begin + ...
+        begin = math_ops.add(begin, size * mask)
+    else:
+      # IndexedSlices, concat_dim == 0. Each input gets IndexedSlices gradients
+      # only for the relevant indices.
+      start = constant_op.constant(0, dtype=grad.indices.dtype)
+      for size in sizes:
+        size_concat_dim = array_ops.gather(size, concat_dim)
+        if size_concat_dim.dtype != grad.indices.dtype:
+          size_concat_dim = math_ops.cast(size_concat_dim,
+                                          dtype=grad.indices.dtype)
+        end = start + size_concat_dim
+        # Compute the 1-D Tensor of indices relevant for this input.
+        indices_to_select = array_ops.squeeze(
+            array_ops.where(math_ops.logical_and(grad.indices >= start,
+                                                 grad.indices < end)),
+            squeeze_dims=[1])
+        new_indices = array_ops.gather(grad.indices, indices_to_select) - start
+        new_values = array_ops.gather(grad.values, indices_to_select)
+        out_grads.append(
+            ops.IndexedSlices(new_values, new_indices, size))
+        start = end
+  else:
+    raise TypeError("Expected Tensor or IndexedSlices, got %s" % type(grad))
+
   return [None] + out_grads
 
 
@@ -201,6 +256,7 @@ def _PadGrad(op, grad):
 def _ReverseSequenceGrad(op, grad):
   seq_lengths = op.inputs[1]
   return [array_ops.reverse_sequence(grad,
-                                    seq_dim=op.get_attr("seq_dim"),
-                                    seq_lengths=seq_lengths),
+                                     batch_dim=op.get_attr("batch_dim"),
+                                     seq_dim=op.get_attr("seq_dim"),
+                                     seq_lengths=seq_lengths),
           None]
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 50f3facf2e8..1e2950e74ff 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -990,17 +990,22 @@ def _ReverseSequenceShape(op):
     A single-element list containing the shape of the output.
 
   Raises:
-    ValueError: If the input shapes are incompatible.
+    ValueError: If the input shapes are incompatible or seq_dim == batch_dim.
   """
   input_shape = op.inputs[0].get_shape()
   seq_lens_shape = op.inputs[1].get_shape().with_rank(1)
-  batch_size = input_shape[0].merge_with(seq_lens_shape[0])
-  input_shape = tensor_shape.TensorShape([batch_size]).concatenate(
-      input_shape[1:])
   seq_dim = op.get_attr("seq_dim")
+  batch_dim = op.get_attr("batch_dim")
+  if batch_dim >= input_shape.ndims:
+    raise ValueError("batch_dim must be < input.dims() (%d vs %d)" %
+                     (batch_dim, input_shape.ndims))
   if seq_dim >= input_shape.ndims:
     raise ValueError("seq_dim must be < input.dims() (%d vs %d)" %
                      (seq_dim, input_shape.ndims))
+  batch_size = input_shape[batch_dim].merge_with(seq_lens_shape[0])
+  input_shape = tensor_shape.TensorShape([
+      value if ix != batch_dim else batch_size
+      for ix, value in enumerate(input_shape)])
   return [input_shape]
 
 
diff --git a/tensorflow/python/ops/constant_op.py b/tensorflow/python/ops/constant_op.py
index f2aaad37a99..5d8d8a88d08 100644
--- a/tensorflow/python/ops/constant_op.py
+++ b/tensorflow/python/ops/constant_op.py
@@ -172,12 +172,24 @@ def _ConstantShape(op):
       [d.size for d in op.get_attr("value").tensor_shape.dim])]
 
 
-ops.register_tensor_conversion_function((list, tuple), constant, 100)
-ops.register_tensor_conversion_function(np.ndarray, constant, 100)
-ops.register_tensor_conversion_function(np.generic, constant, 100)
-ops.register_tensor_conversion_function(object, constant, 200)
+def _constant_tensor_conversion_function(v, dtype=None, name=None,
+                                         as_ref=False):
+  _ = as_ref
+  return constant(v, dtype=dtype, name=name)
 
-def _tensor_shape_tensor_conversion_function(s, dtype=None, name=None):
+
+ops.register_tensor_conversion_function(
+    (list, tuple), _constant_tensor_conversion_function, 100)
+ops.register_tensor_conversion_function(
+    np.ndarray, _constant_tensor_conversion_function, 100)
+ops.register_tensor_conversion_function(
+    np.generic, _constant_tensor_conversion_function, 100)
+ops.register_tensor_conversion_function(
+    object, _constant_tensor_conversion_function, 200)
+
+def _tensor_shape_tensor_conversion_function(s, dtype=None, name=None,
+                                             as_ref=False):
+  _ = as_ref
   if not s.is_fully_defined():
     raise ValueError(
         "Cannot convert a partially known TensorShape to a Tensor: %s" % s)
@@ -193,7 +205,9 @@ def _tensor_shape_tensor_conversion_function(s, dtype=None, name=None):
 ops.register_tensor_conversion_function(
     tensor_shape.TensorShape, _tensor_shape_tensor_conversion_function, 100)
 
-def _dimension_tensor_conversion_function(d, dtype=None, name=None):
+def _dimension_tensor_conversion_function(d, dtype=None, name=None,
+                                          as_ref=False):
+  _ = as_ref
   if d.value is None:
     raise ValueError("Cannot convert an unknown Dimension to a Tensor: %s" % d)
   if dtype is not None:
diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 8803ea62344..53bb20776c0 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -33,7 +33,7 @@ def _SwitchGrad(op, *grad):
   if isinstance(ctxt, WhileContext):
     merge_op = ctxt.switch_map.get(op)
     if merge_op:
-      merge_op._update_input(1, grad[1])
+      merge_op._update_input(1, next_iteration(grad[1]))
       return None, None
     else:
       merge_op = merge(grad, name="b_switch")[0]
@@ -70,7 +70,7 @@ def _MergeGrad(op, grad, _):
   else:
     num_inputs = len(op.inputs)
     cond = [math_ops.equal(op.outputs[1], i) for i in xrange(num_inputs)]
-    return [Switch(grad, cond[i])[1] for i in xrange(num_inputs)]
+    return [switch(grad, cond[i])[1] for i in xrange(num_inputs)]
 
 
 @ops.RegisterGradient("Exit")
@@ -89,7 +89,7 @@ def _ExitGrad(op, grad):
 
 @ops.RegisterGradient("NextIteration")
 def _NextIterationGrad(_, grad):
-  return next_iteration(grad)
+  return grad
 
 
 @ops.RegisterGradient("Enter")
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 8eb1bd79bff..b2660c210ad 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -75,8 +75,9 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import common_shapes
 from tensorflow.python.ops import constant_op
-from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import,undefined-variable
@@ -248,7 +249,7 @@ def _SwitchRefOrTensor(data, pred, name="Switch"):
   Raises:
     TypeError: if data is not a Tensor or IndexedSlices
   """
-  data = ops.convert_to_tensor_or_indexed_slices(data, name="data")
+  data = ops.convert_to_tensor_or_indexed_slices(data, name="data", as_ref=True)
   if isinstance(data, ops.Tensor):
     if not data.dtype.is_ref_dtype:
       return switch(data, pred, name=name)
@@ -418,8 +419,9 @@ def _GetRealValue(value):
   Returns:
     The same tensor value from the saved history.
   """
-  real_value = value
+  # pylint: disable=protected-access
   forward_ctxt = value.op._get_control_flow_context()
+  # pylint: enable=protected-access
   real_value = forward_ctxt.history_map.get(value.name)
   assert value.op.type != "Variable"
   if real_value is None:
@@ -432,29 +434,11 @@ def _GetRealValue(value):
       # to deepcopy the constants for the grad while context.
       history_value = forward_ctxt.AddForwardAccumulateLoop(value)
 
-      # The shapes of the whole history and a single event element.
-      forward_ctxt.grad_context.Exit()
-      elem_rank = array_ops.rank(history_value) - 1
-      elem_rank_vec = array_ops.expand_dims(elem_rank, 0)
-      elem_shape = array_ops.slice(array_ops.shape(history_value), [1],
-                                   elem_rank_vec)
-      slice_shape = array_ops.concat(0, [[1], elem_shape])
-      forward_ctxt.grad_context.Enter()
-
-      # The begin position of the slice at slice_index.
-      slice_index = forward_ctxt.grad_context.index
-      b1 = array_ops.zeros(elem_rank_vec, dtype=dtypes.int32)
-      b = array_ops.concat(0, [array_ops.expand_dims(slice_index, 0), b1])
-
-      # The slice at slice_index.
-      # TODO(irving): Replace with gather once that's GPU accelerated
-      real_value = array_ops.squeeze(
-          array_ops.slice(history_value,
-                          b,
-                          slice_shape,
-                          name="real"),
-          squeeze_dims=[0])
-  forward_ctxt.history_map[value.name] = real_value
+      # pylint: disable=protected-access
+      real_value = gen_data_flow_ops._stack_pop(history_value,
+                                                value.dtype.base_dtype)
+      # pylint: enable=protected-access
+    forward_ctxt.history_map[value.name] = real_value
   return real_value
 
 
@@ -656,7 +640,7 @@ def cond(pred, fn1, fn2, name=None):
     context_f = CondContext(pred, pivot_2, 0)
     context_f.Enter()
     res_f = context_f.BuildCondBranch(fn2)
-    context_t.ExitResult(res_f)
+    context_f.ExitResult(res_f)
     context_f.Exit()
 
     # Add the final merge to the graph.
@@ -693,8 +677,10 @@ class WhileContext(ControlFlowContext):
     # generation for gradient computation
     self._pivot = None
 
-    # The tensors for the counters added by AddForwardCounterLoop or
-    # AddBackPropCounterLoop
+    # The loop counter added either by AddForwardCounterLoop or
+    # AddBackPropCounterLoop. For forward, it is the value of the loop
+    # counter for the next iteration. For backprop, it is the value of
+    # the loop counter for the current iteration.
     self._index = None
 
     # Information needed by backprop
@@ -703,10 +689,10 @@ class WhileContext(ControlFlowContext):
     self._history_map = {}
     self._switch_map = {}
 
-    # values considered to have been already seen in this context
+    # Values considered to have been already seen in this context
     self._values = set()
 
-    # values referenced by but external to this context
+    # Values referenced by but external to this context
     self._external_values = {}
 
   @property
@@ -841,10 +827,9 @@ class WhileContext(ControlFlowContext):
                      name="f_count")
     merge_n = merge([enter_n, enter_n])[0]
     switch_n = switch(merge_n, self._pivot)
-    self._index = switch_n[1]
 
-    add_n = math_ops.add(self._index, 1)
-    next_n = next_iteration(add_n)
+    self._index = math_ops.add(switch_n[1], 1)
+    next_n = next_iteration(self._index)
     merge_n.op._update_input(1, next_n)
 
     self._total_iterations = exit(switch_n[0], name="f_count")
@@ -859,54 +844,39 @@ class WhileContext(ControlFlowContext):
 
     The pseudocode is:
     ```
-      acc;
+      acc = stack();
       while (_pivot) {
-        if (index == 0) [value] else Concat(acc, [value]);
+        acc = stack_push(acc, value);
       }
     ```
 
     Args:
-      value: The tensor that is accumulated.
+      value: The tensor that is to be accumulated.
 
     Returns:
-      The accumulated history of value.
+      The stack that contains the accumulated history of value.
 
     Raises:
       ValueError: If the shape of "value" is not known statically.
     """
-    if not value.get_shape().is_fully_defined():
-      raise ValueError("Must have known shape: %s" % value)
     self._grad_context.Exit()
-    # TODO(irving): Now that acc starts out empty, most of the
-    # conditional logic can go away.
-    acc = constant_op.constant([],
-                               value.dtype,
-                               shape=[0] + value.get_shape().as_list(),
-                               name="f_acc")
+    # pylint: disable=protected-access
+    acc = gen_data_flow_ops._stack(value.dtype.base_dtype, name="f_acc")
+    # pylint: enable=protected-access
     self.Enter()
     self.AddName(acc.name)
-    enter_acc = _Enter(acc, self._name, is_constant=False,
+    enter_acc = _Enter(acc, self._name, is_constant=True,
                        parallel_iterations=self._parallel_iterations,
                        name="f_acc")
-    merge_acc = merge([enter_acc, enter_acc])[0]
-    switch_acc = switch(merge_acc, self._pivot)
 
-    # If index = 0 then [value] else Concat(acc, [value]).
-    cond = math_ops.greater(self._index, 0)
-    switch_add_acc = switch(switch_acc[1], cond)
-    expand_value = array_ops.expand_dims(value, 0)
-    true_branch = array_ops.concat(0, [switch_add_acc[1], expand_value])
-    false_branch = array_ops.identity(switch_add_acc[0])
-    false_branch = with_dependencies([false_branch], expand_value)
-    add_acc = merge([false_branch, true_branch])[0]
+    # pylint: disable=protected-access
+    push_op = gen_data_flow_ops._stack_push(enter_acc, value)
+    self._index.op._add_control_input(push_op.op)
+    # pylint: enable=protected-access
 
-    next_acc = next_iteration(add_acc)
-    merge_acc.op._update_input(1, next_acc)
-
-    exit_acc = exit(switch_acc[0], name="f_acc")
     self.Exit()
     self._grad_context.Enter()
-    return exit_acc
+    return acc
 
   def AddForwardAccumulateCondLoop(self, value):
     """Add an accumulation loop for each conditional switch.
@@ -916,9 +886,9 @@ class WhileContext(ControlFlowContext):
 
     The pseudocode is:
       ```
-      acc;
+      acc = []
       while (_pivot) {
-        Concat(acc, value);
+        acc = concat([acc, value]);
       }
       ```
 
@@ -929,19 +899,19 @@ class WhileContext(ControlFlowContext):
       The accumulated history of value.
     """
     self._grad_context.Exit()
-    acc = constant_op.constant(False, name="f_acc")
+    acc = constant_op.constant(False, name="f_cond")
     self.Enter()
     self.AddName(acc.name)
     enter_acc = _Enter(acc, self._name, is_constant=False,
                        parallel_iterations=self._parallel_iterations,
-                       name="f_acc")
+                       name="f_cond")
     merge_acc = merge([enter_acc, enter_acc])[0]
     switch_acc = switch(merge_acc, self._pivot)
     acc = array_ops.concat(0, [switch_add_acc[1], value])
     next_acc = next_iteration(acc)
     merge_acc.op._update_input(1, next_acc)
 
-    exit_acc = exit(switch_acc[0], name="f_acc")
+    exit_acc = exit(switch_acc[0], name="f_cond")
     self.Exit()
     self._grad_context.Enter()
     return exit_acc
@@ -974,11 +944,10 @@ class WhileContext(ControlFlowContext):
     self._pivot = loop_cond(cond, name="b_count")
     switch_count = switch(merge_count, self._pivot)
 
-    # Add next_iteration right after Switch to match the gradient function.
-    next_count = next_iteration(switch_count[1])
-    self._pivot_for_body = next_count
-    self._index = math_ops.sub(next_count, one)
-    merge_count.op._update_input(1, self._index)
+    self._index = math_ops.sub(switch_count[1], one)
+    self._pivot_for_body = self._index
+    next_count = next_iteration(self._index)
+    merge_count.op._update_input(1, next_count)
 
     exit_count = exit(switch_count[0], name="b_count")
     self.Exit()
@@ -1015,9 +984,9 @@ class WhileContext(ControlFlowContext):
     merge_acc = merge([enter_acc, enter_acc], name="b_acc")[0]
     switch_acc = switch(merge_acc, self._pivot)
 
-    next_acc = next_iteration(switch_acc[1])
-    add_acc = math_ops.add(next_acc, value)
-    merge_acc.op._update_input(1, add_acc)
+    add_acc = math_ops.add(switch_acc[1], value)
+    next_acc = next_iteration(add_acc)
+    merge_acc.op._update_input(1, next_acc)
 
     exit_acc = exit(switch_acc[0], name="b_acc")
     return exit_acc
diff --git a/tensorflow/python/ops/gradients.py b/tensorflow/python/ops/gradients.py
index b790d9af6c9..599875ecb4b 100644
--- a/tensorflow/python/ops/gradients.py
+++ b/tensorflow/python/ops/gradients.py
@@ -50,7 +50,7 @@ from tensorflow.python.platform import logging
 _LARGE_SPARSE_NUM_ELEMENTS = 100000000
 
 
-def _IndexedSlicesToTensor(value, dtype=None, name=None):
+def _IndexedSlicesToTensor(value, dtype=None, name=None, as_ref=False):
   """Converts an IndexedSlices object `value` to a Tensor.
 
   NOTE(mrry): This function is potentially expensive.
@@ -59,6 +59,7 @@ def _IndexedSlicesToTensor(value, dtype=None, name=None):
     value: An ops.IndexedSlices object.
     dtype: The dtype of the Tensor to be returned.
     name: Optional name to use for the returned Tensor.
+    as_ref: True if a ref is requested.
 
   Returns:
     A dense Tensor representing the values in the given IndexedSlices.
@@ -66,6 +67,7 @@ def _IndexedSlicesToTensor(value, dtype=None, name=None):
   Raises:
     ValueError: If the IndexedSlices does not have the same dtype.
   """
+  _ = as_ref
   if dtype and not dtype.is_compatible_with(value.dtype):
     raise ValueError(
         "Tensor conversion requested dtype %s for IndexedSlices with dtype %s" %
diff --git a/tensorflow/python/ops/image_grad_test.py b/tensorflow/python/ops/image_grad_test.py
index 488e741e5b4..0c42f637aa4 100644
--- a/tensorflow/python/ops/image_grad_test.py
+++ b/tensorflow/python/ops/image_grad_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 # pylint: disable=g-bad-import-order,
 # pylint: disable=unused-import
 import tensorflow.python.platform
-from tensorflow.python.kernel_tests import gradient_checker as gc
 
 import numpy as np
 import tensorflow as tf
@@ -56,11 +55,11 @@ class ResizeNearestNeighborOpTest(tf.test.TestCase):
       input_tensor = tf.constant(x, shape=in_shape)
       resize_out = tf.image.resize_nearest_neighbor(input_tensor,
                                                     out_shape[1:3])
-      err = gc.ComputeGradientError(input_tensor,
-                                    in_shape,
-                                    resize_out,
-                                    out_shape,
-                                    x_init_value=x)
+      err = tf.test.compute_gradient_error(input_tensor,
+                                           in_shape,
+                                           resize_out,
+                                           out_shape,
+                                           x_init_value=x)
     self.assertLess(err, 1e-3)
 
   def testGradFromResizeToSmallerInBothDims(self):
@@ -73,11 +72,11 @@ class ResizeNearestNeighborOpTest(tf.test.TestCase):
       input_tensor = tf.constant(x, shape=in_shape)
       resize_out = tf.image.resize_nearest_neighbor(input_tensor,
                                                     out_shape[1:3])
-      err = gc.ComputeGradientError(input_tensor,
-                                    in_shape,
-                                    resize_out,
-                                    out_shape,
-                                    x_init_value=x)
+      err = tf.test.compute_gradient_error(input_tensor,
+                                           in_shape,
+                                           resize_out,
+                                           out_shape,
+                                           x_init_value=x)
     self.assertLess(err, 1e-3)
 
 
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index 7be02b220f6..2392042d504 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -25,7 +25,8 @@ are all of variable size.  If you need fixed size images, pass the output of
 the decode Ops to one of the cropping and resizing Ops.
 
 Note: The PNG encode and decode Ops support RGBA, but the conversions Ops
-presently only support RGB, HSV, and GrayScale.
+presently only support RGB, HSV, and GrayScale. Presently, the alpha channel has
+to be stripped from the image and re-attached using slicing ops.
 
 @@decode_jpeg
 @@encode_jpeg
@@ -82,6 +83,14 @@ resized_image = tf.image.resize_bilinear(image, [299, 299])
 
 @@transpose_image
 
+## Converting Between Colorspaces.
+
+Internally, images are either stored in as one `float32` per channel per pixel
+(implicitly, values are assumed to lie in `[0,1)`) or one `uint8` per channel
+per pixel (values are assumed to lie in `[0,255]`).
+
+@@convert_image_dtype
+
 ## Image Adjustments
 
 TensorFlow provides functions to adjust images in various ways: brightness,
@@ -805,3 +814,64 @@ def random_crop(image, size, seed=None, name=None):
   seed1, seed2 = random_seed.get_seed(seed)
   return gen_image_ops.random_crop(image, size, seed=seed1, seed2=seed2,
                                    name=name)
+
+
+def convert_image_dtype(image, dtype, name=None):
+  """Convert `image` to `dtype`, scaling its values if needed.
+
+  Images that are represented using floating point values are expected to have
+  values in the range [0,1). Image data stored in integer data types are
+  expected to have values in the range `[0,MAX]`, wbere `MAX` is the largest
+  positive representable number for the data type.
+
+  This op converts between data types, scaling the values appropriately before
+  casting.
+
+  Note that for floating point inputs, this op expects values to lie in [0,1).
+  Conversion of an image containing values outside that range may lead to
+  overflow errors when converted to integer `Dtype`s.
+
+  Args:
+    image: An image.
+    dtype: A `DType` to convert `image` to.
+    name: A name for this operation (optional).
+
+  Returns:
+    `image`, converted to `dtype`.
+  """
+
+  if dtype == image.dtype:
+    return image
+
+  with ops.op_scope([image], name, 'convert_image') as name:
+    # Both integer: use integer multiplication in the larger range
+    if image.dtype.is_integer and dtype.is_integer:
+      scale_in = image.dtype.max
+      scale_out = dtype.max
+      if scale_in > scale_out:
+        # Scaling down, scale first, then cast. The scaling factor will
+        # cause in.max to be mapped to above out.max but below out.max+1,
+        # so that the output is safely in the supported range.
+        scale = (scale_in + 1) // (scale_out + 1)
+        scaled = math_ops.div(image, scale)
+        return math_ops.cast(scaled, dtype)
+      else:
+        # Scaling up, cast first, then scale. The scale will not map in.max to
+        # out.max, but converting back and forth should result in no change.
+        cast = math_ops.cast(image, dtype)
+        scale = (scale_out + 1) // (scale_in + 1)
+        return math_ops.mul(cast, scale)
+    elif image.dtype.is_floating and dtype.is_floating:
+      # Both float: Just cast, no possible overflows in the allowed ranges.
+      return math_ops.cast(image, dtype)
+    else:
+      if image.dtype.is_integer:
+        # Converting to float: first cast, then scale
+        cast = math_ops.cast(image, dtype)
+        scale = 1. / image.dtype.max
+        return math_ops.mul(cast, scale)
+      else:
+        # Converting from float: first scale, then cast
+        scale = dtype.max + 0.5  # avoid rounding problems in the cast
+        scaled = math_ops.mul(image, scale)
+        return math_ops.cast(scaled, dtype)
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 7d315487d7a..1b7292e1e79 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -26,6 +26,7 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import constant_op
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import io_ops
@@ -787,5 +788,46 @@ class PngTest(test_util.TensorFlowTestCase):
                          [None, None, channels or None])
 
 
+class ConvertImageTest(test_util.TensorFlowTestCase):
+
+  def _convert(self, original, original_dtype, output_dtype, expected):
+    x_np = np.array(original, dtype=original_dtype.as_numpy_dtype())
+    y_np = np.array(expected, dtype=output_dtype.as_numpy_dtype())
+
+    with self.test_session():
+      image = constant_op.constant(x_np)
+      y = image_ops.convert_image_dtype(image, output_dtype)
+      self.assertTrue(y.dtype == output_dtype)
+      self.assertAllClose(y.eval(), y_np, atol=1e-5)
+
+  def testNoConvert(self):
+    # Make sure converting to the same data type creates no ops
+    with self.test_session():
+      image = constant_op.constant([1], dtype=dtypes.uint8)
+      y = image_ops.convert_image_dtype(image, dtypes.uint8)
+      self.assertEquals(image, y)
+
+  def testConvertBetweenInteger(self):
+    # Make sure converting to between integer types scales appropriately
+    with self.test_session():
+      self._convert([0, 255], dtypes.uint8, dtypes.int16, [0, 255 * 128])
+      self._convert([0, 32767], dtypes.int16, dtypes.uint8, [0, 255])
+
+  def testConvertBetweenFloat(self):
+    # Make sure converting to between float types does nothing interesting
+    with self.test_session():
+      self._convert([-1.0, 0, 1.0, 200000], dtypes.float32, dtypes.float64,
+                    [-1.0, 0, 1.0, 200000])
+      self._convert([-1.0, 0, 1.0, 200000], dtypes.float64, dtypes.float32,
+                    [-1.0, 0, 1.0, 200000])
+
+  def testConvertBetweenIntegerAndFloat(self):
+    # Make sure converting from and to a float type scales appropriately
+    with self.test_session():
+      self._convert([0, 1, 255], dtypes.uint8, dtypes.float32,
+                    [0, 1.0 / 255.0, 1])
+      self._convert([0, 1.1 / 255.0, 1], dtypes.float32, dtypes.uint8,
+                    [0, 1, 255])
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index 17160f909e0..3bd0e875631 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -16,11 +16,10 @@
 # pylint: disable=wildcard-import,unused-import,g-bad-import-order
 """## Activation Functions
 
-The activation ops provide different types of nonlinearities for use in
-neural networks.  These include smooth nonlinearities (`sigmoid`,
-`tanh`, and `softplus`), continuous but not everywhere differentiable
-functions (`relu`, `relu6`, and `relu_x`), and random regularization
-(`dropout`).
+The activation ops provide different types of nonlinearities for use in neural
+networks.  These include smooth nonlinearities (`sigmoid`, `tanh`, `softplus`,
+and `softsign`), continuous but not everywhere differentiable functions (`relu`,
+`relu6`, and `relu_x`), and random regularization (`dropout`).
 
 All activation ops apply componentwise, and produce a tensor of the same
 shape as the input tensor.
@@ -28,6 +27,7 @@ shape as the input tensor.
 @@relu
 @@relu6
 @@softplus
+@@softsign
 @@dropout
 @@bias_add
 @@sigmoid
@@ -212,12 +212,16 @@ from tensorflow.python.ops import candidate_sampling_ops
 from tensorflow.python.ops import constant_op
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import numerics
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import seq2seq
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops.math_ops import sigmoid
 from tensorflow.python.ops.math_ops import tanh
 
@@ -225,6 +229,7 @@ from tensorflow.python.ops.math_ops import tanh
 from tensorflow.python.ops.nn_ops import *
 from tensorflow.python.ops.candidate_sampling_ops import *
 from tensorflow.python.ops.embedding_ops import *
+from tensorflow.python.ops.rnn import *
 
 
 def sigmoid_cross_entropy_with_logits(logits, targets, name=None):
@@ -268,28 +273,6 @@ def sigmoid_cross_entropy_with_logits(logits, targets, name=None):
                         name=name)
 
 
-def xw_plus_b(x, weights, biases, name=None):
-  """Computes matmul(x, weights) + biases.
-
-  Args:
-    x: a 2D tensor.  Dimensions typically: batch, in_units
-    weights: a 2D tensor.  Dimensions typically: in_units, out_units
-    biases: a 1D tensor.  Dimensions: out_units
-    name: A name for the operation (optional).  If not specified
-      "wx_plus_b" is used.
-
-  Returns:
-    A 2-D Tensor computing matmul(x, weights) + biases.
-    Dimensions typically: batch, out_units.
-  """
-  with ops.op_scope([x, weights, biases], name, "xw_plus_b") as name:
-    x = ops.convert_to_tensor(x, name="x")
-    weights = ops.convert_to_tensor(weights, name="weights")
-    biases = ops.convert_to_tensor(biases, name="biases")
-    mm = math_ops.matmul(x, weights)
-    return nn_ops.bias_add(mm, biases, name=name)
-
-
 def relu_layer(x, weights, biases, name=None):
   """Computes Relu(x * weight + biases).
 
@@ -363,59 +346,6 @@ def zero_fraction(value, name=None):
                                               dtypes.float32))
 
 
-def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):
-  """Computes dropout.
-
-  With probability `keep_prob`, outputs the input element scaled up by
-  `1 / keep_prob`, otherwise outputs `0`.  The scaling is so that the expected
-  sum is unchanged.
-
-  By default, each element is kept or dropped independently.  If `noise_shape`
-  is specified, it must be
-  [broadcastable](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-  to the shape of `x`, and only dimensions with `noise_shape[i] == shape(x)[i]`
-  will make independent decisions.  For example, if `shape(x) = [k, l, m, n]`
-  and `noise_shape = [k, 1, 1, n]`, each batch and channel component will be
-  kept independently and each row and column will be kept or not kept together.
-
-  Args:
-    x: A tensor.
-    keep_prob: A scalar `Tensor` with the same type as x. The probability
-      that each element is kept.
-    noise_shape: A 1-D `Tensor` of type `int32`, representing the
-      shape for randomly generated keep/drop flags.
-    seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-      for behavior.
-    name: A name for this operation (optional).
-
-  Returns:
-    A Tensor of the same shape of `x`.
-
-  Raises:
-    ValueError: If `keep_prob` is not in `(0, 1]`.
-  """
-  with ops.op_scope([x], name, "dropout") as name:
-    x = ops.convert_to_tensor(x, name="x")
-    if isinstance(keep_prob, float) and not(0 < keep_prob <= 1):
-      raise ValueError("keep_prob must be a scalar tensor or a float in the "
-                       "range (0, 1], got %g" % keep_prob)
-    keep_prob = ops.convert_to_tensor(
-        keep_prob, dtype=x.dtype, name="keep_prob")
-    keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
-
-    noise_shape = noise_shape or array_ops.shape(x)
-    # uniform [keep_prob, 1.0 + keep_prob)
-    random_tensor = keep_prob
-    random_tensor += random_ops.random_uniform(
-        noise_shape, seed=seed, dtype=x.dtype)
-    # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
-    binary_tensor = math_ops.floor(random_tensor)
-    ret = x * math_ops.inv(keep_prob) * binary_tensor
-    ret.set_shape(x.get_shape())
-    return ret
-
-
 def depthwise_conv2d(input, filter, strides, padding, name=None):
   """Depthwise 2-D convolution.
 
@@ -672,9 +602,9 @@ def _compute_sampled_logits(weights, biases, inputs, labels, num_sampled,
     labels_flat = array_ops.reshape(labels, [-1])
 
     # Sample the negative labels.
-    #   sampled shape: num_sampled vector
-    #   true_expected_count shape = [batch_size, 1]
-    #   sampled_expected_count shape = num_sampled vector
+    #   sampled shape: [num_sampled] tensor
+    #   true_expected_count shape = [batch_size, 1] tensor
+    #   sampled_expected_count shape = [num_sampled] tensor
     if sampled_values is None:
       sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler(
           true_classes=labels,
@@ -687,12 +617,18 @@ def _compute_sampled_logits(weights, biases, inputs, labels, num_sampled,
     sampled, true_expected_count, sampled_expected_count = sampled_values
     # pylint: enable=unpacking-non-sequence
 
+    # labels_flat is a [batch_size * num_true] tensor
+    # sampled is a [num_sampled] int tensor
+    all_ids = array_ops.concat(0, [labels_flat, sampled])
+
     # weights shape is [num_classes, dim]
-    # labels_flat is a [batch_size * num_true] vector
+    all_w = embedding_ops.embedding_lookup(weights, all_ids)
+    all_b = embedding_ops.embedding_lookup(biases, all_ids)
     # true_w shape is [batch_size * num_true, dim]
-    # true_b is a [batch_size * num_true] vector
-    true_w = embedding_ops.embedding_lookup(weights, labels_flat)
-    true_b = embedding_ops.embedding_lookup(biases, labels_flat)
+    # true_b is a [batch_size * num_true] tensor
+    true_w = array_ops.slice(
+        all_w, [0, 0], array_ops.pack([array_ops.shape(labels_flat)[0], -1]))
+    true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat))
 
     # inputs shape is [batch_size, dim]
     # true_w shape is [batch_size * num_true, dim]
@@ -711,11 +647,11 @@ def _compute_sampled_logits(weights, biases, inputs, labels, num_sampled,
     true_logits += true_b
 
     # Lookup weights and biases for sampled labels.
-    #   sampled is a num_sampled int vector
     #   sampled_w shape is [num_sampled, dim]
-    #   sampled_b is a num_sampled float vector
-    sampled_w = embedding_ops.embedding_lookup(weights, sampled)
-    sampled_b = embedding_ops.embedding_lookup(biases, sampled)
+    #   sampled_b is a [num_sampled] float tensor
+    sampled_w = array_ops.slice(
+        all_w, array_ops.pack([array_ops.shape(labels_flat)[0], 0]), [-1, -1])
+    sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1])
 
     # inputs has shape [batch_size, dim]
     # sampled_w has shape [num_sampled, dim]
@@ -740,6 +676,8 @@ def _compute_sampled_logits(weights, biases, inputs, labels, num_sampled,
       sampled_logits_shape = array_ops.concat(
           0,
           [array_ops.shape(labels)[:1], array_ops.expand_dims(num_sampled, 0)])
+      if sampled_logits.dtype != acc_weights.dtype:
+        acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype)
       sampled_logits += sparse_ops.sparse_to_dense(
           sparse_indices, sampled_logits_shape, acc_weights, 0.0)
 
@@ -879,5 +817,5 @@ def sampled_softmax_loss(weights, biases, inputs, labels, num_sampled,
       remove_accidental_hits=remove_accidental_hits,
       name=name)
   sampled_losses = nn_ops.softmax_cross_entropy_with_logits(logits, labels)
-  # sampled_losses is a batch_size vector.
+  # sampled_losses is a [batch_size] tensor.
   return sampled_losses
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 7c8ad8883ca..48f57b65279 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -137,6 +137,11 @@ def _SoftplusGrad(op, grad):
   return gen_nn_ops._softplus_grad(grad, op.inputs[0])
 
 
+@ops.RegisterGradient("Softsign")
+def _SoftsignGrad(op, grad):
+  return gen_nn_ops._softsign_grad(grad, op.inputs[0])
+
+
 @ops.RegisterGradient("ReluGrad")
 def _ReluGradGrad(op, grad):
   x = op.inputs[1]
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 1eb8ef4c693..604739a6b6e 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -26,8 +26,11 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import common_shapes
 from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_nn_ops import *
 
@@ -235,11 +238,13 @@ def max_pool(value, ksize, strides, padding, name=None):
 ops.RegisterShape("Relu")(common_shapes.unchanged_shape)
 ops.RegisterShape("Relu6")(common_shapes.unchanged_shape)
 ops.RegisterShape("Softplus")(common_shapes.unchanged_shape)
+ops.RegisterShape("Softsign")(common_shapes.unchanged_shape)
 
 
 @ops.RegisterShape("ReluGrad")
 @ops.RegisterShape("Relu6Grad")
 @ops.RegisterShape("SoftplusGrad")
+@ops.RegisterShape("SoftsignGrad")
 def _BinaryElementwiseShape(op):
   """Returns same shape as both inputs to op.
 
@@ -383,3 +388,81 @@ def _MaxPoolGradShape(op):
   """Shape function for the MaxPoolGrad op."""
   orig_input_shape = op.inputs[0].get_shape().with_rank(4)
   return [orig_input_shape]
+
+
+def xw_plus_b(x, weights, biases, name=None):  # pylint: disable=invalid-name
+  """Computes matmul(x, weights) + biases.
+
+  Args:
+    x: a 2D tensor.  Dimensions typically: batch, in_units
+    weights: a 2D tensor.  Dimensions typically: in_units, out_units
+    biases: a 1D tensor.  Dimensions: out_units
+    name: A name for the operation (optional).  If not specified
+      "wx_plus_b" is used.
+
+  Returns:
+    A 2-D Tensor computing matmul(x, weights) + biases.
+    Dimensions typically: batch, out_units.
+  """
+  with ops.op_scope([x, weights, biases], name, "xw_plus_b") as name:
+    x = ops.convert_to_tensor(x, name="x")
+    weights = ops.convert_to_tensor(weights, name="weights")
+    biases = ops.convert_to_tensor(biases, name="biases")
+    mm = math_ops.matmul(x, weights)
+    return bias_add(mm, biases, name=name)
+
+
+# pylint: disable=invalid-name
+def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):
+  """Computes dropout.
+
+  With probability `keep_prob`, outputs the input element scaled up by
+  `1 / keep_prob`, otherwise outputs `0`.  The scaling is so that the expected
+  sum is unchanged.
+
+  By default, each element is kept or dropped independently.  If `noise_shape`
+  is specified, it must be
+  [broadcastable](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+  to the shape of `x`, and only dimensions with `noise_shape[i] == shape(x)[i]`
+  will make independent decisions.  For example, if `shape(x) = [k, l, m, n]`
+  and `noise_shape = [k, 1, 1, n]`, each batch and channel component will be
+  kept independently and each row and column will be kept or not kept together.
+
+  Args:
+    x: A tensor.
+    keep_prob: A scalar `Tensor` with the same type as x. The probability
+      that each element is kept.
+    noise_shape: A 1-D `Tensor` of type `int32`, representing the
+      shape for randomly generated keep/drop flags.
+    seed: A Python integer. Used to create random seeds. See
+      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
+      for behavior.
+    name: A name for this operation (optional).
+
+  Returns:
+    A Tensor of the same shape of `x`.
+
+  Raises:
+    ValueError: If `keep_prob` is not in `(0, 1]`.
+  """
+  with ops.op_scope([x], name, "dropout") as name:
+    x = ops.convert_to_tensor(x, name="x")
+    if isinstance(keep_prob, float) and not 0 < keep_prob <= 1:
+      raise ValueError("keep_prob must be a scalar tensor or a float in the "
+                       "range (0, 1], got %g" % keep_prob)
+    keep_prob = ops.convert_to_tensor(
+        keep_prob, dtype=x.dtype, name="keep_prob")
+    keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
+
+    noise_shape = noise_shape or array_ops.shape(x)
+    # uniform [keep_prob, 1.0 + keep_prob)
+    random_tensor = keep_prob
+    random_tensor += random_ops.random_uniform(
+        noise_shape, seed=seed, dtype=x.dtype)
+    # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
+    binary_tensor = math_ops.floor(random_tensor)
+    ret = x * math_ops.inv(keep_prob) * binary_tensor
+    ret.set_shape(x.get_shape())
+    return ret
+
+# pylint: enable=invalid-name
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 65e28978baa..4146803c255 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -22,26 +22,17 @@ import math
 
 import tensorflow.python.platform
 
+import tensorflow as tf
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
-from tensorflow.python.kernel_tests import gradient_checker as gc
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import constant_op
 from tensorflow.python.ops import gen_nn_ops
-from tensorflow.python.ops import gradients
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import nn_grad
-from tensorflow.python.platform import googletest
 
 exp = math.exp
 log = math.log
 
 
-class SigmoidCrossEntropyWithLogitsTest(test_util.TensorFlowTestCase):
+class SigmoidCrossEntropyWithLogitsTest(tf.test.TestCase):
 
   def _SigmoidCrossEntropyWithLogits(self, logits, targets):
     assert len(logits) == len(targets)
@@ -50,28 +41,29 @@ class SigmoidCrossEntropyWithLogitsTest(test_util.TensorFlowTestCase):
     pred = [min(max(p, eps), 1 - eps) for p in pred]
     return [-z * log(y) - (1 - z) * log(1 - y) for y, z in zip(pred, targets)]
 
-  def _Inputs(self, x=None, y=None, dtype=dtypes.float64, sizes=None):
+  def _Inputs(self, x=None, y=None, dtype=tf.float64, sizes=None):
     x = [-100, -2, -2, 0, 2, 2, 2, 100] if x is None else x
     y = [0, 0, 1, 0, 0, 1, 0.5, 1] if y is None else y
     assert len(x) == len(y)
     sizes = sizes if sizes else [len(x)]
-    logits = constant_op.constant(x, shape=sizes, dtype=dtype, name="logits")
-    targets = constant_op.constant(y, shape=sizes, dtype=dtype, name="targets")
+    logits = tf.constant(x, shape=sizes, dtype=dtype, name="logits")
+    targets = tf.constant(y, shape=sizes, dtype=dtype, name="targets")
     losses = np.array(self._SigmoidCrossEntropyWithLogits(x, y)).reshape(*sizes)
     return logits, targets, losses
 
   def testConstructionNamed(self):
     with self.test_session():
       logits, targets, _ = self._Inputs()
-      loss = nn.sigmoid_cross_entropy_with_logits(logits, targets,
-                                                  name="mylogistic")
+      loss = tf.nn.sigmoid_cross_entropy_with_logits(logits,
+                                                     targets,
+                                                     name="mylogistic")
     self.assertEqual("mylogistic", loss.op.name)
 
   def testLogisticOutput(self):
     for use_gpu in [True, False]:
       with self.test_session(use_gpu=use_gpu):
-        logits, targets, losses = self._Inputs(dtype=dtypes.float32)
-        loss = nn.sigmoid_cross_entropy_with_logits(logits, targets)
+        logits, targets, losses = self._Inputs(dtype=tf.float32)
+        loss = tf.nn.sigmoid_cross_entropy_with_logits(logits, targets)
         np_loss = np.array(losses).astype(np.float32)
         tf_loss = loss.eval()
       self.assertAllClose(np_loss, tf_loss, atol=0.001)
@@ -79,9 +71,9 @@ class SigmoidCrossEntropyWithLogitsTest(test_util.TensorFlowTestCase):
   def testLogisticOutputMultiDim(self):
     for use_gpu in [True, False]:
       with self.test_session(use_gpu=use_gpu):
-        logits, targets, losses = self._Inputs(dtype=dtypes.float32,
+        logits, targets, losses = self._Inputs(dtype=tf.float32,
                                                sizes=[2, 2, 2])
-        loss = nn.sigmoid_cross_entropy_with_logits(logits, targets)
+        loss = tf.nn.sigmoid_cross_entropy_with_logits(logits, targets)
         np_loss = np.array(losses).astype(np.float32)
         tf_loss = loss.eval()
       self.assertAllClose(np_loss, tf_loss, atol=0.001)
@@ -90,13 +82,13 @@ class SigmoidCrossEntropyWithLogitsTest(test_util.TensorFlowTestCase):
     sizes = [4, 2]
     with self.test_session():
       logits, targets, _ = self._Inputs(sizes=sizes)
-      loss = nn.sigmoid_cross_entropy_with_logits(logits, targets)
-      err = gc.ComputeGradientError(logits, sizes, loss, sizes)
+      loss = tf.nn.sigmoid_cross_entropy_with_logits(logits, targets)
+      err = tf.test.compute_gradient_error(logits, sizes, loss, sizes)
     print("logistic loss gradient err = ", err)
     self.assertLess(err, 1e-7)
 
 
-class ZeroFractionTest(test_util.TensorFlowTestCase):
+class ZeroFractionTest(tf.test.TestCase):
 
   def _ZeroFraction(self, x):
     assert x.shape
@@ -109,9 +101,9 @@ class ZeroFractionTest(test_util.TensorFlowTestCase):
     x_np = np.random.randint(0, 2, size=x_shape).astype(np.float32)
     y_np = self._ZeroFraction(x_np)
     with self.test_session():
-      x_tf = constant_op.constant(x_np)
+      x_tf = tf.constant(x_np)
       x_tf.set_shape(x_shape)
-      y_tf = nn.zero_fraction(x_tf)
+      y_tf = tf.nn.zero_fraction(x_tf)
       y_tf_np = y_tf.eval()
     eps = 1e-8
     self.assertAllClose(y_tf_np, y_np, eps)
@@ -119,11 +111,11 @@ class ZeroFractionTest(test_util.TensorFlowTestCase):
   def testZeroFractionEmpty(self):
     with self.test_session():
       x = np.zeros(0)
-      y = nn.zero_fraction(x).eval()
+      y = tf.nn.zero_fraction(x).eval()
       self.assertTrue(np.isnan(y))
 
 
-class SoftmaxTest(test_util.TensorFlowTestCase):
+class SoftmaxTest(tf.test.TestCase):
 
   def _softmax(self, x):
     assert len(x.shape) == 2
@@ -137,8 +129,8 @@ class SoftmaxTest(test_util.TensorFlowTestCase):
     x_np = np.random.randn(*x_shape).astype(np.float32)
     y_np = self._softmax(x_np)
     with self.test_session():
-      x_tf = constant_op.constant(x_np)
-      y_tf = nn.softmax(x_tf)
+      x_tf = tf.constant(x_np)
+      y_tf = tf.nn.softmax(x_tf)
       y_tf_np = y_tf.eval()
     eps = 1e-3
     self.assertAllClose(y_tf_np, y_np, eps)
@@ -147,14 +139,14 @@ class SoftmaxTest(test_util.TensorFlowTestCase):
     x_shape = [5, 10]
     x_np = np.random.randn(*x_shape).astype(np.float64)
     with self.test_session():
-      x_tf = constant_op.constant(x_np)
-      y_tf = nn.softmax(x_tf)
-      err = gc.ComputeGradientError(x_tf, x_shape, y_tf, x_shape)
+      x_tf = tf.constant(x_np)
+      y_tf = tf.nn.softmax(x_tf)
+      err = tf.test.compute_gradient_error(x_tf, x_shape, y_tf, x_shape)
     eps = 1e-8
     self.assertLess(err, eps)
 
 
-class DeConv2DTest(test_util.TensorFlowTestCase):
+class DeConv2DTest(tf.test.TestCase):
 
   def testDeConv2DSingleStride(self):
     with self.test_session():
@@ -167,11 +159,9 @@ class DeConv2DTest(test_util.TensorFlowTestCase):
       # Filter: [kernel_height, kernel_width, output_depth, input_depth]
       f_shape = [3, 3, 2, 3]
 
-      x = constant_op.constant(1.0, shape=x_shape, name="x",
-                               dtype=dtypes.float32)
-      f = constant_op.constant(1.0, shape=f_shape, name="filter",
-                               dtype=dtypes.float32)
-      output = nn.deconv2d(x, f, y_shape, strides=strides, padding="SAME")
+      x = tf.constant(1.0, shape=x_shape, name="x", dtype=tf.float32)
+      f = tf.constant(1.0, shape=f_shape, name="filter", dtype=tf.float32)
+      output = tf.nn.deconv2d(x, f, y_shape, strides=strides, padding="SAME")
       value = output.eval()
 
       # We count the number of cells being added at the locations in the output.
@@ -204,11 +194,9 @@ class DeConv2DTest(test_util.TensorFlowTestCase):
       # Filter: [kernel_height, kernel_width, output_depth, input_depth]
       f_shape = [3, 3, 2, 3]
 
-      x = constant_op.constant(1.0, shape=x_shape, name="x",
-                               dtype=dtypes.float32)
-      f = constant_op.constant(1.0, shape=f_shape, name="filter",
-                               dtype=dtypes.float32)
-      output = nn.deconv2d(x, f, y_shape, strides=strides, padding="SAME")
+      x = tf.constant(1.0, shape=x_shape, name="x", dtype=tf.float32)
+      f = tf.constant(1.0, shape=f_shape, name="filter", dtype=tf.float32)
+      output = tf.nn.deconv2d(x, f, y_shape, strides=strides, padding="SAME")
       value = output.eval()
 
       for n in xrange(x_shape[0]):
@@ -236,11 +224,9 @@ class DeConv2DTest(test_util.TensorFlowTestCase):
       # Filter: [kernel_height, kernel_width, output_depth, input_depth]
       f_shape = [3, 3, 2, 3]
 
-      x = constant_op.constant(1.0, shape=x_shape, name="x",
-                               dtype=dtypes.float32)
-      f = constant_op.constant(1.0, shape=f_shape, name="filter",
-                               dtype=dtypes.float32)
-      output = nn.deconv2d(x, f, y_shape, strides=strides, padding="VALID")
+      x = tf.constant(1.0, shape=x_shape, name="x", dtype=tf.float32)
+      f = tf.constant(1.0, shape=f_shape, name="filter", dtype=tf.float32)
+      output = tf.nn.deconv2d(x, f, y_shape, strides=strides, padding="VALID")
       value = output.eval()
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
@@ -281,21 +267,22 @@ class DeConv2DTest(test_util.TensorFlowTestCase):
     x_val = np.random.random_sample(x_shape).astype(np.float64)
     f_val = np.random.random_sample(f_shape).astype(np.float64)
     with self.test_session():
-      x = constant_op.constant(x_val, name="x", dtype=dtypes.float32)
-      f = constant_op.constant(f_val, name="f", dtype=dtypes.float32)
-      output = nn.deconv2d(x, f, y_shape, strides=strides, padding="SAME")
-      err = gc.ComputeGradientError([x, f], [x_shape, f_shape], output, y_shape)
+      x = tf.constant(x_val, name="x", dtype=tf.float32)
+      f = tf.constant(f_val, name="f", dtype=tf.float32)
+      output = tf.nn.deconv2d(x, f, y_shape, strides=strides, padding="SAME")
+      err = tf.test.compute_gradient_error(
+          [x, f], [x_shape, f_shape], output, y_shape)
     print("DeConv gradient err = %g " % err)
     err_tolerance = 0.0005
     self.assertLess(err, err_tolerance)
 
 
-class L2LossTest(test_util.TensorFlowTestCase):
+class L2LossTest(tf.test.TestCase):
 
   def testL2Loss(self):
     with self.test_session():
-      x = constant_op.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="x")
-      l2loss = nn.l2_loss(x)
+      x = tf.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="x")
+      l2loss = tf.nn.l2_loss(x)
       value = l2loss.eval()
     self.assertAllClose(7.0, value)
 
@@ -304,15 +291,15 @@ class L2LossTest(test_util.TensorFlowTestCase):
     np.random.seed(1)  # Make it reproducible.
     x_val = np.random.random_sample(x_shape).astype(np.float64)
     with self.test_session():
-      x = constant_op.constant(x_val, name="x")
-      output = nn.l2_loss(x)
-      err = gc.ComputeGradientError(x, x_shape, output, [1])
+      x = tf.constant(x_val, name="x")
+      output = tf.nn.l2_loss(x)
+      err = tf.test.compute_gradient_error(x, x_shape, output, [1])
     print("L2Loss gradient err = %g " % err)
     err_tolerance = 1e-11
     self.assertLess(err, err_tolerance)
 
 
-class L2NormalizeTest(test_util.TensorFlowTestCase):
+class L2NormalizeTest(tf.test.TestCase):
 
   def _l2Normalize(self, x, dim):
     norm = np.apply_along_axis(np.linalg.norm, dim, x)
@@ -325,8 +312,8 @@ class L2NormalizeTest(test_util.TensorFlowTestCase):
     for dim in range(len(x_shape)):
       y_np = self._l2Normalize(x_np, dim)
       with self.test_session():
-        x_tf = constant_op.constant(x_np, name="x")
-        y_tf = nn.l2_normalize(x_tf, dim)
+        x_tf = tf.constant(x_np, name="x")
+        y_tf = tf.nn.l2_normalize(x_tf, dim)
         self.assertAllClose(y_np, y_tf.eval())
 
   def testL2NormalizeGradient(self):
@@ -335,14 +322,14 @@ class L2NormalizeTest(test_util.TensorFlowTestCase):
     x_np = np.random.random_sample(x_shape).astype(np.float64)
     for dim in range(len(x_shape)):
       with self.test_session():
-        x_tf = constant_op.constant(x_np, name="x")
-        y_tf = nn.l2_normalize(x_tf, dim)
-        err = gc.ComputeGradientError(x_tf, x_shape, y_tf, x_shape)
+        x_tf = tf.constant(x_np, name="x")
+        y_tf = tf.nn.l2_normalize(x_tf, dim)
+        err = tf.test.compute_gradient_error(x_tf, x_shape, y_tf, x_shape)
       print("L2Normalize gradient err = %g " % err)
       self.assertLess(err, 1e-4)
 
 
-class DropoutTest(test_util.TensorFlowTestCase):
+class DropoutTest(tf.test.TestCase):
 
   def testDropout(self):
     # Runs dropout with 0-1 tensor 10 times, sum the number of ones and validate
@@ -353,10 +340,8 @@ class DropoutTest(test_util.TensorFlowTestCase):
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
       with self.test_session():
-        t = constant_op.constant(1.0,
-                                 shape=[x_dim, y_dim],
-                                 dtype=dtypes.float32)
-        dropout = nn.dropout(t, keep_prob)
+        t = tf.constant(1.0, shape=[x_dim, y_dim], dtype=tf.float32)
+        dropout = tf.nn.dropout(t, keep_prob)
         final_count = 0
         self.assertEqual([x_dim, y_dim], dropout.get_shape())
         for _ in xrange(0, num_iter):
@@ -382,10 +367,8 @@ class DropoutTest(test_util.TensorFlowTestCase):
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
       with self.test_session():
-        t = constant_op.constant(1.0,
-                                 shape=[x_dim, y_dim],
-                                 dtype=dtypes.float32)
-        dropout = nn.dropout(t, keep_prob, noise_shape=[x_dim, 1])
+        t = tf.constant(1.0, shape=[x_dim, y_dim], dtype=tf.float32)
+        dropout = tf.nn.dropout(t, keep_prob, noise_shape=[x_dim, 1])
         self.assertEqual([x_dim, y_dim], dropout.get_shape())
         final_count = 0
         for _ in xrange(0, num_iter):
@@ -408,10 +391,8 @@ class DropoutTest(test_util.TensorFlowTestCase):
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
       with self.test_session():
-        t = constant_op.constant(1.0,
-                                 shape=[x_dim, y_dim],
-                                 dtype=dtypes.float32)
-        dropout = nn.dropout(t, keep_prob, noise_shape=[x_dim, 1])
+        t = tf.constant(1.0, shape=[x_dim, y_dim], dtype=tf.float32)
+        dropout = tf.nn.dropout(t, keep_prob, noise_shape=[x_dim, 1])
         self.assertEqual([x_dim, y_dim], dropout.get_shape())
         for _ in xrange(0, num_iter):
           value = dropout.eval()
@@ -429,11 +410,9 @@ class DropoutTest(test_util.TensorFlowTestCase):
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
       with self.test_session():
-        t = constant_op.constant(1.0,
-                                 shape=[x_dim, y_dim],
-                                 dtype=dtypes.float32)
-        keep_prob_placeholder = array_ops.placeholder(dtypes.float32)
-        dropout = nn.dropout(t, keep_prob_placeholder)
+        t = tf.constant(1.0, shape=[x_dim, y_dim], dtype=tf.float32)
+        keep_prob_placeholder = tf.placeholder(tf.float32)
+        dropout = tf.nn.dropout(t, keep_prob_placeholder)
         final_count = 0
         self.assertEqual([x_dim, y_dim], dropout.get_shape())
         for _ in xrange(0, num_iter):
@@ -453,52 +432,49 @@ class DropoutTest(test_util.TensorFlowTestCase):
     x_dim = 40
     y_dim = 30
     keep_prob = 0.5
-    x = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-    dropout_x = nn.dropout(
-        x, keep_prob, noise_shape=array_ops.placeholder(dtypes.int32))
+    x = tf.constant(1.0, shape=[x_dim, y_dim], dtype=tf.float32)
+    dropout_x = tf.nn.dropout(x,
+                              keep_prob,
+                              noise_shape=tf.placeholder(tf.int32))
     self.assertEqual(x.get_shape(), dropout_x.get_shape())
 
   def testInvalidKeepProb(self):
     x_dim = 40
     y_dim = 30
-    t = constant_op.constant(1.0,
-                             shape=[x_dim, y_dim],
-                             dtype=dtypes.float32)
+    t = tf.constant(1.0, shape=[x_dim, y_dim], dtype=tf.float32)
     with self.assertRaises(ValueError):
-      nn.dropout(t, -1.0)
+      tf.nn.dropout(t, -1.0)
     with self.assertRaises(ValueError):
-      nn.dropout(t, 1.1)
+      tf.nn.dropout(t, 1.1)
     with self.assertRaises(ValueError):
-      nn.dropout(t, [0.0, 1.0])
+      tf.nn.dropout(t, [0.0, 1.0])
     with self.assertRaises(ValueError):
-      nn.dropout(t, array_ops.placeholder(dtypes.float64))
+      tf.nn.dropout(t, tf.placeholder(tf.float64))
     with self.assertRaises(ValueError):
-      nn.dropout(t, array_ops.placeholder(dtypes.float32, shape=[2]))
+      tf.nn.dropout(t, tf.placeholder(tf.float32, shape=[2]))
 
   def testShapedDropoutShapeError(self):
     # Runs shaped dropout and verifies an error is thrown on misshapen noise.
     x_dim = 40
     y_dim = 30
     keep_prob = 0.5
-    t = constant_op.constant(1.0,
-                             shape=[x_dim, y_dim],
-                             dtype=dtypes.float32)
+    t = tf.constant(1.0, shape=[x_dim, y_dim], dtype=tf.float32)
     with self.assertRaises(ValueError):
-      _ = nn.dropout(t, keep_prob, noise_shape=[x_dim, y_dim + 10])
+      _ = tf.nn.dropout(t, keep_prob, noise_shape=[x_dim, y_dim + 10])
     with self.assertRaises(ValueError):
-      _ = nn.dropout(t, keep_prob, noise_shape=[x_dim, y_dim, 5])
+      _ = tf.nn.dropout(t, keep_prob, noise_shape=[x_dim, y_dim, 5])
     with self.assertRaises(ValueError):
-      _ = nn.dropout(t, keep_prob, noise_shape=[x_dim + 3])
+      _ = tf.nn.dropout(t, keep_prob, noise_shape=[x_dim + 3])
     with self.assertRaises(ValueError):
-      _ = nn.dropout(t, keep_prob, noise_shape=[x_dim])
+      _ = tf.nn.dropout(t, keep_prob, noise_shape=[x_dim])
     # test that broadcasting proceeds
-    _ = nn.dropout(t, keep_prob, noise_shape=[y_dim])
-    _ = nn.dropout(t, keep_prob, noise_shape=[1, y_dim])
-    _ = nn.dropout(t, keep_prob, noise_shape=[x_dim, 1])
-    _ = nn.dropout(t, keep_prob, noise_shape=[1, 1])
+    _ = tf.nn.dropout(t, keep_prob, noise_shape=[y_dim])
+    _ = tf.nn.dropout(t, keep_prob, noise_shape=[1, y_dim])
+    _ = tf.nn.dropout(t, keep_prob, noise_shape=[x_dim, 1])
+    _ = tf.nn.dropout(t, keep_prob, noise_shape=[1, 1])
 
 
-class BatchNormWithGlobalNormalizationTest(test_util.TensorFlowTestCase):
+class BatchNormWithGlobalNormalizationTest(tf.test.TestCase):
 
   def _npBatchNorm(self, x, m, v, beta, gamma, epsilon,
                    scale_after_normalization):
@@ -509,7 +485,7 @@ class BatchNormWithGlobalNormalizationTest(test_util.TensorFlowTestCase):
 
   def _opsBatchNorm(self, x, m, v, beta, gamma, epsilon,
                     scale_after_normalization):
-    y = (x - m) * math_ops.rsqrt(v + epsilon)
+    y = (x - m) * tf.rsqrt(v + epsilon)
     if scale_after_normalization:
       y = gamma * y
     y += beta
@@ -525,14 +501,14 @@ class BatchNormWithGlobalNormalizationTest(test_util.TensorFlowTestCase):
     gamma_val = np.random.random_sample(param_shape).astype(np.float32)
     for use_gpu in [True, False]:
       with self.test_session(use_gpu=use_gpu) as sess:
-        x = constant_op.constant(x_val, name="x")
-        m = constant_op.constant(m_val, name="m")
-        v = constant_op.constant(v_val, name="v")
-        beta = constant_op.constant(beta_val, name="beta")
-        gamma = constant_op.constant(gamma_val, name="gamma")
+        x = tf.constant(x_val, name="x")
+        m = tf.constant(m_val, name="m")
+        v = tf.constant(v_val, name="v")
+        beta = tf.constant(beta_val, name="beta")
+        gamma = tf.constant(gamma_val, name="gamma")
         epsilon = 0.001
         for scale_after_normalization in [True, False]:
-          bn = nn.batch_norm_with_global_normalization(
+          bn = tf.nn.batch_norm_with_global_normalization(
               x, m, v, beta, gamma, epsilon, scale_after_normalization)
           on = self._opsBatchNorm(
               x, m, v, beta, gamma, epsilon, scale_after_normalization)
@@ -555,20 +531,20 @@ class BatchNormWithGlobalNormalizationTest(test_util.TensorFlowTestCase):
     beta_val = np.random.random_sample(param_shape).astype(np.float64)
     gamma_val = np.random.random_sample(param_shape).astype(np.float64)
     with self.test_session():
-      x = constant_op.constant(x_val, name="x")
-      m = constant_op.constant(m_val, name="m")
-      v = constant_op.constant(v_val, name="v")
-      beta = constant_op.constant(beta_val, name="beta")
-      gamma = constant_op.constant(gamma_val, name="gamma")
+      x = tf.constant(x_val, name="x")
+      m = tf.constant(m_val, name="m")
+      v = tf.constant(v_val, name="v")
+      beta = tf.constant(beta_val, name="beta")
+      gamma = tf.constant(gamma_val, name="gamma")
       epsilon = 0.001
       # If scale_after_normalization is False, backprop for gamma
       # will be 0. gamma is unchanged.
-      output = nn.batch_norm_with_global_normalization(
+      output = tf.nn.batch_norm_with_global_normalization(
           x, m, v, beta, gamma, epsilon, scale_after_normalization)
       all_params = [x, m, v, beta, gamma]
       all_shapes = [x_shape, param_shape, param_shape, param_shape, param_shape]
-      err = gc.ComputeGradientError(all_params[param_index],
-                                    all_shapes[param_index], output, x_shape)
+      err = tf.test.compute_gradient_error(
+          all_params[param_index], all_shapes[param_index], output, x_shape)
     print("Batch normalization %s gradient %s scale err = " %
           (tag, "with" if scale_after_normalization else "without"), err)
     self.assertLess(err, err_tolerance)
@@ -606,12 +582,12 @@ class BatchNormWithGlobalNormalizationTest(test_util.TensorFlowTestCase):
     backprop_val = np.random.random_sample(x_shape).astype(np.float32)
     for use_gpu in [False, True]:
       with self.test_session(use_gpu=use_gpu) as sess:
-        x = constant_op.constant(x_val, name="x")
-        m = constant_op.constant(m_val, name="m")
-        v = constant_op.constant(v_val, name="v")
-        beta = constant_op.constant(beta_val, name="beta")
-        gamma = constant_op.constant(gamma_val, name="gamma")
-        backprop = constant_op.constant(backprop_val, name="backprop")
+        x = tf.constant(x_val, name="x")
+        m = tf.constant(m_val, name="m")
+        v = tf.constant(v_val, name="v")
+        beta = tf.constant(beta_val, name="beta")
+        gamma = tf.constant(gamma_val, name="gamma")
+        backprop = tf.constant(backprop_val, name="backprop")
         epsilon = 0.001
         for scale_after_normalization in [True, False]:
           dx, dm, dv, db, dg = (
@@ -619,7 +595,7 @@ class BatchNormWithGlobalNormalizationTest(test_util.TensorFlowTestCase):
               x, m, v, gamma, backprop, epsilon, scale_after_normalization))
           on = self._opsBatchNorm(
               x, m, v, beta, gamma, epsilon, scale_after_normalization)
-          odx, odm, odv, odb, odg = gradients.gradients(
+          odx, odm, odv, odb, odg = tf.gradients(
               [on], [x, m, v, beta, gamma], [backprop])
           if scale_after_normalization:
             all_grads = sess.run([dx, dm, dv, db, dg, odx, odm, odv, odb, odg])
@@ -633,7 +609,7 @@ class BatchNormWithGlobalNormalizationTest(test_util.TensorFlowTestCase):
                 all_grads[i + len(to_check)], all_grads[i], atol=0.000001)
 
 
-class MomentsTest(test_util.TensorFlowTestCase):
+class MomentsTest(tf.test.TestCase):
 
   def RunMomentTestWithDynamicShape(self, shape, global_norm):
     with self.test_session():
@@ -641,10 +617,10 @@ class MomentsTest(test_util.TensorFlowTestCase):
       assert len(shape) == 4
 
       x_numpy = np.random.normal(size=shape).astype(np.float32)
-      x = array_ops.placeholder(dtypes.float32, shape=[None] * len(shape))
+      x = tf.placeholder(tf.float32, shape=[None] * len(shape))
 
       axes = [0, 1, 2] if global_norm else [0]
-      mean, var = nn.moments(x, axes)
+      mean, var = tf.nn.moments(x, axes)
 
       num_elements = np.prod([shape[i] for i in axes])
 
@@ -665,10 +641,10 @@ class MomentsTest(test_util.TensorFlowTestCase):
       assert len(shape) == 4
 
       x_numpy = np.random.normal(size=shape).astype(np.float32)
-      x = constant_op.constant(x_numpy)
+      x = tf.constant(x_numpy)
 
       axes = [0, 1, 2] if global_norm else [0]
-      mean, var = nn.moments(x, axes)
+      mean, var = tf.nn.moments(x, axes)
 
       num_elements = np.prod([shape[i] for i in axes])
 
@@ -695,17 +671,17 @@ class MomentsTest(test_util.TensorFlowTestCase):
     with self.test_session():
       x_shape = [3, 5, 4, 2]
       x_val = np.random.random_sample(x_shape).astype(np.float64)
-      x = constant_op.constant(x_val)
+      x = tf.constant(x_val)
       x.set_shape(x_shape)
 
       axes = [0, 1, 2]
       y_shape = [2]  # Depth of x
-      out_mean, out_var = nn.moments(x, axes)
+      out_mean, out_var = tf.nn.moments(x, axes)
       if from_y == "mean":
         y = out_mean
       elif from_y == "var":
         y = out_var
-      err = gc.ComputeGradientError(x, x_shape, y, y_shape)
+      err = tf.test.compute_gradient_error(x, x_shape, y, y_shape)
       print("Moments %s gradient err = %g" % (from_y, err))
       self.assertLess(err, 1e-11)
 
@@ -716,7 +692,7 @@ class MomentsTest(test_util.TensorFlowTestCase):
     self._testGlobalGradient(from_y="var")
 
 
-class ComputeSampledLogitsTest(test_util.TensorFlowTestCase):
+class ComputeSampledLogitsTest(tf.test.TestCase):
 
   def setUp(self):
     self._num_classes = 5
@@ -768,18 +744,25 @@ class ComputeSampledLogitsTest(test_util.TensorFlowTestCase):
                               name="sampled_loss_TF"):
     # Should be called from within a `with test_session():` block
     if isinstance(weights, list):
-      weights_tf = [constant_op.constant(shard) for shard in weights]
+      weights_tf = [tf.constant(shard) for shard in weights]
     else:
-      weights_tf = constant_op.constant(weights)
-    biases_tf = constant_op.constant(biases)
-    hidden_acts_tf = constant_op.constant(hidden_acts,
-                                          shape=(self._batch_size, self._dim))
-    labels_tf = constant_op.constant(labels, dtype=dtypes.int64,
-                                     shape=(self._batch_size, num_true))
+      weights_tf = tf.constant(weights)
+    biases_tf = tf.constant(biases)
+    hidden_acts_tf = tf.constant(hidden_acts,
+                                 shape=(self._batch_size, self._dim))
+    labels_tf = tf.constant(labels,
+                            dtype=tf.int64,
+                            shape=(self._batch_size, num_true))
 
-    pred_logits_tf, pred_labels_tf = nn._compute_sampled_logits(
-        weights_tf, biases_tf, hidden_acts_tf, labels_tf, num_sampled,
-        num_classes, num_true, sampled_vals,
+    pred_logits_tf, pred_labels_tf = tf.nn._compute_sampled_logits(
+        weights_tf,
+        biases_tf,
+        hidden_acts_tf,
+        labels_tf,
+        num_sampled,
+        num_classes,
+        num_true,
+        sampled_vals,
         subtract_log_q=subtract_log_q,
         remove_accidental_hits=remove_accidental_hits,
         name=name)
@@ -942,24 +925,28 @@ class ComputeSampledLogitsTest(test_util.TensorFlowTestCase):
       nce_loss_np = np.sum(
           _SigmoidCrossEntropyWithLogits(logits_np, labels_np), 1)
 
-      labels_tf = constant_op.constant(labels, shape=(self._batch_size, 1))
-      weights_tf = constant_op.constant(weights)
-      biases_tf = constant_op.constant(biases)
-      inputs_tf = constant_op.constant(hidden_acts)
+      labels_tf = tf.constant(labels, shape=(self._batch_size, 1))
+      weights_tf = tf.constant(weights)
+      biases_tf = tf.constant(biases)
+      inputs_tf = tf.constant(hidden_acts)
 
-      nce_loss_tf = nn.nce_loss(
-          weights_tf, biases_tf, inputs_tf, labels_tf,
-          num_sampled=1,
-          num_classes=self._num_classes,
-          num_true=1,
-          sampled_values=test_sampled_vals)
+      nce_loss_tf = tf.nn.nce_loss(weights_tf,
+                                   biases_tf,
+                                   inputs_tf,
+                                   labels_tf,
+                                   num_sampled=1,
+                                   num_classes=self._num_classes,
+                                   num_true=1,
+                                   sampled_values=test_sampled_vals)
 
       self.assertAllClose(nce_loss_np, nce_loss_tf.eval(), 1e-4)
 
       # Test with sharded weights
-      nce_loss_tf = nn.nce_loss(
-          [constant_op.constant(shard) for shard in sharded_weights],
-          biases_tf, inputs_tf, labels_tf,
+      nce_loss_tf = tf.nn.nce_loss(
+          [tf.constant(shard) for shard in sharded_weights],
+          biases_tf,
+          inputs_tf,
+          labels_tf,
           num_sampled=1,
           num_classes=self._num_classes,
           num_true=1,
@@ -996,13 +983,16 @@ class ComputeSampledLogitsTest(test_util.TensorFlowTestCase):
       sampled_softmax_loss_np = _SoftmaxCrossEntropyWithLogits(logits_np,
                                                                labels_np)
 
-      labels_tf = constant_op.constant(labels, shape=(self._batch_size, 1))
-      weights_tf = constant_op.constant(weights)
-      biases_tf = constant_op.constant(biases)
-      inputs_tf = constant_op.constant(hidden_acts)
+      labels_tf = tf.constant(labels, shape=(self._batch_size, 1))
+      weights_tf = tf.constant(weights)
+      biases_tf = tf.constant(biases)
+      inputs_tf = tf.constant(hidden_acts)
 
-      sampled_softmax_loss_tf = nn.sampled_softmax_loss(
-          weights_tf, biases_tf, inputs_tf, labels_tf,
+      sampled_softmax_loss_tf = tf.nn.sampled_softmax_loss(
+          weights_tf,
+          biases_tf,
+          inputs_tf,
+          labels_tf,
           num_sampled=1,
           num_classes=self._num_classes,
           num_true=1,
@@ -1013,9 +1003,11 @@ class ComputeSampledLogitsTest(test_util.TensorFlowTestCase):
           sampled_softmax_loss_np, sampled_softmax_loss_tf.eval(), 1e-4)
 
       # Test with sharded weights
-      sampled_softmax_loss_tf = nn.sampled_softmax_loss(
-          [constant_op.constant(shard) for shard in sharded_weights],
-          biases_tf, inputs_tf, labels_tf,
+      sampled_softmax_loss_tf = tf.nn.sampled_softmax_loss(
+          [tf.constant(shard) for shard in sharded_weights],
+          biases_tf,
+          inputs_tf,
+          labels_tf,
           num_sampled=1,
           num_classes=self._num_classes,
           num_true=1,
@@ -1027,4 +1019,4 @@ class ComputeSampledLogitsTest(test_util.TensorFlowTestCase):
 
 
 if __name__ == "__main__":
-  googletest.main()
+  tf.test.main()
diff --git a/tensorflow/python/ops/op_def_library.py b/tensorflow/python/ops/op_def_library.py
index ad0406d43dc..c2ad3bdb582 100644
--- a/tensorflow/python/ops/op_def_library.py
+++ b/tensorflow/python/ops/op_def_library.py
@@ -378,14 +378,18 @@ class OpDefLibrary(object):
                   break
 
           try:
+            if not input_arg.is_ref and dtype:
+              dtype = dtypes.as_dtype(dtype).base_dtype
             values = ops.convert_n_to_tensor_or_indexed_slices(
                 values, name=input_arg.name,
-                dtype=dtypes.as_dtype(dtype).base_dtype if dtype else None)
+                dtype=dtype if dtype else None,
+                as_ref=input_arg.is_ref)
           except (TypeError, ValueError):
             assert dtype is not None, "Should not fail if dtype is None"
             assert input_arg.number_attr, "Should be number_attr case"
             # What types does the conversion function think values have?
-            values = ops.convert_n_to_tensor_or_indexed_slices(values)
+            values = ops.convert_n_to_tensor_or_indexed_slices(
+                values, as_ref=input_arg.is_ref)
             observed = ", ".join(v.dtype.base_dtype.name for v in values)
 
             prefix = (
@@ -393,11 +397,11 @@ class OpDefLibrary(object):
                 (input_name, op_type_name, observed))
             if input_arg.type != types_pb2.DT_INVALID:
               raise TypeError("%s that do not match expected type %s." %
-                              (prefix, dtypes.as_dtype(dtype).name))
+                              (prefix, dtype.name))
             elif input_arg.type_attr in attrs:
               raise TypeError("%s that do not match type %s inferred from "
                               "earlier arguments." %
-                              (prefix, dtypes.as_dtype(dtype).name))
+                              (prefix, dtype.name))
             else:
               raise TypeError("%s that don't all match." % prefix)
 
@@ -411,13 +415,14 @@ class OpDefLibrary(object):
             dtype = input_arg.type
           elif input_arg.type_attr in attrs:
             dtype = attrs[input_arg.type_attr]
-
           try:
             values = ops.convert_to_tensor(
-                values, name=input_arg.name, dtype=dtype)
+                values, name=input_arg.name, dtype=dtype,
+                as_ref=input_arg.is_ref)
           except ValueError:
             # What type does convert_to_tensor think it has?
-            observed = ops.convert_to_tensor(values).dtype.name
+            observed = ops.convert_to_tensor(values,
+                                             as_ref=input_arg.is_ref).dtype.name
             prefix = ("Input '%s' of '%s' Op has type %s that does not match" %
                       (input_name, op_type_name, observed))
             if input_arg.type != types_pb2.DT_INVALID:
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
new file mode 100644
index 00000000000..e7d70ea79e3
--- /dev/null
+++ b/tensorflow/python/ops/rnn.py
@@ -0,0 +1,150 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""RNN helpers for TensorFlow models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variable_scope as vs
+
+
+def rnn(cell, inputs, initial_state=None, dtype=None,
+        sequence_length=None, scope=None):
+  """Creates a recurrent neural network specified by RNNCell "cell".
+
+  The simplest form of RNN network generated is:
+    state = cell.zero_state(...)
+    outputs = []
+    states = []
+    for input_ in inputs:
+      output, state = cell(input_, state)
+      outputs.append(output)
+      states.append(state)
+    return (outputs, states)
+
+  However, a few other options are available:
+
+  An initial state can be provided.
+  If sequence_length is provided, dynamic calculation is performed.
+
+  Dynamic calculation returns, at time t:
+    (t >= max(sequence_length)
+        ? (zeros(output_shape), zeros(state_shape))
+        : cell(input, state)
+
+  Thus saving computational time when unrolling past the max sequence length.
+
+  Args:
+    cell: An instance of RNNCell.
+    inputs: A length T list of inputs, each a vector with shape [batch_size].
+    initial_state: (optional) An initial state for the RNN.  This must be
+      a tensor of appropriate type and shape [batch_size x cell.state_size].
+    dtype: (optional) The data type for the initial state.  Required if
+      initial_state is not provided.
+    sequence_length: An int64 vector (tensor) size [batch_size].
+    scope: VariableScope for the created subgraph; defaults to "RNN".
+
+  Returns:
+    A pair (outputs, states) where:
+      outputs is a length T list of outputs (one for each input)
+      states is a length T list of states (one state following each input)
+
+  Raises:
+    TypeError: If "cell" is not an instance of RNNCell.
+    ValueError: If inputs is None or an empty list.
+  """
+
+  if not isinstance(cell, rnn_cell.RNNCell):
+    raise TypeError("cell must be an instance of RNNCell")
+  if not isinstance(inputs, list):
+    raise TypeError("inputs must be a list")
+  if not inputs:
+    raise ValueError("inputs must not be empty")
+
+  outputs = []
+  states = []
+  with vs.variable_scope(scope or "RNN"):
+    batch_size = array_ops.shape(inputs[0])[0]
+    if initial_state is not None:
+      state = initial_state
+    else:
+      if not dtype:
+        raise ValueError("If no initial_state is provided, dtype must be.")
+      state = cell.zero_state(batch_size, dtype)
+
+    if sequence_length:  # Prepare variables
+      zero_output_state = (
+          array_ops.zeros(array_ops.pack([batch_size, cell.output_size]),
+                          inputs[0].dtype),
+          array_ops.zeros(array_ops.pack([batch_size, cell.state_size]),
+                          state.dtype))
+      max_sequence_length = math_ops.reduce_max(sequence_length)
+
+    for time, input_ in enumerate(inputs):
+      if time > 0: vs.get_variable_scope().reuse_variables()
+      # pylint: disable=cell-var-from-loop
+      def output_state():
+        return cell(input_, state)
+      # pylint: enable=cell-var-from-loop
+      if sequence_length:
+        (output, state) = control_flow_ops.cond(
+            time >= max_sequence_length,
+            lambda: zero_output_state, output_state)
+      else:
+        (output, state) = output_state()
+
+      outputs.append(output)
+      states.append(state)
+
+    return (outputs, states)
+
+
+def state_saving_rnn(cell, inputs, state_saver, state_name,
+                     sequence_length=None, scope=None):
+  """RNN that accepts a state saver for time-truncated RNN calculation.
+
+  Args:
+    cell: An instance of RNNCell.
+    inputs: A length T list of inputs, each a vector with shape [batch_size].
+    state_saver: A state saver object with methods `state` and `save_state`.
+    state_name: The name to use with the state_saver.
+    sequence_length: (optional) An int64 vector (tensor) size [batch_size].
+      See the documentation for rnn() for more details about sequence_length.
+    scope: VariableScope for the created subgraph; defaults to "RNN".
+
+  Returns:
+    A pair (outputs, states) where:
+      outputs is a length T list of outputs (one for each input)
+      states is a length T list of states (one state following each input)
+
+  Raises:
+    TypeError: If "cell" is not an instance of RNNCell.
+    ValueError: If inputs is None or an empty list.
+  """
+  initial_state = state_saver.state(state_name)
+  (outputs, states) = rnn(cell, inputs, initial_state=initial_state,
+                          sequence_length=sequence_length, scope=scope)
+  save_state = state_saver.save_state(state_name, states[-1])
+  with ops.control_dependencies([save_state]):
+    outputs[-1] = array_ops.identity(outputs[-1])
+
+  return (outputs, states)
diff --git a/tensorflow/python/ops/rnn_cell.py b/tensorflow/python/ops/rnn_cell.py
new file mode 100644
index 00000000000..584849236a9
--- /dev/null
+++ b/tensorflow/python/ops/rnn_cell.py
@@ -0,0 +1,685 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Module for constructing RNN Cells."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variable_scope as vs
+
+from tensorflow.python.ops.math_ops import sigmoid
+from tensorflow.python.ops.math_ops import tanh
+
+
+class RNNCell(object):
+  """Abstract object representing an RNN cell.
+
+  An RNN cell, in the most abstract setting, is anything that has
+  a state -- a vector of floats of size self.state_size -- and performs some
+  operation that takes inputs of size self.input_size. This operation
+  results in an output of size self.output_size and a new state.
+
+  This module provides a number of basic commonly used RNN cells, such as
+  LSTM (Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number
+  of operators that allow add dropouts, projections, or embeddings for inputs.
+  Constructing multi-layer cells is supported by a super-class, MultiRNNCell,
+  defined later. Every RNNCell must have the properties below and and
+  implement __call__ with the following signature.
+  """
+
+  def __call__(self, inputs, state, scope=None):
+    """Run this RNN cell on inputs, starting from the given state.
+
+    Args:
+      inputs: 2D Tensor with shape [batch_size x self.input_size].
+      state: 2D Tensor with shape [batch_size x self.state_size].
+      scope: VariableScope for the created subgraph; defaults to class name.
+
+    Returns:
+      A pair containing:
+      - Output: A 2D Tensor with shape [batch_size x self.output_size]
+      - New state: A 2D Tensor with shape [batch_size x self.state_size].
+    """
+    raise NotImplementedError("Abstract method")
+
+  @property
+  def input_size(self):
+    """Integer: size of inputs accepted by this cell."""
+    raise NotImplementedError("Abstract method")
+
+  @property
+  def output_size(self):
+    """Integer: size of outputs produced by this cell."""
+    raise NotImplementedError("Abstract method")
+
+  @property
+  def state_size(self):
+    """Integer: size of state used by this cell."""
+    raise NotImplementedError("Abstract method")
+
+  def zero_state(self, batch_size, dtype):
+    """Return state tensor (shape [batch_size x state_size]) filled with 0.
+
+    Args:
+      batch_size: int, float, or unit Tensor representing the batch size.
+      dtype: the data type to use for the state.
+
+    Returns:
+      A 2D Tensor of shape [batch_size x state_size] filled with zeros.
+    """
+    zeros = array_ops.zeros(
+        array_ops.pack([batch_size, self.state_size]), dtype=dtype)
+    zeros.set_shape([None, self.state_size])
+    return zeros
+
+
+class BasicRNNCell(RNNCell):
+  """The most basic RNN cell."""
+
+  def __init__(self, num_units):
+    self._num_units = num_units
+
+  @property
+  def input_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  def __call__(self, inputs, state, scope=None):
+    """Most basic RNN: output = new_state = tanh(W * input + U * state + B)."""
+    with vs.variable_scope(scope or type(self).__name__):  # "BasicRNNCell"
+      output = tanh(linear([inputs, state], self._num_units, True))
+    return output, output
+
+
+class GRUCell(RNNCell):
+  """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078)."""
+
+  def __init__(self, num_units):
+    self._num_units = num_units
+
+  @property
+  def input_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  def __call__(self, inputs, state, scope=None):
+    """Gated recurrent unit (GRU) with nunits cells."""
+    with vs.variable_scope(scope or type(self).__name__):  # "GRUCell"
+      with vs.variable_scope("Gates"):  # Reset gate and update gate.
+        # We start with bias of 1.0 to not reset and not udpate.
+        r, u = array_ops.split(1, 2, linear([inputs, state],
+                                            2 * self._num_units, True, 1.0))
+        r, u = sigmoid(r), sigmoid(u)
+      with vs.variable_scope("Candidate"):
+        c = tanh(linear([inputs, r * state], self._num_units, True))
+      new_h = u * state + (1 - u) * c
+    return new_h, new_h
+
+
+class BasicLSTMCell(RNNCell):
+  """Basic LSTM recurrent network cell.
+
+  The implementation is based on: http://arxiv.org/pdf/1409.2329v5.pdf.
+
+  It does not allow cell clipping, a projection layer, and does not
+  use peep-hole connections: it is the basic baseline.
+
+  Biases of the forget gate are initialized by default to 1 in order to reduce
+  the scale of forgetting in the beginning of the training.
+  """
+
+  def __init__(self, num_units, forget_bias=1.0):
+    self._num_units = num_units
+    self._forget_bias = forget_bias
+
+  @property
+  def input_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  @property
+  def state_size(self):
+    return 2 * self._num_units
+
+  def __call__(self, inputs, state, scope=None):
+    """Long short-term memory cell (LSTM)."""
+    with vs.variable_scope(scope or type(self).__name__):  # "BasicLSTMCell"
+      # Parameters of gates are concatenated into one multiply for efficiency.
+      c, h = array_ops.split(1, 2, state)
+      concat = linear([inputs, h], 4 * self._num_units, True)
+
+      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+      i, j, f, o = array_ops.split(1, 4, concat)
+
+      new_c = c * sigmoid(f + self._forget_bias) + sigmoid(i) * tanh(j)
+      new_h = tanh(new_c) * sigmoid(o)
+
+    return new_h, array_ops.concat(1, [new_c, new_h])
+
+
+class LSTMCell(RNNCell):
+  """Long short-term memory unit (LSTM) recurrent network cell.
+
+  This implementation is based on:
+
+    https://research.google.com/pubs/archive/43905.pdf
+
+  Hasim Sak, Andrew Senior, and Francoise Beaufays.
+  "Long short-term memory recurrent neural network architectures for
+   large scale acoustic modeling." INTERSPEECH, 2014.
+
+  It uses peep-hole connections, optional cell clipping, and an optional
+  projection layer.
+  """
+
+  def __init__(self, num_units, input_size,
+               use_peepholes=False, cell_clip=None,
+               initializer=None, num_proj=None,
+               num_unit_shards=1, num_proj_shards=1):
+    """Initialize the parameters for an LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the LSTM cell
+      input_size: int, The dimensionality of the inputs into the LSTM cell
+      use_peepholes: bool, set True to enable diagonal/peephole connections.
+      cell_clip: (optional) A float value, if provided the cell state is clipped
+        by this value prior to the cell output activation.
+      initializer: (optional) The initializer to use for the weight and
+        projection matrices.
+      num_proj: (optional) int, The output dimensionality for the projection
+        matrices.  If None, no projection is performed.
+      num_unit_shards: How to split the weight matrix.  If >1, the weight
+        matrix is stored across num_unit_shards.
+        Note that num_unit_shards must evenly divide num_units * 4.
+      num_proj_shards: How to split the projection matrix.  If >1, the
+        projection matrix is stored across num_proj_shards.
+        Note that num_proj_shards must evenly divide num_proj
+              (if num_proj is not None).
+
+    Raises:
+      ValueError: if num_unit_shards doesn't divide 4 * num_units or
+        num_proj_shards doesn't divide num_proj
+    """
+    self._num_units = num_units
+    self._input_size = input_size
+    self._use_peepholes = use_peepholes
+    self._cell_clip = cell_clip
+    self._initializer = initializer
+    self._num_proj = num_proj
+    self._num_unit_shards = num_unit_shards
+    self._num_proj_shards = num_proj_shards
+
+    if (num_units * 4) % num_unit_shards != 0:
+      raise ValueError("num_unit_shards must evently divide 4 * num_units")
+    if num_proj and num_proj % num_proj_shards != 0:
+      raise ValueError("num_proj_shards must evently divide num_proj")
+
+    if num_proj:
+      self._state_size = num_units + num_proj
+      self._output_size = num_proj
+    else:
+      self._state_size = 2 * num_units
+      self._output_size = num_units
+
+  @property
+  def input_size(self):
+    return self._input_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  def __call__(self, input_, state, scope=None):
+    """Run one step of LSTM.
+
+    Args:
+      input_: input Tensor, 2D, batch x num_units.
+      state: state Tensor, 2D, batch x state_size.
+      scope: VariableScope for the created subgraph; defaults to "LSTMCell".
+
+    Returns:
+      A tuple containing:
+      - A 2D, batch x output_dim, Tensor representing the output of the LSTM
+        after reading "input_" when previous state was "state".
+        Here output_dim is:
+           num_proj if num_proj was set,
+           num_units otherwise.
+      - A 2D, batch x state_size, Tensor representing the new state of LSTM
+        after reading "input_" when previous state was "state".
+    """
+    num_proj = self._num_units if self._num_proj is None else self._num_proj
+
+    c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
+    m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
+
+    dtype = input_.dtype
+
+    unit_shard_size = (4 * self._num_units) // self._num_unit_shards
+
+    with vs.variable_scope(scope or type(self).__name__):  # "LSTMCell"
+      w = array_ops.concat(
+          1,
+          [vs.get_variable("W_%d" % i,
+                           shape=[self.input_size + num_proj, unit_shard_size],
+                           initializer=self._initializer,
+                           dtype=dtype) for i in xrange(self._num_unit_shards)])
+
+      b = vs.get_variable(
+          "B", shape=[4 * self._num_units],
+          initializer=array_ops.zeros_initializer, dtype=dtype)
+
+      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+      cell_inputs = array_ops.concat(1, [input_, m_prev])
+      i, j, f, o = array_ops.split(
+          1, 4, nn_ops.bias_add(math_ops.matmul(cell_inputs, w), b))
+
+      # Diagonal connections
+      if self._use_peepholes:
+        w_f_diag = vs.get_variable(
+            "W_F_diag", shape=[self._num_units], dtype=dtype)
+        w_i_diag = vs.get_variable(
+            "W_I_diag", shape=[self._num_units], dtype=dtype)
+        w_o_diag = vs.get_variable(
+            "W_O_diag", shape=[self._num_units], dtype=dtype)
+
+      if self._use_peepholes:
+        c = (sigmoid(f + 1 + w_f_diag * c_prev) * c_prev +
+             sigmoid(i + w_i_diag * c_prev) * tanh(j))
+      else:
+        c = (sigmoid(f + 1) * c_prev + sigmoid(i) * tanh(j))
+
+      if self._cell_clip is not None:
+        c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
+
+      if self._use_peepholes:
+        m = sigmoid(o + w_o_diag * c) * tanh(c)
+      else:
+        m = sigmoid(o) * tanh(c)
+
+      if self._num_proj is not None:
+        proj_shard_size = self._num_proj // self._num_proj_shards
+        w_proj = array_ops.concat(
+            1,
+            [vs.get_variable("W_P_%d" % i,
+                             shape=[self._num_units, proj_shard_size],
+                             initializer=self._initializer,
+                             dtype=dtype)
+             for i in xrange(self._num_proj_shards)])
+        # TODO(ebrevdo), use matmulsum
+        m = math_ops.matmul(m, w_proj)
+
+    return m, array_ops.concat(1, [c, m])
+
+
+class OutputProjectionWrapper(RNNCell):
+  """Operator adding an output projection to the given cell.
+
+  Note: in many cases it may be more efficient to not use this wrapper,
+  but instead concatenate the whole sequence of your outputs in time,
+  do the projection on this batch-concated sequence, then split it
+  if needed or directly feed into a softmax.
+  """
+
+  def __init__(self, cell, output_size):
+    """Create a cell with output projection.
+
+    Args:
+      cell: an RNNCell, a projection to output_size is added to it.
+      output_size: integer, the size of the output after projection.
+
+    Raises:
+      TypeError: if cell is not an RNNCell.
+      ValueError: if output_size is not positive.
+    """
+    if not isinstance(cell, RNNCell):
+      raise TypeError("The parameter cell is not RNNCell.")
+    if output_size < 1:
+      raise ValueError("Parameter output_size must be > 0: %d." % output_size)
+    self._cell = cell
+    self._output_size = output_size
+
+  @property
+  def input_size(self):
+    return self._cell.input_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the cell and output projection on inputs, starting from state."""
+    output, res_state = self._cell(inputs, state)
+    # Default scope: "OutputProjectionWrapper"
+    with vs.variable_scope(scope or type(self).__name__):
+      projected = linear(output, self._output_size, True)
+    return projected, res_state
+
+
+class InputProjectionWrapper(RNNCell):
+  """Operator adding an input projection to the given cell.
+
+  Note: in many cases it may be more efficient to not use this wrapper,
+  but instead concatenate the whole sequence of your inputs in time,
+  do the projection on this batch-concated sequence, then split it.
+  """
+
+  def __init__(self, cell, input_size):
+    """Create a cell with input projection.
+
+    Args:
+      cell: an RNNCell, a projection of inputs is added before it.
+      input_size: integer, the size of the inputs before projection.
+
+    Raises:
+      TypeError: if cell is not an RNNCell.
+      ValueError: if input_size is not positive.
+    """
+    if not isinstance(cell, RNNCell):
+      raise TypeError("The parameter cell is not RNNCell.")
+    if input_size < 1:
+      raise ValueError("Parameter input_size must be > 0: %d." % input_size)
+    self._cell = cell
+    self._input_size = input_size
+
+  @property
+  def input_size(self):
+    return self._input_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the input projection and then the cell."""
+    # Default scope: "InputProjectionWrapper"
+    with vs.variable_scope(scope or type(self).__name__):
+      projected = linear(inputs, self._cell.input_size, True)
+    return self._cell(projected, state)
+
+
+class DropoutWrapper(RNNCell):
+  """Operator adding dropout to inputs and outputs of the given cell."""
+
+  def __init__(self, cell, input_keep_prob=1.0, output_keep_prob=1.0,
+               seed=None):
+    """Create a cell with added input and/or output dropout.
+
+    Dropout is never used on the state.
+
+    Args:
+      cell: an RNNCell, a projection to output_size is added to it.
+      input_keep_prob: unit Tensor or float between 0 and 1, input keep
+        probability; if it is float and 1, no input dropout will be added.
+      output_keep_prob: unit Tensor or float between 0 and 1, output keep
+        probability; if it is float and 1, no output dropout will be added.
+      seed: (optional) integer, the randomness seed.
+
+    Raises:
+      TypeError: if cell is not an RNNCell.
+      ValueError: if keep_prob is not between 0 and 1.
+    """
+    if not isinstance(cell, RNNCell):
+      raise TypeError("The parameter cell is not a RNNCell.")
+    if (isinstance(input_keep_prob, float) and
+        not (input_keep_prob >= 0.0 and input_keep_prob <= 1.0)):
+      raise ValueError("Parameter input_keep_prob must be between 0 and 1: %d"
+                       % input_keep_prob)
+    if (isinstance(output_keep_prob, float) and
+        not (output_keep_prob >= 0.0 and output_keep_prob <= 1.0)):
+      raise ValueError("Parameter input_keep_prob must be between 0 and 1: %d"
+                       % output_keep_prob)
+    self._cell = cell
+    self._input_keep_prob = input_keep_prob
+    self._output_keep_prob = output_keep_prob
+    self._seed = seed
+
+  @property
+  def input_size(self):
+    return self._cell.input_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  def __call__(self, inputs, state):
+    """Run the cell with the declared dropouts."""
+    if (not isinstance(self._input_keep_prob, float) or
+        self._input_keep_prob < 1):
+      inputs = nn_ops.dropout(inputs, self._input_keep_prob, seed=self._seed)
+    output, new_state = self._cell(inputs, state)
+    if (not isinstance(self._output_keep_prob, float) or
+        self._output_keep_prob < 1):
+      output = nn_ops.dropout(output, self._output_keep_prob, seed=self._seed)
+    return output, new_state
+
+
+class EmbeddingWrapper(RNNCell):
+  """Operator adding input embedding to the given cell.
+
+  Note: in many cases it may be more efficient to not use this wrapper,
+  but instead concatenate the whole sequence of your inputs in time,
+  do the embedding on this batch-concated sequence, then split it and
+  feed into your RNN.
+  """
+
+  def __init__(self, cell, embedding_classes=0, embedding=None,
+               initializer=None):
+    """Create a cell with an added input embedding.
+
+    Args:
+      cell: an RNNCell, an embedding will be put before its inputs.
+      embedding_classes: integer, how many symbols will be embedded.
+      embedding: Variable, the embedding to use; if None, a new embedding
+        will be created; if set, then embedding_classes is not required.
+      initializer: an initializer to use when creating the embedding;
+        if None, the initializer from variable scope or a default one is used.
+
+    Raises:
+      TypeError: if cell is not an RNNCell.
+      ValueError: if embedding_classes is not positive.
+    """
+    if not isinstance(cell, RNNCell):
+      raise TypeError("The parameter cell is not RNNCell.")
+    if embedding_classes < 1 and embedding is None:
+      raise ValueError("Pass embedding or embedding_classes must be > 0: %d."
+                       % embedding_classes)
+    if embedding_classes > 0 and embedding is not None:
+      if embedding.size[0] != embedding_classes:
+        raise ValueError("You declared embedding_classes=%d but passed an "
+                         "embedding for %d classes." % (embedding.size[0],
+                                                        embedding_classes))
+      if embedding.size[1] != cell.input_size:
+        raise ValueError("You passed embedding with output size %d and a cell"
+                         " that accepts size %d." % (embedding.size[1],
+                                                     cell.input_size))
+    self._cell = cell
+    self._embedding_classes = embedding_classes
+    self._embedding = embedding
+    self._initializer = initializer
+
+  @property
+  def input_size(self):
+    return 1
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the cell on embedded inputs."""
+    with vs.variable_scope(scope or type(self).__name__):  # "EmbeddingWrapper"
+      with ops.device("/cpu:0"):
+        if self._embedding:
+          embedding = self._embedding
+        else:
+          if self._initializer:
+            initializer = self._initializer
+          elif vs.get_variable_scope().initializer:
+            initializer = vs.get_variable_scope().initializer
+          else:
+            # Default initializer for embeddings should have variance=1.
+            sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
+            initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
+          embedding = vs.get_variable("embedding", [self._embedding_classes,
+                                                    self._cell.input_size],
+                                      initializer=initializer)
+        embedded = embedding_ops.embedding_lookup(
+            embedding, array_ops.reshape(inputs, [-1]))
+    return self._cell(embedded, state)
+
+
+class MultiRNNCell(RNNCell):
+  """RNN cell composed sequentially of multiple simple cells."""
+
+  def __init__(self, cells):
+    """Create a RNN cell composed sequentially of a number of RNNCells.
+
+    Args:
+      cells: list of RNNCells that will be composed in this order.
+
+    Raises:
+      ValueError: if cells is empty (not allowed) or if their sizes don't match.
+    """
+    if not cells:
+      raise ValueError("Must specify at least one cell for MultiRNNCell.")
+    for i in xrange(len(cells) - 1):
+      if cells[i + 1].input_size != cells[i].output_size:
+        raise ValueError("In MultiRNNCell, the input size of each next"
+                         " cell must match the output size of the previous one."
+                         " Mismatched output size in cell %d." % i)
+    self._cells = cells
+
+  @property
+  def input_size(self):
+    return self._cells[0].input_size
+
+  @property
+  def output_size(self):
+    return self._cells[-1].output_size
+
+  @property
+  def state_size(self):
+    return sum([cell.state_size for cell in self._cells])
+
+  def __call__(self, inputs, state, scope=None):
+    """Run this multi-layer cell on inputs, starting from state."""
+    with vs.variable_scope(scope or type(self).__name__):  # "MultiRNNCell"
+      cur_state_pos = 0
+      cur_inp = inputs
+      new_states = []
+      for i, cell in enumerate(self._cells):
+        with vs.variable_scope("Cell%d" % i):
+          cur_state = array_ops.slice(
+              state, [0, cur_state_pos], [-1, cell.state_size])
+          cur_state_pos += cell.state_size
+          cur_inp, new_state = cell(cur_inp, cur_state)
+          new_states.append(new_state)
+    return cur_inp, array_ops.concat(1, new_states)
+
+
+def linear(args, output_size, bias, bias_start=0.0, scope=None):
+  """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
+
+  Args:
+    args: a 2D Tensor or a list of 2D, batch x n, Tensors.
+    output_size: int, second dimension of W[i].
+    bias: boolean, whether to add a bias term or not.
+    bias_start: starting value to initialize the bias; 0 by default.
+    scope: VariableScope for the created subgraph; defaults to "Linear".
+
+  Returns:
+    A 2D Tensor with shape [batch x output_size] equal to
+    sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
+
+  Raises:
+    ValueError: if some of the arguments has unspecified or wrong shape.
+  """
+  assert args
+  if not isinstance(args, (list, tuple)):
+    args = [args]
+
+  # Calculate the total size of arguments on dimension 1.
+  total_arg_size = 0
+  shapes = [a.get_shape().as_list() for a in args]
+  for shape in shapes:
+    if len(shape) != 2:
+      raise ValueError("Linear is expecting 2D arguments: %s" % str(shapes))
+    if not shape[1]:
+      raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes))
+    else:
+      total_arg_size += shape[1]
+
+  # Now the computation.
+  with vs.variable_scope(scope or "Linear"):
+    matrix = vs.get_variable("Matrix", [total_arg_size, output_size])
+    if len(args) == 1:
+      res = math_ops.matmul(args[0], matrix)
+    else:
+      res = math_ops.matmul(array_ops.concat(1, args), matrix)
+    if not bias:
+      return res
+    bias_term = vs.get_variable(
+        "Bias", [output_size],
+        initializer=init_ops.constant_initializer(bias_start))
+  return res + bias_term
diff --git a/tensorflow/python/ops/seq2seq.py b/tensorflow/python/ops/seq2seq.py
new file mode 100644
index 00000000000..131524b77c5
--- /dev/null
+++ b/tensorflow/python/ops/seq2seq.py
@@ -0,0 +1,784 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Library for creating sequence-to-sequence models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variable_scope as vs
+
+
+def rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None,
+                scope=None):
+  """RNN decoder for the sequence-to-sequence model.
+
+  Args:
+    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
+    initial_state: 2D Tensor with shape [batch_size x cell.state_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    loop_function: if not None, this function will be applied to i-th output
+      in order to generate i+1-th input, and decoder_inputs will be ignored,
+      except for the first element ("GO" symbol). This can be used for decoding,
+      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
+      Signature -- loop_function(prev, i) = next
+        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
+        * i is an integer, the step number (when advanced control is needed),
+        * next is a 2D Tensor of shape [batch_size x cell.input_size].
+    scope: VariableScope for the created subgraph; defaults to "rnn_decoder".
+
+  Returns:
+    outputs: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x cell.output_size] containing generated outputs.
+    states: The state of each cell in each time-step. This is a list with
+      length len(decoder_inputs) -- one item for each time-step.
+      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
+      (Note that in some cases, like basic RNN cell or GRU cell, outputs and
+       states can be the same. They are different for LSTM cells though.)
+  """
+  with vs.variable_scope(scope or "rnn_decoder"):
+    states = [initial_state]
+    outputs = []
+    prev = None
+    for i in xrange(len(decoder_inputs)):
+      inp = decoder_inputs[i]
+      if loop_function is not None and prev is not None:
+        with vs.variable_scope("loop_function", reuse=True):
+          # We do not propagate gradients over the loop function.
+          inp = array_ops.stop_gradient(loop_function(prev, i))
+      if i > 0:
+        vs.get_variable_scope().reuse_variables()
+      output, new_state = cell(inp, states[-1])
+      outputs.append(output)
+      states.append(new_state)
+      if loop_function is not None:
+        prev = array_ops.stop_gradient(output)
+  return outputs, states
+
+
+def basic_rnn_seq2seq(
+    encoder_inputs, decoder_inputs, cell, dtype=dtypes.float32, scope=None):
+  """Basic RNN sequence-to-sequence model.
+
+  This model first runs an RNN to encode encoder_inputs into a state vector, and
+  then runs decoder, initialized with the last encoder state, on decoder_inputs.
+  Encoder and decoder use the same RNN cell type, but don't share parameters.
+
+  Args:
+    encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
+    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    dtype: The dtype of the initial state of the RNN cell (default: tf.float32).
+    scope: VariableScope for the created subgraph; default: "basic_rnn_seq2seq".
+
+  Returns:
+    outputs: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x cell.output_size] containing the generated outputs.
+    states: The state of each decoder cell in each time-step. This is a list
+      with length len(decoder_inputs) -- one item for each time-step.
+      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
+  """
+  with vs.variable_scope(scope or "basic_rnn_seq2seq"):
+    _, enc_states = rnn.rnn(cell, encoder_inputs, dtype=dtype)
+    return rnn_decoder(decoder_inputs, enc_states[-1], cell)
+
+
+def tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell,
+                     loop_function=None, dtype=dtypes.float32, scope=None):
+  """RNN sequence-to-sequence model with tied encoder and decoder parameters.
+
+  This model first runs an RNN to encode encoder_inputs into a state vector, and
+  then runs decoder, initialized with the last encoder state, on decoder_inputs.
+  Encoder and decoder use the same RNN cell and share parameters.
+
+  Args:
+    encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
+    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    loop_function: if not None, this function will be applied to i-th output
+      in order to generate i+1-th input, and decoder_inputs will be ignored,
+      except for the first element ("GO" symbol), see rnn_decoder for details.
+    dtype: The dtype of the initial state of the rnn cell (default: tf.float32).
+    scope: VariableScope for the created subgraph; default: "tied_rnn_seq2seq".
+
+  Returns:
+    outputs: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x cell.output_size] containing the generated outputs.
+    states: The state of each decoder cell in each time-step. This is a list
+      with length len(decoder_inputs) -- one item for each time-step.
+      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
+  """
+  with vs.variable_scope("combined_tied_rnn_seq2seq"):
+    scope = scope or "tied_rnn_seq2seq"
+    _, enc_states = rnn.rnn(
+        cell, encoder_inputs, dtype=dtype, scope=scope)
+    vs.get_variable_scope().reuse_variables()
+    return rnn_decoder(decoder_inputs, enc_states[-1], cell,
+                       loop_function=loop_function, scope=scope)
+
+
+def embedding_rnn_decoder(decoder_inputs, initial_state, cell, num_symbols,
+                          output_projection=None, feed_previous=False,
+                          scope=None):
+  """RNN decoder with embedding and a pure-decoding option.
+
+  Args:
+    decoder_inputs: a list of 1D batch-sized int32-Tensors (decoder inputs).
+    initial_state: 2D Tensor [batch_size x cell.state_size].
+    cell: rnn_cell.RNNCell defining the cell function.
+    num_symbols: integer, how many symbols come into the embedding.
+    output_projection: None or a pair (W, B) of output projection weights and
+      biases; W has shape [cell.output_size x num_symbols] and B has
+      shape [num_symbols]; if provided and feed_previous=True, each fed
+      previous output will first be multiplied by W and added B.
+    feed_previous: Boolean; if True, only the first of decoder_inputs will be
+      used (the "GO" symbol), and all other decoder inputs will be generated by:
+        next = embedding_lookup(embedding, argmax(previous_output)),
+      In effect, this implements a greedy decoder. It can also be used
+      during training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
+      If False, decoder_inputs are used as given (the standard decoder case).
+    scope: VariableScope for the created subgraph; defaults to
+      "embedding_rnn_decoder".
+
+  Returns:
+    outputs: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x cell.output_size] containing the generated outputs.
+    states: The state of each decoder cell in each time-step. This is a list
+      with length len(decoder_inputs) -- one item for each time-step.
+      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
+
+  Raises:
+    ValueError: when output_projection has the wrong shape.
+  """
+  if output_projection is not None:
+    proj_weights = ops.convert_to_tensor(
+        output_projection[0], dtype=dtypes.float32)
+    proj_weights.get_shape().assert_is_compatible_with([cell.output_size,
+                                                        num_symbols])
+    proj_biases = ops.convert_to_tensor(
+        output_projection[1], dtype=dtypes.float32)
+    proj_biases.get_shape().assert_is_compatible_with([num_symbols])
+
+  with vs.variable_scope(scope or "embedding_rnn_decoder"):
+    with ops.device("/cpu:0"):
+      embedding = vs.get_variable("embedding", [num_symbols, cell.input_size])
+
+    def extract_argmax_and_embed(prev, _):
+      """Loop_function that extracts the symbol from prev and embeds it."""
+      if output_projection is not None:
+        prev = nn_ops.xw_plus_b(
+            prev, output_projection[0], output_projection[1])
+      prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1))
+      return embedding_ops.embedding_lookup(embedding, prev_symbol)
+
+    loop_function = None
+    if feed_previous:
+      loop_function = extract_argmax_and_embed
+
+    emb_inp = [
+        embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs]
+    return rnn_decoder(emb_inp, initial_state, cell,
+                       loop_function=loop_function)
+
+
+def embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell,
+                          num_encoder_symbols, num_decoder_symbols,
+                          output_projection=None, feed_previous=False,
+                          dtype=dtypes.float32, scope=None):
+  """Embedding RNN sequence-to-sequence model.
+
+  This model first embeds encoder_inputs by a newly created embedding (of shape
+  [num_encoder_symbols x cell.input_size]). Then it runs an RNN to encode
+  embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs
+  by another newly created embedding (of shape [num_decoder_symbols x
+  cell.input_size]). Then it runs RNN decoder, initialized with the last
+  encoder state, on embedded decoder_inputs.
+
+  Args:
+    encoder_inputs: a list of 1D int32-Tensors of shape [batch_size].
+    decoder_inputs: a list of 1D int32-Tensors of shape [batch_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    num_encoder_symbols: integer; number of symbols on the encoder side.
+    num_decoder_symbols: integer; number of symbols on the decoder side.
+    output_projection: None or a pair (W, B) of output projection weights and
+      biases; W has shape [cell.output_size x num_decoder_symbols] and B has
+      shape [num_decoder_symbols]; if provided and feed_previous=True, each
+      fed previous output will first be multiplied by W and added B.
+    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
+      of decoder_inputs will be used (the "GO" symbol), and all other decoder
+      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+      If False, decoder_inputs are used as given (the standard decoder case).
+    dtype: The dtype of the initial state for both the encoder and encoder
+      rnn cells (default: tf.float32).
+    scope: VariableScope for the created subgraph; defaults to
+      "embedding_rnn_seq2seq"
+
+  Returns:
+    outputs: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x num_decoder_symbols] containing the generated outputs.
+    states: The state of each decoder cell in each time-step. This is a list
+      with length len(decoder_inputs) -- one item for each time-step.
+      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
+  """
+  with vs.variable_scope(scope or "embedding_rnn_seq2seq"):
+    # Encoder.
+    encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols)
+    _, encoder_states = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype)
+
+    # Decoder.
+    if output_projection is None:
+      cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
+
+    if isinstance(feed_previous, bool):
+      return embedding_rnn_decoder(decoder_inputs, encoder_states[-1], cell,
+                                   num_decoder_symbols, output_projection,
+                                   feed_previous)
+    else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
+      outputs1, states1 = embedding_rnn_decoder(
+          decoder_inputs, encoder_states[-1], cell, num_decoder_symbols,
+          output_projection, True)
+      vs.get_variable_scope().reuse_variables()
+      outputs2, states2 = embedding_rnn_decoder(
+          decoder_inputs, encoder_states[-1], cell, num_decoder_symbols,
+          output_projection, False)
+
+      outputs = control_flow_ops.cond(feed_previous,
+                                      lambda: outputs1, lambda: outputs2)
+      states = control_flow_ops.cond(feed_previous,
+                                     lambda: states1, lambda: states2)
+      return outputs, states
+
+
+def embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell,
+                               num_symbols, output_projection=None,
+                               feed_previous=False, dtype=dtypes.float32,
+                               scope=None):
+  """Embedding RNN sequence-to-sequence model with tied (shared) parameters.
+
+  This model first embeds encoder_inputs by a newly created embedding (of shape
+  [num_symbols x cell.input_size]). Then it runs an RNN to encode embedded
+  encoder_inputs into a state vector. Next, it embeds decoder_inputs using
+  the same embedding. Then it runs RNN decoder, initialized with the last
+  encoder state, on embedded decoder_inputs.
+
+  Args:
+    encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
+    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    num_symbols: integer; number of symbols for both encoder and decoder.
+    output_projection: None or a pair (W, B) of output projection weights and
+      biases; W has shape [cell.output_size x num_symbols] and B has
+      shape [num_symbols]; if provided and feed_previous=True, each
+      fed previous output will first be multiplied by W and added B.
+    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
+      of decoder_inputs will be used (the "GO" symbol), and all other decoder
+      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+      If False, decoder_inputs are used as given (the standard decoder case).
+    dtype: The dtype to use for the initial RNN states (default: tf.float32).
+    scope: VariableScope for the created subgraph; defaults to
+      "embedding_tied_rnn_seq2seq".
+
+  Returns:
+    outputs: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x num_decoder_symbols] containing the generated outputs.
+    states: The state of each decoder cell in each time-step. This is a list
+      with length len(decoder_inputs) -- one item for each time-step.
+      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
+
+  Raises:
+    ValueError: when output_projection has the wrong shape.
+  """
+  if output_projection is not None:
+    proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype)
+    proj_weights.get_shape().assert_is_compatible_with([cell.output_size,
+                                                        num_symbols])
+    proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
+    proj_biases.get_shape().assert_is_compatible_with([num_symbols])
+
+  with vs.variable_scope(scope or "embedding_tied_rnn_seq2seq"):
+    with ops.device("/cpu:0"):
+      embedding = vs.get_variable("embedding", [num_symbols, cell.input_size])
+
+    emb_encoder_inputs = [embedding_ops.embedding_lookup(embedding, x)
+                          for x in encoder_inputs]
+    emb_decoder_inputs = [embedding_ops.embedding_lookup(embedding, x)
+                          for x in decoder_inputs]
+
+    def extract_argmax_and_embed(prev, _):
+      """Loop_function that extracts the symbol from prev and embeds it."""
+      if output_projection is not None:
+        prev = nn_ops.xw_plus_b(
+            prev, output_projection[0], output_projection[1])
+      prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1))
+      return embedding_ops.embedding_lookup(embedding, prev_symbol)
+
+    if output_projection is None:
+      cell = rnn_cell.OutputProjectionWrapper(cell, num_symbols)
+
+    if isinstance(feed_previous, bool):
+      loop_function = extract_argmax_and_embed if feed_previous else None
+      return tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell,
+                              loop_function=loop_function, dtype=dtype)
+    else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
+      outputs1, states1 = tied_rnn_seq2seq(
+          emb_encoder_inputs, emb_decoder_inputs, cell,
+          loop_function=extract_argmax_and_embed, dtype=dtype)
+      vs.get_variable_scope().reuse_variables()
+      outputs2, states2 = tied_rnn_seq2seq(
+          emb_encoder_inputs, emb_decoder_inputs, cell, dtype=dtype)
+
+      outputs = control_flow_ops.cond(feed_previous,
+                                      lambda: outputs1, lambda: outputs2)
+      states = control_flow_ops.cond(feed_previous,
+                                     lambda: states1, lambda: states2)
+      return outputs, states
+
+
+def attention_decoder(decoder_inputs, initial_state, attention_states, cell,
+                      output_size=None, num_heads=1, loop_function=None,
+                      dtype=dtypes.float32, scope=None):
+  """RNN decoder with attention for the sequence-to-sequence model.
+
+  Args:
+    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
+    initial_state: 2D Tensor [batch_size x cell.state_size].
+    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    output_size: size of the output vectors; if None, we use cell.output_size.
+    num_heads: number of attention heads that read from attention_states.
+    loop_function: if not None, this function will be applied to i-th output
+      in order to generate i+1-th input, and decoder_inputs will be ignored,
+      except for the first element ("GO" symbol). This can be used for decoding,
+      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
+      Signature -- loop_function(prev, i) = next
+        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
+        * i is an integer, the step number (when advanced control is needed),
+        * next is a 2D Tensor of shape [batch_size x cell.input_size].
+    dtype: The dtype to use for the RNN initial state (default: tf.float32).
+    scope: VariableScope for the created subgraph; default: "attention_decoder".
+
+  Returns:
+    outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
+      [batch_size x output_size]. These represent the generated outputs.
+      Output i is computed from input i (which is either i-th decoder_inputs or
+      loop_function(output {i-1}, i)) as follows. First, we run the cell
+      on a combination of the input and previous attention masks:
+        cell_output, new_state = cell(linear(input, prev_attn), prev_state).
+      Then, we calculate new attention masks:
+        new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
+      and then we calculate the output:
+        output = linear(cell_output, new_attn).
+    states: The state of each decoder cell in each time-step. This is a list
+      with length len(decoder_inputs) -- one item for each time-step.
+      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
+
+  Raises:
+    ValueError: when num_heads is not positive, there are no inputs, or shapes
+      of attention_states are not set.
+  """
+  if not decoder_inputs:
+    raise ValueError("Must provide at least 1 input to attention decoder.")
+  if num_heads < 1:
+    raise ValueError("With less than 1 heads, use a non-attention decoder.")
+  if not attention_states.get_shape()[1:2].is_fully_defined():
+    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
+                     % attention_states.get_shape())
+  if output_size is None:
+    output_size = cell.output_size
+
+  with vs.variable_scope(scope or "attention_decoder"):
+    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
+    attn_length = attention_states.get_shape()[1].value
+    attn_size = attention_states.get_shape()[2].value
+
+    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
+    hidden = array_ops.reshape(
+        attention_states, [-1, attn_length, 1, attn_size])
+    hidden_features = []
+    v = []
+    attention_vec_size = attn_size  # Size of query vectors for attention.
+    for a in xrange(num_heads):
+      k = vs.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size])
+      hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
+      v.append(vs.get_variable("AttnV_%d" % a, [attention_vec_size]))
+
+    states = [initial_state]
+
+    def attention(query):
+      """Put attention masks on hidden using hidden_features and query."""
+      ds = []  # Results of attention reads will be stored here.
+      for a in xrange(num_heads):
+        with vs.variable_scope("Attention_%d" % a):
+          y = rnn_cell.linear(query, attention_vec_size, True)
+          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
+          # Attention mask is a softmax of v^T * tanh(...).
+          s = math_ops.reduce_sum(
+              v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
+          a = nn_ops.softmax(s)
+          # Now calculate the attention-weighted vector d.
+          d = math_ops.reduce_sum(
+              array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
+              [1, 2])
+          ds.append(array_ops.reshape(d, [-1, attn_size]))
+      return ds
+
+    outputs = []
+    prev = None
+    batch_attn_size = array_ops.pack([batch_size, attn_size])
+    attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
+             for _ in xrange(num_heads)]
+    for a in attns:  # Ensure the second shape of attention vectors is set.
+      a.set_shape([None, attn_size])
+    for i in xrange(len(decoder_inputs)):
+      if i > 0:
+        vs.get_variable_scope().reuse_variables()
+      inp = decoder_inputs[i]
+      # If loop_function is set, we use it instead of decoder_inputs.
+      if loop_function is not None and prev is not None:
+        with vs.variable_scope("loop_function", reuse=True):
+          inp = array_ops.stop_gradient(loop_function(prev, i))
+      # Merge input and previous attentions into one vector of the right size.
+      x = rnn_cell.linear([inp] + attns, cell.input_size, True)
+      # Run the RNN.
+      cell_output, new_state = cell(x, states[-1])
+      states.append(new_state)
+      # Run the attention mechanism.
+      attns = attention(new_state)
+      with vs.variable_scope("AttnOutputProjection"):
+        output = rnn_cell.linear([cell_output] + attns, output_size, True)
+      if loop_function is not None:
+        # We do not propagate gradients over the loop function.
+        prev = array_ops.stop_gradient(output)
+      outputs.append(output)
+
+  return outputs, states
+
+
+def embedding_attention_decoder(decoder_inputs, initial_state, attention_states,
+                                cell, num_symbols, num_heads=1,
+                                output_size=None, output_projection=None,
+                                feed_previous=False, dtype=dtypes.float32,
+                                scope=None):
+  """RNN decoder with embedding and attention and a pure-decoding option.
+
+  Args:
+    decoder_inputs: a list of 1D batch-sized int32-Tensors (decoder inputs).
+    initial_state: 2D Tensor [batch_size x cell.state_size].
+    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
+    cell: rnn_cell.RNNCell defining the cell function.
+    num_symbols: integer, how many symbols come into the embedding.
+    num_heads: number of attention heads that read from attention_states.
+    output_size: size of the output vectors; if None, use cell.output_size.
+    output_projection: None or a pair (W, B) of output projection weights and
+      biases; W has shape [output_size x num_symbols] and B has shape
+      [num_symbols]; if provided and feed_previous=True, each fed previous
+      output will first be multiplied by W and added B.
+    feed_previous: Boolean; if True, only the first of decoder_inputs will be
+      used (the "GO" symbol), and all other decoder inputs will be generated by:
+        next = embedding_lookup(embedding, argmax(previous_output)),
+      In effect, this implements a greedy decoder. It can also be used
+      during training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
+      If False, decoder_inputs are used as given (the standard decoder case).
+    dtype: The dtype to use for the RNN initial states (default: tf.float32).
+    scope: VariableScope for the created subgraph; defaults to
+      "embedding_attention_decoder".
+
+  Returns:
+    outputs: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x output_size] containing the generated outputs.
+    states: The state of each decoder cell in each time-step. This is a list
+      with length len(decoder_inputs) -- one item for each time-step.
+      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
+
+  Raises:
+    ValueError: when output_projection has the wrong shape.
+  """
+  if output_size is None:
+    output_size = cell.output_size
+  if output_projection is not None:
+    proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype)
+    proj_weights.get_shape().assert_is_compatible_with([cell.output_size,
+                                                        num_symbols])
+    proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
+    proj_biases.get_shape().assert_is_compatible_with([num_symbols])
+
+  with vs.variable_scope(scope or "embedding_attention_decoder"):
+    with ops.device("/cpu:0"):
+      embedding = vs.get_variable("embedding", [num_symbols, cell.input_size])
+
+    def extract_argmax_and_embed(prev, _):
+      """Loop_function that extracts the symbol from prev and embeds it."""
+      if output_projection is not None:
+        prev = nn_ops.xw_plus_b(
+            prev, output_projection[0], output_projection[1])
+      prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1))
+      emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
+      return emb_prev
+
+    loop_function = None
+    if feed_previous:
+      loop_function = extract_argmax_and_embed
+
+    emb_inp = [
+        embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs]
+    return attention_decoder(
+        emb_inp, initial_state, attention_states, cell, output_size=output_size,
+        num_heads=num_heads, loop_function=loop_function)
+
+
+def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell,
+                                num_encoder_symbols, num_decoder_symbols,
+                                num_heads=1, output_projection=None,
+                                feed_previous=False, dtype=dtypes.float32,
+                                scope=None):
+  """Embedding sequence-to-sequence model with attention.
+
+  This model first embeds encoder_inputs by a newly created embedding (of shape
+  [num_encoder_symbols x cell.input_size]). Then it runs an RNN to encode
+  embedded encoder_inputs into a state vector. It keeps the outputs of this
+  RNN at every step to use for attention later. Next, it embeds decoder_inputs
+  by another newly created embedding (of shape [num_decoder_symbols x
+  cell.input_size]). Then it runs attention decoder, initialized with the last
+  encoder state, on embedded decoder_inputs and attending to encoder outputs.
+
+  Args:
+    encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
+    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    num_encoder_symbols: integer; number of symbols on the encoder side.
+    num_decoder_symbols: integer; number of symbols on the decoder side.
+    num_heads: number of attention heads that read from attention_states.
+    output_projection: None or a pair (W, B) of output projection weights and
+      biases; W has shape [cell.output_size x num_decoder_symbols] and B has
+      shape [num_decoder_symbols]; if provided and feed_previous=True, each
+      fed previous output will first be multiplied by W and added B.
+    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
+      of decoder_inputs will be used (the "GO" symbol), and all other decoder
+      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+      If False, decoder_inputs are used as given (the standard decoder case).
+    dtype: The dtype of the initial RNN state (default: tf.float32).
+    scope: VariableScope for the created subgraph; defaults to
+      "embedding_attention_seq2seq".
+
+  Returns:
+    outputs: A list of the same length as decoder_inputs of 2D Tensors with
+      shape [batch_size x num_decoder_symbols] containing the generated outputs.
+    states: The state of each decoder cell in each time-step. This is a list
+      with length len(decoder_inputs) -- one item for each time-step.
+      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
+  """
+  with vs.variable_scope(scope or "embedding_attention_seq2seq"):
+    # Encoder.
+    encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols)
+    encoder_outputs, encoder_states = rnn.rnn(
+        encoder_cell, encoder_inputs, dtype=dtype)
+
+    # First calculate a concatenation of encoder outputs to put attention on.
+    top_states = [array_ops.reshape(e, [-1, 1, cell.output_size])
+                  for e in encoder_outputs]
+    attention_states = array_ops.concat(1, top_states)
+
+    # Decoder.
+    output_size = None
+    if output_projection is None:
+      cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
+      output_size = num_decoder_symbols
+
+    if isinstance(feed_previous, bool):
+      return embedding_attention_decoder(
+          decoder_inputs, encoder_states[-1], attention_states, cell,
+          num_decoder_symbols, num_heads, output_size, output_projection,
+          feed_previous)
+    else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
+      outputs1, states1 = embedding_attention_decoder(
+          decoder_inputs, encoder_states[-1], attention_states, cell,
+          num_decoder_symbols, num_heads, output_size, output_projection, True)
+      vs.get_variable_scope().reuse_variables()
+      outputs2, states2 = embedding_attention_decoder(
+          decoder_inputs, encoder_states[-1], attention_states, cell,
+          num_decoder_symbols, num_heads, output_size, output_projection, False)
+
+      outputs = control_flow_ops.cond(feed_previous,
+                                      lambda: outputs1, lambda: outputs2)
+      states = control_flow_ops.cond(feed_previous,
+                                     lambda: states1, lambda: states2)
+      return outputs, states
+
+
+def sequence_loss_by_example(logits, targets, weights, num_decoder_symbols,
+                             average_across_timesteps=True,
+                             softmax_loss_function=None, name=None):
+  """Weighted cross-entropy loss for a sequence of logits (per example).
+
+  Args:
+    logits: list of 2D Tensors of shape [batch_size x num_decoder_symbols].
+    targets: list of 1D batch-sized int32-Tensors of the same length as logits.
+    weights: list of 1D batch-sized float-Tensors of the same length as logits.
+    num_decoder_symbols: integer, number of decoder symbols (output classes).
+    average_across_timesteps: If set, divide the returned cost by the total
+      label weight.
+    softmax_loss_function: function (inputs-batch, labels-batch) -> loss-batch
+      to be used instead of the standard softmax (the default if this is None).
+    name: optional name for this operation, default: "sequence_loss_by_example".
+
+  Returns:
+    1D batch-sized float Tensor: the log-perplexity for each sequence.
+
+  Raises:
+    ValueError: if len(logits) is different from len(targets) or len(weights).
+  """
+  if len(targets) != len(logits) or len(weights) != len(logits):
+    raise ValueError("Lengths of logits, weights, and targets must be the same "
+                     "%d, %d, %d." % (len(logits), len(weights), len(targets)))
+  with ops.op_scope(logits + targets + weights, name,
+                    "sequence_loss_by_example"):
+    batch_size = array_ops.shape(targets[0])[0]
+    log_perp_list = []
+    length = batch_size * num_decoder_symbols
+    for i in xrange(len(logits)):
+      if softmax_loss_function is None:
+        # TODO(lukaszkaiser): There is no SparseCrossEntropy in TensorFlow, so
+        # we need to first cast targets into a dense representation, and as
+        # SparseToDense does not accept batched inputs, we need to do this by
+        # re-indexing and re-sizing. When TensorFlow adds SparseCrossEntropy,
+        # rewrite this method.
+        indices = targets[i] + num_decoder_symbols * math_ops.range(batch_size)
+        with ops.device("/cpu:0"):  # Sparse-to-dense must be on CPU for now.
+          dense = sparse_ops.sparse_to_dense(
+              indices, array_ops.expand_dims(length, 0), 1.0,
+              0.0)
+        target = array_ops.reshape(dense, [-1, num_decoder_symbols])
+        crossent = nn_ops.softmax_cross_entropy_with_logits(
+            logits[i], target, name="SequenceLoss/CrossEntropy{0}".format(i))
+      else:
+        crossent = softmax_loss_function(logits[i], targets[i])
+      log_perp_list.append(crossent * weights[i])
+    log_perps = math_ops.add_n(log_perp_list)
+    if average_across_timesteps:
+      total_size = math_ops.add_n(weights)
+      total_size += 1e-12  # Just to avoid division by 0 for all-0 weights.
+      log_perps /= total_size
+  return log_perps
+
+
+def sequence_loss(logits, targets, weights, num_decoder_symbols,
+                  average_across_timesteps=True, average_across_batch=True,
+                  softmax_loss_function=None, name=None):
+  """Weighted cross-entropy loss for a sequence of logits, batch-collapsed.
+
+  Args:
+    logits: list of 2D Tensors os shape [batch_size x num_decoder_symbols].
+    targets: list of 1D batch-sized int32-Tensors of the same length as logits.
+    weights: list of 1D batch-sized float-Tensors of the same length as logits.
+    num_decoder_symbols: integer, number of decoder symbols (output classes).
+    average_across_timesteps: If set, divide the returned cost by the total
+      label weight.
+    average_across_batch: If set, divide the returned cost by the batch size.
+    softmax_loss_function: function (inputs-batch, labels-batch) -> loss-batch
+      to be used instead of the standard softmax (the default if this is None).
+    name: optional name for this operation, defaults to "sequence_loss".
+
+  Returns:
+    A scalar float Tensor: the average log-perplexity per symbol (weighted).
+
+  Raises:
+    ValueError: if len(logits) is different from len(targets) or len(weights).
+  """
+  with ops.op_scope(logits + targets + weights, name, "sequence_loss"):
+    cost = math_ops.reduce_sum(sequence_loss_by_example(
+        logits, targets, weights, num_decoder_symbols,
+        average_across_timesteps=average_across_timesteps,
+        softmax_loss_function=softmax_loss_function))
+    if average_across_batch:
+      batch_size = array_ops.shape(targets[0])[0]
+      return cost / math_ops.cast(batch_size, dtypes.float32)
+    else:
+      return cost
+
+
+def model_with_buckets(encoder_inputs, decoder_inputs, targets, weights,
+                       buckets, num_decoder_symbols, seq2seq,
+                       softmax_loss_function=None, name=None):
+  """Create a sequence-to-sequence model with support for bucketing.
+
+  The seq2seq argument is a function that defines a sequence-to-sequence model,
+  e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24))
+
+  Args:
+    encoder_inputs: a list of Tensors to feed the encoder; first seq2seq input.
+    decoder_inputs: a list of Tensors to feed the decoder; second seq2seq input.
+    targets: a list of 1D batch-sized int32-Tensors (desired output sequence).
+    weights: list of 1D batch-sized float-Tensors to weight the targets.
+    buckets: a list of pairs of (input size, output size) for each bucket.
+    num_decoder_symbols: integer, number of decoder symbols (output classes).
+    seq2seq: a sequence-to-sequence model function; it takes 2 input that
+      agree with encoder_inputs and decoder_inputs, and returns a pair
+      consisting of outputs and states (as, e.g., basic_rnn_seq2seq).
+    softmax_loss_function: function (inputs-batch, labels-batch) -> loss-batch
+      to be used instead of the standard softmax (the default if this is None).
+    name: optional name for this operation, defaults to "model_with_buckets".
+
+  Returns:
+    outputs: The outputs for each bucket. Its j'th element consists of a list
+      of 2D Tensors of shape [batch_size x num_decoder_symbols] (j'th outputs).
+    losses: List of scalar Tensors, representing losses for each bucket.
+  Raises:
+    ValueError: if length of encoder_inputsut, targets, or weights is smaller
+      than the largest (last) bucket.
+  """
+  if len(encoder_inputs) < buckets[-1][0]:
+    raise ValueError("Length of encoder_inputs (%d) must be at least that of la"
+                     "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0]))
+  if len(targets) < buckets[-1][1]:
+    raise ValueError("Length of targets (%d) must be at least that of last"
+                     "bucket (%d)." % (len(targets), buckets[-1][1]))
+  if len(weights) < buckets[-1][1]:
+    raise ValueError("Length of weights (%d) must be at least that of last"
+                     "bucket (%d)." % (len(weights), buckets[-1][1]))
+
+  all_inputs = encoder_inputs + decoder_inputs + targets + weights
+  losses = []
+  outputs = []
+  with ops.op_scope(all_inputs, name, "model_with_buckets"):
+    for j in xrange(len(buckets)):
+      if j > 0:
+        vs.get_variable_scope().reuse_variables()
+      bucket_encoder_inputs = [encoder_inputs[i]
+                               for i in xrange(buckets[j][0])]
+      bucket_decoder_inputs = [decoder_inputs[i]
+                               for i in xrange(buckets[j][1])]
+      bucket_outputs, _ = seq2seq(bucket_encoder_inputs,
+                                  bucket_decoder_inputs)
+      outputs.append(bucket_outputs)
+
+      bucket_targets = [targets[i] for i in xrange(buckets[j][1])]
+      bucket_weights = [weights[i] for i in xrange(buckets[j][1])]
+      losses.append(sequence_loss(
+          outputs[-1], bucket_targets, bucket_weights, num_decoder_symbols,
+          softmax_loss_function=softmax_loss_function))
+
+  return outputs, losses
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index fc37ac5ceb2..3840971d76a 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -139,6 +139,8 @@ class Variable(object):
   @@graph
   @@op
   """
+  # TODO(touts): Add @@value and @@ref in the docstring above once they are
+  # ready for consumption.
 
   def __init__(self, initial_value, trainable=True, collections=None,
                validate_shape=True, name=None):
@@ -199,6 +201,7 @@ class Variable(object):
         with ops.device(self._variable.device):
           self._initializer_op = state_ops.assign(
               self._variable, self._initial_value, validate_shape=False).op
+          self._snapshot = array_ops.identity(self._variable, name="read")
       else:
         self._variable = state_ops.variable_op(
             self._initial_value.get_shape(),
@@ -207,6 +210,7 @@ class Variable(object):
         with ops.device(self._variable.device):
           self._initializer_op = state_ops.assign(
               self._variable, self._initial_value).op
+          self._snapshot = array_ops.identity(self._variable, name="read")
     for key in collections:
       ops.add_to_collection(key, self)
     self._save_slice_info = None
@@ -216,7 +220,50 @@ class Variable(object):
     return self._variable
 
   def _AsTensor(self):
-    """Conversion function for ops.convert_to_tensor()."""
+    """Converts this variable to a Tensor.
+
+    See [`value()`](#Variable.value).
+
+    Returns:
+      A `Tensor` containing the value of the variable.
+    """
+    return self._snapshot
+
+  def value(self):
+    """Returns the last snapshot of this variable.
+
+    You usually do not need to call this method as all ops that need the value
+    of the variable call it automatically through a `convert_to_tensor()` call.
+
+    Returns a `Tensor` which holds the value of the variable.  You can not
+    assign a new value to this tensor as it is not a reference to the variable.
+    See [`ref()`](#Variable.ref) if you want to get a reference to the
+    variable.
+
+    To avoid copies, if the consumer of the returned value is on the same device
+    as the variable, this actually returns the live value of the variable, not
+    a copy.  Updates to the variable are seen by the consumer.  If the consumer
+    is on a different device it will get a copy of the variable.
+
+    Returns:
+      A `Tensor` containing the value of the variable.
+    """
+    return self._snapshot
+
+  def ref(self):
+    """Returns a reference to this variable.
+
+    You usually do not need to call this method as all ops that need a reference
+    to the variable call it automatically.
+
+    Returns is a `Tensor` which holds a reference to the variable.  You can
+    assign a new value to the variable by passing the tensor to an assign op.
+    See [`value()`](#Variable.value) if you want to get the value of the
+    variable.
+
+    Returns:
+      A `Tensor` that is a reference to the variable.
+    """
     return self._variable
 
   def eval(self, session=None):
@@ -366,15 +413,17 @@ class Variable(object):
 
   # Conversion to tensor.
   @staticmethod
-  def _TensorConversionFunction(v, dtype=None, name=None):
+  def _TensorConversionFunction(v, dtype=None, name=None, as_ref=False):
     """Utility function for converting a Variable to a Tensor."""
     _ = name
-    ret = v._AsTensor()  # pylint: disable=protected-access
     if dtype and not dtype.is_compatible_with(v.dtype):
       raise ValueError(
           "Incompatible type conversion requested to type '%s' for variable "
           "of type '%s'" % (dtype.name, v.dtype.name))
-    return ret
+    if as_ref:
+      return v.ref()
+    else:
+      return v.value()
 
   # Operator overloading.
   #
diff --git a/tensorflow/python/platform/default/_flags.py b/tensorflow/python/platform/default/_flags.py
index fcf78fde989..d7ae189c21c 100644
--- a/tensorflow/python/platform/default/_flags.py
+++ b/tensorflow/python/platform/default/_flags.py
@@ -94,7 +94,15 @@ def DEFINE_boolean(flag_name, default_value, docstring):
     default_value: The default value the flag should take as a boolean.
     docstring: A helpful message explaining the use of the flag.
   """
-  _define_helper(flag_name, default_value, docstring, bool)
+  # Register a custom function for 'bool' so --flag=True works.
+  def str2bool(v):
+    return v.lower() in ('true', 't', '1')
+  _global_parser.add_argument('--' + flag_name,
+                              nargs='?',
+                              const=True,
+                              help=docstring,
+                              default=default_value,
+                              type=str2bool)
   _global_parser.add_argument('--no' + flag_name,
                               action='store_false',
                               dest=flag_name)
diff --git a/tensorflow/python/platform/default/_gfile.py b/tensorflow/python/platform/default/_gfile.py
index 4ee28ca0123..44a09f0d9c3 100644
--- a/tensorflow/python/platform/default/_gfile.py
+++ b/tensorflow/python/platform/default/_gfile.py
@@ -358,3 +358,25 @@ def ListDirectory(directory, return_dotfiles=False):  # pylint: disable=invalid-
   if not return_dotfiles:
     files = [f for f in files if not f.startswith('.')]
   return files
+
+
+def Walk(top, topdown=1, onerror=None):
+  """Recursive directory tree generator.
+
+  Args:
+    top: string, a pathname.
+    topdown: bool, should traversal be pre-order (True) or post-order (False)
+    onerror: function, optional callback for errors.
+
+  By default, errors that occur when listing a directory are ignored.
+  (This is the same semantics as Python's os.walk() generator.)  If the
+  optional argument "onerror" is specified, it should be a function.  It
+  will be called with one argument, an os.error instance.  It can return
+  to continue with the walk, or reraise the exception to abort the walk.
+
+  Yields:
+    # Each yield is a 3-tuple:  the pathname of a directory, followed
+    # by lists of all its subdirectories and leaf files.
+    (dirname, [subdirname, subdirname, ...], [filename, filename, ...])
+  """
+  return os.walk(top, topdown=topdown, onerror=onerror)
diff --git a/tensorflow/python/platform/default/flags_test.py b/tensorflow/python/platform/default/flags_test.py
index be32bb63bd9..3868576c2f8 100644
--- a/tensorflow/python/platform/default/flags_test.py
+++ b/tensorflow/python/platform/default/flags_test.py
@@ -26,10 +26,16 @@ from tensorflow.python.platform.default import _flags as flags
 
 
 flags.DEFINE_string("string_foo", "default_val", "HelpString")
-flags.DEFINE_boolean("bool_foo", True, "HelpString")
 flags.DEFINE_integer("int_foo", 42, "HelpString")
 flags.DEFINE_float("float_foo", 42.0, "HelpString")
 
+flags.DEFINE_boolean("bool_foo", True, "HelpString")
+flags.DEFINE_boolean("bool_negation", True, "HelpString")
+flags.DEFINE_boolean("bool_a", False, "HelpString")
+flags.DEFINE_boolean("bool_c", False, "HelpString")
+flags.DEFINE_boolean("bool_d", True, "HelpString")
+flags.DEFINE_boolean("bool_e", True, "HelpString")
+
 FLAGS = flags.FLAGS
 
 class FlagsTest(googletest.TestCase):
@@ -46,14 +52,23 @@ class FlagsTest(googletest.TestCase):
     FLAGS.bool_foo = False
     self.assertFalse(FLAGS.bool_foo)
 
-  def testNoBool(self):
-    FLAGS.bool_foo = True
-    try:
-      sys.argv.append("--nobool_foo")
-      FLAGS._parse_flags()
-      self.assertFalse(FLAGS.bool_foo)
-    finally:
-      sys.argv.pop()
+  def testBoolCommandLines(self):
+    # Specified on command line with no args, sets to True,
+    # even if default is False.
+    self.assertEqual(True, FLAGS.bool_a)
+
+    # --no before the flag forces it to False, even if the
+    # default is True
+    self.assertEqual(False, FLAGS.bool_negation)
+
+    # --bool_flag=True sets to True
+    self.assertEqual(True, FLAGS.bool_c)
+
+    # --bool_flag=False sets to False
+    self.assertEqual(False, FLAGS.bool_d)
+
+    # --bool_flag=gibberish sets to False
+    self.assertEqual(False, FLAGS.bool_e)
 
   def testInt(self):
     res = FLAGS.int_foo
@@ -69,4 +84,12 @@ class FlagsTest(googletest.TestCase):
 
 
 if __name__ == "__main__":
-  googletest.main()
+  # Test command lines
+  sys.argv.extend(["--bool_a", "--nobool_negation", "--bool_c=True",
+                   "--bool_d=False", "--bool_e=gibberish"])
+
+  # googletest.main() tries to interpret the above flags, so use the
+  # direct functions instead.
+  runner = googletest.TextTestRunner()
+  itersuite = googletest.TestLoader().loadTestsFromTestCase(FlagsTest)
+  runner.run(itersuite)
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index f985092b66e..209f730c8db 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -17,9 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# pylint: disable=unused-import
 from tensorflow.python.platform.googletest import GetTempDir
 from tensorflow.python.platform.googletest import main
 from tensorflow.python.framework.test_util import TensorFlowTestCase as TestCase
 from tensorflow.python.framework.test_util import IsGoogleCudaEnabled as IsBuiltWithCuda
 
+from tensorflow.python.kernel_tests.gradient_checker import compute_gradient_error
+from tensorflow.python.kernel_tests.gradient_checker import compute_gradient
+
 get_temp_dir = GetTempDir
+# pylint: enable=unused-import
diff --git a/tensorflow/python/summary/event_multiplexer.py b/tensorflow/python/summary/event_multiplexer.py
index a3ce42afb1e..46ee7523876 100644
--- a/tensorflow/python/summary/event_multiplexer.py
+++ b/tensorflow/python/summary/event_multiplexer.py
@@ -143,13 +143,15 @@ class EventMultiplexer(object):
     return self
 
   def AddRunsFromDirectory(self, path, name=None):
-    """Load runs from a directory, assuming each subdirectory is a run.
+    """Load runs from a directory; recursively walks subdirectories.
 
     If path doesn't exist, no-op. This ensures that it is safe to call
       `AddRunsFromDirectory` multiple times, even before the directory is made.
 
-    If the directory contains TensorFlow event files, it is itself treated as a
-      run.
+    If path is a directory, load event files in the directory (if any exist) and
+      recursively call AddRunsFromDirectory on any subdirectories. This mean you
+      can call AddRunsFromDirectory at the root of a tree of event logs and
+      TensorBoard will load them all.
 
     If the `EventMultiplexer` is already loaded or autoupdating, this will cause
     the newly created accumulators to also `Reload()` or `AutoUpdate()`.
@@ -171,25 +173,16 @@ class EventMultiplexer(object):
     if not gfile.Exists(path):
       return  # Maybe it hasn't been created yet, fail silently to retry later
     if not gfile.IsDirectory(path):
-      raise ValueError('Path exists and is not a directory, %s'  % path)
-    paths = gfile.ListDirectory(path)
-    is_directory = lambda x: gfile.IsDirectory(os.path.join(path, x))
-    subdirectories = filter(is_directory, paths)
-    for s in subdirectories:
-      if name:
-        subname = '/'.join([name, s])
-      else:
-        subname = s
-      self.AddRun(os.path.join(path, s), subname)
+      raise ValueError('AddRunsFromDirectory: path exists and is not a '
+                       'directory, %s'  % path)
+
+    for (subdir, _, files) in gfile.Walk(path):
+      if list(filter(event_accumulator.IsTensorFlowEventsFile, files)):
+        logging.info('Adding events from directory %s', subdir)
+        rpath = os.path.relpath(subdir, path)
+        subname = os.path.join(name, rpath) if name else rpath
+        self.AddRun(subdir, name=subname)
 
-    if list(filter(event_accumulator.IsTensorFlowEventsFile, paths)):
-      directory_name = os.path.split(path)[1]
-      logging.info('Directory %s has event files; loading', directory_name)
-      if name:
-        dname = name
-      else:
-        dname = directory_name
-      self.AddRun(path, dname)
     return self
 
   def Reload(self):
diff --git a/tensorflow/python/summary/event_multiplexer_test.py b/tensorflow/python/summary/event_multiplexer_test.py
index 01749a16f5d..e7cecba1447 100644
--- a/tensorflow/python/summary/event_multiplexer_test.py
+++ b/tensorflow/python/summary/event_multiplexer_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import os.path
 
 import tensorflow.python.platform
 
@@ -28,6 +29,20 @@ from tensorflow.python.summary import event_accumulator
 from tensorflow.python.summary import event_multiplexer
 
 
+def _AddEvents(path):
+  if not gfile.IsDirectory(path):
+    gfile.MakeDirs(path)
+  fpath = os.path.join(path, 'hypothetical.tfevents.out')
+  with gfile.GFile(fpath, 'w'):
+    return fpath
+
+
+def _CreateCleanDirectory(path):
+  if gfile.IsDirectory(path):
+    gfile.DeleteRecursively(path)
+  gfile.MkDir(path)
+
+
 class _FakeAccumulator(object):
 
   def __init__(self, path):
@@ -137,34 +152,33 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
     x.AddRunsFromDirectory(fakedir)
     self.assertEqual(x.Runs(), {}, 'loading fakedir had no effect')
 
-    if gfile.IsDirectory(realdir):
-      gfile.DeleteRecursively(realdir)
-    gfile.MkDir(realdir)
+    _CreateCleanDirectory(realdir)
     x.AddRunsFromDirectory(realdir)
     self.assertEqual(x.Runs(), {}, 'loading empty directory had no effect')
 
     path1 = join(realdir, 'path1')
     gfile.MkDir(path1)
     x.AddRunsFromDirectory(realdir)
-    self.assertEqual(sorted(x.Runs().keys()), ['path1'], 'loaded run: path1')
+    self.assertEqual(x.Runs(), {}, 'creating empty subdirectory had no effect')
+
+    _AddEvents(path1)
+    x.AddRunsFromDirectory(realdir)
+    self.assertItemsEqual(x.Runs(), ['path1'], 'loaded run: path1')
     loader1 = x._GetAccumulator('path1')
     self.assertEqual(loader1._path, path1, 'has the correct path')
 
     path2 = join(realdir, 'path2')
-    gfile.MkDir(path2)
+    _AddEvents(path2)
     x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(sorted(x.Runs().keys()), ['path1', 'path2'])
+    self.assertItemsEqual(x.Runs(), ['path1', 'path2'])
     self.assertEqual(x._GetAccumulator('path1'), loader1,
                      'loader1 not regenerated')
-    loader2 = x._GetAccumulator('path2')
 
     path2_2 = join(path2, 'path2')
-    gfile.MkDir(path2_2)
-    x.AddRunsFromDirectory(path2)
-    self.assertItemsEqual(sorted(x.Runs().keys()), ['path1', 'path2'])
-    self.assertNotEqual(loader2, x._GetAccumulator('path2'),
-                        'loader2 regenerated')
-    self.assertEqual(x._GetAccumulator('path2')._path, path2_2,
+    _AddEvents(path2_2)
+    x.AddRunsFromDirectory(realdir)
+    self.assertItemsEqual(x.Runs(), ['path1', 'path2', 'path2/path2'])
+    self.assertEqual(x._GetAccumulator('path2/path2')._path, path2_2,
                      'loader2 path correct')
 
   def testAddRunsFromDirectoryThatContainsEvents(self):
@@ -173,21 +187,18 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
     join = os.path.join
     realdir = join(tmpdir, 'event_containing_directory')
 
-    if gfile.IsDirectory(realdir):
-      gfile.DeleteRecursively(realdir)
-    gfile.MkDir(realdir)
+    _CreateCleanDirectory(realdir)
 
     self.assertEqual(x.Runs(), {})
 
-    with gfile.GFile(join(realdir, 'hypothetical.tfevents.out'), 'w'):
-      pass
+    _AddEvents(realdir)
     x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(x.Runs(), ['event_containing_directory'])
+    self.assertItemsEqual(x.Runs(), ['.'])
 
     subdir = join(realdir, 'subdir')
-    gfile.MkDir(subdir)
+    _AddEvents(subdir)
     x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(x.Runs(), ['event_containing_directory', 'subdir'])
+    self.assertItemsEqual(x.Runs(), ['.', 'subdir'])
 
   def testAddRunsFromDirectoryWithRunNames(self):
     x = event_multiplexer.EventMultiplexer()
@@ -195,30 +206,45 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
     join = os.path.join
     realdir = join(tmpdir, 'event_containing_directory')
 
-    if gfile.IsDirectory(realdir):
-      gfile.DeleteRecursively(realdir)
-    gfile.MkDir(realdir)
+    _CreateCleanDirectory(realdir)
 
     self.assertEqual(x.Runs(), {})
 
-    with gfile.GFile(join(realdir, 'hypothetical.tfevents.out'), 'w'):
-      pass
+    _AddEvents(realdir)
     x.AddRunsFromDirectory(realdir, 'foo')
-    self.assertItemsEqual(x.Runs(), ['foo'])
+    self.assertItemsEqual(x.Runs(), ['foo/.'])
 
     subdir = join(realdir, 'subdir')
-    gfile.MkDir(subdir)
+    _AddEvents(subdir)
     x.AddRunsFromDirectory(realdir, 'foo')
-    self.assertItemsEqual(x.Runs(), ['foo', 'foo/subdir'])
+    self.assertItemsEqual(x.Runs(), ['foo/.', 'foo/subdir'])
+
+  def testAddRunsFromDirectoryWalksTree(self):
+    x = event_multiplexer.EventMultiplexer()
+    tmpdir = self.get_temp_dir()
+    join = os.path.join
+    realdir = join(tmpdir, 'event_containing_directory')
+
+    _CreateCleanDirectory(realdir)
+    _AddEvents(realdir)
+    sub = join(realdir, 'subdirectory')
+    sub1 = join(sub, '1')
+    sub2 = join(sub, '2')
+    sub1_1 = join(sub1, '1')
+    _AddEvents(sub1)
+    _AddEvents(sub2)
+    _AddEvents(sub1_1)
+    x.AddRunsFromDirectory(realdir)
+
+    self.assertItemsEqual(x.Runs(), ['.',
+                                     'subdirectory/1', 'subdirectory/2',
+                                     'subdirectory/1/1'])
 
   def testAddRunsFromDirectoryThrowsException(self):
     x = event_multiplexer.EventMultiplexer()
     tmpdir = self.get_temp_dir()
 
-    filepath = os.path.join(tmpdir, 'bad_file')
-    with gfile.GFile(filepath, 'w'):
-      pass
-
+    filepath = _AddEvents(tmpdir)
     with self.assertRaises(ValueError):
       x.AddRunsFromDirectory(filepath)
 
diff --git a/tensorflow/python/training/adagrad_test.py b/tensorflow/python/training/adagrad_test.py
index d226d672abd..1057ec947e7 100644
--- a/tensorflow/python/training/adagrad_test.py
+++ b/tensorflow/python/training/adagrad_test.py
@@ -47,6 +47,28 @@ class AdagradOptimizerTest(tf.test.TestCase):
       self.assertAllClose(np.array([2.715679168701172, 3.715679168701172]),
                           var1.eval())
 
+  def testTensorLearningRate(self):
+    with self.test_session():
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([3.0, 4.0])
+      grads0 = tf.constant([0.1, 0.1])
+      grads1 = tf.constant([0.01, 0.01])
+      ada_opt = tf.train.AdagradOptimizer(
+          tf.constant(3.0), initial_accumulator_value=0.1)
+      ada_update = ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+      # Run 3 steps of adagrad
+      for _ in range(3):
+        ada_update.run()
+      # Validate updated params
+      self.assertAllClose(np.array([-1.6026098728179932, -0.6026098728179932]),
+                          var0.eval())
+      self.assertAllClose(np.array([2.715679168701172, 3.715679168701172]),
+                          var1.eval())
+
   def testFloat64(self):
     with self.test_session():
       opt = tf.train.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 41fa64e6d71..6729394083f 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -69,9 +69,9 @@ class AdamOptimizer(optimizer.Optimizer):
       beta1: A float value or a constant float tensor.
         The exponential decay rate for the 1st moment estimates.
       beta2: A float value or a constant float tensor.
-        The exponential decay rate for the 2st moment estimates.
+        The exponential decay rate for the 2nd moment estimates.
       epsilon: A small constant for numerical stability.
-      use_locking: If True use locks for update operation.s
+      use_locking: If True use locks for update operations.
       name: Optional name for the operations created when applying gradients.
         Defaults to "Adam".
     """
@@ -143,8 +143,8 @@ class AdamOptimizer(optimizer.Optimizer):
                                use_locking=self._use_locking)
     v_sqrt = math_ops.sqrt(v_t)
     var_update = state_ops.assign_sub(var,
-                                     lr * m_t / (v_sqrt + self._epsilon_t),
-                                     use_locking=self._use_locking)
+                                      lr * m_t / (v_sqrt + self._epsilon_t),
+                                      use_locking=self._use_locking)
     return control_flow_ops.group(*[var_update, m_t, v_t])
 
   def _finish(self, update_ops, name_scope):
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index f9ea6c22f55..d6e18146912 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -115,6 +115,42 @@ class AdamOptimizerTest(tf.test.TestCase):
         self.assertAllClose(var0_np, var0.eval())
         self.assertAllClose(var1_np, var1.eval())
 
+  def testTensorLearningRate(self):
+    with self.test_session():
+      # Initialize variables for numpy implementation.
+      m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+      var0_np = np.array([1.0, 2.0], dtype=np.float32)
+      grads0_np = np.array([0.1, 0.1], dtype=np.float32)
+      var1_np = np.array([3.0, 4.0], dtype=np.float32)
+      grads1_np = np.array([0.01, 0.01], dtype=np.float32)
+
+      var0 = tf.Variable(var0_np)
+      var1 = tf.Variable(var1_np)
+      grads0 = tf.constant(grads0_np)
+      grads1 = tf.constant(grads1_np)
+      opt = tf.train.AdamOptimizer(tf.constant(0.001))
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+
+      beta1_power, beta2_power = opt._get_beta_accumulators()
+
+      # Run 3 steps of Adam
+      for t in range(1, 4):
+        self.assertAllClose(0.9 ** t, beta1_power.eval())
+        self.assertAllClose(0.999 ** t, beta2_power.eval())
+        update.run()
+
+        var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+        var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+        # Validate updated params
+        self.assertAllClose(var0_np, var0.eval())
+        self.assertAllClose(var1_np, var1.eval())
+
   def testFloat64(self):
     with self.test_session():
       opt = tf.train.AdamOptimizer()
diff --git a/tensorflow/python/training/gradient_descent_test.py b/tensorflow/python/training/gradient_descent_test.py
index 68378cef9ec..dd4e391196c 100644
--- a/tensorflow/python/training/gradient_descent_test.py
+++ b/tensorflow/python/training/gradient_descent_test.py
@@ -44,6 +44,25 @@ class GradientDescentOptimizerTest(tf.test.TestCase):
       self.assertAllClose([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], var0.eval())
       self.assertAllClose([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], var1.eval())
 
+  def testTensorLearningRate(self):
+    with self.test_session():
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([3.0, 4.0])
+      grads0 = tf.constant([0.1, 0.1])
+      grads1 = tf.constant([0.01, 0.01])
+      lrate = tf.constant(3.0)
+      sgd_op = tf.train.GradientDescentOptimizer(lrate).apply_gradients(
+          zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+      # Run 1 step of sgd
+      sgd_op.run()
+      # Validate updated params
+      self.assertAllClose([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], var0.eval())
+      self.assertAllClose([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], var1.eval())
+
   def testFloat64(self):
     with self.test_session():
       opt = tf.train.GradientDescentOptimizer(3.0)
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index e580e80eb27..53bca00756c 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import constant_op
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import summary_ops
@@ -114,8 +115,21 @@ def string_input_producer(string_tensor, num_epochs=None, shuffle=True,
   Returns:
     A queue with the output strings.  A `QueueRunner` for the Queue
     is added to the current `Graph`'s `QUEUE_RUNNER` collection.
+
+  Raises:
+    ValueError: If the string_tensor is a null Python list.  At runtime,
+    will fail with an assertion if string_tensor becomes a null tensor.
   """
+  not_null_err = "string_input_producer requires a non-null input tensor"
+  if not string_tensor:
+    raise ValueError(not_null_err)
+
   with ops.op_scope([string_tensor], name, "input_producer") as name:
+    string_tensor = ops.convert_to_tensor(string_tensor, dtype=dtypes.string)
+    with ops.control_dependencies([
+        logging_ops.Assert(math_ops.greater(array_ops.size(string_tensor), 0),
+                           [not_null_err])]):
+      string_tensor = array_ops.identity(string_tensor)
     return _input_producer(
         string_tensor, dtypes.string, num_epochs, shuffle, seed, capacity, name,
         "fraction_of_%d_full" % capacity)
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index 80961abdc38..ab17a6be495 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -132,6 +132,28 @@ class StringInputProducerTest(tf.test.TestCase):
       for thread in threads:
         thread.join()
 
+  def testNullStringPython(self):
+    # Graph-construction time check for empty string list:
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        _ = tf.train.string_input_producer([])
+
+  def testNullString(self):
+    # Runtime check for empty string list.  This is slightly oblique:
+    # The queue runner should die with an assertion error on the null
+    # input tensor, causing the dequeue to fail with an OutOfRangeError.
+    with self.test_session():
+      coord = tf.train.Coordinator()
+      queue = tf.train.string_input_producer(tf.constant([], dtype=tf.string))
+      dequeue = queue.dequeue()
+      tf.initialize_all_variables().run()
+      threads = tf.train.start_queue_runners(coord=coord)
+      with self.assertRaises(tf.errors.OutOfRangeError):
+        dequeue.eval()
+      coord.request_stop()
+      for thread in threads:
+        thread.join()
+
 
 class RangeInputProducerTest(tf.test.TestCase):
 
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index eee6f0300da..f7e1e3095c2 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -77,6 +77,57 @@ class MomentumOptimizerTest(tf.test.TestCase):
                                     3.98 - ((0.9 * 0.01 + 0.01) * 2.0)]),
                           var1.eval())
 
+  def testTensorLearningRateAndMomentum(self):
+    with self.test_session():
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([3.0, 4.0])
+      grads0 = tf.constant([0.1, 0.1])
+      grads1 = tf.constant([0.01, 0.01])
+      mom_opt = tf.train.MomentumOptimizer(
+          learning_rate=tf.constant(2.0), momentum=tf.constant(0.9))
+      mom_update = mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+      # Check we have slots
+      self.assertEqual(["momentum"], mom_opt.get_slot_names())
+      slot0 = mom_opt.get_slot(var0, "momentum")
+      self.assertEquals(slot0.get_shape(), var0.get_shape())
+      self.assertFalse(slot0 in tf.trainable_variables())
+      slot1 = mom_opt.get_slot(var1, "momentum")
+      self.assertEquals(slot1.get_shape(), var1.get_shape())
+      self.assertFalse(slot1 in tf.trainable_variables())
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+      # Step 1: the momentum accumulators where 0. So we should see a normal
+      # update: v -= grad * learning_rate
+      mom_update.run()
+      # Check that the momentum accumulators have been updated.
+      self.assertAllClose(np.array([0.1, 0.1]), slot0.eval())
+      self.assertAllClose(np.array([0.01, 0.01]), slot1.eval())
+      # Check that the parameters have been updated.
+      self.assertAllClose(np.array([1.0 - (0.1 * 2.0),
+                                    2.0 - (0.1 * 2.0)]),
+                          var0.eval())
+      self.assertAllClose(np.array([3.0 - (0.01 * 2.0),
+                                    4.0 - (0.01 * 2.0)]),
+                          var1.eval())
+      # Step 2: the momentum accumulators contain the previous update.
+      mom_update.run()
+      # Check that the momentum accumulators have been updated.
+      self.assertAllClose(np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+                          slot0.eval())
+      self.assertAllClose(np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+                          slot1.eval())
+      # Check that the parameters have been updated.
+      self.assertAllClose(
+          np.array([1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                    2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)]),
+          var0.eval())
+      self.assertAllClose(np.array([2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                                    3.98 - ((0.9 * 0.01 + 0.01) * 2.0)]),
+                          var1.eval())
+
   def testFloat64(self):
     with self.test_session():
       opt = tf.train.MomentumOptimizer(learning_rate=2.0, momentum=0.9)
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index dc2f700f816..d9b6062cb71 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -225,6 +225,8 @@ class Optimizer(object):
     for var in var_list:
       if not isinstance(var, variables.Variable):
         raise TypeError("Argument is not a variables.Variable: %s" % var)
+    if not var_list:
+      raise ValueError("No variables to optimize")
     grads = gradients.gradients(
         loss, var_list, gate_gradients=(gate_gradients == Optimizer.GATE_OP),
         aggregation_method=aggregation_method)
@@ -254,6 +256,7 @@ class Optimizer(object):
 
     Raises:
       TypeError: if `grads_and_vars` is malformed.
+      ValueError: if none of the variables have gradients.
     """
     # This is a default implementation of apply_gradients() that can be shared
     # by most optimizers.  It relies on the subclass implementing the following
@@ -268,7 +271,11 @@ class Optimizer(object):
             "Variable must be a variables.Variable: %s" % v)
       if g is not None:
         self._assert_valid_dtypes([g, v])
-    self._create_slots([v for g, v in grads_and_vars if g is not None])
+    var_list = [v for g, v in grads_and_vars if g is not None]
+    if not var_list:
+      raise ValueError("No gradients provided for any variable: %s" %
+                       grads_and_vars)
+    self._create_slots(var_list)
     update_ops = []
     with ops.op_scope([], name, self._name) as name:
       self._prepare()
diff --git a/tensorflow/python/training/optimizer_test.py b/tensorflow/python/training/optimizer_test.py
index b9f2b5fdef5..204d9e7d3b4 100644
--- a/tensorflow/python/training/optimizer_test.py
+++ b/tensorflow/python/training/optimizer_test.py
@@ -64,6 +64,26 @@ class OptimizerTest(tf.test.TestCase):
       self.assertAllClose([-14., -13.], var0.eval())
       self.assertAllClose([-6., -5.], var1.eval())
 
+  def testNoVariables(self):
+    with self.test_session():
+      var0 = tf.Variable([1.0, 2.0], trainable=False)
+      var1 = tf.Variable([3.0, 4.0], trainable=False)
+      cost = 5 * var0 + var1
+      sgd_op = tf.train.GradientDescentOptimizer(3.0)
+      with self.assertRaisesRegexp(ValueError, 'No variables'):
+        sgd_op.minimize(cost)
+
+  def testNoGradients(self):
+    with self.test_session():
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([3.0, 4.0])
+      cost = 5 * var0
+      global_step = tf.Variable(tf.zeros([], tf.int64), name='global_step')
+      sgd_op = tf.train.GradientDescentOptimizer(3.0)
+      with self.assertRaisesRegexp(ValueError, 'No gradients'):
+        # var1 has no gradient
+        sgd_op.minimize(cost, global_step, [var1])
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 4b86a08609d..08250dc750b 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -284,7 +284,7 @@ class BaseSaverBuilder(object):
         else:
           names_to_variables[name] = [var]
       else:
-        var = ops.convert_to_tensor(var)
+        var = ops.convert_to_tensor(var, as_ref=True)
         if not self._IsVariable(var):
           raise TypeError("Variable to save is not a Variable: %s" % var)
         name = var.op.name
@@ -341,7 +341,7 @@ class BaseSaverBuilder(object):
         # pylint: enable=protected-access
       else:
         # A variable or tensor.
-        variable = ops.convert_to_tensor(v)
+        variable = ops.convert_to_tensor(v, as_ref=True)
         if not self._IsVariable(variable):
           raise TypeError("names_to_variables must be a dict mapping string "
                           "names to Tensors/Variables. Not a variable: %s" %
diff --git a/tensorflow/tensorboard/README.md b/tensorflow/tensorboard/README.md
index eb85a1e4610..99279fa9ed5 100644
--- a/tensorflow/tensorboard/README.md
+++ b/tensorflow/tensorboard/README.md
@@ -1,13 +1,20 @@
 # TensorBoard
 
 TensorBoard is a suite of web applications for inspecting and understanding your
-TensorFlow runs and graphs.
+TensorFlow runs and graphs. Before running TensorBoard, make sure you have
+generated summary data in a log directory by creating a `SummaryWriter`:
 
-Example Usage:
+```python
+# sess.graph_def is the graph definition.
+summary_writer = tf.train.SummaryWriter('/path/to/logs', sess.graph_def)
+```
+
+For more details, see [this tutorial](http://www.tensorflow.org/how_tos/summaries_and_tensorboard/index.html#serializing-the-data).
+Then run TensorBoard and provide the log directory:
 
 ```
 python tensorflow/tensorboard/tensorboard.py --logdir=path/to/logs
-# if installed via pip
+# or if installed via pip, run:
 tensorboard --logdir=path/to/logs
 
 # if building from source
@@ -26,7 +33,14 @@ includes a frontend (app/tf-tensorboard.html) that contains html and javascript
 for displaying this data in a UI.
 
 
-## Building the TensorBoard frontend
+## TensorBoard Development Instructions
+
+The following instructions are useful if you want to develop the TensorBoard
+frontend in a lightweight frontend-only environment. It sets up gulp with
+automatic recompiling and serves just the frontend assets without a connected
+backend.
+
+If you just want to use TensorBoard, there is no need to read any further.
 
 ### Install Node, npm, gulp, bower, and tsd in your machine
 Get nodejs and npm through whatever package distribution system is appropriate
@@ -43,24 +57,11 @@ run the following commands.
     bower install
     tsd install
 
-### Run Gulp Vulcanize
+### Run Gulp
 
-Inside this directory, run `gulp vulcanize`. That will compile all of the
-html/js/css dependencies for TensorBoard into a monolithic index.html file under
-dist/. Once you've done this, you can locally run your own TensorBoard instance
-and it will have a working frontend.
+Inside this directory, run `gulp`. That will compile all of the
+html/js/css dependencies for TensorBoard, and also spin up a server
+(by default at port 8000). You can navigate to component-specific demo pages to
+check out their behavior.
 
-### Frontend General Dev Instructions
-
-To speed up the development process, we can run the frontend code independently
-of the backend, and mock out the backend with static JSON files. This allows
-testing the frontend's correctness without needing to find  real data and spin
-up a real server. Look at app/demo/index.html for an example.
-
-The following gulp commands are useful:
-
-* `gulp test` - build, test, and lint the code
-* `gulp watch` - build, test, and rebuild on change
-* `gulp server` - start a livereload server on localhost:8000
-* `gulp` - alias for `gulp watch`
-* `gulp vulcanize` -
+Running `gulp test` will run all unit tests, the linter, etc.
diff --git a/tensorflow/tensorboard/app/index.html b/tensorflow/tensorboard/app/index.html
index bf466c79243..c4031c14d30 100644
--- a/tensorflow/tensorboard/app/index.html
+++ b/tensorflow/tensorboard/app/index.html
@@ -2,9 +2,16 @@
 <html>
   <head>
     <script src="../bower_components/webcomponentsjs/webcomponents-lite.min.js"></script>
+    <style>
+      html, body {
+        margin: 0;
+        padding: 0;
+        height: 100%;
+        font-family: "RobotoDraft","Roboto",sans-serif;
+      }
+    </style>
     <script src="analytics.js"></script>
-    <link rel="import" href="tf-tensorboard.html">
-    <link rel="stylesheet" type="text/css" href="../lib/css/global.css">
+    <link rel="import" href="../components/tf-tensorboard/tf-tensorboard.html">
     <title>TensorBoard</title>
   </head>
   <body>
diff --git a/tensorflow/tensorboard/app/tf-tensorboard-demo.html b/tensorflow/tensorboard/app/tf-tensorboard-demo.html
deleted file mode 100644
index 5f0ef5b00c7..00000000000
--- a/tensorflow/tensorboard/app/tf-tensorboard-demo.html
+++ /dev/null
@@ -1,72 +0,0 @@
-<link rel="import" href="../bower_components/polymer/polymer.html">
-<link rel="import" href="tf-tensorboard.html">
-<!--
-tf-tensorboard-demo creates a demo instance of TensorBoard.
-
-It expects to load data from a folder called tensorboard/
-
-The way it ensures the urls are correct is quite hacky.
-TODO(danmane): Fix the url handling during cleanup.
--->
-<dom-module id="tf-tensorboard-demo">
-  <template>
-    <template is="dom-if" if="[[_urlsReady]]">
-      <tf-tensorboard mode="[[mode]]"></tf-tensorboard>
-    </template>
-    <template is="dom-if" if="[[!_urlsReady]]">
-      <p>
-        urls not ready - probably because a dataDir was not provided
-      </p>
-    </template>
-    <style>
-    :host {
-      display: block;
-      width: 100%;
-      height: 100%;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-tensorboard-demo",
-      properties: {
-        mode: {
-          type: String,
-          value: "events",
-        },
-        dataDir: {
-          type: String,
-        },
-        _urlsReady: {
-          type: Boolean,
-          value: false,
-        },
-      },
-      observers: ['_setupUrls(dataDir)'],
-      _setupUrls: function(dataDir) {
-        function router(route) {
-          return function(tag, run) {
-            run = run.replace(/[ \)\(]/g, "_");
-            tag = tag.replace(/[ \)\(]/g, "_");
-            return dataDir + "/" + route + "/" + run + "/" + tag + ".json";
-          };
-        }
-        TF.Urls.runsUrl = function() {
-          return dataDir + "/runs.json"
-        };
-        TF.Urls.graphUrl = function(run) {
-          run = run.replace(/ /g, "_");
-          return dataDir + "/graph/" + run + ".pbtxt";
-        };
-        TF.Urls.scalarsUrl = router("scalars");
-        TF.Urls.histogramsUrl = router("histograms");
-        TF.Urls.compressedHistogramsUrl = router("compressedHistograms");
-        TF.Urls.imagesUrl = router("images");
-        TF.Urls.individualImageUrl = function(query) {
-          return dataDir + "/individualImage/" + query + ".png";
-        }
-        this._urlsReady = true;
-      },
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/bower.json b/tensorflow/tensorboard/bower.json
index 7d9f033d4dc..06a31968ab0 100644
--- a/tensorflow/tensorboard/bower.json
+++ b/tensorflow/tensorboard/bower.json
@@ -20,9 +20,11 @@
     "es6-promise": "3.0.2",
     "graphlib": "1.0.7",
     "iron-ajax": "PolymerElements/iron-ajax#1.0.7",
+    "iron-behaviors": "PolymerElements/iron-behaviors#1.0.10",
     "iron-collapse": "PolymerElements/iron-collapse#1.0.4",
     "iron-list": "PolymerElements/iron-list#1.1.5",
     "iron-selector": "PolymerElements/iron-selector#1.0.7",
+    "paper-behaviors": "PolymerElements/paper-behaviors#1.0.9",
     "paper-button": "PolymerElements/paper-button#1.0.8",
     "paper-checkbox": "PolymerElements/paper-checkbox#1.0.13",
     "paper-dropdown-menu": "PolymerElements/paper-dropdown-menu#1.0.5",
diff --git a/tensorflow/tensorboard/components/imports/README.md b/tensorflow/tensorboard/components/imports/README.md
new file mode 100644
index 00000000000..695698bf237
--- /dev/null
+++ b/tensorflow/tensorboard/components/imports/README.md
@@ -0,0 +1,6 @@
+This file acts as import routers for third party javascript libraries,
+e.g. Plottable and D3.
+
+The "local-imports" folder contains alternate versions of the import routers
+that load from `bower_components`; it exists to faciliate local development
+with a gulp workflow.
diff --git a/tensorflow/tensorboard/components/imports/dagre.html b/tensorflow/tensorboard/components/imports/dagre.html
new file mode 100644
index 00000000000..b75f137cb28
--- /dev/null
+++ b/tensorflow/tensorboard/components/imports/dagre.html
@@ -0,0 +1,2 @@
+<link rel="import" href="graphlib.html">
+<script src="../../bower_components/dagre/dist/dagre.core.js"></script>
diff --git a/tensorflow/tensorboard/components/imports/graphlib.html b/tensorflow/tensorboard/components/imports/graphlib.html
new file mode 100644
index 00000000000..189eff17201
--- /dev/null
+++ b/tensorflow/tensorboard/components/imports/graphlib.html
@@ -0,0 +1 @@
+<script src="../../bower_components/graphlib/dist/graphlib.core.js"></script>
diff --git a/tensorflow/tensorboard/components/imports/local-imports/d3.html b/tensorflow/tensorboard/components/imports/local-imports/d3.html
new file mode 100644
index 00000000000..e2797c0a1a9
--- /dev/null
+++ b/tensorflow/tensorboard/components/imports/local-imports/d3.html
@@ -0,0 +1 @@
+<script src="../../d3/d3.js"></script>
diff --git a/tensorflow/tensorboard/components/imports/local-imports/dagre.html b/tensorflow/tensorboard/components/imports/local-imports/dagre.html
new file mode 100644
index 00000000000..b685aea6c93
--- /dev/null
+++ b/tensorflow/tensorboard/components/imports/local-imports/dagre.html
@@ -0,0 +1,4 @@
+// hackhack for some reason getting graphlib via an import reference results in
+// out of order script evaluation
+<script src="../../graphlib/dist/graphlib.core.js"></script>
+<script src="../../dagre/dist/dagre.core.js"></script>
diff --git a/tensorflow/tensorboard/components/imports/local-imports/graphlib.html b/tensorflow/tensorboard/components/imports/local-imports/graphlib.html
new file mode 100644
index 00000000000..a1e98e9089d
--- /dev/null
+++ b/tensorflow/tensorboard/components/imports/local-imports/graphlib.html
@@ -0,0 +1 @@
+<script src="../../graphlib/dist/graphlib.core.js"></script>
diff --git a/tensorflow/tensorboard/components/imports/local-imports/lodash.html b/tensorflow/tensorboard/components/imports/local-imports/lodash.html
new file mode 100644
index 00000000000..95f8375a1d4
--- /dev/null
+++ b/tensorflow/tensorboard/components/imports/local-imports/lodash.html
@@ -0,0 +1 @@
+<script src="../../lodash/lodash.min.js"></script>
diff --git a/tensorflow/tensorboard/components/imports/local-imports/plottable.html b/tensorflow/tensorboard/components/imports/local-imports/plottable.html
new file mode 100644
index 00000000000..dfbe77c8c44
--- /dev/null
+++ b/tensorflow/tensorboard/components/imports/local-imports/plottable.html
@@ -0,0 +1,3 @@
+<link rel="import" href="../../imports/d3.html">
+<script src="../../plottable/plottable.js"></script>
+<link rel="stylesheet" type="text/css" href="../../plottable/plottable.css">
diff --git a/tensorflow/tensorboard/test/index.html b/tensorflow/tensorboard/components/test/index.html
similarity index 56%
rename from tensorflow/tensorboard/test/index.html
rename to tensorflow/tensorboard/components/test/index.html
index e02aafc668f..bef954701fe 100644
--- a/tensorflow/tensorboard/test/index.html
+++ b/tensorflow/tensorboard/components/test/index.html
@@ -2,16 +2,15 @@
 <html>
 <head>
   <meta charset="utf-8">
-  <script src="../bower_components/webcomponentsjs/webcomponents-lite.min.js"></script>
   <script src="../bower_components/web-component-tester/browser.js"></script>
 </head>
 <body>
 <script>
 // Run the tests for each main component in tensorboard.
 WCT.loadSuites([
-  '../components/tf-graph-common/test/index.html',
-  '../components/tf-graph-loader/test/index.html'
+  '../tf-graph-common/test/index.html',
+  '../tf-graph-loader/test/index.html'
 ]);
 </script>
 </body>
-</html>
\ No newline at end of file
+</html>
diff --git a/tensorflow/tensorboard/components/tf-graph-common/test/index.html b/tensorflow/tensorboard/components/tf-graph-common/test/index.html
index fddcb2fde4e..c7694e75149 100644
--- a/tensorflow/tensorboard/components/tf-graph-common/test/index.html
+++ b/tensorflow/tensorboard/components/tf-graph-common/test/index.html
@@ -12,4 +12,4 @@
   <script src="hierarchy-test.js"></script>
   <script src="layout-test.js"></script>
 </body>
-</html>
\ No newline at end of file
+</html>
diff --git a/tensorflow/tensorboard/components/tf-graph-common/tf-graph-common.html b/tensorflow/tensorboard/components/tf-graph-common/tf-graph-common.html
index 107e3ab7a09..f42e7d27dd7 100644
--- a/tensorflow/tensorboard/components/tf-graph-common/tf-graph-common.html
+++ b/tensorflow/tensorboard/components/tf-graph-common/tf-graph-common.html
@@ -1,7 +1,7 @@
-<script src="../../bower_components/d3/d3.js"></script>
-<script src="../../bower_components/lodash/lodash.js"></script>
-<script src="../../bower_components/graphlib/dist/graphlib.core.js"></script>
-<script src="../../bower_components/dagre/dist/dagre.core.js"></script>
+<link rel="import" href="../imports/d3.html">
+<link rel="import" href="../imports/dagre.html">
+<link rel="import" href="../imports/graphlib.html">
+<link rel="import" href="../imports/lodash.html">
 
 <script src="lib/common.js"></script>
 <script src="lib/graph.js"></script>
diff --git a/tensorflow/tensorboard/components/tf-graph-loader/test/index.html b/tensorflow/tensorboard/components/tf-graph-loader/test/index.html
index ba31e22c5b7..e484b43822f 100644
--- a/tensorflow/tensorboard/components/tf-graph-loader/test/index.html
+++ b/tensorflow/tensorboard/components/tf-graph-loader/test/index.html
@@ -10,4 +10,4 @@
   <tf-graph-loader id="loader"></tf-graph-loader>
   <script src="loader.js"></script>
 </body>
-</html>
\ No newline at end of file
+</html>
diff --git a/tensorflow/tensorboard/app/demo/data/cos.json b/tensorflow/tensorboard/components/tf-tensorboard/demo/data/cos.json
similarity index 100%
rename from tensorflow/tensorboard/app/demo/data/cos.json
rename to tensorflow/tensorboard/components/tf-tensorboard/demo/data/cos.json
diff --git a/tensorflow/tensorboard/app/demo/data/cubic.json b/tensorflow/tensorboard/components/tf-tensorboard/demo/data/cubic.json
similarity index 100%
rename from tensorflow/tensorboard/app/demo/data/cubic.json
rename to tensorflow/tensorboard/components/tf-tensorboard/demo/data/cubic.json
diff --git a/tensorflow/tensorboard/app/demo/data/linear.json b/tensorflow/tensorboard/components/tf-tensorboard/demo/data/linear.json
similarity index 100%
rename from tensorflow/tensorboard/app/demo/data/linear.json
rename to tensorflow/tensorboard/components/tf-tensorboard/demo/data/linear.json
diff --git a/tensorflow/tensorboard/app/demo/data/poly5-graph.pbtxt b/tensorflow/tensorboard/components/tf-tensorboard/demo/data/poly5-graph.pbtxt
similarity index 100%
rename from tensorflow/tensorboard/app/demo/data/poly5-graph.pbtxt
rename to tensorflow/tensorboard/components/tf-tensorboard/demo/data/poly5-graph.pbtxt
diff --git a/tensorflow/tensorboard/app/demo/data/poly5.json b/tensorflow/tensorboard/components/tf-tensorboard/demo/data/poly5.json
similarity index 100%
rename from tensorflow/tensorboard/app/demo/data/poly5.json
rename to tensorflow/tensorboard/components/tf-tensorboard/demo/data/poly5.json
diff --git a/tensorflow/tensorboard/app/demo/data/runs.json b/tensorflow/tensorboard/components/tf-tensorboard/demo/data/runs.json
similarity index 100%
rename from tensorflow/tensorboard/app/demo/data/runs.json
rename to tensorflow/tensorboard/components/tf-tensorboard/demo/data/runs.json
diff --git a/tensorflow/tensorboard/app/demo/data/sin-graph.pbtxt b/tensorflow/tensorboard/components/tf-tensorboard/demo/data/sin-graph.pbtxt
similarity index 100%
rename from tensorflow/tensorboard/app/demo/data/sin-graph.pbtxt
rename to tensorflow/tensorboard/components/tf-tensorboard/demo/data/sin-graph.pbtxt
diff --git a/tensorflow/tensorboard/app/demo/data/sin.json b/tensorflow/tensorboard/components/tf-tensorboard/demo/data/sin.json
similarity index 100%
rename from tensorflow/tensorboard/app/demo/data/sin.json
rename to tensorflow/tensorboard/components/tf-tensorboard/demo/data/sin.json
diff --git a/tensorflow/tensorboard/app/demo/data/sq.json b/tensorflow/tensorboard/components/tf-tensorboard/demo/data/sq.json
similarity index 100%
rename from tensorflow/tensorboard/app/demo/data/sq.json
rename to tensorflow/tensorboard/components/tf-tensorboard/demo/data/sq.json
diff --git a/tensorflow/tensorboard/app/demo/index.html b/tensorflow/tensorboard/components/tf-tensorboard/demo/index.html
similarity index 79%
rename from tensorflow/tensorboard/app/demo/index.html
rename to tensorflow/tensorboard/components/tf-tensorboard/demo/index.html
index a12b5abc261..369032bae15 100644
--- a/tensorflow/tensorboard/app/demo/index.html
+++ b/tensorflow/tensorboard/components/tf-tensorboard/demo/index.html
@@ -1,7 +1,7 @@
 <!DOCTYPE html>
 <html>
   <head>
-    <script src="../../bower_components/webcomponentsjs/webcomponents-lite.min.js"></script>
+    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
     <script>
       function handleLoad() {
         TF.Urls.runsUrl = function() {return "data/runs.json"};
@@ -15,7 +15,7 @@
       }
     </script>
     <link rel="import" href="../tf-tensorboard.html" onload="handleLoad(event)">
-    <link rel="stylesheet" type="text/css" href="../../lib/css/global.css">
+    <link rel="stylesheet" type="text/css" href="../../../lib/css/global.css">
     <title>TensorBoard Demo</title>
   </head>
   <body>
diff --git a/tensorflow/tensorboard/app/tf-tensorboard.html b/tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html
similarity index 82%
rename from tensorflow/tensorboard/app/tf-tensorboard.html
rename to tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html
index 0f5114143e1..9b9da223852 100644
--- a/tensorflow/tensorboard/app/tf-tensorboard.html
+++ b/tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html
@@ -1,12 +1,12 @@
-<link rel="import" href="../bower_components/polymer/polymer.html">
-<link rel="import" href="../bower_components/paper-toolbar/paper-toolbar.html">
-<link rel="import" href="../bower_components/paper-button/paper-button.html">
-<link rel="import" href="../bower_components/paper-header-panel/paper-header-panel.html">
-<link rel="import" href="../components/tf-event-dashboard/tf-event-dashboard.html">
-<link rel="import" href="../components/tf-histogram-dashboard/tf-histogram-dashboard.html">
-<link rel="import" href="../components/tf-image-dashboard/tf-image-dashboard.html">
-<link rel="import" href="../components/tf-graph-dashboard/tf-graph-dashboard.html">
-<link rel="import" href="../components/tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="../../bower_components/polymer/polymer.html">
+<link rel="import" href="../../bower_components/paper-toolbar/paper-toolbar.html">
+<link rel="import" href="../../bower_components/paper-button/paper-button.html">
+<link rel="import" href="../../bower_components/paper-header-panel/paper-header-panel.html">
+<link rel="import" href="../tf-event-dashboard/tf-event-dashboard.html">
+<link rel="import" href="../tf-histogram-dashboard/tf-histogram-dashboard.html">
+<link rel="import" href="../tf-image-dashboard/tf-image-dashboard.html">
+<link rel="import" href="../tf-graph-dashboard/tf-graph-dashboard.html">
+<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
 <!--
 tf-tensorboard is the frontend entry point for TensorBoard.
 
diff --git a/tensorflow/tensorboard/gulpfile.js b/tensorflow/tensorboard/gulpfile.js
index ea12304f847..e01af56c9c2 100644
--- a/tensorflow/tensorboard/gulpfile.js
+++ b/tensorflow/tensorboard/gulpfile.js
@@ -21,7 +21,6 @@ var ts = require('gulp-typescript');
 var typescript = require('typescript');
 var gutil = require('gulp-util');
 var tslint = require('gulp-tslint');
-var server = require('gulp-server-livereload');
 var concat = require('gulp-concat');
 var merge = require('merge2');
 var gulpFilter = require('gulp-filter');
@@ -29,10 +28,11 @@ var vulcanize = require('gulp-vulcanize');
 var rename = require('gulp-rename');
 var minimist = require('minimist');
 var replace = require('gulp-replace');
-
+var tfserve = require('./scripts/tfserve.js');
 var options = minimist(process.argv.slice(2), {
   default: {
-    p: 8000  // port for gulp server
+    p: 8000,  // port for gulp server
+    h: '0.0.0.0', // host to serve on
   }
 });
 
@@ -88,7 +88,8 @@ gulp.task('compile.all', function() {
 });
 
 gulp.task('test', ['tslint-strict', 'compile.all'], function(done) {
-  tester({plugins: {local: {},   sauce: false}}, function(error) {
+  tester({suites: ['components/test/'],
+          plugins: {local: {}, sauce: false}}, function(error) {
     if (error) {
       // Pretty error for gulp.
       error = new Error(error.message || error);
@@ -128,16 +129,11 @@ gulp.task('watch', ['compile.all', 'tslint-permissive'], function() {
 });
 
 gulp.task('server', function() {
-  gulp.src('.')
-      .pipe(server({
-        host: '0.0.0.0',
-        port: options.p,
-        livereload: {
-          enable: true,
-          port: 27729 + options.p
-        },
-        directoryListing: true,
-      }));
+  tfserve({
+    port: options.p,
+    host: options.h,
+    verbose: options.v,
+  });
 });
 
 
@@ -175,4 +171,4 @@ gulp.task('vulcanize', ['compile.all', 'tslint-strict'], function() {
 });
 
 gulp.task('serve', ['server']); // alias
-gulp.task('default', ['watch']);
+gulp.task('default', ['watch', 'serve']);
diff --git a/tensorflow/tensorboard/package.json b/tensorflow/tensorboard/package.json
index 068d2f04968..2637e873738 100644
--- a/tensorflow/tensorboard/package.json
+++ b/tensorflow/tensorboard/package.json
@@ -30,6 +30,8 @@
     "gulp-vulcanize": "~6.0.1",
     "minimist": "~1.2.0",
     "gulp-replace": "~0.5.4",
-    "web-component-tester": "~3.3.30"
+    "web-component-tester": "~3.3.30",
+    "send": "~0.13.0",
+    "express": "~4.13.3"
   }
 }
diff --git a/tensorflow/tensorboard/scripts/tfserve.js b/tensorflow/tensorboard/scripts/tfserve.js
new file mode 100644
index 00000000000..1d70fade3f6
--- /dev/null
+++ b/tensorflow/tensorboard/scripts/tfserve.js
@@ -0,0 +1,79 @@
+/**
+ * Simple server for running TensorBoard during development.
+ */
+
+var express = require('express');
+var http = require('http');
+var fs = require('fs');
+var parseUrl = require('url').parse;
+var send = require('send');
+
+function pathExists(path) {
+  try {
+    fs.statSync(path);
+    return true;
+  } catch (e) {
+    return false;
+  }
+}
+
+function serveTensorBoard(options) {
+  var logger = options.verbose ? console.log : new Function();
+
+  function serveFile(req, res) {
+    var url = parseUrl(req.url, true);
+    var splitPath = url.pathname.split('/').slice(1);
+    var filePath = splitPath.join('/');
+    logger('serve file path:', filePath);
+    send(req, filePath).pipe(res);
+  }
+
+  options.port = options.port || 8000;
+  options.host = options.host || 'localhost';
+  console.log('Serving TensorBoard at', options.host, ':', options.port);
+  logger('Serving in verbose mode.');
+  var app = express();
+  app.get('/', function(req, res) {
+    // redirect to the demo page
+    logger('Redirecting from / to the demo page');
+    res.redirect(301, '/components/tf-tensorboard/demo/index.html');
+  });
+  app.get('/components/imports/local-imports/*', serveFile);
+  app.get('/components/imports/*', function(req, res) {
+    var url = parseUrl(req.url, true).pathname;
+    var newUrl = url.replace('imports', 'imports/local-imports');
+    logger('redirect:', url, '->', newUrl);
+    res.redirect(301, newUrl);
+  });
+  app.get('/components/*', function(req, res) {
+    // serve from bower_components if possible, components otherwise
+    var url = parseUrl(req.url, true);
+    var splitPath = url.pathname.split('/').slice(2);
+    var bowerPath = ['bower_components'].concat(splitPath).join('/');
+    var componentsPath = ['components'].concat(splitPath).join('/');
+    var path;
+    if (pathExists(bowerPath)) {
+      path = bowerPath;
+    } else if (pathExists(componentsPath)) {
+      path = componentsPath;
+    } else {
+      console.error('Unable to find path:', componentsPath);
+      res.status(404).send('404 - couldnt find', componentsPath);
+      return;
+    }
+    logger('sending file:', path);
+    send(req, path).pipe(res);
+  });
+  app.get('*', serveFile);
+
+  var server = http.createServer(app);
+  server.listen(options.port, options.host);
+
+  server.on('error', function(err) {
+    if (err.code === 'EADDRINUSE') {
+      console.error('tfserve.js: Error - Port in use:', options.port);
+    }
+  });
+}
+
+module.exports = serveTensorBoard;
diff --git a/tensorflow/tensorboard/tensorboard.py b/tensorflow/tensorboard/tensorboard.py
index bccf669326b..07593a83233 100644
--- a/tensorflow/tensorboard/tensorboard.py
+++ b/tensorflow/tensorboard/tensorboard.py
@@ -38,27 +38,25 @@ from tensorflow.python.summary import event_accumulator
 from tensorflow.python.summary import event_multiplexer
 from tensorflow.tensorboard import tensorboard_handler
 
-flags.DEFINE_string('logdir', None, """
-logdir specifies where TensorBoard will look to find TensorFlow event files
-that it can display. In the simplest case, logdir is a directory containing
-tfevents files. TensorBoard also supports comparing multiple TensorFlow
-executions: to do this, you can use directory whose subdirectories contain
-tfevents files, as in the following example:
+flags.DEFINE_string('logdir', None, """logdir specifies the directory where
+TensorBoard will look to find TensorFlow event files that it can display.
+TensorBoard will recursively walk the directory structure rooted at logdir,
+looking for .*tfevents.* files.
 
-foo/bar/logdir/
-foo/bar/logdir/mnist_1/events.out.tfevents.1444088766
-foo/bar/logdir/mnist_2/events.out.tfevents.1444090064
-
-You may also pass a comma seperated list of log directories, and you can
-assign names to individual log directories by putting a colon between the name
-and the path, as in
+You may also pass a comma seperated list of log directories, and TensorBoard
+will watch each directory. You can also assign names to individual log
+directories by putting a colon between the name and the path, as in
 
 tensorboard --logdir=name1:/path/to/logs/1,name2:/path/to/logs/2
 """)
+
 flags.DEFINE_boolean('debug', False, 'Whether to run the app in debug mode. '
                      'This increases log verbosity to DEBUG.')
+
+
 flags.DEFINE_string('host', '127.0.0.1', 'What host to listen to. Defaults to '
                     'serving on localhost, set to 0.0.0.0 for remote access.')
+
 flags.DEFINE_integer('port', 6006, 'What port to serve TensorBoard on.')
 
 FLAGS = flags.FLAGS
diff --git a/tensorflow/tensorboard/tensorboard_handler.py b/tensorflow/tensorboard/tensorboard_handler.py
index fec8a781371..8c4a9e76895 100644
--- a/tensorflow/tensorboard/tensorboard_handler.py
+++ b/tensorflow/tensorboard/tensorboard_handler.py
@@ -327,6 +327,10 @@ class TensorboardHandler(BaseHTTPServer.BaseHTTPRequestHandler):
     """Serves the index page (i.e., the tensorboard app itself)."""
     self._serve_static_file('/dist/index.html')
 
+  def _serve_js(self, unused_query_params):
+    """Serves the JavaScript for the index page."""
+    self._serve_static_file('/dist/app.js')
+
   def _serve_static_file(self, path):
     """Serves the static file located at the given path.
 
@@ -377,7 +381,8 @@ class TensorboardHandler(BaseHTTPServer.BaseHTTPRequestHandler):
         IMAGES_ROUTE: self._serve_images,
         INDIVIDUAL_IMAGE_ROUTE: self._serve_image,
         RUNS_ROUTE: self._serve_runs,
-        '': self._serve_index
+        '': self._serve_index,
+        '/app.js': self._serve_js
     }
 
     if clean_path in handlers: