diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index cf8a668affc..0d2c9f2d195 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -416,16 +416,7 @@ tf_gen_op_wrappers_cc(
         "sparse_ops",
         "state_ops",
         "string_ops",
-        "adadelta_ops",
-        "adagrad_da_ops",
-        "adagrad_ops",
-        "adam_ops",
-        "ftrl_ops",
-        "momentum_ops",
-        "gradient_descent_ops",
-        "proximal_adagrad_ops",
-        "proximal_gradient_descent_ops",
-        "rms_prop_ops",
+        "training_ops",
         "user_ops",
     ],
     other_hdrs = [
diff --git a/tensorflow/cc/ops/standard_ops.h b/tensorflow/cc/ops/standard_ops.h
index d0537dd79e1..0c021f0b3ac 100644
--- a/tensorflow/cc/ops/standard_ops.h
+++ b/tensorflow/cc/ops/standard_ops.h
@@ -16,34 +16,25 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CC_OPS_STANDARD_OPS_H_
 #define THIRD_PARTY_TENSORFLOW_CC_OPS_STANDARD_OPS_H_
 
-#include "tensorflow/cc/ops/adadelta_ops.h"
-#include "tensorflow/cc/ops/adagrad_da_ops.h"
-#include "tensorflow/cc/ops/adagrad_ops.h"
-#include "tensorflow/cc/ops/adam_ops.h"
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/candidate_sampling_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/control_flow_ops.h"
 #include "tensorflow/cc/ops/data_flow_ops.h"
-#include "tensorflow/cc/ops/ftrl_ops.h"
-#include "tensorflow/cc/ops/gradient_descent_ops.h"
 #include "tensorflow/cc/ops/image_ops.h"
 #include "tensorflow/cc/ops/io_ops.h"
 #include "tensorflow/cc/ops/linalg_ops.h"
 #include "tensorflow/cc/ops/logging_ops.h"
 #include "tensorflow/cc/ops/lookup_ops.h"
 #include "tensorflow/cc/ops/math_ops.h"
-#include "tensorflow/cc/ops/momentum_ops.h"
 #include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/cc/ops/no_op.h"
 #include "tensorflow/cc/ops/parsing_ops.h"
-#include "tensorflow/cc/ops/proximal_adagrad_ops.h"
-#include "tensorflow/cc/ops/proximal_gradient_descent_ops.h"
 #include "tensorflow/cc/ops/random_ops.h"
-#include "tensorflow/cc/ops/rms_prop_ops.h"
 #include "tensorflow/cc/ops/sparse_ops.h"
 #include "tensorflow/cc/ops/state_ops.h"
 #include "tensorflow/cc/ops/string_ops.h"
+#include "tensorflow/cc/ops/training_ops.h"
 #include "tensorflow/cc/ops/user_ops.h"
 
 #endif  // THIRD_PARTY_TENSORFLOW_CC_OPS_STANDARD_OPS_H_
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 6f428d80458..87cb212ad0f 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -535,7 +535,6 @@ cc_library(
 
 # Generates library per group of ops.
 tf_gen_op_libs(
-    extra_srcs = ["ops/training_ops.h"],
     op_lib_names = [
         "bitwise_ops",
         "candidate_sampling_ops",
@@ -568,16 +567,7 @@ tf_gen_op_libs(
         "stateless_random_ops",
         "string_ops",
         "summary_ops",
-        "adadelta_ops",
-        "adagrad_da_ops",
-        "adagrad_ops",
-        "adam_ops",
-        "ftrl_ops",
-        "momentum_ops",
-        "gradient_descent_ops",
-        "proximal_adagrad_ops",
-        "proximal_gradient_descent_ops",
-        "rms_prop_ops",
+        "training_ops",
     ],
 )
 
@@ -655,16 +645,7 @@ cc_library(
         ":state_ops_op_lib",
         ":stateless_random_ops_op_lib",
         ":string_ops_op_lib",
-        ":adadelta_ops_op_lib",
-        ":adagrad_da_ops_op_lib",
-        ":adagrad_ops_op_lib",
-        ":adam_ops_op_lib",
-        ":ftrl_ops_op_lib",
-        ":momentum_ops_op_lib",
-        ":gradient_descent_ops_op_lib",
-        ":proximal_adagrad_ops_op_lib",
-        ":proximal_gradient_descent_ops_op_lib",
-        ":rms_prop_ops_op_lib",
+        ":training_ops_op_lib",
         ":user_ops_op_lib",
         ":word2vec_ops",
     ] + tf_additional_cloud_op_deps(),
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 765976c37c3..10f9e7344a5 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3888,18 +3888,9 @@ tf_kernel_library(
         ":bounds_check",
         ":training_op_helpers",
         ":variable_ops",
-        "//tensorflow/core:adadelta_ops_op_lib",
-        "//tensorflow/core:adagrad_da_ops_op_lib",
-        "//tensorflow/core:adagrad_ops_op_lib",
-        "//tensorflow/core:adam_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:ftrl_ops_op_lib",
-        "//tensorflow/core:gradient_descent_ops_op_lib",
         "//tensorflow/core:lib",
-        "//tensorflow/core:momentum_ops_op_lib",
-        "//tensorflow/core:proximal_adagrad_ops_op_lib",
-        "//tensorflow/core:proximal_gradient_descent_ops_op_lib",
-        "//tensorflow/core:rms_prop_ops_op_lib",
+        "//tensorflow/core:training_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
diff --git a/tensorflow/core/ops/adadelta_ops.cc b/tensorflow/core/ops/adadelta_ops.cc
deleted file mode 100644
index b7dcff4a09e..00000000000
--- a/tensorflow/core/ops/adadelta_ops.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/ops/training_ops.h"
-
-namespace tensorflow {
-
-static Status ApplyAdadeltaShapeFn(InferenceContext* c, bool sparse) {
-  ShapeHandle unused;
-  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
-  TF_RETURN_IF_ERROR(
-      c->Merge(s, ShapeOrHandleShape(c, 2), &s));            // accum update
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));  // lr
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));  // rho
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));  // epsilon
-  TF_RETURN_IF_ERROR(
-      HandleGradAndIndicesInputs(c, sparse, 6 /* grad_idx */, &s));
-  if (c->num_outputs() > 0) {
-    c->set_output(0, s);
-  }
-  return Status::OK();
-}
-
-REGISTER_OP("ApplyAdadelta")
-    .Input("var: Ref(T)")
-    .Input("accum: Ref(T)")
-    .Input("accum_update: Ref(T)")
-    .Input("lr: T")
-    .Input("rho: T")
-    .Input("epsilon: T")
-    .Input("grad: T")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdadeltaShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the adadelta scheme.
-
-accum = rho() * accum + (1 - rho()) * grad.square();
-update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-update_accum = rho() * update_accum + (1 - rho()) * update.square();
-var -= update;
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-accum_update: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-rho: Decay factor. Must be a scalar.
-epsilon: Constant factor. Must be a scalar.
-grad: The gradient.
-out: Same as "var".
-use_locking: If True, updating of the var, accum and update_accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-REGISTER_OP("SparseApplyAdadelta")
-    .Input("var: Ref(T)")
-    .Input("accum: Ref(T)")
-    .Input("accum_update: Ref(T)")
-    .Input("lr: T")
-    .Input("rho: T")
-    .Input("epsilon: T")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdadeltaShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-var: Should be from a Variable().
-accum: Should be from a Variable().
-accum_update:: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-rho: Decay factor. Must be a scalar.
-epsilon: Constant factor. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-out: Same as "var".
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-REGISTER_OP("ResourceApplyAdadelta")
-    .Input("var: resource")
-    .Input("accum: resource")
-    .Input("accum_update: resource")
-    .Input("lr: T")
-    .Input("rho: T")
-    .Input("epsilon: T")
-    .Input("grad: T")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdadeltaShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the adadelta scheme.
-
-accum = rho() * accum + (1 - rho()) * grad.square();
-update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-update_accum = rho() * update_accum + (1 - rho()) * update.square();
-var -= update;
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-accum_update: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-rho: Decay factor. Must be a scalar.
-epsilon: Constant factor. Must be a scalar.
-grad: The gradient.
-use_locking: If True, updating of the var, accum and update_accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-REGISTER_OP("ResourceSparseApplyAdadelta")
-    .Input("var: resource")
-    .Input("accum: resource")
-    .Input("accum_update: resource")
-    .Input("lr: T")
-    .Input("rho: T")
-    .Input("epsilon: T")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdadeltaShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-var: Should be from a Variable().
-accum: Should be from a Variable().
-accum_update:: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-rho: Decay factor. Must be a scalar.
-epsilon: Constant factor. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/ops/adagrad_da_ops.cc b/tensorflow/core/ops/adagrad_da_ops.cc
deleted file mode 100644
index 997a0249904..00000000000
--- a/tensorflow/core/ops/adagrad_da_ops.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/ops/training_ops.h"
-
-namespace tensorflow {
-
-static Status ApplyAdagradDAShapeFn(InferenceContext* c, bool sparse) {
-  ShapeHandle unused;
-  ShapeHandle s = ShapeOrHandleShape(c, 0);  // var
-  TF_RETURN_IF_ERROR(
-      c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // grad_accumulator
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2),
-                              &s));  // gradient_squared_accumulator
-  TF_RETURN_IF_ERROR(
-      HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
-  int idx = sparse ? 5 : 4;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // lr
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l1
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l2
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // global step
-  if (c->num_outputs() > 0) {
-    c->set_output(0, s);
-  }
-  return Status::OK();
-}
-
-REGISTER_OP("ApplyAdagradDA")
-    .Input("var: Ref(T)")
-    .Input("gradient_accumulator: Ref(T)")
-    .Input("gradient_squared_accumulator: Ref(T)")
-    .Input("grad: T")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("global_step: int64")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdagradDAShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the proximal adagrad scheme.
-
-var: Should be from a Variable().
-gradient_accumulator: Should be from a Variable().
-gradient_squared_accumulator: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-global_step: Training step number. Must be a scalar.
-out: Same as "var".
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-REGISTER_OP("SparseApplyAdagradDA")
-    .Input("var: Ref(T)")
-    .Input("gradient_accumulator: Ref(T)")
-    .Input("gradient_squared_accumulator: Ref(T)")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("global_step: int64")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdagradDAShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
-
-var: Should be from a Variable().
-gradient_accumulator: Should be from a Variable().
-gradient_squared_accumulator: Should be from a Variable().
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-lr: Learning rate. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-global_step: Training step number. Must be a scalar.
-out: Same as "var".
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-REGISTER_OP("ResourceApplyAdagradDA")
-    .Input("var: resource")
-    .Input("gradient_accumulator: resource")
-    .Input("gradient_squared_accumulator: resource")
-    .Input("grad: T")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("global_step: int64")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdagradDAShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the proximal adagrad scheme.
-
-var: Should be from a Variable().
-gradient_accumulator: Should be from a Variable().
-gradient_squared_accumulator: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-global_step: Training step number. Must be a scalar.
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-REGISTER_OP("ResourceSparseApplyAdagradDA")
-    .Input("var: resource")
-    .Input("gradient_accumulator: resource")
-    .Input("gradient_squared_accumulator: resource")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("global_step: int64")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdagradDAShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
-
-var: Should be from a Variable().
-gradient_accumulator: Should be from a Variable().
-gradient_squared_accumulator: Should be from a Variable().
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-lr: Learning rate. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-global_step: Training step number. Must be a scalar.
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/ops/adagrad_ops.cc b/tensorflow/core/ops/adagrad_ops.cc
deleted file mode 100644
index 03dde949e41..00000000000
--- a/tensorflow/core/ops/adagrad_ops.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/ops/training_ops.h"
-
-namespace tensorflow {
-
-static Status ApplyAdagradShapeFn(InferenceContext* c, bool sparse) {
-  ShapeHandle unused;
-  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));  // lr
-  TF_RETURN_IF_ERROR(
-      HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
-  if (c->num_outputs() > 0) {
-    c->set_output(0, s);
-  }
-  return Status::OK();
-}
-
-REGISTER_OP("ApplyAdagrad")
-    .Input("var: Ref(T)")
-    .Input("accum: Ref(T)")
-    .Input("lr: T")
-    .Input("grad: T")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdagradShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the adagrad scheme.
-
-accum += grad * grad
-var -= lr * grad * (1 / sqrt(accum))
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-grad: The gradient.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("ResourceApplyAdagrad")
-    .Input("var: resource")
-    .Input("accum: resource")
-    .Input("lr: T")
-    .Input("grad: T")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdagradShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the adagrad scheme.
-
-accum += grad * grad
-var -= lr * grad * (1 / sqrt(accum))
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-grad: The gradient.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("SparseApplyAdagrad")
-    .Input("var: Ref(T)")
-    .Input("accum: Ref(T)")
-    .Input("lr: T")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdagradShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
-
-That is for rows we have grad for, we update var and accum as follows:
-accum += grad * grad
-var -= lr * grad * (1 / sqrt(accum))
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("ResourceSparseApplyAdagrad")
-    .Input("var: resource")
-    .Input("accum: resource")
-    .Input("lr: T")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdagradShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
-
-That is for rows we have grad for, we update var and accum as follows:
-accum += grad * grad
-var -= lr * grad * (1 / sqrt(accum))
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/ops/adam_ops.cc b/tensorflow/core/ops/adam_ops.cc
deleted file mode 100644
index 74cf2f5267a..00000000000
--- a/tensorflow/core/ops/adam_ops.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/ops/training_ops.h"
-
-namespace tensorflow {
-
-static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) {
-  ShapeHandle unused;
-  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // v
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));  // beta1_power
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));  // beta2_power
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));  // lr
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));  // beta1
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));  // beta2
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));  // epsilon
-  TF_RETURN_IF_ERROR(
-      HandleGradAndIndicesInputs(c, sparse, 9 /* grad_idx */, &s));
-  if (c->num_outputs() > 0) {
-    c->set_output(0, s);
-  }
-  return Status::OK();
-}
-
-REGISTER_OP("ApplyAdam")
-    .Input("var: Ref(T)")
-    .Input("m: Ref(T)")
-    .Input("v: Ref(T)")
-    .Input("beta1_power: T")
-    .Input("beta2_power: T")
-    .Input("lr: T")
-    .Input("beta1: T")
-    .Input("beta2: T")
-    .Input("epsilon: T")
-    .Input("grad: T")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .Attr("use_nesterov: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdamShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the Adam algorithm.
-
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
-
-var: Should be from a Variable().
-m: Should be from a Variable().
-v: Should be from a Variable().
-beta1_power: Must be a scalar.
-beta2_power: Must be a scalar.
-lr: Scaling factor. Must be a scalar.
-beta1: Momentum factor. Must be a scalar.
-beta2: Momentum factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-grad: The gradient.
-out: Same as "var".
-use_locking: If `True`, updating of the var, m, and v tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-use_nesterov: If `True`, uses the nesterov update.
-)doc");
-
-REGISTER_OP("ResourceApplyAdam")
-    .Input("var: resource")
-    .Input("m: resource")
-    .Input("v: resource")
-    .Input("beta1_power: T")
-    .Input("beta2_power: T")
-    .Input("lr: T")
-    .Input("beta1: T")
-    .Input("beta2: T")
-    .Input("epsilon: T")
-    .Input("grad: T")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .Attr("use_nesterov: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdamShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the Adam algorithm.
-
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
-
-var: Should be from a Variable().
-m: Should be from a Variable().
-v: Should be from a Variable().
-beta1_power: Must be a scalar.
-beta2_power: Must be a scalar.
-lr: Scaling factor. Must be a scalar.
-beta1: Momentum factor. Must be a scalar.
-beta2: Momentum factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-grad: The gradient.
-use_locking: If `True`, updating of the var, m, and v tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-use_nesterov: If `True`, uses the nesterov update.
-)doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/ops/ftrl_ops.cc b/tensorflow/core/ops/ftrl_ops.cc
deleted file mode 100644
index a40823eb539..00000000000
--- a/tensorflow/core/ops/ftrl_ops.cc
+++ /dev/null
@@ -1,368 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/ops/training_ops.h"
-
-namespace tensorflow {
-
-static Status ApplyFtrlShapeFn(InferenceContext* c, bool sparse) {
-  ShapeHandle unused;
-  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // linear
-  TF_RETURN_IF_ERROR(
-      HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
-  int idx = sparse ? 5 : 4;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // lr
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l1
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l2
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // lr_power
-  if (c->num_outputs() > 0) {
-    c->set_output(0, s);
-  }
-  return Status::OK();
-}
-
-REGISTER_OP("ApplyFtrl")
-    .Input("var: Ref(T)")
-    .Input("accum: Ref(T)")
-    .Input("linear: Ref(T)")
-    .Input("grad: T")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("lr_power: T")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyFtrlShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the Ftrl-proximal scheme.
-
-accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regulariation. Must be a scalar.
-l2: L2 regulariation. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("SparseApplyFtrl")
-    .Input("var: Ref(T)")
-    .Input("accum: Ref(T)")
-    .Input("linear: Ref(T)")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("lr_power: T")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyFtrlShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-
-That is for rows we have grad for, we update var, accum and linear as follows:
-accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("ResourceApplyFtrl")
-    .Input("var: resource")
-    .Input("accum: resource")
-    .Input("linear: resource")
-    .Input("grad: T")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("lr_power: T")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyFtrlShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the Ftrl-proximal scheme.
-
-accum_new = accum + grad * grad
-linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regulariation. Must be a scalar.
-l2: L2 regulariation. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("ResourceSparseApplyFtrl")
-    .Input("var: resource")
-    .Input("accum: resource")
-    .Input("linear: resource")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("lr_power: T")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyFtrlShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-
-That is for rows we have grad for, we update var, accum and linear as follows:
-accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("ApplyFtrlV2")
-    .Input("var: Ref(T)")
-    .Input("accum: Ref(T)")
-    .Input("linear: Ref(T)")
-    .Input("grad: T")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("l2_shrinkage: T")
-    .Input("lr_power: T")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyFtrlShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the Ftrl-proximal scheme.
-
-grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
-    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regulariation. Must be a scalar.
-l2: online L2 regulariation. Must be a scalar.
-l2: L2 shrinkage regulariation. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("SparseApplyFtrlV2")
-    .Input("var: Ref(T)")
-    .Input("accum: Ref(T)")
-    .Input("linear: Ref(T)")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("l2_shrinkage: T")
-    .Input("lr_power: T")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyFtrlShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-
-That is for rows we have grad for, we update var, accum and linear as follows:
-grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
-    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: onine L2 regularization. Must be a scalar.
-l2: L2 shrinkage regulariation. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("ResourceApplyFtrlV2")
-    .Input("var: resource")
-    .Input("accum: resource")
-    .Input("linear: resource")
-    .Input("grad: T")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("l2_shrinkage: T")
-    .Input("lr_power: T")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyFtrlShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the Ftrl-proximal scheme.
-
-grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
-    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regulariation. Must be a scalar.
-l2: onine L2 regularization. Must be a scalar.
-l2: L2 shrinkage regulariation. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("ResourceSparseApplyFtrlV2")
-    .Input("var: resource")
-    .Input("accum: resource")
-    .Input("linear: resource")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("l2_shrinkage: T")
-    .Input("lr_power: T")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyFtrlShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-
-That is for rows we have grad for, we update var, accum and linear as follows:
-grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
-    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: onine L2 regularization. Must be a scalar.
-l2: L2 shrinkage regulariation. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/ops/gradient_descent_ops.cc b/tensorflow/core/ops/gradient_descent_ops.cc
deleted file mode 100644
index c94a91d275c..00000000000
--- a/tensorflow/core/ops/gradient_descent_ops.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/ops/training_ops.h"
-
-namespace tensorflow {
-
-static Status ApplyGradientDescentShapeFn(InferenceContext* c) {
-  ShapeHandle unused;
-  ShapeHandle s = ShapeOrHandleShape(c, 0);                  // var
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));  // alpha
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s));          // delta
-  if (c->num_outputs() > 0) {
-    c->set_output(0, s);
-  }
-  return Status::OK();
-}
-
-REGISTER_OP("ApplyGradientDescent")
-    .Input("var: Ref(T)")
-    .Input("alpha: T")
-    .Input("delta: T")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn(ApplyGradientDescentShapeFn)
-    .Doc(R"doc(
-Update '*var' by subtracting 'alpha' * 'delta' from it.
-
-var: Should be from a Variable().
-alpha: Scaling factor. Must be a scalar.
-delta: The change.
-out: Same as "var".
-use_locking: If `True`, the subtraction will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-REGISTER_OP("ResourceApplyGradientDescent")
-    .Input("var: resource")
-    .Input("alpha: T")
-    .Input("delta: T")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn(ApplyGradientDescentShapeFn)
-    .Doc(R"doc(
-Update '*var' by subtracting 'alpha' * 'delta' from it.
-
-var: Should be from a Variable().
-alpha: Scaling factor. Must be a scalar.
-delta: The change.
-use_locking: If `True`, the subtraction will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/ops/momentum_ops.cc b/tensorflow/core/ops/momentum_ops.cc
deleted file mode 100644
index f9701bd4881..00000000000
--- a/tensorflow/core/ops/momentum_ops.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/ops/training_ops.h"
-
-namespace tensorflow {
-
-static Status ApplyMomentumShapeFn(InferenceContext* c, bool sparse) {
-  ShapeHandle unused;
-  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));       // lr
-  TF_RETURN_IF_ERROR(
-      HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
-  int idx = sparse ? 5 : 4;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // momentum
-  if (c->num_outputs() > 0) {
-    c->set_output(0, s);
-  }
-  return Status::OK();
-}
-
-REGISTER_OP("ApplyMomentum")
-    .Input("var: Ref(T)")
-    .Input("accum: Ref(T)")
-    .Input("lr: T")
-    .Input("grad: T")
-    .Input("momentum: T")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .Attr("use_nesterov: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyMomentumShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-want to use Nesterov momentum.
-
-accum = accum * momentum + grad
-var -= lr * accum
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-grad: The gradient.
-momentum: Momentum. Must be a scalar.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be
-var - lr * momentum * accum, so in the end, the var you get is actually
-var - lr * momentum * accum.
-)doc");
-
-REGISTER_OP("SparseApplyMomentum")
-    .Input("var: Ref(T)")
-    .Input("accum: Ref(T)")
-    .Input("lr: T")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Input("momentum: T")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .Attr("use_nesterov: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyMomentumShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-Set use_nesterov = True if you want to use Nesterov momentum.
-
-That is for rows we have grad for, we update var and accum as follows:
-
-accum = accum * momentum + grad
-var -= lr * accum
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-momentum: Momentum. Must be a scalar.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be
-var - lr * momentum * accum, so in the end, the var you get is actually
-var - lr * momentum * accum.
-)doc");
-
-REGISTER_OP("ResourceApplyMomentum")
-    .Input("var: resource")
-    .Input("accum: resource")
-    .Input("lr: T")
-    .Input("grad: T")
-    .Input("momentum: T")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .Attr("use_nesterov: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyMomentumShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-want to use Nesterov momentum.
-
-accum = accum * momentum + grad
-var -= lr * accum
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-grad: The gradient.
-momentum: Momentum. Must be a scalar.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be
-var - lr * momentum * accum, so in the end, the var you get is actually
-var - lr * momentum * accum.
-)doc");
-
-REGISTER_OP("ResourceSparseApplyMomentum")
-    .Input("var: resource")
-    .Input("accum: resource")
-    .Input("lr: T")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Input("momentum: T")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .Attr("use_nesterov: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyMomentumShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-Set use_nesterov = True if you want to use Nesterov momentum.
-
-That is for rows we have grad for, we update var and accum as follows:
-
-accum = accum * momentum + grad
-var -= lr * accum
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-momentum: Momentum. Must be a scalar.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be
-var - lr * momentum * accum, so in the end, the var you get is actually
-var - lr * momentum * accum.
-)doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/ops/proximal_adagrad_ops.cc b/tensorflow/core/ops/proximal_adagrad_ops.cc
deleted file mode 100644
index a618519d82b..00000000000
--- a/tensorflow/core/ops/proximal_adagrad_ops.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/ops/training_ops.h"
-
-namespace tensorflow {
-
-static Status ApplyProximalAdagradShapeFn(InferenceContext* c, bool sparse) {
-  ShapeHandle unused;
-  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));  // lr
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));  // l1
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));  // l2
-  TF_RETURN_IF_ERROR(
-      HandleGradAndIndicesInputs(c, sparse, 5 /* grad_idx */, &s));
-  if (c->num_outputs() > 0) {
-    c->set_output(0, s);
-  }
-  return Status::OK();
-}
-
-REGISTER_OP("ApplyProximalAdagrad")
-    .Input("var: Ref(T)")
-    .Input("accum: Ref(T)")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("grad: T")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyProximalAdagradShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
-accum += grad * grad
-prox_v = var - lr * grad * (1 / sqrt(accum))
-var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-out: Same as "var".
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-REGISTER_OP("ResourceApplyProximalAdagrad")
-    .Input("var: resource")
-    .Input("accum: resource")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("grad: T")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyProximalAdagradShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
-accum += grad * grad
-prox_v = var - lr * grad * (1 / sqrt(accum))
-var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-REGISTER_OP("SparseApplyProximalAdagrad")
-    .Input("var: Ref(T)")
-    .Input("accum: Ref(T)")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyProximalAdagradShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
-
-That is for rows we have grad for, we update var and accum as follows:
-accum += grad * grad
-prox_v = var
-prox_v -= lr * grad * (1 / sqrt(accum))
-var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-out: Same as "var".
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-REGISTER_OP("ResourceSparseApplyProximalAdagrad")
-    .Input("var: resource")
-    .Input("accum: resource")
-    .Input("lr: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyProximalAdagradShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
-
-That is for rows we have grad for, we update var and accum as follows:
-accum += grad * grad
-prox_v = var
-prox_v -= lr * grad * (1 / sqrt(accum))
-var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-static Status ApplyFtrlShapeFn(InferenceContext* c, bool sparse) {
-  ShapeHandle unused;
-  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // linear
-  TF_RETURN_IF_ERROR(
-      HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
-  int idx = sparse ? 5 : 4;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // lr
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l1
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l2
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // lr_power
-  if (c->num_outputs() > 0) {
-    c->set_output(0, s);
-  }
-  return Status::OK();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/ops/proximal_gradient_descent_ops.cc b/tensorflow/core/ops/proximal_gradient_descent_ops.cc
deleted file mode 100644
index 42e762c0529..00000000000
--- a/tensorflow/core/ops/proximal_gradient_descent_ops.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/ops/training_ops.h"
-
-namespace tensorflow {
-
-static Status ApplyProximalGradientDescentShapeFn(InferenceContext* c,
-                                                  bool sparse) {
-  ShapeHandle unused;
-  ShapeHandle s = ShapeOrHandleShape(c, 0);                  // var
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));  // alpha
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));  // l1
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));  // l2
-  TF_RETURN_IF_ERROR(
-      HandleGradAndIndicesInputs(c, sparse, 4 /* grad_idx */, &s));
-  if (c->num_outputs() > 0) {
-    c->set_output(0, s);
-  }
-  return Status::OK();
-}
-
-REGISTER_OP("ApplyProximalGradientDescent")
-    .Input("var: Ref(T)")
-    .Input("alpha: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("delta: T")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyProximalGradientDescentShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' as FOBOS algorithm with fixed learning rate.
-prox_v = var - alpha * delta
-var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-
-var: Should be from a Variable().
-alpha: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-delta: The change.
-out: Same as "var".
-use_locking: If True, the subtraction will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-REGISTER_OP("SparseApplyProximalGradientDescent")
-    .Input("var: Ref(T)")
-    .Input("alpha: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyProximalGradientDescentShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Sparse update '*var' as FOBOS algorithm with fixed learning rate.
-
-That is for rows we have grad for, we update var as follows:
-prox_v = var - alpha * grad
-var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-
-var: Should be from a Variable().
-alpha: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-out: Same as "var".
-use_locking: If True, the subtraction will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-REGISTER_OP("ResourceApplyProximalGradientDescent")
-    .Input("var: resource")
-    .Input("alpha: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("delta: T")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyProximalGradientDescentShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' as FOBOS algorithm with fixed learning rate.
-prox_v = var - alpha * delta
-var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-
-var: Should be from a Variable().
-alpha: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-delta: The change.
-use_locking: If True, the subtraction will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-REGISTER_OP("ResourceSparseApplyProximalGradientDescent")
-    .Input("var: resource")
-    .Input("alpha: T")
-    .Input("l1: T")
-    .Input("l2: T")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyProximalGradientDescentShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Sparse update '*var' as FOBOS algorithm with fixed learning rate.
-
-That is for rows we have grad for, we update var as follows:
-prox_v = var - alpha * grad
-var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-
-var: Should be from a Variable().
-alpha: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-use_locking: If True, the subtraction will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/ops/rms_prop_ops.cc b/tensorflow/core/ops/rms_prop_ops.cc
deleted file mode 100644
index d13cef6413d..00000000000
--- a/tensorflow/core/ops/rms_prop_ops.cc
+++ /dev/null
@@ -1,425 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/ops/training_ops.h"
-
-namespace tensorflow {
-
-static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) {
-  ShapeHandle unused;
-  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // ms
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // mom
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));  // lr
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));  // rho
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));  // momentum
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));  // epsilon
-  TF_RETURN_IF_ERROR(
-      HandleGradAndIndicesInputs(c, sparse, 7 /* grad_idx */, &s));
-  if (c->num_outputs() > 0) {
-    c->set_output(0, s);
-  }
-  return Status::OK();
-}
-
-static Status ApplyCenteredRMSPropShapeFn(InferenceContext* c, bool sparse) {
-  ShapeHandle unused;
-  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // ms
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // mg
-  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 3), &s));  // mom
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));  // lr
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));  // rho
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));  // momentum
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));  // epsilon
-  TF_RETURN_IF_ERROR(
-      HandleGradAndIndicesInputs(c, sparse, 8 /* grad_idx */, &s));
-  if (c->num_outputs() > 0) {
-    c->set_output(0, s);
-  }
-  return Status::OK();
-}
-
-REGISTER_OP("ApplyRMSProp")
-    .Input("var: Ref(T)")
-    .Input("ms: Ref(T)")
-    .Input("mom: Ref(T)")
-    .Input("lr: T")
-    .Input("rho: T")
-    .Input("momentum: T")
-    .Input("epsilon: T")
-    .Input("grad: T")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyRMSPropShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the RMSProp algorithm.
-Note that in dense implementation of this algorithm, ms and mom will
-update even if the grad is zero, but in this sparse implementation, ms
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-out: Same as "var".
-use_locking: If `True`, updating of the var, ms, and mom tensors is protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("ApplyCenteredRMSProp")
-    .Input("var: Ref(T)")
-    .Input("mg: Ref(T)")
-    .Input("ms: Ref(T)")
-    .Input("mom: Ref(T)")
-    .Input("lr: T")
-    .Input("rho: T")
-    .Input("momentum: T")
-    .Input("epsilon: T")
-    .Input("grad: T")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyCenteredRMSPropShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the centered RMSProp algorithm.
-The centered RMSProp algorithm uses an estimate of the centered second moment
-(i.e., the variance) for normalization, as opposed to regular RMSProp, which
-uses the (uncentered) second moment. This often helps with training, but is
-slightly more expensive in terms of computation and memory.
-
-Note that in dense implementation of this algorithm, mg, ms, and mom will
-update even if the grad is zero, but in this sparse implementation, mg, ms,
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-mean_grad = decay * mean_grad + (1-decay) * gradient
-
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-
-mg <- rho * mg_{t-1} + (1-rho) * grad
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-mg: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-out: Same as "var".
-use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
-  protected by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("SparseApplyRMSProp")
-    .Input("var: Ref(T)")
-    .Input("ms: Ref(T)")
-    .Input("mom: Ref(T)")
-    .Input("lr: T")
-    .Input("rho: T")
-    .Input("momentum: T")
-    .Input("epsilon: T")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyRMSPropShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the RMSProp algorithm.
-Note that in dense implementation of this algorithm, ms and mom will
-update even if the grad is zero, but in this sparse implementation, ms
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var, ms and mom.
-out: Same as "var".
-use_locking: If `True`, updating of the var, ms, and mom tensors is protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("SparseApplyCenteredRMSProp")
-    .Input("var: Ref(T)")
-    .Input("mg: Ref(T)")
-    .Input("ms: Ref(T)")
-    .Input("mom: Ref(T)")
-    .Input("lr: T")
-    .Input("rho: T")
-    .Input("momentum: T")
-    .Input("epsilon: T")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Output("out: Ref(T)")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyCenteredRMSPropShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the centered RMSProp algorithm.
-The centered RMSProp algorithm uses an estimate of the centered second moment
-(i.e., the variance) for normalization, as opposed to regular RMSProp, which
-uses the (uncentered) second moment. This often helps with training, but is
-slightly more expensive in terms of computation and memory.
-
-Note that in dense implementation of this algorithm, mg, ms, and mom will
-update even if the grad is zero, but in this sparse implementation, mg, ms,
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-mean_grad = decay * mean_grad + (1-decay) * gradient
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-mg: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var, ms and mom.
-out: Same as "var".
-use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
-  protected by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("ResourceApplyRMSProp")
-    .Input("var: resource")
-    .Input("ms: resource")
-    .Input("mom: resource")
-    .Input("lr: T")
-    .Input("rho: T")
-    .Input("momentum: T")
-    .Input("epsilon: T")
-    .Input("grad: T")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyRMSPropShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the RMSProp algorithm.
-Note that in dense implementation of this algorithm, ms and mom will
-update even if the grad is zero, but in this sparse implementation, ms
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-use_locking: If `True`, updating of the var, ms, and mom tensors is protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("ResourceApplyCenteredRMSProp")
-    .Input("var: resource")
-    .Input("mg: resource")
-    .Input("ms: resource")
-    .Input("mom: resource")
-    .Input("lr: T")
-    .Input("rho: T")
-    .Input("momentum: T")
-    .Input("epsilon: T")
-    .Input("grad: T")
-    .Attr("T: numbertype")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyCenteredRMSPropShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the centered RMSProp algorithm.
-The centered RMSProp algorithm uses an estimate of the centered second moment
-(i.e., the variance) for normalization, as opposed to regular RMSProp, which
-uses the (uncentered) second moment. This often helps with training, but is
-slightly more expensive in terms of computation and memory.
-
-Note that in dense implementation of this algorithm, mg, ms, and mom will
-update even if the grad is zero, but in this sparse implementation, mg, ms,
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-mean_grad = decay * mean_grad + (1-decay) * gradient
-
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-
-mg <- rho * mg_{t-1} + (1-rho) * grad
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-mg: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
-  protected by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("ResourceSparseApplyRMSProp")
-    .Input("var: resource")
-    .Input("ms: resource")
-    .Input("mom: resource")
-    .Input("lr: T")
-    .Input("rho: T")
-    .Input("momentum: T")
-    .Input("epsilon: T")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyRMSPropShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the RMSProp algorithm.
-Note that in dense implementation of this algorithm, ms and mom will
-update even if the grad is zero, but in this sparse implementation, ms
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var, ms and mom.
-use_locking: If `True`, updating of the var, ms, and mom tensors is protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-REGISTER_OP("ResourceSparseApplyCenteredRMSProp")
-    .Input("var: resource")
-    .Input("mg: resource")
-    .Input("ms: resource")
-    .Input("mom: resource")
-    .Input("lr: T")
-    .Input("rho: T")
-    .Input("momentum: T")
-    .Input("epsilon: T")
-    .Input("grad: T")
-    .Input("indices: Tindices")
-    .Attr("T: numbertype")
-    .Attr("Tindices: {int32, int64}")
-    .Attr("use_locking: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return ApplyCenteredRMSPropShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the centered RMSProp algorithm.
-The centered RMSProp algorithm uses an estimate of the centered second moment
-(i.e., the variance) for normalization, as opposed to regular RMSProp, which
-uses the (uncentered) second moment. This often helps with training, but is
-slightly more expensive in terms of computation and memory.
-
-Note that in dense implementation of this algorithm, mg, ms, and mom will
-update even if the grad is zero, but in this sparse implementation, mg, ms,
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-mean_grad = decay * mean_grad + (1-decay) * gradient
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-mg: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var, ms and mom.
-use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
-  protected by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
new file mode 100644
index 00000000000..6f06b87d589
--- /dev/null
+++ b/tensorflow/core/ops/training_ops.cc
@@ -0,0 +1,1799 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) {
+  auto* handle_data = c->input_handle_shapes_and_types(input);
+  if (handle_data != nullptr && !handle_data->empty() &&
+      (*handle_data)[0].dtype != DT_INVALID) {
+    return (*handle_data)[0].shape;
+  }
+  return c->input(input);
+}
+
+// Handle the gradient and, if <sparse>, indices inputs.
+// <s> is an input+output parameter, containing the current known input shape to
+// the gradient.
+static Status HandleGradAndIndicesInputs(InferenceContext* c, bool sparse,
+                                         int grad_idx, ShapeHandle* s) {
+  ShapeHandle grad = ShapeOrHandleShape(c, grad_idx);
+  if (!sparse) {
+    TF_RETURN_IF_ERROR(c->Merge(*s, grad, s));
+    return Status::OK();
+  }
+  // Indices is a vector where indices.dim[0].rank == grad[0].rank.
+  ShapeHandle indices;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(grad_idx + 1), 1, &indices));
+  DimensionHandle unused;
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused));
+
+  // Trailing part of grad matches trailing part of *s.
+  ShapeHandle grad_unknown_first;
+  TF_RETURN_IF_ERROR(
+      c->ReplaceDim(grad, 0, c->UnknownDim(), &grad_unknown_first));
+  TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s));
+
+  return Status::OK();
+}
+
+static Status ApplyGradientDescentShapeFn(InferenceContext* c) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                  // var
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));  // alpha
+  TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s));          // delta
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyGradientDescent")
+    .Input("var: Ref(T)")
+    .Input("alpha: T")
+    .Input("delta: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn(ApplyGradientDescentShapeFn)
+    .Doc(R"doc(
+Update '*var' by subtracting 'alpha' * 'delta' from it.
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+delta: The change.
+out: Same as "var".
+use_locking: If `True`, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("ResourceApplyGradientDescent")
+    .Input("var: resource")
+    .Input("alpha: T")
+    .Input("delta: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn(ApplyGradientDescentShapeFn)
+    .Doc(R"doc(
+Update '*var' by subtracting 'alpha' * 'delta' from it.
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+delta: The change.
+use_locking: If `True`, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+static Status ApplyProximalGradientDescentShapeFn(InferenceContext* c,
+                                                  bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                  // var
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));  // alpha
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));  // l1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));  // l2
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 4 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyProximalGradientDescent")
+    .Input("var: Ref(T)")
+    .Input("alpha: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("delta: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyProximalGradientDescentShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' as FOBOS algorithm with fixed learning rate.
+prox_v = var - alpha * delta
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+delta: The change.
+out: Same as "var".
+use_locking: If True, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("SparseApplyProximalGradientDescent")
+    .Input("var: Ref(T)")
+    .Input("alpha: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyProximalGradientDescentShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+
+That is for rows we have grad for, we update var as follows:
+prox_v = var - alpha * grad
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+out: Same as "var".
+use_locking: If True, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("ResourceApplyProximalGradientDescent")
+    .Input("var: resource")
+    .Input("alpha: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("delta: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyProximalGradientDescentShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' as FOBOS algorithm with fixed learning rate.
+prox_v = var - alpha * delta
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+delta: The change.
+use_locking: If True, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyProximalGradientDescent")
+    .Input("var: resource")
+    .Input("alpha: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyProximalGradientDescentShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+
+That is for rows we have grad for, we update var as follows:
+prox_v = var - alpha * grad
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+use_locking: If True, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+static Status ApplyAdadeltaShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
+  TF_RETURN_IF_ERROR(
+      c->Merge(s, ShapeOrHandleShape(c, 2), &s));            // accum update
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));  // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));  // rho
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));  // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 6 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyAdadelta")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("accum_update: Ref(T)")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdadeltaShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the adadelta scheme.
+
+accum = rho() * accum + (1 - rho()) * grad.square();
+update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+update_accum = rho() * update_accum + (1 - rho()) * update.square();
+var -= update;
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+accum_update: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+rho: Decay factor. Must be a scalar.
+epsilon: Constant factor. Must be a scalar.
+grad: The gradient.
+out: Same as "var".
+use_locking: If True, updating of the var, accum and update_accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("SparseApplyAdadelta")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("accum_update: Ref(T)")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdadeltaShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+var: Should be from a Variable().
+accum: Should be from a Variable().
+accum_update:: Should be from a Variable().
+lr: Learning rate. Must be a scalar.
+rho: Decay factor. Must be a scalar.
+epsilon: Constant factor. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+out: Same as "var".
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("ResourceApplyAdadelta")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("accum_update: resource")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdadeltaShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the adadelta scheme.
+
+accum = rho() * accum + (1 - rho()) * grad.square();
+update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+update_accum = rho() * update_accum + (1 - rho()) * update.square();
+var -= update;
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+accum_update: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+rho: Decay factor. Must be a scalar.
+epsilon: Constant factor. Must be a scalar.
+grad: The gradient.
+use_locking: If True, updating of the var, accum and update_accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyAdadelta")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("accum_update: resource")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdadeltaShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+var: Should be from a Variable().
+accum: Should be from a Variable().
+accum_update:: Should be from a Variable().
+lr: Learning rate. Must be a scalar.
+rho: Decay factor. Must be a scalar.
+epsilon: Constant factor. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+static Status ApplyAdagradShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyAdagrad")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the adagrad scheme.
+
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+grad: The gradient.
+out: Same as "var".
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceApplyAdagrad")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the adagrad scheme.
+
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+grad: The gradient.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+static Status ApplyProximalAdagradShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));       // l1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // l2
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 5 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyProximalAdagrad")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyProximalAdagradShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+accum += grad * grad
+prox_v = var - lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+out: Same as "var".
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("ResourceApplyProximalAdagrad")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyProximalAdagradShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+accum += grad * grad
+prox_v = var - lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("SparseApplyAdagrad")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+
+That is for rows we have grad for, we update var and accum as follows:
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Learning rate. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+out: Same as "var".
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyAdagrad")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+
+That is for rows we have grad for, we update var and accum as follows:
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Learning rate. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+static Status ApplyAdagradDAShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);  // var
+  TF_RETURN_IF_ERROR(
+      c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // grad_accumulator
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2),
+                              &s));  // gradient_squared_accumulator
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
+  int idx = sparse ? 5 : 4;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l2
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // global step
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyAdagradDA")
+    .Input("var: Ref(T)")
+    .Input("gradient_accumulator: Ref(T)")
+    .Input("gradient_squared_accumulator: Ref(T)")
+    .Input("grad: T")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("global_step: int64")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradDAShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the proximal adagrad scheme.
+
+var: Should be from a Variable().
+gradient_accumulator: Should be from a Variable().
+gradient_squared_accumulator: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+global_step: Training step number. Must be a scalar.
+out: Same as "var".
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("SparseApplyAdagradDA")
+    .Input("var: Ref(T)")
+    .Input("gradient_accumulator: Ref(T)")
+    .Input("gradient_squared_accumulator: Ref(T)")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("global_step: int64")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradDAShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+
+var: Should be from a Variable().
+gradient_accumulator: Should be from a Variable().
+gradient_squared_accumulator: Should be from a Variable().
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+lr: Learning rate. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+global_step: Training step number. Must be a scalar.
+out: Same as "var".
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("SparseApplyProximalAdagrad")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyProximalAdagradShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+
+That is for rows we have grad for, we update var and accum as follows:
+accum += grad * grad
+prox_v = var
+prox_v -= lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Learning rate. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+out: Same as "var".
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("ResourceApplyAdagradDA")
+    .Input("var: resource")
+    .Input("gradient_accumulator: resource")
+    .Input("gradient_squared_accumulator: resource")
+    .Input("grad: T")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("global_step: int64")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradDAShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the proximal adagrad scheme.
+
+var: Should be from a Variable().
+gradient_accumulator: Should be from a Variable().
+gradient_squared_accumulator: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+global_step: Training step number. Must be a scalar.
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyAdagradDA")
+    .Input("var: resource")
+    .Input("gradient_accumulator: resource")
+    .Input("gradient_squared_accumulator: resource")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("global_step: int64")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradDAShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+
+var: Should be from a Variable().
+gradient_accumulator: Should be from a Variable().
+gradient_squared_accumulator: Should be from a Variable().
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+lr: Learning rate. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+global_step: Training step number. Must be a scalar.
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyProximalAdagrad")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyProximalAdagradShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+
+That is for rows we have grad for, we update var and accum as follows:
+accum += grad * grad
+prox_v = var
+prox_v -= lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Learning rate. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+static Status ApplyFtrlShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // linear
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
+  int idx = sparse ? 5 : 4;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l2
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // lr_power
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyFtrl")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("linear: Ref(T)")
+    .Input("grad: T")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("lr_power: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyFtrlShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the Ftrl-proximal scheme.
+
+accum_new = accum + grad * grad
+linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+linear: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regulariation. Must be a scalar.
+l2: L2 regulariation. Must be a scalar.
+lr_power: Scaling factor. Must be a scalar.
+out: Same as "var".
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("SparseApplyFtrl")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("linear: Ref(T)")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("lr_power: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyFtrlShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+
+That is for rows we have grad for, we update var, accum and linear as follows:
+accum_new = accum + grad * grad
+linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+linear: Should be from a Variable().
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+lr_power: Scaling factor. Must be a scalar.
+out: Same as "var".
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceApplyFtrl")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("linear: resource")
+    .Input("grad: T")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("lr_power: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyFtrlShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the Ftrl-proximal scheme.
+
+accum_new = accum + grad * grad
+linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+linear: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regulariation. Must be a scalar.
+l2: L2 regulariation. Must be a scalar.
+lr_power: Scaling factor. Must be a scalar.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyFtrl")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("linear: resource")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("lr_power: T")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyFtrlShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+
+That is for rows we have grad for, we update var, accum and linear as follows:
+accum_new = accum + grad * grad
+linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+linear: Should be from a Variable().
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+lr_power: Scaling factor. Must be a scalar.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ApplyFtrlV2")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("linear: Ref(T)")
+    .Input("grad: T")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("l2_shrinkage: T")
+    .Input("lr_power: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyFtrlShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the Ftrl-proximal scheme.
+
+grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+linear += grad_with_shrinkage +
+    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+linear: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regulariation. Must be a scalar.
+l2: online L2 regulariation. Must be a scalar.
+l2: L2 shrinkage regulariation. Must be a scalar.
+lr_power: Scaling factor. Must be a scalar.
+out: Same as "var".
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("SparseApplyFtrlV2")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("linear: Ref(T)")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("l2_shrinkage: T")
+    .Input("lr_power: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyFtrlShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+
+That is for rows we have grad for, we update var, accum and linear as follows:
+grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+linear += grad_with_shrinkage +
+    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+linear: Should be from a Variable().
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: onine L2 regularization. Must be a scalar.
+l2: L2 shrinkage regulariation. Must be a scalar.
+lr_power: Scaling factor. Must be a scalar.
+out: Same as "var".
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceApplyFtrlV2")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("linear: resource")
+    .Input("grad: T")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("l2_shrinkage: T")
+    .Input("lr_power: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyFtrlShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the Ftrl-proximal scheme.
+
+grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+linear += grad_with_shrinkage +
+    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+linear: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regulariation. Must be a scalar.
+l2: onine L2 regularization. Must be a scalar.
+l2: L2 shrinkage regulariation. Must be a scalar.
+lr_power: Scaling factor. Must be a scalar.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyFtrlV2")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("linear: resource")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("l2_shrinkage: T")
+    .Input("lr_power: T")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyFtrlShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+
+That is for rows we have grad for, we update var, accum and linear as follows:
+grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+linear += grad_with_shrinkage +
+    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+linear: Should be from a Variable().
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: onine L2 regularization. Must be a scalar.
+l2: L2 shrinkage regulariation. Must be a scalar.
+lr_power: Scaling factor. Must be a scalar.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+static Status ApplyMomentumShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
+  int idx = sparse ? 5 : 4;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // momentum
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyMomentum")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("momentum: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyMomentumShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+want to use Nesterov momentum.
+
+accum = accum * momentum + grad
+var -= lr * accum
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+grad: The gradient.
+momentum: Momentum. Must be a scalar.
+out: Same as "var".
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+use_nesterov: If `True`, the tensor passed to compute grad will be
+var - lr * momentum * accum, so in the end, the var you get is actually
+var - lr * momentum * accum.
+)doc");
+
+REGISTER_OP("SparseApplyMomentum")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("momentum: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyMomentumShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+Set use_nesterov = True if you want to use Nesterov momentum.
+
+That is for rows we have grad for, we update var and accum as follows:
+
+accum = accum * momentum + grad
+var -= lr * accum
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Learning rate. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+momentum: Momentum. Must be a scalar.
+out: Same as "var".
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+use_nesterov: If `True`, the tensor passed to compute grad will be
+var - lr * momentum * accum, so in the end, the var you get is actually
+var - lr * momentum * accum.
+)doc");
+
+REGISTER_OP("ResourceApplyMomentum")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("momentum: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyMomentumShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+want to use Nesterov momentum.
+
+accum = accum * momentum + grad
+var -= lr * accum
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+grad: The gradient.
+momentum: Momentum. Must be a scalar.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+use_nesterov: If `True`, the tensor passed to compute grad will be
+var - lr * momentum * accum, so in the end, the var you get is actually
+var - lr * momentum * accum.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyMomentum")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("momentum: T")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyMomentumShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+Set use_nesterov = True if you want to use Nesterov momentum.
+
+That is for rows we have grad for, we update var and accum as follows:
+
+accum = accum * momentum + grad
+var -= lr * accum
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Learning rate. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+momentum: Momentum. Must be a scalar.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+use_nesterov: If `True`, the tensor passed to compute grad will be
+var - lr * momentum * accum, so in the end, the var you get is actually
+var - lr * momentum * accum.
+)doc");
+
+static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // v
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));       // beta1_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // beta2_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));       // beta1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));       // beta2
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));       // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 9 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyAdam")
+    .Input("var: Ref(T)")
+    .Input("m: Ref(T)")
+    .Input("v: Ref(T)")
+    .Input("beta1_power: T")
+    .Input("beta2_power: T")
+    .Input("lr: T")
+    .Input("beta1: T")
+    .Input("beta2: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdamShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the Adam algorithm.
+
+lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
+v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
+variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+
+var: Should be from a Variable().
+m: Should be from a Variable().
+v: Should be from a Variable().
+beta1_power: Must be a scalar.
+beta2_power: Must be a scalar.
+lr: Scaling factor. Must be a scalar.
+beta1: Momentum factor. Must be a scalar.
+beta2: Momentum factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+grad: The gradient.
+out: Same as "var".
+use_locking: If `True`, updating of the var, m, and v tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+use_nesterov: If `True`, uses the nesterov update.
+)doc");
+
+REGISTER_OP("ResourceApplyAdam")
+    .Input("var: resource")
+    .Input("m: resource")
+    .Input("v: resource")
+    .Input("beta1_power: T")
+    .Input("beta2_power: T")
+    .Input("lr: T")
+    .Input("beta1: T")
+    .Input("beta2: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdamShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the Adam algorithm.
+
+lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
+v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
+variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+
+var: Should be from a Variable().
+m: Should be from a Variable().
+v: Should be from a Variable().
+beta1_power: Must be a scalar.
+beta2_power: Must be a scalar.
+lr: Scaling factor. Must be a scalar.
+beta1: Momentum factor. Must be a scalar.
+beta2: Momentum factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+grad: The gradient.
+use_locking: If `True`, updating of the var, m, and v tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+use_nesterov: If `True`, uses the nesterov update.
+)doc");
+
+static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // ms
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // mom
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // rho
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // momentum
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));       // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 7 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+static Status ApplyCenteredRMSPropShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // ms
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // mg
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 3), &s));  // mom
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // rho
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));       // momentum
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));       // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 8 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyRMSProp")
+    .Input("var: Ref(T)")
+    .Input("ms: Ref(T)")
+    .Input("mom: Ref(T)")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("momentum: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyRMSPropShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the RMSProp algorithm.
+Note that in dense implementation of this algorithm, ms and mom will
+update even if the grad is zero, but in this sparse implementation, ms
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+
+var: Should be from a Variable().
+ms: Should be from a Variable().
+mom: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+rho: Decay rate. Must be a scalar.
+grad: The gradient.
+out: Same as "var".
+use_locking: If `True`, updating of the var, ms, and mom tensors is protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ApplyCenteredRMSProp")
+    .Input("var: Ref(T)")
+    .Input("mg: Ref(T)")
+    .Input("ms: Ref(T)")
+    .Input("mom: Ref(T)")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("momentum: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyCenteredRMSPropShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the centered RMSProp algorithm.
+The centered RMSProp algorithm uses an estimate of the centered second moment
+(i.e., the variance) for normalization, as opposed to regular RMSProp, which
+uses the (uncentered) second moment. This often helps with training, but is
+slightly more expensive in terms of computation and memory.
+
+Note that in dense implementation of this algorithm, mg, ms, and mom will
+update even if the grad is zero, but in this sparse implementation, mg, ms,
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+mean_grad = decay * mean_grad + (1-decay) * gradient
+
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+
+mg <- rho * mg_{t-1} + (1-rho) * grad
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+var <- var - mom
+
+var: Should be from a Variable().
+mg: Should be from a Variable().
+ms: Should be from a Variable().
+mom: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+rho: Decay rate. Must be a scalar.
+grad: The gradient.
+out: Same as "var".
+use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
+  protected by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("SparseApplyRMSProp")
+    .Input("var: Ref(T)")
+    .Input("ms: Ref(T)")
+    .Input("mom: Ref(T)")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("momentum: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyRMSPropShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the RMSProp algorithm.
+Note that in dense implementation of this algorithm, ms and mom will
+update even if the grad is zero, but in this sparse implementation, ms
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+
+var: Should be from a Variable().
+ms: Should be from a Variable().
+mom: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+rho: Decay rate. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var, ms and mom.
+out: Same as "var".
+use_locking: If `True`, updating of the var, ms, and mom tensors is protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("SparseApplyCenteredRMSProp")
+    .Input("var: Ref(T)")
+    .Input("mg: Ref(T)")
+    .Input("ms: Ref(T)")
+    .Input("mom: Ref(T)")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("momentum: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyCenteredRMSPropShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the centered RMSProp algorithm.
+The centered RMSProp algorithm uses an estimate of the centered second moment
+(i.e., the variance) for normalization, as opposed to regular RMSProp, which
+uses the (uncentered) second moment. This often helps with training, but is
+slightly more expensive in terms of computation and memory.
+
+Note that in dense implementation of this algorithm, mg, ms, and mom will
+update even if the grad is zero, but in this sparse implementation, mg, ms,
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+mean_grad = decay * mean_grad + (1-decay) * gradient
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+
+var: Should be from a Variable().
+mg: Should be from a Variable().
+ms: Should be from a Variable().
+mom: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+rho: Decay rate. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var, ms and mom.
+out: Same as "var".
+use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
+  protected by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceApplyRMSProp")
+    .Input("var: resource")
+    .Input("ms: resource")
+    .Input("mom: resource")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("momentum: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyRMSPropShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the RMSProp algorithm.
+Note that in dense implementation of this algorithm, ms and mom will
+update even if the grad is zero, but in this sparse implementation, ms
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+
+var: Should be from a Variable().
+ms: Should be from a Variable().
+mom: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+rho: Decay rate. Must be a scalar.
+grad: The gradient.
+use_locking: If `True`, updating of the var, ms, and mom tensors is protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceApplyCenteredRMSProp")
+    .Input("var: resource")
+    .Input("mg: resource")
+    .Input("ms: resource")
+    .Input("mom: resource")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("momentum: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyCenteredRMSPropShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the centered RMSProp algorithm.
+The centered RMSProp algorithm uses an estimate of the centered second moment
+(i.e., the variance) for normalization, as opposed to regular RMSProp, which
+uses the (uncentered) second moment. This often helps with training, but is
+slightly more expensive in terms of computation and memory.
+
+Note that in dense implementation of this algorithm, mg, ms, and mom will
+update even if the grad is zero, but in this sparse implementation, mg, ms,
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+mean_grad = decay * mean_grad + (1-decay) * gradient
+
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+
+mg <- rho * mg_{t-1} + (1-rho) * grad
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+var <- var - mom
+
+var: Should be from a Variable().
+mg: Should be from a Variable().
+ms: Should be from a Variable().
+mom: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+rho: Decay rate. Must be a scalar.
+grad: The gradient.
+use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
+  protected by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyRMSProp")
+    .Input("var: resource")
+    .Input("ms: resource")
+    .Input("mom: resource")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("momentum: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyRMSPropShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the RMSProp algorithm.
+Note that in dense implementation of this algorithm, ms and mom will
+update even if the grad is zero, but in this sparse implementation, ms
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+
+var: Should be from a Variable().
+ms: Should be from a Variable().
+mom: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+rho: Decay rate. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var, ms and mom.
+use_locking: If `True`, updating of the var, ms, and mom tensors is protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyCenteredRMSProp")
+    .Input("var: resource")
+    .Input("mg: resource")
+    .Input("ms: resource")
+    .Input("mom: resource")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("momentum: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyCenteredRMSPropShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the centered RMSProp algorithm.
+The centered RMSProp algorithm uses an estimate of the centered second moment
+(i.e., the variance) for normalization, as opposed to regular RMSProp, which
+uses the (uncentered) second moment. This often helps with training, but is
+slightly more expensive in terms of computation and memory.
+
+Note that in dense implementation of this algorithm, mg, ms, and mom will
+update even if the grad is zero, but in this sparse implementation, mg, ms,
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+mean_grad = decay * mean_grad + (1-decay) * gradient
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+
+var: Should be from a Variable().
+mg: Should be from a Variable().
+ms: Should be from a Variable().
+mom: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+rho: Decay rate. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var, ms and mom.
+use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
+  protected by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/training_ops.h b/tensorflow/core/ops/training_ops.h
deleted file mode 100644
index a61c41e2a7f..00000000000
--- a/tensorflow/core/ops/training_ops.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_OPS_TRAINING_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_OPS_TRAINING_OPS_H_
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/ops/training_ops.h"
-
-namespace tensorflow {
-
-using shape_inference::DimensionHandle;
-using shape_inference::InferenceContext;
-using shape_inference::ShapeHandle;
-
-static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) {
-  auto* handle_data = c->input_handle_shapes_and_types(input);
-  if (handle_data != nullptr && !handle_data->empty() &&
-      (*handle_data)[0].dtype != DT_INVALID) {
-    return (*handle_data)[0].shape;
-  }
-  return c->input(input);
-}
-
-// Handle the gradient and, if <sparse>, indices inputs.
-// <s> is an input+output parameter, containing the current known input shape to
-// the gradient.
-static Status HandleGradAndIndicesInputs(InferenceContext* c, bool sparse,
-                                         int grad_idx, ShapeHandle* s) {
-  ShapeHandle grad = ShapeOrHandleShape(c, grad_idx);
-  if (!sparse) {
-    TF_RETURN_IF_ERROR(c->Merge(*s, grad, s));
-    return Status::OK();
-  }
-  // Indices is a vector where indices.dim[0].rank == grad[0].rank.
-  ShapeHandle indices;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(grad_idx + 1), 1, &indices));
-  DimensionHandle unused;
-  TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused));
-
-  // Trailing part of grad matches trailing part of *s.
-  ShapeHandle grad_unknown_first;
-  TF_RETURN_IF_ERROR(
-      c->ReplaceDim(grad, 0, c->UnknownDim(), &grad_unknown_first));
-  TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s));
-
-  return Status::OK();
-}
-
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_OPS_TRAINING_OPS_H_
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 26ece1d7416..4680e3ba160 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -65,16 +65,7 @@ tf_java_op_gen_srcjar(
         "sparse_ops",
         "state_ops",
         "string_ops",
-        "adadelta_ops",
-        "adagrad_da_ops",
-        "adagrad_ops",
-        "adam_ops",
-        "ftrl_ops",
-        "momentum_ops",
-        "gradient_descent_ops",
-        "proximal_adagrad_ops",
-        "proximal_gradient_descent_ops",
-        "rms_prop_ops",
+        "training_ops",
         "user_ops",
     ],
 )
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index d9c3fab0cf7..7ea5cedd8a4 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1363,53 +1363,8 @@ tf_gen_op_wrapper_private_py(
 )
 
 tf_gen_op_wrapper_private_py(
-    name = "adagrad_ops_gen",
-    out = "training/gen_adagrad_ops.py",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "adagrad_da_ops_gen",
-    out = "training/gen_adagrad_da_ops.py",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "adadelta_ops_gen",
-    out = "training/gen_adadelta_ops.py",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "adam_ops_gen",
-    out = "training/gen_adam_ops.py",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "ftrl_ops_gen",
-    out = "training/gen_ftrl_ops.py",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "gradient_descent_ops_gen",
-    out = "training/gen_gradient_descent_ops.py",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "momentum_ops_gen",
-    out = "training/gen_momentum_ops.py",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "proximal_adagrad_ops_gen",
-    out = "training/gen_proximal_adagrad_ops.py",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "proximal_gradient_descent_ops_gen",
-    out = "training/gen_proximal_gradient_descent_ops.py",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "rms_prop_ops_gen",
-    out = "training/gen_rms_prop_ops.py",
+    name = "training_ops_gen",
+    out = "training/gen_training_ops.py",
 )
 
 py_library(
@@ -2640,10 +2595,6 @@ py_library(
     ),
     srcs_version = "PY2AND3",
     deps = [
-        ":adadelta_ops_gen",
-        ":adagrad_da_ops_gen",
-        ":adagrad_ops_gen",
-        ":adam_ops_gen",
         ":array_ops",
         ":checkpoint_ops_gen",
         ":client",
@@ -2652,8 +2603,6 @@ py_library(
         ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
-        ":ftrl_ops_gen",
-        ":gradient_descent_ops_gen",
         ":gradients",
         ":init_ops",
         ":io_ops",
@@ -2661,21 +2610,18 @@ py_library(
         ":lib",
         ":lookup_ops",
         ":math_ops",
-        ":momentum_ops_gen",
         ":platform",
         ":protos_all_py",
-        ":proximal_adagrad_ops_gen",
-        ":proximal_gradient_descent_ops_gen",
         ":pywrap_tensorflow",
         ":random_ops",
         ":resource_variable_ops",
         ":resources",
-        ":rms_prop_ops_gen",
         ":sdca_ops",
         ":sparse_ops",
         ":state_ops",
         ":string_ops",
         ":summary",
+        ":training_ops_gen",
         ":util",
         ":variable_scope",
         ":variables",
diff --git a/tensorflow/python/training/training_ops.py b/tensorflow/python/training/training_ops.py
index b33e7cb2749..e98c32b6144 100644
--- a/tensorflow/python/training/training_ops.py
+++ b/tensorflow/python/training/training_ops.py
@@ -19,16 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.training import gen_training_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from tensorflow.python.training.gen_adadelta_ops import *
-from tensorflow.python.training.gen_adagrad_da_ops import *
-from tensorflow.python.training.gen_adagrad_ops import *
-from tensorflow.python.training.gen_adam_ops import *
-from tensorflow.python.training.gen_ftrl_ops import *
-from tensorflow.python.training.gen_gradient_descent_ops import *
-from tensorflow.python.training.gen_momentum_ops import *
-from tensorflow.python.training.gen_proximal_adagrad_ops import *
-from tensorflow.python.training.gen_proximal_gradient_descent_ops import *
-from tensorflow.python.training.gen_rms_prop_ops import *
+from tensorflow.python.training.gen_training_ops import *
 # pylint: enable=wildcard-import
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 5c0c507c9f3..5c156e7ee26 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -196,7 +196,7 @@ def tf_opts_nortti_if_android():
 
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate a library for that file.
-def tf_gen_op_libs(op_lib_names, deps=None, extra_srcs=[]):
+def tf_gen_op_libs(op_lib_names, deps=None):
   # Make library out of each op so it can also be used to generate wrappers
   # for various languages.
   if not deps:
@@ -205,7 +205,7 @@ def tf_gen_op_libs(op_lib_names, deps=None, extra_srcs=[]):
     native.cc_library(
         name=n + "_op_lib",
         copts=tf_copts(),
-        srcs=extra_srcs + ["ops/" + n + ".cc"],
+        srcs=["ops/" + n + ".cc"],
         deps=deps + [clean_dep("//tensorflow/core:framework")],
         visibility=["//visibility:public"],
         alwayslink=1,