diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD index cf8a668affc..0d2c9f2d195 100644 --- a/tensorflow/cc/BUILD +++ b/tensorflow/cc/BUILD @@ -416,16 +416,7 @@ tf_gen_op_wrappers_cc( "sparse_ops", "state_ops", "string_ops", - "adadelta_ops", - "adagrad_da_ops", - "adagrad_ops", - "adam_ops", - "ftrl_ops", - "momentum_ops", - "gradient_descent_ops", - "proximal_adagrad_ops", - "proximal_gradient_descent_ops", - "rms_prop_ops", + "training_ops", "user_ops", ], other_hdrs = [ diff --git a/tensorflow/cc/ops/standard_ops.h b/tensorflow/cc/ops/standard_ops.h index d0537dd79e1..0c021f0b3ac 100644 --- a/tensorflow/cc/ops/standard_ops.h +++ b/tensorflow/cc/ops/standard_ops.h @@ -16,34 +16,25 @@ limitations under the License. #ifndef THIRD_PARTY_TENSORFLOW_CC_OPS_STANDARD_OPS_H_ #define THIRD_PARTY_TENSORFLOW_CC_OPS_STANDARD_OPS_H_ -#include "tensorflow/cc/ops/adadelta_ops.h" -#include "tensorflow/cc/ops/adagrad_da_ops.h" -#include "tensorflow/cc/ops/adagrad_ops.h" -#include "tensorflow/cc/ops/adam_ops.h" #include "tensorflow/cc/ops/array_ops.h" #include "tensorflow/cc/ops/candidate_sampling_ops.h" #include "tensorflow/cc/ops/const_op.h" #include "tensorflow/cc/ops/control_flow_ops.h" #include "tensorflow/cc/ops/data_flow_ops.h" -#include "tensorflow/cc/ops/ftrl_ops.h" -#include "tensorflow/cc/ops/gradient_descent_ops.h" #include "tensorflow/cc/ops/image_ops.h" #include "tensorflow/cc/ops/io_ops.h" #include "tensorflow/cc/ops/linalg_ops.h" #include "tensorflow/cc/ops/logging_ops.h" #include "tensorflow/cc/ops/lookup_ops.h" #include "tensorflow/cc/ops/math_ops.h" -#include "tensorflow/cc/ops/momentum_ops.h" #include "tensorflow/cc/ops/nn_ops.h" #include "tensorflow/cc/ops/no_op.h" #include "tensorflow/cc/ops/parsing_ops.h" -#include "tensorflow/cc/ops/proximal_adagrad_ops.h" -#include "tensorflow/cc/ops/proximal_gradient_descent_ops.h" #include "tensorflow/cc/ops/random_ops.h" -#include "tensorflow/cc/ops/rms_prop_ops.h" #include "tensorflow/cc/ops/sparse_ops.h" #include "tensorflow/cc/ops/state_ops.h" #include "tensorflow/cc/ops/string_ops.h" +#include "tensorflow/cc/ops/training_ops.h" #include "tensorflow/cc/ops/user_ops.h" #endif // THIRD_PARTY_TENSORFLOW_CC_OPS_STANDARD_OPS_H_ diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 6f428d80458..87cb212ad0f 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -535,7 +535,6 @@ cc_library( # Generates library per group of ops. tf_gen_op_libs( - extra_srcs = ["ops/training_ops.h"], op_lib_names = [ "bitwise_ops", "candidate_sampling_ops", @@ -568,16 +567,7 @@ tf_gen_op_libs( "stateless_random_ops", "string_ops", "summary_ops", - "adadelta_ops", - "adagrad_da_ops", - "adagrad_ops", - "adam_ops", - "ftrl_ops", - "momentum_ops", - "gradient_descent_ops", - "proximal_adagrad_ops", - "proximal_gradient_descent_ops", - "rms_prop_ops", + "training_ops", ], ) @@ -655,16 +645,7 @@ cc_library( ":state_ops_op_lib", ":stateless_random_ops_op_lib", ":string_ops_op_lib", - ":adadelta_ops_op_lib", - ":adagrad_da_ops_op_lib", - ":adagrad_ops_op_lib", - ":adam_ops_op_lib", - ":ftrl_ops_op_lib", - ":momentum_ops_op_lib", - ":gradient_descent_ops_op_lib", - ":proximal_adagrad_ops_op_lib", - ":proximal_gradient_descent_ops_op_lib", - ":rms_prop_ops_op_lib", + ":training_ops_op_lib", ":user_ops_op_lib", ":word2vec_ops", ] + tf_additional_cloud_op_deps(), diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 765976c37c3..10f9e7344a5 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -3888,18 +3888,9 @@ tf_kernel_library( ":bounds_check", ":training_op_helpers", ":variable_ops", - "//tensorflow/core:adadelta_ops_op_lib", - "//tensorflow/core:adagrad_da_ops_op_lib", - "//tensorflow/core:adagrad_ops_op_lib", - "//tensorflow/core:adam_ops_op_lib", "//tensorflow/core:framework", - "//tensorflow/core:ftrl_ops_op_lib", - "//tensorflow/core:gradient_descent_ops_op_lib", "//tensorflow/core:lib", - "//tensorflow/core:momentum_ops_op_lib", - "//tensorflow/core:proximal_adagrad_ops_op_lib", - "//tensorflow/core:proximal_gradient_descent_ops_op_lib", - "//tensorflow/core:rms_prop_ops_op_lib", + "//tensorflow/core:training_ops_op_lib", "//third_party/eigen3", ], ) diff --git a/tensorflow/core/ops/adadelta_ops.cc b/tensorflow/core/ops/adadelta_ops.cc deleted file mode 100644 index b7dcff4a09e..00000000000 --- a/tensorflow/core/ops/adadelta_ops.cc +++ /dev/null @@ -1,163 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/ops/training_ops.h" - -namespace tensorflow { - -static Status ApplyAdadeltaShapeFn(InferenceContext* c, bool sparse) { - ShapeHandle unused; - ShapeHandle s = ShapeOrHandleShape(c, 0); // var - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum - TF_RETURN_IF_ERROR( - c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // accum update - TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // lr - TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // rho - TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // epsilon - TF_RETURN_IF_ERROR( - HandleGradAndIndicesInputs(c, sparse, 6 /* grad_idx */, &s)); - if (c->num_outputs() > 0) { - c->set_output(0, s); - } - return Status::OK(); -} - -REGISTER_OP("ApplyAdadelta") - .Input("var: Ref(T)") - .Input("accum: Ref(T)") - .Input("accum_update: Ref(T)") - .Input("lr: T") - .Input("rho: T") - .Input("epsilon: T") - .Input("grad: T") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyAdadeltaShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the adadelta scheme. - -accum = rho() * accum + (1 - rho()) * grad.square(); -update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad; -update_accum = rho() * update_accum + (1 - rho()) * update.square(); -var -= update; - -var: Should be from a Variable(). -accum: Should be from a Variable(). -accum_update: Should be from a Variable(). -lr: Scaling factor. Must be a scalar. -rho: Decay factor. Must be a scalar. -epsilon: Constant factor. Must be a scalar. -grad: The gradient. -out: Same as "var". -use_locking: If True, updating of the var, accum and update_accum tensors will be protected by -a lock; otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -REGISTER_OP("SparseApplyAdadelta") - .Input("var: Ref(T)") - .Input("accum: Ref(T)") - .Input("accum_update: Ref(T)") - .Input("lr: T") - .Input("rho: T") - .Input("epsilon: T") - .Input("grad: T") - .Input("indices: Tindices") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyAdadeltaShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -var: Should be from a Variable(). -accum: Should be from a Variable(). -accum_update:: Should be from a Variable(). -lr: Learning rate. Must be a scalar. -rho: Decay factor. Must be a scalar. -epsilon: Constant factor. Must be a scalar. -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -out: Same as "var". -use_locking: If True, updating of the var and accum tensors will be protected by -a lock; otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -REGISTER_OP("ResourceApplyAdadelta") - .Input("var: resource") - .Input("accum: resource") - .Input("accum_update: resource") - .Input("lr: T") - .Input("rho: T") - .Input("epsilon: T") - .Input("grad: T") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyAdadeltaShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the adadelta scheme. - -accum = rho() * accum + (1 - rho()) * grad.square(); -update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad; -update_accum = rho() * update_accum + (1 - rho()) * update.square(); -var -= update; - -var: Should be from a Variable(). -accum: Should be from a Variable(). -accum_update: Should be from a Variable(). -lr: Scaling factor. Must be a scalar. -rho: Decay factor. Must be a scalar. -epsilon: Constant factor. Must be a scalar. -grad: The gradient. -use_locking: If True, updating of the var, accum and update_accum tensors will be protected by -a lock; otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -REGISTER_OP("ResourceSparseApplyAdadelta") - .Input("var: resource") - .Input("accum: resource") - .Input("accum_update: resource") - .Input("lr: T") - .Input("rho: T") - .Input("epsilon: T") - .Input("grad: T") - .Input("indices: Tindices") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyAdadeltaShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -var: Should be from a Variable(). -accum: Should be from a Variable(). -accum_update:: Should be from a Variable(). -lr: Learning rate. Must be a scalar. -rho: Decay factor. Must be a scalar. -epsilon: Constant factor. Must be a scalar. -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -use_locking: If True, updating of the var and accum tensors will be protected by -a lock; otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -} // namespace tensorflow diff --git a/tensorflow/core/ops/adagrad_da_ops.cc b/tensorflow/core/ops/adagrad_da_ops.cc deleted file mode 100644 index 997a0249904..00000000000 --- a/tensorflow/core/ops/adagrad_da_ops.cc +++ /dev/null @@ -1,168 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/ops/training_ops.h" - -namespace tensorflow { - -static Status ApplyAdagradDAShapeFn(InferenceContext* c, bool sparse) { - ShapeHandle unused; - ShapeHandle s = ShapeOrHandleShape(c, 0); // var - TF_RETURN_IF_ERROR( - c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // grad_accumulator - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), - &s)); // gradient_squared_accumulator - TF_RETURN_IF_ERROR( - HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); - int idx = sparse ? 5 : 4; - TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // lr - TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l1 - TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l2 - TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // global step - if (c->num_outputs() > 0) { - c->set_output(0, s); - } - return Status::OK(); -} - -REGISTER_OP("ApplyAdagradDA") - .Input("var: Ref(T)") - .Input("gradient_accumulator: Ref(T)") - .Input("gradient_squared_accumulator: Ref(T)") - .Input("grad: T") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("global_step: int64") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyAdagradDAShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the proximal adagrad scheme. - -var: Should be from a Variable(). -gradient_accumulator: Should be from a Variable(). -gradient_squared_accumulator: Should be from a Variable(). -grad: The gradient. -lr: Scaling factor. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: L2 regularization. Must be a scalar. -global_step: Training step number. Must be a scalar. -out: Same as "var". -use_locking: If True, updating of the var and accum tensors will be protected by -a lock; otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -REGISTER_OP("SparseApplyAdagradDA") - .Input("var: Ref(T)") - .Input("gradient_accumulator: Ref(T)") - .Input("gradient_squared_accumulator: Ref(T)") - .Input("grad: T") - .Input("indices: Tindices") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("global_step: int64") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyAdagradDAShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Update entries in '*var' and '*accum' according to the proximal adagrad scheme. - -var: Should be from a Variable(). -gradient_accumulator: Should be from a Variable(). -gradient_squared_accumulator: Should be from a Variable(). -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -lr: Learning rate. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: L2 regularization. Must be a scalar. -global_step: Training step number. Must be a scalar. -out: Same as "var". -use_locking: If True, updating of the var and accum tensors will be protected by -a lock; otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -REGISTER_OP("ResourceApplyAdagradDA") - .Input("var: resource") - .Input("gradient_accumulator: resource") - .Input("gradient_squared_accumulator: resource") - .Input("grad: T") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("global_step: int64") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyAdagradDAShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the proximal adagrad scheme. - -var: Should be from a Variable(). -gradient_accumulator: Should be from a Variable(). -gradient_squared_accumulator: Should be from a Variable(). -grad: The gradient. -lr: Scaling factor. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: L2 regularization. Must be a scalar. -global_step: Training step number. Must be a scalar. -use_locking: If True, updating of the var and accum tensors will be protected by -a lock; otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -REGISTER_OP("ResourceSparseApplyAdagradDA") - .Input("var: resource") - .Input("gradient_accumulator: resource") - .Input("gradient_squared_accumulator: resource") - .Input("grad: T") - .Input("indices: Tindices") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("global_step: int64") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyAdagradDAShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Update entries in '*var' and '*accum' according to the proximal adagrad scheme. - -var: Should be from a Variable(). -gradient_accumulator: Should be from a Variable(). -gradient_squared_accumulator: Should be from a Variable(). -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -lr: Learning rate. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: L2 regularization. Must be a scalar. -global_step: Training step number. Must be a scalar. -use_locking: If True, updating of the var and accum tensors will be protected by -a lock; otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -} // namespace tensorflow diff --git a/tensorflow/core/ops/adagrad_ops.cc b/tensorflow/core/ops/adagrad_ops.cc deleted file mode 100644 index 03dde949e41..00000000000 --- a/tensorflow/core/ops/adagrad_ops.cc +++ /dev/null @@ -1,147 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/ops/training_ops.h" - -namespace tensorflow { - -static Status ApplyAdagradShapeFn(InferenceContext* c, bool sparse) { - ShapeHandle unused; - ShapeHandle s = ShapeOrHandleShape(c, 0); // var - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum - TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); // lr - TF_RETURN_IF_ERROR( - HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); - if (c->num_outputs() > 0) { - c->set_output(0, s); - } - return Status::OK(); -} - -REGISTER_OP("ApplyAdagrad") - .Input("var: Ref(T)") - .Input("accum: Ref(T)") - .Input("lr: T") - .Input("grad: T") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyAdagradShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the adagrad scheme. - -accum += grad * grad -var -= lr * grad * (1 / sqrt(accum)) - -var: Should be from a Variable(). -accum: Should be from a Variable(). -lr: Scaling factor. Must be a scalar. -grad: The gradient. -out: Same as "var". -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("ResourceApplyAdagrad") - .Input("var: resource") - .Input("accum: resource") - .Input("lr: T") - .Input("grad: T") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyAdagradShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the adagrad scheme. - -accum += grad * grad -var -= lr * grad * (1 / sqrt(accum)) - -var: Should be from a Variable(). -accum: Should be from a Variable(). -lr: Scaling factor. Must be a scalar. -grad: The gradient. -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("SparseApplyAdagrad") - .Input("var: Ref(T)") - .Input("accum: Ref(T)") - .Input("lr: T") - .Input("grad: T") - .Input("indices: Tindices") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyAdagradShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Update relevant entries in '*var' and '*accum' according to the adagrad scheme. - -That is for rows we have grad for, we update var and accum as follows: -accum += grad * grad -var -= lr * grad * (1 / sqrt(accum)) - -var: Should be from a Variable(). -accum: Should be from a Variable(). -lr: Learning rate. Must be a scalar. -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -out: Same as "var". -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("ResourceSparseApplyAdagrad") - .Input("var: resource") - .Input("accum: resource") - .Input("lr: T") - .Input("grad: T") - .Input("indices: Tindices") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyAdagradShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Update relevant entries in '*var' and '*accum' according to the adagrad scheme. - -That is for rows we have grad for, we update var and accum as follows: -accum += grad * grad -var -= lr * grad * (1 / sqrt(accum)) - -var: Should be from a Variable(). -accum: Should be from a Variable(). -lr: Learning rate. Must be a scalar. -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -} // namespace tensorflow diff --git a/tensorflow/core/ops/adam_ops.cc b/tensorflow/core/ops/adam_ops.cc deleted file mode 100644 index 74cf2f5267a..00000000000 --- a/tensorflow/core/ops/adam_ops.cc +++ /dev/null @@ -1,125 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/ops/training_ops.h" - -namespace tensorflow { - -static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) { - ShapeHandle unused; - ShapeHandle s = ShapeOrHandleShape(c, 0); // var - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // m - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // v - TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // beta1_power - TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // beta2_power - TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // lr - TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); // beta1 - TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused)); // beta2 - TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused)); // epsilon - TF_RETURN_IF_ERROR( - HandleGradAndIndicesInputs(c, sparse, 9 /* grad_idx */, &s)); - if (c->num_outputs() > 0) { - c->set_output(0, s); - } - return Status::OK(); -} - -REGISTER_OP("ApplyAdam") - .Input("var: Ref(T)") - .Input("m: Ref(T)") - .Input("v: Ref(T)") - .Input("beta1_power: T") - .Input("beta2_power: T") - .Input("lr: T") - .Input("beta1: T") - .Input("beta2: T") - .Input("epsilon: T") - .Input("grad: T") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .Attr("use_nesterov: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyAdamShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the Adam algorithm. - -lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) -m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t -v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t -variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon) - -var: Should be from a Variable(). -m: Should be from a Variable(). -v: Should be from a Variable(). -beta1_power: Must be a scalar. -beta2_power: Must be a scalar. -lr: Scaling factor. Must be a scalar. -beta1: Momentum factor. Must be a scalar. -beta2: Momentum factor. Must be a scalar. -epsilon: Ridge term. Must be a scalar. -grad: The gradient. -out: Same as "var". -use_locking: If `True`, updating of the var, m, and v tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -use_nesterov: If `True`, uses the nesterov update. -)doc"); - -REGISTER_OP("ResourceApplyAdam") - .Input("var: resource") - .Input("m: resource") - .Input("v: resource") - .Input("beta1_power: T") - .Input("beta2_power: T") - .Input("lr: T") - .Input("beta1: T") - .Input("beta2: T") - .Input("epsilon: T") - .Input("grad: T") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .Attr("use_nesterov: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyAdamShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the Adam algorithm. - -lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) -m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t -v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t -variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon) - -var: Should be from a Variable(). -m: Should be from a Variable(). -v: Should be from a Variable(). -beta1_power: Must be a scalar. -beta2_power: Must be a scalar. -lr: Scaling factor. Must be a scalar. -beta1: Momentum factor. Must be a scalar. -beta2: Momentum factor. Must be a scalar. -epsilon: Ridge term. Must be a scalar. -grad: The gradient. -use_locking: If `True`, updating of the var, m, and v tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -use_nesterov: If `True`, uses the nesterov update. -)doc"); - -} // namespace tensorflow diff --git a/tensorflow/core/ops/ftrl_ops.cc b/tensorflow/core/ops/ftrl_ops.cc deleted file mode 100644 index a40823eb539..00000000000 --- a/tensorflow/core/ops/ftrl_ops.cc +++ /dev/null @@ -1,368 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/ops/training_ops.h" - -namespace tensorflow { - -static Status ApplyFtrlShapeFn(InferenceContext* c, bool sparse) { - ShapeHandle unused; - ShapeHandle s = ShapeOrHandleShape(c, 0); // var - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // linear - TF_RETURN_IF_ERROR( - HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); - int idx = sparse ? 5 : 4; - TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // lr - TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l1 - TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l2 - TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // lr_power - if (c->num_outputs() > 0) { - c->set_output(0, s); - } - return Status::OK(); -} - -REGISTER_OP("ApplyFtrl") - .Input("var: Ref(T)") - .Input("accum: Ref(T)") - .Input("linear: Ref(T)") - .Input("grad: T") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("lr_power: T") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyFtrlShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the Ftrl-proximal scheme. - -accum_new = accum + grad * grad -linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var -quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 -var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 -accum = accum_new - -var: Should be from a Variable(). -accum: Should be from a Variable(). -linear: Should be from a Variable(). -grad: The gradient. -lr: Scaling factor. Must be a scalar. -l1: L1 regulariation. Must be a scalar. -l2: L2 regulariation. Must be a scalar. -lr_power: Scaling factor. Must be a scalar. -out: Same as "var". -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("SparseApplyFtrl") - .Input("var: Ref(T)") - .Input("accum: Ref(T)") - .Input("linear: Ref(T)") - .Input("grad: T") - .Input("indices: Tindices") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("lr_power: T") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyFtrlShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Update relevant entries in '*var' according to the Ftrl-proximal scheme. - -That is for rows we have grad for, we update var, accum and linear as follows: -accum_new = accum + grad * grad -linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var -quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 -var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 -accum = accum_new - -var: Should be from a Variable(). -accum: Should be from a Variable(). -linear: Should be from a Variable(). -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -lr: Scaling factor. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: L2 regularization. Must be a scalar. -lr_power: Scaling factor. Must be a scalar. -out: Same as "var". -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("ResourceApplyFtrl") - .Input("var: resource") - .Input("accum: resource") - .Input("linear: resource") - .Input("grad: T") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("lr_power: T") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyFtrlShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the Ftrl-proximal scheme. - -accum_new = accum + grad * grad -linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var -quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 -var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 -accum = accum_new - -var: Should be from a Variable(). -accum: Should be from a Variable(). -linear: Should be from a Variable(). -grad: The gradient. -lr: Scaling factor. Must be a scalar. -l1: L1 regulariation. Must be a scalar. -l2: L2 regulariation. Must be a scalar. -lr_power: Scaling factor. Must be a scalar. -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("ResourceSparseApplyFtrl") - .Input("var: resource") - .Input("accum: resource") - .Input("linear: resource") - .Input("grad: T") - .Input("indices: Tindices") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("lr_power: T") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyFtrlShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Update relevant entries in '*var' according to the Ftrl-proximal scheme. - -That is for rows we have grad for, we update var, accum and linear as follows: -accum_new = accum + grad * grad -linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var -quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 -var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 -accum = accum_new - -var: Should be from a Variable(). -accum: Should be from a Variable(). -linear: Should be from a Variable(). -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -lr: Scaling factor. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: L2 regularization. Must be a scalar. -lr_power: Scaling factor. Must be a scalar. -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("ApplyFtrlV2") - .Input("var: Ref(T)") - .Input("accum: Ref(T)") - .Input("linear: Ref(T)") - .Input("grad: T") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("l2_shrinkage: T") - .Input("lr_power: T") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyFtrlShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the Ftrl-proximal scheme. - -grad_with_shrinkage = grad + 2 * l2_shrinkage * var -accum_new = accum + grad_with_shrinkage * grad_with_shrinkage -linear += grad_with_shrinkage + - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var -quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 -var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 -accum = accum_new - -var: Should be from a Variable(). -accum: Should be from a Variable(). -linear: Should be from a Variable(). -grad: The gradient. -lr: Scaling factor. Must be a scalar. -l1: L1 regulariation. Must be a scalar. -l2: online L2 regulariation. Must be a scalar. -l2: L2 shrinkage regulariation. Must be a scalar. -lr_power: Scaling factor. Must be a scalar. -out: Same as "var". -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("SparseApplyFtrlV2") - .Input("var: Ref(T)") - .Input("accum: Ref(T)") - .Input("linear: Ref(T)") - .Input("grad: T") - .Input("indices: Tindices") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("l2_shrinkage: T") - .Input("lr_power: T") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyFtrlShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Update relevant entries in '*var' according to the Ftrl-proximal scheme. - -That is for rows we have grad for, we update var, accum and linear as follows: -grad_with_shrinkage = grad + 2 * l2_shrinkage * var -accum_new = accum + grad_with_shrinkage * grad_with_shrinkage -linear += grad_with_shrinkage + - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var -quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 -var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 -accum = accum_new - -var: Should be from a Variable(). -accum: Should be from a Variable(). -linear: Should be from a Variable(). -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -lr: Scaling factor. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: onine L2 regularization. Must be a scalar. -l2: L2 shrinkage regulariation. Must be a scalar. -lr_power: Scaling factor. Must be a scalar. -out: Same as "var". -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("ResourceApplyFtrlV2") - .Input("var: resource") - .Input("accum: resource") - .Input("linear: resource") - .Input("grad: T") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("l2_shrinkage: T") - .Input("lr_power: T") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyFtrlShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the Ftrl-proximal scheme. - -grad_with_shrinkage = grad + 2 * l2_shrinkage * var -accum_new = accum + grad_with_shrinkage * grad_with_shrinkage -linear += grad_with_shrinkage + - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var -quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 -var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 -accum = accum_new - -var: Should be from a Variable(). -accum: Should be from a Variable(). -linear: Should be from a Variable(). -grad: The gradient. -lr: Scaling factor. Must be a scalar. -l1: L1 regulariation. Must be a scalar. -l2: onine L2 regularization. Must be a scalar. -l2: L2 shrinkage regulariation. Must be a scalar. -lr_power: Scaling factor. Must be a scalar. -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("ResourceSparseApplyFtrlV2") - .Input("var: resource") - .Input("accum: resource") - .Input("linear: resource") - .Input("grad: T") - .Input("indices: Tindices") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("l2_shrinkage: T") - .Input("lr_power: T") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyFtrlShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Update relevant entries in '*var' according to the Ftrl-proximal scheme. - -That is for rows we have grad for, we update var, accum and linear as follows: -grad_with_shrinkage = grad + 2 * l2_shrinkage * var -accum_new = accum + grad_with_shrinkage * grad_with_shrinkage -linear += grad_with_shrinkage + - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var -quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 -var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 -accum = accum_new - -var: Should be from a Variable(). -accum: Should be from a Variable(). -linear: Should be from a Variable(). -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -lr: Scaling factor. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: onine L2 regularization. Must be a scalar. -l2: L2 shrinkage regulariation. Must be a scalar. -lr_power: Scaling factor. Must be a scalar. -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -} // namespace tensorflow diff --git a/tensorflow/core/ops/gradient_descent_ops.cc b/tensorflow/core/ops/gradient_descent_ops.cc deleted file mode 100644 index c94a91d275c..00000000000 --- a/tensorflow/core/ops/gradient_descent_ops.cc +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/ops/training_ops.h" - -namespace tensorflow { - -static Status ApplyGradientDescentShapeFn(InferenceContext* c) { - ShapeHandle unused; - ShapeHandle s = ShapeOrHandleShape(c, 0); // var - TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); // alpha - TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s)); // delta - if (c->num_outputs() > 0) { - c->set_output(0, s); - } - return Status::OK(); -} - -REGISTER_OP("ApplyGradientDescent") - .Input("var: Ref(T)") - .Input("alpha: T") - .Input("delta: T") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn(ApplyGradientDescentShapeFn) - .Doc(R"doc( -Update '*var' by subtracting 'alpha' * 'delta' from it. - -var: Should be from a Variable(). -alpha: Scaling factor. Must be a scalar. -delta: The change. -out: Same as "var". -use_locking: If `True`, the subtraction will be protected by a lock; - otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -REGISTER_OP("ResourceApplyGradientDescent") - .Input("var: resource") - .Input("alpha: T") - .Input("delta: T") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn(ApplyGradientDescentShapeFn) - .Doc(R"doc( -Update '*var' by subtracting 'alpha' * 'delta' from it. - -var: Should be from a Variable(). -alpha: Scaling factor. Must be a scalar. -delta: The change. -use_locking: If `True`, the subtraction will be protected by a lock; - otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -} // namespace tensorflow diff --git a/tensorflow/core/ops/momentum_ops.cc b/tensorflow/core/ops/momentum_ops.cc deleted file mode 100644 index f9701bd4881..00000000000 --- a/tensorflow/core/ops/momentum_ops.cc +++ /dev/null @@ -1,179 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/ops/training_ops.h" - -namespace tensorflow { - -static Status ApplyMomentumShapeFn(InferenceContext* c, bool sparse) { - ShapeHandle unused; - ShapeHandle s = ShapeOrHandleShape(c, 0); // var - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum - TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); // lr - TF_RETURN_IF_ERROR( - HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); - int idx = sparse ? 5 : 4; - TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // momentum - if (c->num_outputs() > 0) { - c->set_output(0, s); - } - return Status::OK(); -} - -REGISTER_OP("ApplyMomentum") - .Input("var: Ref(T)") - .Input("accum: Ref(T)") - .Input("lr: T") - .Input("grad: T") - .Input("momentum: T") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .Attr("use_nesterov: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyMomentumShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the momentum scheme. Set use_nesterov = True if you -want to use Nesterov momentum. - -accum = accum * momentum + grad -var -= lr * accum - -var: Should be from a Variable(). -accum: Should be from a Variable(). -lr: Scaling factor. Must be a scalar. -grad: The gradient. -momentum: Momentum. Must be a scalar. -out: Same as "var". -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -use_nesterov: If `True`, the tensor passed to compute grad will be -var - lr * momentum * accum, so in the end, the var you get is actually -var - lr * momentum * accum. -)doc"); - -REGISTER_OP("SparseApplyMomentum") - .Input("var: Ref(T)") - .Input("accum: Ref(T)") - .Input("lr: T") - .Input("grad: T") - .Input("indices: Tindices") - .Input("momentum: T") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .Attr("use_nesterov: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyMomentumShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Update relevant entries in '*var' and '*accum' according to the momentum scheme. -Set use_nesterov = True if you want to use Nesterov momentum. - -That is for rows we have grad for, we update var and accum as follows: - -accum = accum * momentum + grad -var -= lr * accum - -var: Should be from a Variable(). -accum: Should be from a Variable(). -lr: Learning rate. Must be a scalar. -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -momentum: Momentum. Must be a scalar. -out: Same as "var". -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -use_nesterov: If `True`, the tensor passed to compute grad will be -var - lr * momentum * accum, so in the end, the var you get is actually -var - lr * momentum * accum. -)doc"); - -REGISTER_OP("ResourceApplyMomentum") - .Input("var: resource") - .Input("accum: resource") - .Input("lr: T") - .Input("grad: T") - .Input("momentum: T") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .Attr("use_nesterov: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyMomentumShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the momentum scheme. Set use_nesterov = True if you -want to use Nesterov momentum. - -accum = accum * momentum + grad -var -= lr * accum - -var: Should be from a Variable(). -accum: Should be from a Variable(). -lr: Scaling factor. Must be a scalar. -grad: The gradient. -momentum: Momentum. Must be a scalar. -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -use_nesterov: If `True`, the tensor passed to compute grad will be -var - lr * momentum * accum, so in the end, the var you get is actually -var - lr * momentum * accum. -)doc"); - -REGISTER_OP("ResourceSparseApplyMomentum") - .Input("var: resource") - .Input("accum: resource") - .Input("lr: T") - .Input("grad: T") - .Input("indices: Tindices") - .Input("momentum: T") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .Attr("use_nesterov: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyMomentumShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Update relevant entries in '*var' and '*accum' according to the momentum scheme. -Set use_nesterov = True if you want to use Nesterov momentum. - -That is for rows we have grad for, we update var and accum as follows: - -accum = accum * momentum + grad -var -= lr * accum - -var: Should be from a Variable(). -accum: Should be from a Variable(). -lr: Learning rate. Must be a scalar. -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -momentum: Momentum. Must be a scalar. -use_locking: If `True`, updating of the var and accum tensors will be protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -use_nesterov: If `True`, the tensor passed to compute grad will be -var - lr * momentum * accum, so in the end, the var you get is actually -var - lr * momentum * accum. -)doc"); - -} // namespace tensorflow diff --git a/tensorflow/core/ops/proximal_adagrad_ops.cc b/tensorflow/core/ops/proximal_adagrad_ops.cc deleted file mode 100644 index a618519d82b..00000000000 --- a/tensorflow/core/ops/proximal_adagrad_ops.cc +++ /dev/null @@ -1,183 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/ops/training_ops.h" - -namespace tensorflow { - -static Status ApplyProximalAdagradShapeFn(InferenceContext* c, bool sparse) { - ShapeHandle unused; - ShapeHandle s = ShapeOrHandleShape(c, 0); // var - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum - TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); // lr - TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // l1 - TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // l2 - TF_RETURN_IF_ERROR( - HandleGradAndIndicesInputs(c, sparse, 5 /* grad_idx */, &s)); - if (c->num_outputs() > 0) { - c->set_output(0, s); - } - return Status::OK(); -} - -REGISTER_OP("ApplyProximalAdagrad") - .Input("var: Ref(T)") - .Input("accum: Ref(T)") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("grad: T") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyProximalAdagradShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' and '*accum' according to FOBOS with Adagrad learning rate. -accum += grad * grad -prox_v = var - lr * grad * (1 / sqrt(accum)) -var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0} - -var: Should be from a Variable(). -accum: Should be from a Variable(). -grad: The gradient. -lr: Scaling factor. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: L2 regularization. Must be a scalar. -out: Same as "var". -use_locking: If True, updating of the var and accum tensors will be protected by -a lock; otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -REGISTER_OP("ResourceApplyProximalAdagrad") - .Input("var: resource") - .Input("accum: resource") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("grad: T") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyProximalAdagradShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' and '*accum' according to FOBOS with Adagrad learning rate. -accum += grad * grad -prox_v = var - lr * grad * (1 / sqrt(accum)) -var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0} - -var: Should be from a Variable(). -accum: Should be from a Variable(). -grad: The gradient. -lr: Scaling factor. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: L2 regularization. Must be a scalar. -use_locking: If True, updating of the var and accum tensors will be protected by -a lock; otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -REGISTER_OP("SparseApplyProximalAdagrad") - .Input("var: Ref(T)") - .Input("accum: Ref(T)") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("grad: T") - .Input("indices: Tindices") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyProximalAdagradShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Sparse update entries in '*var' and '*accum' according to FOBOS algorithm. - -That is for rows we have grad for, we update var and accum as follows: -accum += grad * grad -prox_v = var -prox_v -= lr * grad * (1 / sqrt(accum)) -var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0} - -var: Should be from a Variable(). -accum: Should be from a Variable(). -lr: Learning rate. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: L2 regularization. Must be a scalar. -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -out: Same as "var". -use_locking: If True, updating of the var and accum tensors will be protected by -a lock; otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -REGISTER_OP("ResourceSparseApplyProximalAdagrad") - .Input("var: resource") - .Input("accum: resource") - .Input("lr: T") - .Input("l1: T") - .Input("l2: T") - .Input("grad: T") - .Input("indices: Tindices") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyProximalAdagradShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Sparse update entries in '*var' and '*accum' according to FOBOS algorithm. - -That is for rows we have grad for, we update var and accum as follows: -accum += grad * grad -prox_v = var -prox_v -= lr * grad * (1 / sqrt(accum)) -var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0} - -var: Should be from a Variable(). -accum: Should be from a Variable(). -lr: Learning rate. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: L2 regularization. Must be a scalar. -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -use_locking: If True, updating of the var and accum tensors will be protected by -a lock; otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -static Status ApplyFtrlShapeFn(InferenceContext* c, bool sparse) { - ShapeHandle unused; - ShapeHandle s = ShapeOrHandleShape(c, 0); // var - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // linear - TF_RETURN_IF_ERROR( - HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); - int idx = sparse ? 5 : 4; - TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // lr - TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l1 - TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l2 - TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // lr_power - if (c->num_outputs() > 0) { - c->set_output(0, s); - } - return Status::OK(); -} - -} // namespace tensorflow diff --git a/tensorflow/core/ops/proximal_gradient_descent_ops.cc b/tensorflow/core/ops/proximal_gradient_descent_ops.cc deleted file mode 100644 index 42e762c0529..00000000000 --- a/tensorflow/core/ops/proximal_gradient_descent_ops.cc +++ /dev/null @@ -1,151 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/ops/training_ops.h" - -namespace tensorflow { - -static Status ApplyProximalGradientDescentShapeFn(InferenceContext* c, - bool sparse) { - ShapeHandle unused; - ShapeHandle s = ShapeOrHandleShape(c, 0); // var - TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); // alpha - TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); // l1 - TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // l2 - TF_RETURN_IF_ERROR( - HandleGradAndIndicesInputs(c, sparse, 4 /* grad_idx */, &s)); - if (c->num_outputs() > 0) { - c->set_output(0, s); - } - return Status::OK(); -} - -REGISTER_OP("ApplyProximalGradientDescent") - .Input("var: Ref(T)") - .Input("alpha: T") - .Input("l1: T") - .Input("l2: T") - .Input("delta: T") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyProximalGradientDescentShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' as FOBOS algorithm with fixed learning rate. -prox_v = var - alpha * delta -var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} - -var: Should be from a Variable(). -alpha: Scaling factor. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: L2 regularization. Must be a scalar. -delta: The change. -out: Same as "var". -use_locking: If True, the subtraction will be protected by a lock; - otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -REGISTER_OP("SparseApplyProximalGradientDescent") - .Input("var: Ref(T)") - .Input("alpha: T") - .Input("l1: T") - .Input("l2: T") - .Input("grad: T") - .Input("indices: Tindices") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyProximalGradientDescentShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Sparse update '*var' as FOBOS algorithm with fixed learning rate. - -That is for rows we have grad for, we update var as follows: -prox_v = var - alpha * grad -var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} - -var: Should be from a Variable(). -alpha: Scaling factor. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: L2 regularization. Must be a scalar. -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -out: Same as "var". -use_locking: If True, the subtraction will be protected by a lock; - otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -REGISTER_OP("ResourceApplyProximalGradientDescent") - .Input("var: resource") - .Input("alpha: T") - .Input("l1: T") - .Input("l2: T") - .Input("delta: T") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyProximalGradientDescentShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' as FOBOS algorithm with fixed learning rate. -prox_v = var - alpha * delta -var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} - -var: Should be from a Variable(). -alpha: Scaling factor. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: L2 regularization. Must be a scalar. -delta: The change. -use_locking: If True, the subtraction will be protected by a lock; - otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -REGISTER_OP("ResourceSparseApplyProximalGradientDescent") - .Input("var: resource") - .Input("alpha: T") - .Input("l1: T") - .Input("l2: T") - .Input("grad: T") - .Input("indices: Tindices") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyProximalGradientDescentShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Sparse update '*var' as FOBOS algorithm with fixed learning rate. - -That is for rows we have grad for, we update var as follows: -prox_v = var - alpha * grad -var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} - -var: Should be from a Variable(). -alpha: Scaling factor. Must be a scalar. -l1: L1 regularization. Must be a scalar. -l2: L2 regularization. Must be a scalar. -grad: The gradient. -indices: A vector of indices into the first dimension of var and accum. -use_locking: If True, the subtraction will be protected by a lock; - otherwise the behavior is undefined, but may exhibit less contention. -)doc"); - -} // namespace tensorflow diff --git a/tensorflow/core/ops/rms_prop_ops.cc b/tensorflow/core/ops/rms_prop_ops.cc deleted file mode 100644 index d13cef6413d..00000000000 --- a/tensorflow/core/ops/rms_prop_ops.cc +++ /dev/null @@ -1,425 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/ops/training_ops.h" - -namespace tensorflow { - -static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) { - ShapeHandle unused; - ShapeHandle s = ShapeOrHandleShape(c, 0); // var - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // ms - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // mom - TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // lr - TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // rho - TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // momentum - TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); // epsilon - TF_RETURN_IF_ERROR( - HandleGradAndIndicesInputs(c, sparse, 7 /* grad_idx */, &s)); - if (c->num_outputs() > 0) { - c->set_output(0, s); - } - return Status::OK(); -} - -static Status ApplyCenteredRMSPropShapeFn(InferenceContext* c, bool sparse) { - ShapeHandle unused; - ShapeHandle s = ShapeOrHandleShape(c, 0); // var - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // ms - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // mg - TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 3), &s)); // mom - TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // lr - TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // rho - TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); // momentum - TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused)); // epsilon - TF_RETURN_IF_ERROR( - HandleGradAndIndicesInputs(c, sparse, 8 /* grad_idx */, &s)); - if (c->num_outputs() > 0) { - c->set_output(0, s); - } - return Status::OK(); -} - -REGISTER_OP("ApplyRMSProp") - .Input("var: Ref(T)") - .Input("ms: Ref(T)") - .Input("mom: Ref(T)") - .Input("lr: T") - .Input("rho: T") - .Input("momentum: T") - .Input("epsilon: T") - .Input("grad: T") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyRMSPropShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the RMSProp algorithm. -Note that in dense implementation of this algorithm, ms and mom will -update even if the grad is zero, but in this sparse implementation, ms -and mom will not update in iterations during which the grad is zero. - -mean_square = decay * mean_square + (1-decay) * gradient ** 2 -Delta = learning_rate * gradient / sqrt(mean_square + epsilon) - -ms <- rho * ms_{t-1} + (1-rho) * grad * grad -mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) -var <- var - mom - -var: Should be from a Variable(). -ms: Should be from a Variable(). -mom: Should be from a Variable(). -lr: Scaling factor. Must be a scalar. -epsilon: Ridge term. Must be a scalar. -rho: Decay rate. Must be a scalar. -grad: The gradient. -out: Same as "var". -use_locking: If `True`, updating of the var, ms, and mom tensors is protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("ApplyCenteredRMSProp") - .Input("var: Ref(T)") - .Input("mg: Ref(T)") - .Input("ms: Ref(T)") - .Input("mom: Ref(T)") - .Input("lr: T") - .Input("rho: T") - .Input("momentum: T") - .Input("epsilon: T") - .Input("grad: T") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyCenteredRMSPropShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the centered RMSProp algorithm. -The centered RMSProp algorithm uses an estimate of the centered second moment -(i.e., the variance) for normalization, as opposed to regular RMSProp, which -uses the (uncentered) second moment. This often helps with training, but is -slightly more expensive in terms of computation and memory. - -Note that in dense implementation of this algorithm, mg, ms, and mom will -update even if the grad is zero, but in this sparse implementation, mg, ms, -and mom will not update in iterations during which the grad is zero. - -mean_square = decay * mean_square + (1-decay) * gradient ** 2 -mean_grad = decay * mean_grad + (1-decay) * gradient - -Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2) - -mg <- rho * mg_{t-1} + (1-rho) * grad -ms <- rho * ms_{t-1} + (1-rho) * grad * grad -mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon) -var <- var - mom - -var: Should be from a Variable(). -mg: Should be from a Variable(). -ms: Should be from a Variable(). -mom: Should be from a Variable(). -lr: Scaling factor. Must be a scalar. -epsilon: Ridge term. Must be a scalar. -rho: Decay rate. Must be a scalar. -grad: The gradient. -out: Same as "var". -use_locking: If `True`, updating of the var, mg, ms, and mom tensors is - protected by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("SparseApplyRMSProp") - .Input("var: Ref(T)") - .Input("ms: Ref(T)") - .Input("mom: Ref(T)") - .Input("lr: T") - .Input("rho: T") - .Input("momentum: T") - .Input("epsilon: T") - .Input("grad: T") - .Input("indices: Tindices") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyRMSPropShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the RMSProp algorithm. -Note that in dense implementation of this algorithm, ms and mom will -update even if the grad is zero, but in this sparse implementation, ms -and mom will not update in iterations during which the grad is zero. - -mean_square = decay * mean_square + (1-decay) * gradient ** 2 -Delta = learning_rate * gradient / sqrt(mean_square + epsilon) - -ms <- rho * ms_{t-1} + (1-rho) * grad * grad -mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) -var <- var - mom - -var: Should be from a Variable(). -ms: Should be from a Variable(). -mom: Should be from a Variable(). -lr: Scaling factor. Must be a scalar. -epsilon: Ridge term. Must be a scalar. -rho: Decay rate. Must be a scalar. -grad: The gradient. -indices: A vector of indices into the first dimension of var, ms and mom. -out: Same as "var". -use_locking: If `True`, updating of the var, ms, and mom tensors is protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("SparseApplyCenteredRMSProp") - .Input("var: Ref(T)") - .Input("mg: Ref(T)") - .Input("ms: Ref(T)") - .Input("mom: Ref(T)") - .Input("lr: T") - .Input("rho: T") - .Input("momentum: T") - .Input("epsilon: T") - .Input("grad: T") - .Input("indices: Tindices") - .Output("out: Ref(T)") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyCenteredRMSPropShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the centered RMSProp algorithm. -The centered RMSProp algorithm uses an estimate of the centered second moment -(i.e., the variance) for normalization, as opposed to regular RMSProp, which -uses the (uncentered) second moment. This often helps with training, but is -slightly more expensive in terms of computation and memory. - -Note that in dense implementation of this algorithm, mg, ms, and mom will -update even if the grad is zero, but in this sparse implementation, mg, ms, -and mom will not update in iterations during which the grad is zero. - -mean_square = decay * mean_square + (1-decay) * gradient ** 2 -mean_grad = decay * mean_grad + (1-decay) * gradient -Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2) - -ms <- rho * ms_{t-1} + (1-rho) * grad * grad -mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) -var <- var - mom - -var: Should be from a Variable(). -mg: Should be from a Variable(). -ms: Should be from a Variable(). -mom: Should be from a Variable(). -lr: Scaling factor. Must be a scalar. -epsilon: Ridge term. Must be a scalar. -rho: Decay rate. Must be a scalar. -grad: The gradient. -indices: A vector of indices into the first dimension of var, ms and mom. -out: Same as "var". -use_locking: If `True`, updating of the var, mg, ms, and mom tensors is - protected by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("ResourceApplyRMSProp") - .Input("var: resource") - .Input("ms: resource") - .Input("mom: resource") - .Input("lr: T") - .Input("rho: T") - .Input("momentum: T") - .Input("epsilon: T") - .Input("grad: T") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyRMSPropShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the RMSProp algorithm. -Note that in dense implementation of this algorithm, ms and mom will -update even if the grad is zero, but in this sparse implementation, ms -and mom will not update in iterations during which the grad is zero. - -mean_square = decay * mean_square + (1-decay) * gradient ** 2 -Delta = learning_rate * gradient / sqrt(mean_square + epsilon) - -ms <- rho * ms_{t-1} + (1-rho) * grad * grad -mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) -var <- var - mom - -var: Should be from a Variable(). -ms: Should be from a Variable(). -mom: Should be from a Variable(). -lr: Scaling factor. Must be a scalar. -epsilon: Ridge term. Must be a scalar. -rho: Decay rate. Must be a scalar. -grad: The gradient. -use_locking: If `True`, updating of the var, ms, and mom tensors is protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("ResourceApplyCenteredRMSProp") - .Input("var: resource") - .Input("mg: resource") - .Input("ms: resource") - .Input("mom: resource") - .Input("lr: T") - .Input("rho: T") - .Input("momentum: T") - .Input("epsilon: T") - .Input("grad: T") - .Attr("T: numbertype") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyCenteredRMSPropShapeFn(c, false /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the centered RMSProp algorithm. -The centered RMSProp algorithm uses an estimate of the centered second moment -(i.e., the variance) for normalization, as opposed to regular RMSProp, which -uses the (uncentered) second moment. This often helps with training, but is -slightly more expensive in terms of computation and memory. - -Note that in dense implementation of this algorithm, mg, ms, and mom will -update even if the grad is zero, but in this sparse implementation, mg, ms, -and mom will not update in iterations during which the grad is zero. - -mean_square = decay * mean_square + (1-decay) * gradient ** 2 -mean_grad = decay * mean_grad + (1-decay) * gradient - -Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2) - -mg <- rho * mg_{t-1} + (1-rho) * grad -ms <- rho * ms_{t-1} + (1-rho) * grad * grad -mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon) -var <- var - mom - -var: Should be from a Variable(). -mg: Should be from a Variable(). -ms: Should be from a Variable(). -mom: Should be from a Variable(). -lr: Scaling factor. Must be a scalar. -epsilon: Ridge term. Must be a scalar. -rho: Decay rate. Must be a scalar. -grad: The gradient. -use_locking: If `True`, updating of the var, mg, ms, and mom tensors is - protected by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("ResourceSparseApplyRMSProp") - .Input("var: resource") - .Input("ms: resource") - .Input("mom: resource") - .Input("lr: T") - .Input("rho: T") - .Input("momentum: T") - .Input("epsilon: T") - .Input("grad: T") - .Input("indices: Tindices") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyRMSPropShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the RMSProp algorithm. -Note that in dense implementation of this algorithm, ms and mom will -update even if the grad is zero, but in this sparse implementation, ms -and mom will not update in iterations during which the grad is zero. - -mean_square = decay * mean_square + (1-decay) * gradient ** 2 -Delta = learning_rate * gradient / sqrt(mean_square + epsilon) - -ms <- rho * ms_{t-1} + (1-rho) * grad * grad -mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) -var <- var - mom - -var: Should be from a Variable(). -ms: Should be from a Variable(). -mom: Should be from a Variable(). -lr: Scaling factor. Must be a scalar. -epsilon: Ridge term. Must be a scalar. -rho: Decay rate. Must be a scalar. -grad: The gradient. -indices: A vector of indices into the first dimension of var, ms and mom. -use_locking: If `True`, updating of the var, ms, and mom tensors is protected - by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -REGISTER_OP("ResourceSparseApplyCenteredRMSProp") - .Input("var: resource") - .Input("mg: resource") - .Input("ms: resource") - .Input("mom: resource") - .Input("lr: T") - .Input("rho: T") - .Input("momentum: T") - .Input("epsilon: T") - .Input("grad: T") - .Input("indices: Tindices") - .Attr("T: numbertype") - .Attr("Tindices: {int32, int64}") - .Attr("use_locking: bool = false") - .SetShapeFn([](InferenceContext* c) { - return ApplyCenteredRMSPropShapeFn(c, true /* sparse */); - }) - .Doc(R"doc( -Update '*var' according to the centered RMSProp algorithm. -The centered RMSProp algorithm uses an estimate of the centered second moment -(i.e., the variance) for normalization, as opposed to regular RMSProp, which -uses the (uncentered) second moment. This often helps with training, but is -slightly more expensive in terms of computation and memory. - -Note that in dense implementation of this algorithm, mg, ms, and mom will -update even if the grad is zero, but in this sparse implementation, mg, ms, -and mom will not update in iterations during which the grad is zero. - -mean_square = decay * mean_square + (1-decay) * gradient ** 2 -mean_grad = decay * mean_grad + (1-decay) * gradient -Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2) - -ms <- rho * ms_{t-1} + (1-rho) * grad * grad -mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) -var <- var - mom - -var: Should be from a Variable(). -mg: Should be from a Variable(). -ms: Should be from a Variable(). -mom: Should be from a Variable(). -lr: Scaling factor. Must be a scalar. -epsilon: Ridge term. Must be a scalar. -rho: Decay rate. Must be a scalar. -grad: The gradient. -indices: A vector of indices into the first dimension of var, ms and mom. -use_locking: If `True`, updating of the var, mg, ms, and mom tensors is - protected by a lock; otherwise the behavior is undefined, but may exhibit less - contention. -)doc"); - -} // namespace tensorflow diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc new file mode 100644 index 00000000000..6f06b87d589 --- /dev/null +++ b/tensorflow/core/ops/training_ops.cc @@ -0,0 +1,1799 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; + +static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) { + auto* handle_data = c->input_handle_shapes_and_types(input); + if (handle_data != nullptr && !handle_data->empty() && + (*handle_data)[0].dtype != DT_INVALID) { + return (*handle_data)[0].shape; + } + return c->input(input); +} + +// Handle the gradient and, if , indices inputs. +// is an input+output parameter, containing the current known input shape to +// the gradient. +static Status HandleGradAndIndicesInputs(InferenceContext* c, bool sparse, + int grad_idx, ShapeHandle* s) { + ShapeHandle grad = ShapeOrHandleShape(c, grad_idx); + if (!sparse) { + TF_RETURN_IF_ERROR(c->Merge(*s, grad, s)); + return Status::OK(); + } + // Indices is a vector where indices.dim[0].rank == grad[0].rank. + ShapeHandle indices; + TF_RETURN_IF_ERROR(c->WithRank(c->input(grad_idx + 1), 1, &indices)); + DimensionHandle unused; + TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused)); + + // Trailing part of grad matches trailing part of *s. + ShapeHandle grad_unknown_first; + TF_RETURN_IF_ERROR( + c->ReplaceDim(grad, 0, c->UnknownDim(), &grad_unknown_first)); + TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s)); + + return Status::OK(); +} + +static Status ApplyGradientDescentShapeFn(InferenceContext* c) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); // alpha + TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s)); // delta + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return Status::OK(); +} + +REGISTER_OP("ApplyGradientDescent") + .Input("var: Ref(T)") + .Input("alpha: T") + .Input("delta: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn(ApplyGradientDescentShapeFn) + .Doc(R"doc( +Update '*var' by subtracting 'alpha' * 'delta' from it. + +var: Should be from a Variable(). +alpha: Scaling factor. Must be a scalar. +delta: The change. +out: Same as "var". +use_locking: If `True`, the subtraction will be protected by a lock; + otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("ResourceApplyGradientDescent") + .Input("var: resource") + .Input("alpha: T") + .Input("delta: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn(ApplyGradientDescentShapeFn) + .Doc(R"doc( +Update '*var' by subtracting 'alpha' * 'delta' from it. + +var: Should be from a Variable(). +alpha: Scaling factor. Must be a scalar. +delta: The change. +use_locking: If `True`, the subtraction will be protected by a lock; + otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +static Status ApplyProximalGradientDescentShapeFn(InferenceContext* c, + bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); // alpha + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); // l1 + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // l2 + TF_RETURN_IF_ERROR( + HandleGradAndIndicesInputs(c, sparse, 4 /* grad_idx */, &s)); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return Status::OK(); +} + +REGISTER_OP("ApplyProximalGradientDescent") + .Input("var: Ref(T)") + .Input("alpha: T") + .Input("l1: T") + .Input("l2: T") + .Input("delta: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyProximalGradientDescentShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' as FOBOS algorithm with fixed learning rate. +prox_v = var - alpha * delta +var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} + +var: Should be from a Variable(). +alpha: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +delta: The change. +out: Same as "var". +use_locking: If True, the subtraction will be protected by a lock; + otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("SparseApplyProximalGradientDescent") + .Input("var: Ref(T)") + .Input("alpha: T") + .Input("l1: T") + .Input("l2: T") + .Input("grad: T") + .Input("indices: Tindices") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyProximalGradientDescentShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Sparse update '*var' as FOBOS algorithm with fixed learning rate. + +That is for rows we have grad for, we update var as follows: +prox_v = var - alpha * grad +var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} + +var: Should be from a Variable(). +alpha: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +out: Same as "var". +use_locking: If True, the subtraction will be protected by a lock; + otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("ResourceApplyProximalGradientDescent") + .Input("var: resource") + .Input("alpha: T") + .Input("l1: T") + .Input("l2: T") + .Input("delta: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyProximalGradientDescentShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' as FOBOS algorithm with fixed learning rate. +prox_v = var - alpha * delta +var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} + +var: Should be from a Variable(). +alpha: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +delta: The change. +use_locking: If True, the subtraction will be protected by a lock; + otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyProximalGradientDescent") + .Input("var: resource") + .Input("alpha: T") + .Input("l1: T") + .Input("l2: T") + .Input("grad: T") + .Input("indices: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyProximalGradientDescentShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Sparse update '*var' as FOBOS algorithm with fixed learning rate. + +That is for rows we have grad for, we update var as follows: +prox_v = var - alpha * grad +var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} + +var: Should be from a Variable(). +alpha: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +use_locking: If True, the subtraction will be protected by a lock; + otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +static Status ApplyAdadeltaShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum + TF_RETURN_IF_ERROR( + c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // accum update + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // lr + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // rho + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // epsilon + TF_RETURN_IF_ERROR( + HandleGradAndIndicesInputs(c, sparse, 6 /* grad_idx */, &s)); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return Status::OK(); +} + +REGISTER_OP("ApplyAdadelta") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("accum_update: Ref(T)") + .Input("lr: T") + .Input("rho: T") + .Input("epsilon: T") + .Input("grad: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdadeltaShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the adadelta scheme. + +accum = rho() * accum + (1 - rho()) * grad.square(); +update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad; +update_accum = rho() * update_accum + (1 - rho()) * update.square(); +var -= update; + +var: Should be from a Variable(). +accum: Should be from a Variable(). +accum_update: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +rho: Decay factor. Must be a scalar. +epsilon: Constant factor. Must be a scalar. +grad: The gradient. +out: Same as "var". +use_locking: If True, updating of the var, accum and update_accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("SparseApplyAdadelta") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("accum_update: Ref(T)") + .Input("lr: T") + .Input("rho: T") + .Input("epsilon: T") + .Input("grad: T") + .Input("indices: Tindices") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdadeltaShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +var: Should be from a Variable(). +accum: Should be from a Variable(). +accum_update:: Should be from a Variable(). +lr: Learning rate. Must be a scalar. +rho: Decay factor. Must be a scalar. +epsilon: Constant factor. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +out: Same as "var". +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("ResourceApplyAdadelta") + .Input("var: resource") + .Input("accum: resource") + .Input("accum_update: resource") + .Input("lr: T") + .Input("rho: T") + .Input("epsilon: T") + .Input("grad: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdadeltaShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the adadelta scheme. + +accum = rho() * accum + (1 - rho()) * grad.square(); +update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad; +update_accum = rho() * update_accum + (1 - rho()) * update.square(); +var -= update; + +var: Should be from a Variable(). +accum: Should be from a Variable(). +accum_update: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +rho: Decay factor. Must be a scalar. +epsilon: Constant factor. Must be a scalar. +grad: The gradient. +use_locking: If True, updating of the var, accum and update_accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyAdadelta") + .Input("var: resource") + .Input("accum: resource") + .Input("accum_update: resource") + .Input("lr: T") + .Input("rho: T") + .Input("epsilon: T") + .Input("grad: T") + .Input("indices: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdadeltaShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +var: Should be from a Variable(). +accum: Should be from a Variable(). +accum_update:: Should be from a Variable(). +lr: Learning rate. Must be a scalar. +rho: Decay factor. Must be a scalar. +epsilon: Constant factor. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +static Status ApplyAdagradShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); // lr + TF_RETURN_IF_ERROR( + HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return Status::OK(); +} + +REGISTER_OP("ApplyAdagrad") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("lr: T") + .Input("grad: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdagradShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the adagrad scheme. + +accum += grad * grad +var -= lr * grad * (1 / sqrt(accum)) + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +grad: The gradient. +out: Same as "var". +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ResourceApplyAdagrad") + .Input("var: resource") + .Input("accum: resource") + .Input("lr: T") + .Input("grad: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdagradShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the adagrad scheme. + +accum += grad * grad +var -= lr * grad * (1 / sqrt(accum)) + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +grad: The gradient. +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +static Status ApplyProximalAdagradShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); // lr + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // l1 + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // l2 + TF_RETURN_IF_ERROR( + HandleGradAndIndicesInputs(c, sparse, 5 /* grad_idx */, &s)); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return Status::OK(); +} + +REGISTER_OP("ApplyProximalAdagrad") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("grad: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyProximalAdagradShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' and '*accum' according to FOBOS with Adagrad learning rate. +accum += grad * grad +prox_v = var - lr * grad * (1 / sqrt(accum)) +var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0} + +var: Should be from a Variable(). +accum: Should be from a Variable(). +grad: The gradient. +lr: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +out: Same as "var". +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("ResourceApplyProximalAdagrad") + .Input("var: resource") + .Input("accum: resource") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("grad: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyProximalAdagradShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' and '*accum' according to FOBOS with Adagrad learning rate. +accum += grad * grad +prox_v = var - lr * grad * (1 / sqrt(accum)) +var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0} + +var: Should be from a Variable(). +accum: Should be from a Variable(). +grad: The gradient. +lr: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("SparseApplyAdagrad") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("lr: T") + .Input("grad: T") + .Input("indices: Tindices") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdagradShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update relevant entries in '*var' and '*accum' according to the adagrad scheme. + +That is for rows we have grad for, we update var and accum as follows: +accum += grad * grad +var -= lr * grad * (1 / sqrt(accum)) + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Learning rate. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +out: Same as "var". +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyAdagrad") + .Input("var: resource") + .Input("accum: resource") + .Input("lr: T") + .Input("grad: T") + .Input("indices: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdagradShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update relevant entries in '*var' and '*accum' according to the adagrad scheme. + +That is for rows we have grad for, we update var and accum as follows: +accum += grad * grad +var -= lr * grad * (1 / sqrt(accum)) + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Learning rate. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +static Status ApplyAdagradDAShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR( + c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // grad_accumulator + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), + &s)); // gradient_squared_accumulator + TF_RETURN_IF_ERROR( + HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); + int idx = sparse ? 5 : 4; + TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // lr + TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l1 + TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l2 + TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // global step + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return Status::OK(); +} + +REGISTER_OP("ApplyAdagradDA") + .Input("var: Ref(T)") + .Input("gradient_accumulator: Ref(T)") + .Input("gradient_squared_accumulator: Ref(T)") + .Input("grad: T") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("global_step: int64") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdagradDAShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the proximal adagrad scheme. + +var: Should be from a Variable(). +gradient_accumulator: Should be from a Variable(). +gradient_squared_accumulator: Should be from a Variable(). +grad: The gradient. +lr: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +global_step: Training step number. Must be a scalar. +out: Same as "var". +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("SparseApplyAdagradDA") + .Input("var: Ref(T)") + .Input("gradient_accumulator: Ref(T)") + .Input("gradient_squared_accumulator: Ref(T)") + .Input("grad: T") + .Input("indices: Tindices") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("global_step: int64") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdagradDAShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update entries in '*var' and '*accum' according to the proximal adagrad scheme. + +var: Should be from a Variable(). +gradient_accumulator: Should be from a Variable(). +gradient_squared_accumulator: Should be from a Variable(). +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +lr: Learning rate. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +global_step: Training step number. Must be a scalar. +out: Same as "var". +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("SparseApplyProximalAdagrad") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("grad: T") + .Input("indices: Tindices") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyProximalAdagradShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Sparse update entries in '*var' and '*accum' according to FOBOS algorithm. + +That is for rows we have grad for, we update var and accum as follows: +accum += grad * grad +prox_v = var +prox_v -= lr * grad * (1 / sqrt(accum)) +var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0} + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Learning rate. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +out: Same as "var". +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("ResourceApplyAdagradDA") + .Input("var: resource") + .Input("gradient_accumulator: resource") + .Input("gradient_squared_accumulator: resource") + .Input("grad: T") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("global_step: int64") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdagradDAShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the proximal adagrad scheme. + +var: Should be from a Variable(). +gradient_accumulator: Should be from a Variable(). +gradient_squared_accumulator: Should be from a Variable(). +grad: The gradient. +lr: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +global_step: Training step number. Must be a scalar. +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyAdagradDA") + .Input("var: resource") + .Input("gradient_accumulator: resource") + .Input("gradient_squared_accumulator: resource") + .Input("grad: T") + .Input("indices: Tindices") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("global_step: int64") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdagradDAShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update entries in '*var' and '*accum' according to the proximal adagrad scheme. + +var: Should be from a Variable(). +gradient_accumulator: Should be from a Variable(). +gradient_squared_accumulator: Should be from a Variable(). +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +lr: Learning rate. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +global_step: Training step number. Must be a scalar. +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyProximalAdagrad") + .Input("var: resource") + .Input("accum: resource") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("grad: T") + .Input("indices: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyProximalAdagradShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Sparse update entries in '*var' and '*accum' according to FOBOS algorithm. + +That is for rows we have grad for, we update var and accum as follows: +accum += grad * grad +prox_v = var +prox_v -= lr * grad * (1 / sqrt(accum)) +var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0} + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Learning rate. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +static Status ApplyFtrlShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // linear + TF_RETURN_IF_ERROR( + HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); + int idx = sparse ? 5 : 4; + TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // lr + TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l1 + TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l2 + TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // lr_power + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return Status::OK(); +} + +REGISTER_OP("ApplyFtrl") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("linear: Ref(T)") + .Input("grad: T") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("lr_power: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyFtrlShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the Ftrl-proximal scheme. + +accum_new = accum + grad * grad +linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var +quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 +var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 +accum = accum_new + +var: Should be from a Variable(). +accum: Should be from a Variable(). +linear: Should be from a Variable(). +grad: The gradient. +lr: Scaling factor. Must be a scalar. +l1: L1 regulariation. Must be a scalar. +l2: L2 regulariation. Must be a scalar. +lr_power: Scaling factor. Must be a scalar. +out: Same as "var". +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("SparseApplyFtrl") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("linear: Ref(T)") + .Input("grad: T") + .Input("indices: Tindices") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("lr_power: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyFtrlShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update relevant entries in '*var' according to the Ftrl-proximal scheme. + +That is for rows we have grad for, we update var, accum and linear as follows: +accum_new = accum + grad * grad +linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var +quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 +var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 +accum = accum_new + +var: Should be from a Variable(). +accum: Should be from a Variable(). +linear: Should be from a Variable(). +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +lr: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +lr_power: Scaling factor. Must be a scalar. +out: Same as "var". +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ResourceApplyFtrl") + .Input("var: resource") + .Input("accum: resource") + .Input("linear: resource") + .Input("grad: T") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("lr_power: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyFtrlShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the Ftrl-proximal scheme. + +accum_new = accum + grad * grad +linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var +quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 +var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 +accum = accum_new + +var: Should be from a Variable(). +accum: Should be from a Variable(). +linear: Should be from a Variable(). +grad: The gradient. +lr: Scaling factor. Must be a scalar. +l1: L1 regulariation. Must be a scalar. +l2: L2 regulariation. Must be a scalar. +lr_power: Scaling factor. Must be a scalar. +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyFtrl") + .Input("var: resource") + .Input("accum: resource") + .Input("linear: resource") + .Input("grad: T") + .Input("indices: Tindices") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("lr_power: T") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyFtrlShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update relevant entries in '*var' according to the Ftrl-proximal scheme. + +That is for rows we have grad for, we update var, accum and linear as follows: +accum_new = accum + grad * grad +linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var +quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 +var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 +accum = accum_new + +var: Should be from a Variable(). +accum: Should be from a Variable(). +linear: Should be from a Variable(). +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +lr: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +lr_power: Scaling factor. Must be a scalar. +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ApplyFtrlV2") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("linear: Ref(T)") + .Input("grad: T") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("l2_shrinkage: T") + .Input("lr_power: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyFtrlShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the Ftrl-proximal scheme. + +grad_with_shrinkage = grad + 2 * l2_shrinkage * var +accum_new = accum + grad_with_shrinkage * grad_with_shrinkage +linear += grad_with_shrinkage + + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var +quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 +var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 +accum = accum_new + +var: Should be from a Variable(). +accum: Should be from a Variable(). +linear: Should be from a Variable(). +grad: The gradient. +lr: Scaling factor. Must be a scalar. +l1: L1 regulariation. Must be a scalar. +l2: online L2 regulariation. Must be a scalar. +l2: L2 shrinkage regulariation. Must be a scalar. +lr_power: Scaling factor. Must be a scalar. +out: Same as "var". +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("SparseApplyFtrlV2") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("linear: Ref(T)") + .Input("grad: T") + .Input("indices: Tindices") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("l2_shrinkage: T") + .Input("lr_power: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyFtrlShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update relevant entries in '*var' according to the Ftrl-proximal scheme. + +That is for rows we have grad for, we update var, accum and linear as follows: +grad_with_shrinkage = grad + 2 * l2_shrinkage * var +accum_new = accum + grad_with_shrinkage * grad_with_shrinkage +linear += grad_with_shrinkage + + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var +quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 +var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 +accum = accum_new + +var: Should be from a Variable(). +accum: Should be from a Variable(). +linear: Should be from a Variable(). +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +lr: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: onine L2 regularization. Must be a scalar. +l2: L2 shrinkage regulariation. Must be a scalar. +lr_power: Scaling factor. Must be a scalar. +out: Same as "var". +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ResourceApplyFtrlV2") + .Input("var: resource") + .Input("accum: resource") + .Input("linear: resource") + .Input("grad: T") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("l2_shrinkage: T") + .Input("lr_power: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyFtrlShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the Ftrl-proximal scheme. + +grad_with_shrinkage = grad + 2 * l2_shrinkage * var +accum_new = accum + grad_with_shrinkage * grad_with_shrinkage +linear += grad_with_shrinkage + + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var +quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 +var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 +accum = accum_new + +var: Should be from a Variable(). +accum: Should be from a Variable(). +linear: Should be from a Variable(). +grad: The gradient. +lr: Scaling factor. Must be a scalar. +l1: L1 regulariation. Must be a scalar. +l2: onine L2 regularization. Must be a scalar. +l2: L2 shrinkage regulariation. Must be a scalar. +lr_power: Scaling factor. Must be a scalar. +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyFtrlV2") + .Input("var: resource") + .Input("accum: resource") + .Input("linear: resource") + .Input("grad: T") + .Input("indices: Tindices") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("l2_shrinkage: T") + .Input("lr_power: T") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyFtrlShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update relevant entries in '*var' according to the Ftrl-proximal scheme. + +That is for rows we have grad for, we update var, accum and linear as follows: +grad_with_shrinkage = grad + 2 * l2_shrinkage * var +accum_new = accum + grad_with_shrinkage * grad_with_shrinkage +linear += grad_with_shrinkage + + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var +quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 +var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 +accum = accum_new + +var: Should be from a Variable(). +accum: Should be from a Variable(). +linear: Should be from a Variable(). +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +lr: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: onine L2 regularization. Must be a scalar. +l2: L2 shrinkage regulariation. Must be a scalar. +lr_power: Scaling factor. Must be a scalar. +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +static Status ApplyMomentumShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); // lr + TF_RETURN_IF_ERROR( + HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); + int idx = sparse ? 5 : 4; + TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // momentum + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return Status::OK(); +} + +REGISTER_OP("ApplyMomentum") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("lr: T") + .Input("grad: T") + .Input("momentum: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .Attr("use_nesterov: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyMomentumShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the momentum scheme. Set use_nesterov = True if you +want to use Nesterov momentum. + +accum = accum * momentum + grad +var -= lr * accum + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +grad: The gradient. +momentum: Momentum. Must be a scalar. +out: Same as "var". +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +use_nesterov: If `True`, the tensor passed to compute grad will be +var - lr * momentum * accum, so in the end, the var you get is actually +var - lr * momentum * accum. +)doc"); + +REGISTER_OP("SparseApplyMomentum") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("lr: T") + .Input("grad: T") + .Input("indices: Tindices") + .Input("momentum: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .Attr("use_nesterov: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyMomentumShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update relevant entries in '*var' and '*accum' according to the momentum scheme. +Set use_nesterov = True if you want to use Nesterov momentum. + +That is for rows we have grad for, we update var and accum as follows: + +accum = accum * momentum + grad +var -= lr * accum + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Learning rate. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +momentum: Momentum. Must be a scalar. +out: Same as "var". +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +use_nesterov: If `True`, the tensor passed to compute grad will be +var - lr * momentum * accum, so in the end, the var you get is actually +var - lr * momentum * accum. +)doc"); + +REGISTER_OP("ResourceApplyMomentum") + .Input("var: resource") + .Input("accum: resource") + .Input("lr: T") + .Input("grad: T") + .Input("momentum: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .Attr("use_nesterov: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyMomentumShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the momentum scheme. Set use_nesterov = True if you +want to use Nesterov momentum. + +accum = accum * momentum + grad +var -= lr * accum + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +grad: The gradient. +momentum: Momentum. Must be a scalar. +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +use_nesterov: If `True`, the tensor passed to compute grad will be +var - lr * momentum * accum, so in the end, the var you get is actually +var - lr * momentum * accum. +)doc"); + +REGISTER_OP("ResourceSparseApplyMomentum") + .Input("var: resource") + .Input("accum: resource") + .Input("lr: T") + .Input("grad: T") + .Input("indices: Tindices") + .Input("momentum: T") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .Attr("use_nesterov: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyMomentumShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update relevant entries in '*var' and '*accum' according to the momentum scheme. +Set use_nesterov = True if you want to use Nesterov momentum. + +That is for rows we have grad for, we update var and accum as follows: + +accum = accum * momentum + grad +var -= lr * accum + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Learning rate. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +momentum: Momentum. Must be a scalar. +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +use_nesterov: If `True`, the tensor passed to compute grad will be +var - lr * momentum * accum, so in the end, the var you get is actually +var - lr * momentum * accum. +)doc"); + +static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // m + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // v + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // beta1_power + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // beta2_power + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // lr + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); // beta1 + TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused)); // beta2 + TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused)); // epsilon + TF_RETURN_IF_ERROR( + HandleGradAndIndicesInputs(c, sparse, 9 /* grad_idx */, &s)); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return Status::OK(); +} + +REGISTER_OP("ApplyAdam") + .Input("var: Ref(T)") + .Input("m: Ref(T)") + .Input("v: Ref(T)") + .Input("beta1_power: T") + .Input("beta2_power: T") + .Input("lr: T") + .Input("beta1: T") + .Input("beta2: T") + .Input("epsilon: T") + .Input("grad: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .Attr("use_nesterov: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdamShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the Adam algorithm. + +lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) +m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t +v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t +variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon) + +var: Should be from a Variable(). +m: Should be from a Variable(). +v: Should be from a Variable(). +beta1_power: Must be a scalar. +beta2_power: Must be a scalar. +lr: Scaling factor. Must be a scalar. +beta1: Momentum factor. Must be a scalar. +beta2: Momentum factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +grad: The gradient. +out: Same as "var". +use_locking: If `True`, updating of the var, m, and v tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +use_nesterov: If `True`, uses the nesterov update. +)doc"); + +REGISTER_OP("ResourceApplyAdam") + .Input("var: resource") + .Input("m: resource") + .Input("v: resource") + .Input("beta1_power: T") + .Input("beta2_power: T") + .Input("lr: T") + .Input("beta1: T") + .Input("beta2: T") + .Input("epsilon: T") + .Input("grad: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .Attr("use_nesterov: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdamShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the Adam algorithm. + +lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) +m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t +v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t +variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon) + +var: Should be from a Variable(). +m: Should be from a Variable(). +v: Should be from a Variable(). +beta1_power: Must be a scalar. +beta2_power: Must be a scalar. +lr: Scaling factor. Must be a scalar. +beta1: Momentum factor. Must be a scalar. +beta2: Momentum factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +grad: The gradient. +use_locking: If `True`, updating of the var, m, and v tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +use_nesterov: If `True`, uses the nesterov update. +)doc"); + +static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // ms + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // mom + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // lr + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // rho + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // momentum + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); // epsilon + TF_RETURN_IF_ERROR( + HandleGradAndIndicesInputs(c, sparse, 7 /* grad_idx */, &s)); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return Status::OK(); +} + +static Status ApplyCenteredRMSPropShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // ms + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // mg + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 3), &s)); // mom + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // lr + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // rho + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); // momentum + TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused)); // epsilon + TF_RETURN_IF_ERROR( + HandleGradAndIndicesInputs(c, sparse, 8 /* grad_idx */, &s)); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return Status::OK(); +} + +REGISTER_OP("ApplyRMSProp") + .Input("var: Ref(T)") + .Input("ms: Ref(T)") + .Input("mom: Ref(T)") + .Input("lr: T") + .Input("rho: T") + .Input("momentum: T") + .Input("epsilon: T") + .Input("grad: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyRMSPropShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the RMSProp algorithm. +Note that in dense implementation of this algorithm, ms and mom will +update even if the grad is zero, but in this sparse implementation, ms +and mom will not update in iterations during which the grad is zero. + +mean_square = decay * mean_square + (1-decay) * gradient ** 2 +Delta = learning_rate * gradient / sqrt(mean_square + epsilon) + +ms <- rho * ms_{t-1} + (1-rho) * grad * grad +mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) +var <- var - mom + +var: Should be from a Variable(). +ms: Should be from a Variable(). +mom: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +rho: Decay rate. Must be a scalar. +grad: The gradient. +out: Same as "var". +use_locking: If `True`, updating of the var, ms, and mom tensors is protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ApplyCenteredRMSProp") + .Input("var: Ref(T)") + .Input("mg: Ref(T)") + .Input("ms: Ref(T)") + .Input("mom: Ref(T)") + .Input("lr: T") + .Input("rho: T") + .Input("momentum: T") + .Input("epsilon: T") + .Input("grad: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyCenteredRMSPropShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the centered RMSProp algorithm. +The centered RMSProp algorithm uses an estimate of the centered second moment +(i.e., the variance) for normalization, as opposed to regular RMSProp, which +uses the (uncentered) second moment. This often helps with training, but is +slightly more expensive in terms of computation and memory. + +Note that in dense implementation of this algorithm, mg, ms, and mom will +update even if the grad is zero, but in this sparse implementation, mg, ms, +and mom will not update in iterations during which the grad is zero. + +mean_square = decay * mean_square + (1-decay) * gradient ** 2 +mean_grad = decay * mean_grad + (1-decay) * gradient + +Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2) + +mg <- rho * mg_{t-1} + (1-rho) * grad +ms <- rho * ms_{t-1} + (1-rho) * grad * grad +mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon) +var <- var - mom + +var: Should be from a Variable(). +mg: Should be from a Variable(). +ms: Should be from a Variable(). +mom: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +rho: Decay rate. Must be a scalar. +grad: The gradient. +out: Same as "var". +use_locking: If `True`, updating of the var, mg, ms, and mom tensors is + protected by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("SparseApplyRMSProp") + .Input("var: Ref(T)") + .Input("ms: Ref(T)") + .Input("mom: Ref(T)") + .Input("lr: T") + .Input("rho: T") + .Input("momentum: T") + .Input("epsilon: T") + .Input("grad: T") + .Input("indices: Tindices") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyRMSPropShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the RMSProp algorithm. +Note that in dense implementation of this algorithm, ms and mom will +update even if the grad is zero, but in this sparse implementation, ms +and mom will not update in iterations during which the grad is zero. + +mean_square = decay * mean_square + (1-decay) * gradient ** 2 +Delta = learning_rate * gradient / sqrt(mean_square + epsilon) + +ms <- rho * ms_{t-1} + (1-rho) * grad * grad +mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) +var <- var - mom + +var: Should be from a Variable(). +ms: Should be from a Variable(). +mom: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +rho: Decay rate. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var, ms and mom. +out: Same as "var". +use_locking: If `True`, updating of the var, ms, and mom tensors is protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("SparseApplyCenteredRMSProp") + .Input("var: Ref(T)") + .Input("mg: Ref(T)") + .Input("ms: Ref(T)") + .Input("mom: Ref(T)") + .Input("lr: T") + .Input("rho: T") + .Input("momentum: T") + .Input("epsilon: T") + .Input("grad: T") + .Input("indices: Tindices") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyCenteredRMSPropShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the centered RMSProp algorithm. +The centered RMSProp algorithm uses an estimate of the centered second moment +(i.e., the variance) for normalization, as opposed to regular RMSProp, which +uses the (uncentered) second moment. This often helps with training, but is +slightly more expensive in terms of computation and memory. + +Note that in dense implementation of this algorithm, mg, ms, and mom will +update even if the grad is zero, but in this sparse implementation, mg, ms, +and mom will not update in iterations during which the grad is zero. + +mean_square = decay * mean_square + (1-decay) * gradient ** 2 +mean_grad = decay * mean_grad + (1-decay) * gradient +Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2) + +ms <- rho * ms_{t-1} + (1-rho) * grad * grad +mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) +var <- var - mom + +var: Should be from a Variable(). +mg: Should be from a Variable(). +ms: Should be from a Variable(). +mom: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +rho: Decay rate. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var, ms and mom. +out: Same as "var". +use_locking: If `True`, updating of the var, mg, ms, and mom tensors is + protected by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ResourceApplyRMSProp") + .Input("var: resource") + .Input("ms: resource") + .Input("mom: resource") + .Input("lr: T") + .Input("rho: T") + .Input("momentum: T") + .Input("epsilon: T") + .Input("grad: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyRMSPropShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the RMSProp algorithm. +Note that in dense implementation of this algorithm, ms and mom will +update even if the grad is zero, but in this sparse implementation, ms +and mom will not update in iterations during which the grad is zero. + +mean_square = decay * mean_square + (1-decay) * gradient ** 2 +Delta = learning_rate * gradient / sqrt(mean_square + epsilon) + +ms <- rho * ms_{t-1} + (1-rho) * grad * grad +mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) +var <- var - mom + +var: Should be from a Variable(). +ms: Should be from a Variable(). +mom: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +rho: Decay rate. Must be a scalar. +grad: The gradient. +use_locking: If `True`, updating of the var, ms, and mom tensors is protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ResourceApplyCenteredRMSProp") + .Input("var: resource") + .Input("mg: resource") + .Input("ms: resource") + .Input("mom: resource") + .Input("lr: T") + .Input("rho: T") + .Input("momentum: T") + .Input("epsilon: T") + .Input("grad: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyCenteredRMSPropShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the centered RMSProp algorithm. +The centered RMSProp algorithm uses an estimate of the centered second moment +(i.e., the variance) for normalization, as opposed to regular RMSProp, which +uses the (uncentered) second moment. This often helps with training, but is +slightly more expensive in terms of computation and memory. + +Note that in dense implementation of this algorithm, mg, ms, and mom will +update even if the grad is zero, but in this sparse implementation, mg, ms, +and mom will not update in iterations during which the grad is zero. + +mean_square = decay * mean_square + (1-decay) * gradient ** 2 +mean_grad = decay * mean_grad + (1-decay) * gradient + +Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2) + +mg <- rho * mg_{t-1} + (1-rho) * grad +ms <- rho * ms_{t-1} + (1-rho) * grad * grad +mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon) +var <- var - mom + +var: Should be from a Variable(). +mg: Should be from a Variable(). +ms: Should be from a Variable(). +mom: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +rho: Decay rate. Must be a scalar. +grad: The gradient. +use_locking: If `True`, updating of the var, mg, ms, and mom tensors is + protected by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyRMSProp") + .Input("var: resource") + .Input("ms: resource") + .Input("mom: resource") + .Input("lr: T") + .Input("rho: T") + .Input("momentum: T") + .Input("epsilon: T") + .Input("grad: T") + .Input("indices: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyRMSPropShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the RMSProp algorithm. +Note that in dense implementation of this algorithm, ms and mom will +update even if the grad is zero, but in this sparse implementation, ms +and mom will not update in iterations during which the grad is zero. + +mean_square = decay * mean_square + (1-decay) * gradient ** 2 +Delta = learning_rate * gradient / sqrt(mean_square + epsilon) + +ms <- rho * ms_{t-1} + (1-rho) * grad * grad +mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) +var <- var - mom + +var: Should be from a Variable(). +ms: Should be from a Variable(). +mom: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +rho: Decay rate. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var, ms and mom. +use_locking: If `True`, updating of the var, ms, and mom tensors is protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyCenteredRMSProp") + .Input("var: resource") + .Input("mg: resource") + .Input("ms: resource") + .Input("mom: resource") + .Input("lr: T") + .Input("rho: T") + .Input("momentum: T") + .Input("epsilon: T") + .Input("grad: T") + .Input("indices: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyCenteredRMSPropShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the centered RMSProp algorithm. +The centered RMSProp algorithm uses an estimate of the centered second moment +(i.e., the variance) for normalization, as opposed to regular RMSProp, which +uses the (uncentered) second moment. This often helps with training, but is +slightly more expensive in terms of computation and memory. + +Note that in dense implementation of this algorithm, mg, ms, and mom will +update even if the grad is zero, but in this sparse implementation, mg, ms, +and mom will not update in iterations during which the grad is zero. + +mean_square = decay * mean_square + (1-decay) * gradient ** 2 +mean_grad = decay * mean_grad + (1-decay) * gradient +Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2) + +ms <- rho * ms_{t-1} + (1-rho) * grad * grad +mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) +var <- var - mom + +var: Should be from a Variable(). +mg: Should be from a Variable(). +ms: Should be from a Variable(). +mom: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +rho: Decay rate. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var, ms and mom. +use_locking: If `True`, updating of the var, mg, ms, and mom tensors is + protected by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +} // namespace tensorflow diff --git a/tensorflow/core/ops/training_ops.h b/tensorflow/core/ops/training_ops.h deleted file mode 100644 index a61c41e2a7f..00000000000 --- a/tensorflow/core/ops/training_ops.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef THIRD_PARTY_TENSORFLOW_CORE_OPS_TRAINING_OPS_H_ -#define THIRD_PARTY_TENSORFLOW_CORE_OPS_TRAINING_OPS_H_ - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/ops/training_ops.h" - -namespace tensorflow { - -using shape_inference::DimensionHandle; -using shape_inference::InferenceContext; -using shape_inference::ShapeHandle; - -static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) { - auto* handle_data = c->input_handle_shapes_and_types(input); - if (handle_data != nullptr && !handle_data->empty() && - (*handle_data)[0].dtype != DT_INVALID) { - return (*handle_data)[0].shape; - } - return c->input(input); -} - -// Handle the gradient and, if , indices inputs. -// is an input+output parameter, containing the current known input shape to -// the gradient. -static Status HandleGradAndIndicesInputs(InferenceContext* c, bool sparse, - int grad_idx, ShapeHandle* s) { - ShapeHandle grad = ShapeOrHandleShape(c, grad_idx); - if (!sparse) { - TF_RETURN_IF_ERROR(c->Merge(*s, grad, s)); - return Status::OK(); - } - // Indices is a vector where indices.dim[0].rank == grad[0].rank. - ShapeHandle indices; - TF_RETURN_IF_ERROR(c->WithRank(c->input(grad_idx + 1), 1, &indices)); - DimensionHandle unused; - TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused)); - - // Trailing part of grad matches trailing part of *s. - ShapeHandle grad_unknown_first; - TF_RETURN_IF_ERROR( - c->ReplaceDim(grad, 0, c->UnknownDim(), &grad_unknown_first)); - TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s)); - - return Status::OK(); -} - -} // namespace tensorflow - -#endif // THIRD_PARTY_TENSORFLOW_CORE_OPS_TRAINING_OPS_H_ diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD index 26ece1d7416..4680e3ba160 100644 --- a/tensorflow/java/BUILD +++ b/tensorflow/java/BUILD @@ -65,16 +65,7 @@ tf_java_op_gen_srcjar( "sparse_ops", "state_ops", "string_ops", - "adadelta_ops", - "adagrad_da_ops", - "adagrad_ops", - "adam_ops", - "ftrl_ops", - "momentum_ops", - "gradient_descent_ops", - "proximal_adagrad_ops", - "proximal_gradient_descent_ops", - "rms_prop_ops", + "training_ops", "user_ops", ], ) diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index d9c3fab0cf7..7ea5cedd8a4 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -1363,53 +1363,8 @@ tf_gen_op_wrapper_private_py( ) tf_gen_op_wrapper_private_py( - name = "adagrad_ops_gen", - out = "training/gen_adagrad_ops.py", -) - -tf_gen_op_wrapper_private_py( - name = "adagrad_da_ops_gen", - out = "training/gen_adagrad_da_ops.py", -) - -tf_gen_op_wrapper_private_py( - name = "adadelta_ops_gen", - out = "training/gen_adadelta_ops.py", -) - -tf_gen_op_wrapper_private_py( - name = "adam_ops_gen", - out = "training/gen_adam_ops.py", -) - -tf_gen_op_wrapper_private_py( - name = "ftrl_ops_gen", - out = "training/gen_ftrl_ops.py", -) - -tf_gen_op_wrapper_private_py( - name = "gradient_descent_ops_gen", - out = "training/gen_gradient_descent_ops.py", -) - -tf_gen_op_wrapper_private_py( - name = "momentum_ops_gen", - out = "training/gen_momentum_ops.py", -) - -tf_gen_op_wrapper_private_py( - name = "proximal_adagrad_ops_gen", - out = "training/gen_proximal_adagrad_ops.py", -) - -tf_gen_op_wrapper_private_py( - name = "proximal_gradient_descent_ops_gen", - out = "training/gen_proximal_gradient_descent_ops.py", -) - -tf_gen_op_wrapper_private_py( - name = "rms_prop_ops_gen", - out = "training/gen_rms_prop_ops.py", + name = "training_ops_gen", + out = "training/gen_training_ops.py", ) py_library( @@ -2640,10 +2595,6 @@ py_library( ), srcs_version = "PY2AND3", deps = [ - ":adadelta_ops_gen", - ":adagrad_da_ops_gen", - ":adagrad_ops_gen", - ":adam_ops_gen", ":array_ops", ":checkpoint_ops_gen", ":client", @@ -2652,8 +2603,6 @@ py_library( ":errors", ":framework", ":framework_for_generated_wrappers", - ":ftrl_ops_gen", - ":gradient_descent_ops_gen", ":gradients", ":init_ops", ":io_ops", @@ -2661,21 +2610,18 @@ py_library( ":lib", ":lookup_ops", ":math_ops", - ":momentum_ops_gen", ":platform", ":protos_all_py", - ":proximal_adagrad_ops_gen", - ":proximal_gradient_descent_ops_gen", ":pywrap_tensorflow", ":random_ops", ":resource_variable_ops", ":resources", - ":rms_prop_ops_gen", ":sdca_ops", ":sparse_ops", ":state_ops", ":string_ops", ":summary", + ":training_ops_gen", ":util", ":variable_scope", ":variables", diff --git a/tensorflow/python/training/training_ops.py b/tensorflow/python/training/training_ops.py index b33e7cb2749..e98c32b6144 100644 --- a/tensorflow/python/training/training_ops.py +++ b/tensorflow/python/training/training_ops.py @@ -19,16 +19,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.python.training import gen_training_ops # go/tf-wildcard-import # pylint: disable=wildcard-import -from tensorflow.python.training.gen_adadelta_ops import * -from tensorflow.python.training.gen_adagrad_da_ops import * -from tensorflow.python.training.gen_adagrad_ops import * -from tensorflow.python.training.gen_adam_ops import * -from tensorflow.python.training.gen_ftrl_ops import * -from tensorflow.python.training.gen_gradient_descent_ops import * -from tensorflow.python.training.gen_momentum_ops import * -from tensorflow.python.training.gen_proximal_adagrad_ops import * -from tensorflow.python.training.gen_proximal_gradient_descent_ops import * -from tensorflow.python.training.gen_rms_prop_ops import * +from tensorflow.python.training.gen_training_ops import * # pylint: enable=wildcard-import diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 5c0c507c9f3..5c156e7ee26 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -196,7 +196,7 @@ def tf_opts_nortti_if_android(): # Given a list of "op_lib_names" (a list of files in the ops directory # without their .cc extensions), generate a library for that file. -def tf_gen_op_libs(op_lib_names, deps=None, extra_srcs=[]): +def tf_gen_op_libs(op_lib_names, deps=None): # Make library out of each op so it can also be used to generate wrappers # for various languages. if not deps: @@ -205,7 +205,7 @@ def tf_gen_op_libs(op_lib_names, deps=None, extra_srcs=[]): native.cc_library( name=n + "_op_lib", copts=tf_copts(), - srcs=extra_srcs + ["ops/" + n + ".cc"], + srcs=["ops/" + n + ".cc"], deps=deps + [clean_dep("//tensorflow/core:framework")], visibility=["//visibility:public"], alwayslink=1,