parent
84f091dff8
commit
b9310932ce
@ -25,11 +25,9 @@ from __future__ import absolute_import
|
|||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
from tensorflow.python.framework import ops
|
|
||||||
from tensorflow.python.ops import array_ops
|
from tensorflow.python.ops import array_ops
|
||||||
from tensorflow.python.ops import control_flow_ops
|
from tensorflow.python.ops import control_flow_ops
|
||||||
from tensorflow.python.ops import math_ops
|
from tensorflow.python.ops import math_ops
|
||||||
from tensorflow.python.ops import resource_variable_ops
|
|
||||||
from tensorflow.python.ops import state_ops
|
from tensorflow.python.ops import state_ops
|
||||||
from tensorflow.python.training import adam
|
from tensorflow.python.training import adam
|
||||||
|
|
||||||
@ -48,12 +46,7 @@ class LazyAdamOptimizer(adam.AdamOptimizer):
|
|||||||
may lead to different empirical results.
|
may lead to different empirical results.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _apply_sparse_shared(self,
|
def _apply_sparse(self, grad, var):
|
||||||
grad,
|
|
||||||
var,
|
|
||||||
indices,
|
|
||||||
scatter_update,
|
|
||||||
scatter_sub):
|
|
||||||
beta1_power, beta2_power = self._get_beta_accumulators()
|
beta1_power, beta2_power = self._get_beta_accumulators()
|
||||||
beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
|
beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
|
||||||
beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
|
beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
|
||||||
@ -65,51 +58,23 @@ class LazyAdamOptimizer(adam.AdamOptimizer):
|
|||||||
|
|
||||||
# \\(m := beta1 * m + (1 - beta1) * g_t\\)
|
# \\(m := beta1 * m + (1 - beta1) * g_t\\)
|
||||||
m = self.get_slot(var, "m")
|
m = self.get_slot(var, "m")
|
||||||
m_t = scatter_update(m, indices,
|
m_t = state_ops.scatter_update(m, grad.indices,
|
||||||
beta1_t * array_ops.gather(m, indices) +
|
beta1_t * array_ops.gather(m, grad.indices) +
|
||||||
(1 - beta1_t) * grad)
|
(1 - beta1_t) * grad.values,
|
||||||
|
use_locking=self._use_locking)
|
||||||
|
|
||||||
# \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
|
# \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
|
||||||
v = self.get_slot(var, "v")
|
v = self.get_slot(var, "v")
|
||||||
v_t = scatter_update(v, indices,
|
v_t = state_ops.scatter_update(v, grad.indices,
|
||||||
beta2_t * array_ops.gather(v, indices) +
|
beta2_t * array_ops.gather(v, grad.indices) +
|
||||||
(1 - beta2_t) * math_ops.square(grad))
|
(1 - beta2_t) * math_ops.square(grad.values),
|
||||||
|
use_locking=self._use_locking)
|
||||||
|
|
||||||
# \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
|
# \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
|
||||||
m_t_slice = array_ops.gather(m_t, indices)
|
m_t_slice = array_ops.gather(m_t, grad.indices)
|
||||||
v_t_slice = array_ops.gather(v_t, indices)
|
v_t_slice = array_ops.gather(v_t, grad.indices)
|
||||||
denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t
|
denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t
|
||||||
var_update = scatter_sub(var, indices,
|
var_update = state_ops.scatter_sub(var, grad.indices,
|
||||||
lr * m_t_slice / denominator_slice)
|
lr * m_t_slice / denominator_slice,
|
||||||
|
use_locking=self._use_locking)
|
||||||
return control_flow_ops.group(var_update, m_t, v_t)
|
return control_flow_ops.group(var_update, m_t, v_t)
|
||||||
|
|
||||||
def _apply_sparse(self, grad, var):
|
|
||||||
return self._apply_sparse_shared(
|
|
||||||
grad.values, var, grad.indices,
|
|
||||||
self._scatter_update,
|
|
||||||
self._scatter_sub)
|
|
||||||
|
|
||||||
def _resource_apply_sparse(self, grad, var, indices):
|
|
||||||
return self._apply_sparse_shared(
|
|
||||||
grad, var, indices,
|
|
||||||
self._resource_scatter_update,
|
|
||||||
self._resource_scatter_sub)
|
|
||||||
|
|
||||||
# Utility functions for updating resource or non-resource variables.
|
|
||||||
def _scatter_update(self, x, i, v):
|
|
||||||
return state_ops.scatter_update(
|
|
||||||
x, i, v, use_locking=self._use_locking)
|
|
||||||
|
|
||||||
def _scatter_sub(self, x, i, v):
|
|
||||||
return state_ops.scatter_sub(
|
|
||||||
x, i, v, use_locking=self._use_locking)
|
|
||||||
|
|
||||||
def _resource_scatter_update(self, x, i, v):
|
|
||||||
update_op = resource_variable_ops.resource_scatter_update(x.handle, i, v)
|
|
||||||
with ops.control_dependencies([update_op]):
|
|
||||||
return x.value()
|
|
||||||
|
|
||||||
def _resource_scatter_sub(self, x, i, v):
|
|
||||||
sub_op = resource_variable_ops.resource_scatter_sub(x.handle, i, v)
|
|
||||||
with ops.control_dependencies([sub_op]):
|
|
||||||
return x.value()
|
|
||||||
|
@ -27,7 +27,6 @@ from tensorflow.python.framework import dtypes
|
|||||||
from tensorflow.python.framework import ops
|
from tensorflow.python.framework import ops
|
||||||
from tensorflow.python.ops import array_ops
|
from tensorflow.python.ops import array_ops
|
||||||
from tensorflow.python.ops import math_ops
|
from tensorflow.python.ops import math_ops
|
||||||
from tensorflow.python.ops import resource_variable_ops
|
|
||||||
from tensorflow.python.ops import variables
|
from tensorflow.python.ops import variables
|
||||||
from tensorflow.python.platform import test
|
from tensorflow.python.platform import test
|
||||||
|
|
||||||
@ -52,7 +51,7 @@ def adam_update_numpy(param,
|
|||||||
|
|
||||||
class AdamOptimizerTest(test.TestCase):
|
class AdamOptimizerTest(test.TestCase):
|
||||||
|
|
||||||
def doTestSparse(self, use_resource=False):
|
def testSparse(self):
|
||||||
for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
|
for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
|
||||||
with self.cached_session():
|
with self.cached_session():
|
||||||
# Initialize variables for numpy implementation.
|
# Initialize variables for numpy implementation.
|
||||||
@ -62,10 +61,6 @@ class AdamOptimizerTest(test.TestCase):
|
|||||||
var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
|
var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
|
||||||
grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
|
grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
|
||||||
|
|
||||||
if use_resource:
|
|
||||||
var0 = resource_variable_ops.ResourceVariable(var0_np)
|
|
||||||
var1 = resource_variable_ops.ResourceVariable(var1_np)
|
|
||||||
else:
|
|
||||||
var0 = variables.Variable(var0_np)
|
var0 = variables.Variable(var0_np)
|
||||||
var1 = variables.Variable(var1_np)
|
var1 = variables.Variable(var1_np)
|
||||||
grads0_np_indices = np.array([0, 1], dtype=np.int32)
|
grads0_np_indices = np.array([0, 1], dtype=np.int32)
|
||||||
@ -99,12 +94,6 @@ class AdamOptimizerTest(test.TestCase):
|
|||||||
self.assertAllCloseAccordingToType(var0_np, var0.eval())
|
self.assertAllCloseAccordingToType(var0_np, var0.eval())
|
||||||
self.assertAllCloseAccordingToType(var1_np, var1.eval())
|
self.assertAllCloseAccordingToType(var1_np, var1.eval())
|
||||||
|
|
||||||
def testSparse(self):
|
|
||||||
self.doTestSparse(use_resource=False)
|
|
||||||
|
|
||||||
def testResourceSparse(self):
|
|
||||||
self.doTestSparse(use_resource=True)
|
|
||||||
|
|
||||||
def testSparseDevicePlacement(self):
|
def testSparseDevicePlacement(self):
|
||||||
for index_dtype in [dtypes.int32, dtypes.int64]:
|
for index_dtype in [dtypes.int32, dtypes.int64]:
|
||||||
with self.test_session(force_gpu=test.is_gpu_available()):
|
with self.test_session(force_gpu=test.is_gpu_available()):
|
||||||
|
Loading…
Reference in New Issue
Block a user