Adam optimizer with learning rate multipliers
30 Apr 2018Below is my implementation of the adam optimizer with learning rate multipliers, implemented and tried together with TensorFlow backend.
from keras.legacy import interfaces
import keras.backend as K
from keras.optimizers import Optimizer
class Adam_lr_mult(Optimizer):
"""Adam optimizer.
Adam optimizer, with learning rate multipliers built on Keras implementation
# Arguments
lr: float >= 0. Learning rate.
beta_1: float, 0 < beta < 1. Generally close to 1.
beta_2: float, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
amsgrad: boolean. Whether to apply the AMSGrad variant of this
algorithm from the paper "On the Convergence of Adam and
Beyond".
# References
- [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
- [On the Convergence of Adam and Beyond](https://openreview.net/forum?id=ryQu7f-RZ)
AUTHOR: Erik Brorson
"""
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999,
epsilon=None, decay=0., amsgrad=False,
multipliers=None, debug_verbose=False,**kwargs):
super(Adam_lr_mult, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.initial_decay = decay
self.amsgrad = amsgrad
self.multipliers = multipliers
self.debug_verbose = debug_verbose
@interfaces.legacy_get_updates_support
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
lr = self.lr
if self.initial_decay > 0:
lr *= (1. / (1. + self.decay * K.cast(self.iterations,
K.dtype(self.decay))))
t = K.cast(self.iterations, K.floatx()) + 1
lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
(1. - K.pow(self.beta_1, t)))
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
if self.amsgrad:
vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
else:
vhats = [K.zeros(1) for _ in params]
self.weights = [self.iterations] + ms + vs + vhats
for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
# Learning rate multipliers
if self.multipliers:
multiplier = [mult for mult in self.multipliers if mult in p.name]
else:
multiplier = None
if multiplier:
new_lr_t = lr_t * self.multipliers[multiplier[0]]
if self.debug_verbose:
print('Setting {} to learning rate {}'.format(multiplier[0], new_lr_t))
print(K.get_value(new_lr_t))
else:
new_lr_t = lr_t
if self.debug_verbose:
print('No change in learning rate {}'.format(p.name))
print(K.get_value(new_lr_t))
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
if self.amsgrad:
vhat_t = K.maximum(vhat, v_t)
p_t = p - new_lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
self.updates.append(K.update(vhat, vhat_t))
else:
p_t = p - new_lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
self.updates.append(K.update(m, m_t))
self.updates.append(K.update(v, v_t))
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'epsilon': self.epsilon,
'amsgrad': self.amsgrad,
'multipliers':self.multipliers}
base_config = super(Adam_lr_mult, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
In addition to the normal parameters, the optimizer takes the multiplier argument which is a dictionary. It works like follows: Imagine if we have a network with three layers with names layer_1, layer_2, and layer_3. This would then be:
learning_rate_multipliers = {}
learning_rate_multipliers['layer_1'] = 1
learning_rate_multipliers['layer_2'] = 0.5
learning_rate_multipliers['layer_3'] = 0.1
We then create the optimizer like so:
adam_with_lr_multipliers = Adam_lr_mult(multipliers=learning_rate_multipliers)
Which can be used to train a model.