for input, target in dataset: def closure(): optimizer.zero_grad() output = model(input) loss = loss_fn(output, target) loss.backward() return loss optimizer.step(closure)
下面正式开始。
SGD
先来看SGD。SGD没有动量的概念,也就是说:
代入步骤3,可以看到下降梯度就是最简单的
SGD最大的缺点是下降速度慢,而且可能会在沟壑的两边持续震荡,停留在一个局部最优点。
SGD with Momentum
为了抑制SGD的震荡,SGDM认为梯度下降过程可以加入惯性。下坡的时候,如果发现是陡坡,那就利用惯性跑的快一些。SGDM全称是SGD with momentum,在SGD基础上引入了一阶动量:
def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: raise ValueError("Invalid momentum value: {}".format(momentum)) if weight_decay < 0.0: raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=nesterov) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGD, self).__init__(params, defaults)
def __setstate__(self, state): super(SGD, self).__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False)
[docs] @torch.no_grad() def step(self, closure=None): """Performs a single optimization step.
Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad():
loss = closure()
for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov']
for p in group['params']: if p.grad is None: continue d_p = p.grad if weight_decay != 0: d_p = d_p.add(p, alpha=weight_decay) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(d_p, alpha=1 - dampening) if nesterov: d_p = d_p.add(buf, alpha=momentum) else: d_p = buf
for group in self.param_groups: for p in group['params']: state = self.state[p] state['step'] = 0
state['sum'] = torch.full_like(p, initial_accumulator_value, memory_format=torch.preserve_format)
def share_memory(self): for group in self.param_groups: for p in group['params']: state = self.state[p] state['sum'].share_memory_()
[docs] @torch.no_grad() def step(self, closure=None): """Performs a single optimization step.
Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure()
for group in self.param_groups: params_with_grad = [] grads = [] state_sums = [] state_steps = []
for p in group['params']: if p.grad is not None: params_with_grad.append(p) grads.append(p.grad) state = self.state[p] state_sums.append(state['sum']) # update the steps for each param group update state['step'] += 1 # record the step after step update state_steps.append(state['step'])
CLASS torch.optim.RMSprop(params, lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
参数:
params (iterable) – 优化器作用的模型参数。
lr (float) – learning rate – 相当于是统一框架中的 。
momentum (float, optional) – 动量参数。(默认值:0)。
alpha(float,optional) – 平滑常数 (默认值:0.99)。
centered(bool,optional) – ifTrue, compute the centered RMSProp, the gradient is normalized by an estimation of its variance,就是这一项是 True 的话就把方差使用梯度作归一化。
The centered version first appears in `Generating Sequences With Recurrent Neural Networks `_.
The implementation here takes the square root of the gradient average before adding epsilon (note that TensorFlow interchanges these two operations). The effective learning rate is thus :math:`\alpha/(\sqrt{v} + \epsilon)` where :math:`\alpha` is the scheduled learning rate and :math:`v` is the weighted moving average of the squared gradient.
Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-2) momentum (float, optional): momentum factor (default: 0) alpha (float, optional): smoothing constant (default: 0.99) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) centered (bool, optional) : if ``True``, compute the centered RMSProp, the gradient is normalized by an estimation of its variance weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
"""
def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= momentum: raise ValueError("Invalid momentum value: {}".format(momentum)) if not 0.0 <= weight_decay: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) if not 0.0 <= alpha: raise ValueError("Invalid alpha value: {}".format(alpha))
def __setstate__(self, state): super(RMSprop, self).__setstate__(state) for group in self.param_groups: group.setdefault('momentum', 0) group.setdefault('centered', False)
[docs] @torch.no_grad() def step(self, closure=None): """Performs a single optimization step.
Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure()
for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad if grad.is_sparse: raise RuntimeError('RMSprop does not support sparse gradients') state = self.state[p]
# State initialization if len(state) == 0: state['step'] = 0 state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) if group['momentum'] > 0: state['momentum_buffer'] = torch.zeros_like(p, memory_format=torch.preserve_format) if group['centered']: state['grad_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`__.
Arguments: params (iterable): iterable of parameters to optimize or dicts defining
parameter groups rho (float, optional): coefficient used for computing a running average of squared gradients (default: 0.9) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-6) lr (float, optional): coefficient that scale delta before it is applied to the parameters (default: 1.0) weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
__ https://arxiv.org/abs/1212.5701 """
def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= rho <= 1.0: raise ValueError("Invalid rho value: {}".format(rho)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= weight_decay: raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
[docs] @torch.no_grad() def step(self, closure=None): """Performs a single optimization step.
Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure()
for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad if grad.is_sparse: raise RuntimeError('Adadelta does not support sparse gradients') state = self.state[p]
# State initialization if len(state) == 0: state['step'] = 0 state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) state['acc_delta'] = torch.zeros_like(p, memory_format=torch.preserve_format)
import math import torch from .optimizer import Optimizer
[docs]class Adam(Optimizer): r"""Implements Adam algorithm.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ (default: False)
.. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) if not 0.0 <= weight_decay:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(Adam, self).__init__(params, defaults)
def __setstate__(self, state): super(Adam, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False)
[docs] @torch.no_grad() def step(self, closure=None): """Performs a single optimization step.
Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure()
for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') amsgrad = group['amsgrad']
state = self.state[p]
# State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
if group['weight_decay'] != 0: grad = grad.add(p, alpha=group['weight_decay'])
# Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) else: denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
step_size = group['lr'] / bias_correction1
p.addcdiv_(exp_avg, denom, value=-step_size)
return loss
这里通过 grad = p.grad 得到每个参数的梯度,也就是1式的 。
如果使用 weight_decay 的话,那么相当于目标函数加上 ,所以相当于是梯度相当于要再加上 ,故使用了 grad = grad.add(p, alpha=group['weight_decay'])。
Just adding the square of the weights to the loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact with the m and v parameters in strange ways. Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent to adding the square of the weights to the loss with plain (non-momentum) SGD. Add weight decay at the end (fixed version).
The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay coefficient (default: 1e-2) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ (default: False)
.. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) if not 0.0 <= weight_decay: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) super(AdamW, self).__init__(params, defaults)
def __setstate__(self, state): super(AdamW, self).__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False)
[docs] @torch.no_grad()
def step(self, closure=None): """Performs a single optimization step.
Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: with torch.enable_grad(): loss = closure()
for group in self.param_groups: for p in group['params']: if p.grad is None: continue
# Perform optimization step grad = p.grad if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') amsgrad = group['amsgrad']
state = self.state[p]
# State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
# Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) else: denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
step_size = group['lr'] / bias_correction1
p.addcdiv_(exp_avg, denom, value=-step_size)
return loss
与 Adam 不一样的地方是: Adam 如果使用 weight_decay 的话,那么相当于目标函数加上 ,所以相当于是梯度相当于要再加上 ,故使用了 grad = grad.add(p, alpha=group['weight_decay'])。