-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathCustomAdam.py
More file actions
73 lines (69 loc) · 4.76 KB
/
Copy pathCustomAdam.py
File metadata and controls
73 lines (69 loc) · 4.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import torch
from torch.optim import Optimizer
#Custom Adam Optimizer - extension of Optimizer class
class CustomAdam(Optimizer):
"""
A custom implementation of the Adam optimizer. Defaults used are as recommended in https://arxiv.org/abs/1412.6980
See the paper or visit Optimizer_Experimentation.ipynb for more information on how exactly Adam works + mathematics behind it.
Params:
stepsize (float): the effective upperbound of the optimizer step in most cases (size of step). DEFAULT - 0.001.
bias_m1 (float): bias for the first moment estimate. DEFAULT - 0.9
bias_m2 (float): bias for the second uncentered moment estimate, DEFAULT - 0.999.
epsilon (float): small number added to prevent division by zero, DEFAULT - 10e-8.
bias_correction (bool): whether the optimizer should correct for the specified biases when taking a step. DEFAULT - TRUE.
"""
#Initialize optimizer with parameters
def __init__(self, params, stepsize = 0.001, bias_m1 = 0.9, bias_m2 = 0.999, epsilon = 10e-8, bias_correction = True):
#Check if stepsizes and biases are invalid (negative)
if stepsize < 0:
raise ValueError("Invalid stepsize [{}]. Choose a positive stepsize".format(stepsize))
if bias_m1 < 0 or bias_m2 < 0 and bias_correction:
raise ValueError("Invalid bias parameters [{}, {}]. Choose positive bias parameters.".format(bias_m1, bias_m2))
#Declare dictionary of default values for optimizer initialization
DEFAULTS = dict(stepsize = stepsize, bias_m1 = bias_m1, bias_m2 = bias_m2, epsilon = epsilon, bias_correction = bias_correction)
#Initialize the optimizer
super(CustomAdam, self).__init__(params, DEFAULTS)
#Step method (for updating parameters)
def step(self, closure = None):
#Set loss to none
loss = None
#If the closure is set to True, set the loss to the closure function
loss = closure() if closure != None else loss
#Check if this is the first step - if not, increment the current step
if not self.state["step"]:
self.state["step"] = 1
else:
self.state["step"] += 1
#Iterate over "groups" of parameters (layers of parameters in the network) to begin processing and computing the next set of params
for param_group in self.param_groups:
#Iterate over individual parameters
for param in param_group["params"]:
#Check if gradients have been computed for each parameter
#If not - if there are no gradients - then skip the parameter
if param.grad.data == None:
continue
else: gradients = param.grad.data
#Use Adam optimization method - first, define all the required arguments for the parameter if we are on the first step
if self.state["step"] == 1:
#Set the first and second moment estimates to zeroes
self.state["first_moment_estimate"] = torch.zeros_like(param.data)
self.state["second_moment_estimate"] = torch.zeros_like(param.data)
#Declare variables from state - inplace methods modify state variable directly
first_moment_estimate = self.state["first_moment_estimate"]
second_moment_estimate = self.state["second_moment_estimate"]
#Compute the first moment estimate - B_1 * m_t + (1-B_1) * grad (uncentered)
first_moment_estimate.mul_(param_group["bias_m1"]).add_(gradients * (1.0 - param_group["bias_m1"]))
#Compute the second moment estimate - B_2 * v_t + (1-B_2) * grad^2 (uncentered)
second_moment_estimate.mul_(param_group["bias_m2"]).add_(gradients.pow_(2) * (1.0 - param_group["bias_m2"]))
#Perform bias correction if parameter is set to true
if param_group["bias_correction"]:
#Perform bias correction for the first moment estimate: m_t / (1 -B_1^t)
first_moment_estimate.divide_(1.0 - (param_group["bias_m1"] ** self.state["step"]))
#Perform bias correction for second moment estimate: v_t / (1 - B_2^t)
second_moment_estimate.divide_(1.0 - (param_group["bias_m2"] ** self.state["step"]))
#Next, perform the actual update
#Multiply the stepsize a by the quotient of the first moment estimate and the square root of the second moment estimate plus epsilon
#In other words - theta = theta_{t-1} - a * first_estimate/(sqr(second_estimate) + epsilon)
param.data.add_((-param_group["stepsize"]) * first_moment_estimate.divide_(second_moment_estimate.sqrt_() + param_group["epsilon"]))
#Return the loss
return loss