Adam-Optimization-From-Scratch/CustomAdam.py at master · thetechdude124/Adam-Optimization-From-Scratch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import torch
from torch.optim import Optimizer

#Custom Adam Optimizer - extension of Optimizer class
class CustomAdam(Optimizer):

    """
    A custom implementation of the Adam optimizer. Defaults used are as recommended in https://arxiv.org/abs/1412.6980
    See the paper or visit Optimizer_Experimentation.ipynb for more information on how exactly Adam works + mathematics behind it.

    Params:
    stepsize (float): the effective upperbound of the optimizer step in most cases (size of step). DEFAULT - 0.001.
    bias_m1 (float): bias for the first moment estimate. DEFAULT - 0.9
    bias_m2 (float): bias for the second uncentered moment estimate, DEFAULT - 0.999.
    epsilon (float): small number added to prevent division by zero, DEFAULT - 10e-8.
    bias_correction (bool): whether the optimizer should correct for the specified biases when taking a step. DEFAULT - TRUE.
    """
    #Initialize optimizer with parameters
    def __init__(self, params, stepsize = 0.001, bias_m1 = 0.9, bias_m2 = 0.999, epsilon = 10e-8, bias_correction = True):
        #Check if stepsizes and biases are invalid (negative)
        if stepsize < 0:
            raise ValueError("Invalid stepsize [{}]. Choose a positive stepsize".format(stepsize))
        if bias_m1 < 0 or bias_m2 < 0 and bias_correction:
            raise ValueError("Invalid bias parameters [{}, {}]. Choose positive bias parameters.".format(bias_m1, bias_m2))
        #Declare dictionary of default values for optimizer initialization
        DEFAULTS = dict(stepsize = stepsize, bias_m1 = bias_m1, bias_m2 = bias_m2, epsilon = epsilon, bias_correction = bias_correction)
        #Initialize the optimizer
        super(CustomAdam, self).__init__(params, DEFAULTS)

    #Step method (for updating parameters)
    def step(self, closure = None):
        #Set loss to none
        loss = None
        #If the closure is set to True, set the loss to the closure function
        loss = closure() if closure != None else loss
        #Check if this is the first step - if not, increment the current step
        if not self.state["step"]:
            self.state["step"] = 1
        else:
            self.state["step"] += 1
        #Iterate over "groups" of parameters (layers of parameters in the network) to begin processing and computing the next set of params
        for param_group in self.param_groups:
            #Iterate over individual parameters
            for param in param_group["params"]:
                #Check if gradients have been computed for each parameter
                #If not - if there are no gradients - then skip the parameter
                if param.grad.data == None:
                    continue
                else: gradients = param.grad.data
                #Use Adam optimization method - first, define all the required arguments for the parameter if we are on the first step
                if self.state["step"] == 1:
                    #Set the first and second moment estimates to zeroes
                    self.state["first_moment_estimate"] = torch.zeros_like(param.data)
                    self.state["second_moment_estimate"] = torch.zeros_like(param.data)
                #Declare variables from state - inplace methods modify state variable directly
                first_moment_estimate = self.state["first_moment_estimate"]
                second_moment_estimate = self.state["second_moment_estimate"]
                #Compute the first moment estimate - B_1 * m_t + (1-B_1) * grad (uncentered)
                first_moment_estimate.mul_(param_group["bias_m1"]).add_(gradients * (1.0 - param_group["bias_m1"]))
                #Compute the second moment estimate - B_2 * v_t + (1-B_2) * grad^2 (uncentered)
                second_moment_estimate.mul_(param_group["bias_m2"]).add_(gradients.pow_(2) * (1.0 - param_group["bias_m2"]))
                #Perform bias correction if parameter is set to true
                if param_group["bias_correction"]:
                    #Perform bias correction for the first moment estimate: m_t / (1 -B_1^t)
                    first_moment_estimate.divide_(1.0 - (param_group["bias_m1"] ** self.state["step"]))
                    #Perform bias correction for second moment estimate: v_t / (1 - B_2^t)
                    second_moment_estimate.divide_(1.0 - (param_group["bias_m2"] ** self.state["step"]))
                #Next, perform the actual update
                #Multiply the stepsize a by the quotient of the first moment estimate and the square root of the second moment estimate plus epsilon
                #In other words - theta = theta_{t-1} - a * first_estimate/(sqr(second_estimate) + epsilon)
                param.data.add_((-param_group["stepsize"]) * first_moment_estimate.divide_(second_moment_estimate.sqrt_() + param_group["epsilon"]))
        #Return the loss
        return loss