models/neural_net.py · Multi-Layer-Neural-Networks

"""Neural network model."""

from typing import Sequence

import numpy as np


class NeuralNetwork:
    """A multi-layer fully-connected neural network. The net has an input
    dimension of N, a hidden layer dimension of H, and performs classification
    over C classes. We train the network with a softmax loss function and L2
    regularization on the weight matrices.

    The network uses a nonlinearity after each fully connected layer except for
    the last. The outputs of the last fully-connected layer are the scores for
    each class."""

    def __init__(self,input_size: int,hidden_sizes: Sequence[int],output_size: int,num_layers: int,):
        """Initialize the model. Weights are initialized to small random values
        and biases are initialized to zero. Weights and biases are stored in
        the variable self.params, which is a dictionary with the following
        keys:

        W1: 1st layer weights; has shape (D, H_1)
        b1: 1st layer biases; has shape (H_1,)
        ...
        Wk: kth layer weights; has shape (H_{k-1}, C)
        bk: kth layer biases; has shape (C,)

        Parameters:
            input_size: The dimension D of the input data
            hidden_size: List [H1,..., Hk] with the number of neurons Hi in the
                hidden layer i
            output_size: The number of classes C
            num_layers: Number of fully connected layers in the neural network
        """
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.output_size = output_size
        self.num_layers = num_layers
        
        # print(self.input_size)
        # print(self.hidden_sizes)
        # print(self.output_size)


        assert len(hidden_sizes) == (num_layers - 1)
        sizes = [input_size] + hidden_sizes + [output_size]
        # print("the sizes list",sizes)

        self.params = {}
        for i in range(1, num_layers + 1):
            self.params["W" + str(i)] =np.random.randn(sizes[i - 1], sizes[i])/ np.sqrt(sizes[i - 1])
            self.params["b" + str(i)] = np.zeros(sizes[i])
        # print(self.params)
        
        # print('printing param')
        # print(self.params["W1"].shape)
        # print(self.params["W2"].shape)
        # print(self.params["b1"].shape)
        # print(self.params["b2"].shape)

    def linear(self, W: np.ndarray, X: np.ndarray, b: np.ndarray) -> np.ndarray:
        """Fully connected (linear) layer.

        Parameters:
            W: the weight matrix
            X: the input data
            b: the bias

        Returns:
            the output
        """
        
        tmp=np.dot(X, W) + b
        #print("shape of tmp",tmp.shape)
        return tmp

    def relu(self, X: np.ndarray) -> np.ndarray:
        """Rectified Linear Unit (ReLU).

        Parameters:
            X: the input data

        Returns:
            the output
        """
        # TODO: implement me
        return X * (X > 0) #elementwise multiplication

    def softmax(self, X: np.ndarray) -> np.ndarray:
        """The softmax function.

        Parameters:
            X: the input data

        Returns:
            the output
        """
        maxiscores = np.amax(X, axis=1) # this is a vector with the dimension (N,)
        expo_scores = np.exp(X - maxiscores[:,None]) #   this is a ndarray with the dimension (N, C)
        prob_class = expo_scores / np.sum(expo_scores, axis=1, keepdims=True) # this is a ndarray with the dimension (N, C)
 
        return prob_class

    def forward(self, X: np.ndarray) -> np.ndarray:
        """Compute the scores for each class for all of the data samples.

        Hint: this function is also used for prediction.

        Parameters:
            X: Input data of shape (N, D). Each X[i] is a training or
                testing sample

        Returns:
            Matrix of shape (N, C) where scores[i, c] is the score for class
                c on input X[i] outputted from the last layer of your network
        """
        self.outputs = {}
        # TODO: implement me. You'll want to store the output of each layer in
        # self.outputs as it will be used during back-propagation. You can use
        # the same keys as self.params. You can use functions like
        # self.linear, self.relu, and self.softmax in here.
        input_X=X
        
        # print("number of layers",self.num_layers)
        for i in range(1, self.num_layers + 1):
            # print(i, input_X.shape)
            after_linear=self.linear(self.params["W" + str(i)] , input_X, self.params['b' + str(i)])
            self.outputs["X"+str(i+1)]=after_linear #NOTICE ADDING ONE
            # print("linear happened")
            # print("after linear",self.outputs["X"+str(i+1)].shape)
            if(i < self.num_layers):
                # print("relu happened")
                after_relu=self.relu(after_linear)
                self.outputs["X"+str(i+1)+"_ReLU"]=after_relu  #NOTICE ADDING ONE
                input_X=after_relu # last one should be (N, C)  The outputs of the last fully-connected layer are the scores for each class
                # print("after relu",self.outputs["X"+str(i+1)+"_ReLU"].shape)
         
        scores_all_1= after_linear  
        scores_all=self.softmax(scores_all_1)
        # scores_all=self.softmax(after_linear)
        # print("softmax happened")
        # print(self.outputs)
        # maxiscores = np.amax(scores, axis=1) # this is a vector with the dimension (N,)
        # expo_scores = np.exp(scores - maxiscores[:,None]) #   this is a ndarray with the dimension (N, C)
        # prob_class = expo_scores / np.sum(expo_scores, axis=1, keepdims=True) # this is a ndarray with the dimension (N, C)
        # prob_class_log = -np.log(prob_class[range(N), y]) # (N,)
        # data_loss = np.mean(prob_class_log)
        #reg_loss = reg * (np.sum(W1**2) + np.sum(W2**2))
        
        return scores_all

    def backward(self, X: np.ndarray, y: np.ndarray, update: str ,lr: float, reg: float = 0.0) -> float:
        """Perform back-propagation and update the parameters using the
        gradients.

        Parameters:
            X: Input data of shape (N, D). Each X[i] is a training sample
            y: Vector of training labels. y[i] is the label for X[i], and each
                y[i] is an integer in the range 0 <= y[i] < C
            lr: Learning rate
            reg: Regularization strength

        Returns:
            Total loss for this batch of training samples
        """
        self.gradients = {}
        loss = 0.0
        # TODO: implement me. You'll want to store the gradient of each layer
        # in self.gradients if you want to be able to debug your gradients
        # later. You can use the same keys as self.params. You can add
        # functions like self.linear_grad, self.relu_grad, and
        # self.softmax_grad if it helps organize your code.
        
        ford=self.forward(X)
        
        #ford=self.softmax(ford0)
        # print("softmax happened")
        
        log_prob=-np.log(ford[range(X.shape[0]),y])
        data_loss=np.mean(log_prob)
        reg_loss=reg * (np.sum(self.params["W1"]**2) + np.sum(self.params["W2"]**2))
        
        
        if (self.num_layers == 2):
        # from softmax derivative
            after_softmax=self.soft_max_grad(ford,y)
            
            #print("shape of after_softmaxk derivative",after_softmax)
            
            # W2 gradient
            self.gradients["W2"]=np.dot(self.outputs["X2_ReLU"].T,after_softmax)
            self.gradients["W2"]+=2*reg*self.params["W2"]
            self.gradients["b2"]=np.sum(after_softmax,axis=0)
            
             # W1 gradient
            #first change X1
            dW1=np.dot(after_softmax,self.params["W2"].T)
            
            # ReLu layer
            dX1=self.relu_grad(self.outputs["X2"])
            new_dX1=dX1*dW1  #elementwise multiplication
            
            self.gradients["W1"]=X.T.dot(new_dX1)
            self.gradients["W1"]+= 2 * reg *self.params["W1"]
            
            #b1 grad
            self.gradients["b1"]=np.sum(new_dX1, axis=0)
            
            if (update=="SGD"):
            
                self.params["W1"]+= - lr*self.gradients["W1"]
                self.params["W2"]+= - lr*self.gradients["W2"]
                self.params["b1"]+= - lr*self.gradients["b1"]
                self.params["b2"]+= - lr*self.gradients["b2"]
        
        if (self.num_layers == 3):
            after_softmax=self.soft_max_grad(ford,y)
            
            #print("shape of after_softmaxk derivative",after_softmax)
            
            # W3 gradient
            self.gradients["W3"]=np.dot(self.outputs["X3_ReLU"].T,after_softmax)
            self.gradients["W3"]+=2*reg*self.params["W3"]
            self.gradients["b3"]=np.sum(after_softmax,axis=0)
            
             # X2 gradient
            dX3=np.dot(after_softmax,self.params["W3"].T)
            
            # ReLu layer
            dX33=self.relu_grad(self.outputs["X3"])
            new_dX3=dX33*dX3
            
            #W2 gradient
            self.gradients["W2"]=self.outputs["X2_ReLU"].T.dot(new_dX3)
            self.gradients["W2"]+= 2 * reg *self.params["W2"]
            
            #b2 grad
            self.gradients["b2"]=np.sum(new_dX3, axis=0)
            
            # X1 gradient
            dX1=np.dot(new_dX3 ,self.params["W2"].T)
            # ReLu layer
            dX11=self.relu_grad(self.outputs["X2"])
            new_dX1=dX11*dX1
            
            #W2 gradient
            self.gradients["W1"]=X.T.dot(new_dX1)
            self.gradients["W1"]+= 2 * reg *self.params["W1"]
            
            #b2 grad
            self.gradients["b1"]=np.sum(new_dX1, axis=0)
                  
            if (update=="SGD"):
                self.params["W1"]+= - lr*self.gradients["W1"]
                self.params["W2"]+= - lr*self.gradients["W2"]
                self.params["W3"]+= - lr*self.gradients["W3"]
                self.params["b1"]+= - lr*self.gradients["b1"]
                self.params["b2"]+= - lr*self.gradients["b2"]
                self.params["b3"]+= - lr*self.gradients["b3"]


        loss = data_loss + reg_loss
        return (loss,self.gradients)
    
    def soft_max_grad(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        X[range(X.shape[0]), y] -=1
        
        return X/X.shape[0]
    
    def relu_grad(self, X: np.ndarray)-> np.ndarray:
        
        X[X<=0]=0
        X[X>0]=1
        return X