"""Neural network model.""" from typing import Sequence import numpy as np class NeuralNetwork: """A multi-layer fully-connected neural network. The net has an input dimension of N, a hidden layer dimension of H, and performs classification over C classes. We train the network with a softmax loss function and L2 regularization on the weight matrices. The network uses a nonlinearity after each fully connected layer except for the last. The outputs of the last fully-connected layer are the scores for each class.""" def __init__(self,input_size: int,hidden_sizes: Sequence[int],output_size: int,num_layers: int,): """Initialize the model. Weights are initialized to small random values and biases are initialized to zero. Weights and biases are stored in the variable self.params, which is a dictionary with the following keys: W1: 1st layer weights; has shape (D, H_1) b1: 1st layer biases; has shape (H_1,) ... Wk: kth layer weights; has shape (H_{k-1}, C) bk: kth layer biases; has shape (C,) Parameters: input_size: The dimension D of the input data hidden_size: List [H1,..., Hk] with the number of neurons Hi in the hidden layer i output_size: The number of classes C num_layers: Number of fully connected layers in the neural network """ self.input_size = input_size self.hidden_sizes = hidden_sizes self.output_size = output_size self.num_layers = num_layers # print(self.input_size) # print(self.hidden_sizes) # print(self.output_size) assert len(hidden_sizes) == (num_layers - 1) sizes = [input_size] + hidden_sizes + [output_size] # print("the sizes list",sizes) self.params = {} for i in range(1, num_layers + 1): self.params["W" + str(i)] =np.random.randn(sizes[i - 1], sizes[i])/ np.sqrt(sizes[i - 1]) self.params["b" + str(i)] = np.zeros(sizes[i]) # print(self.params) # print('printing param') # print(self.params["W1"].shape) # print(self.params["W2"].shape) # print(self.params["b1"].shape) # print(self.params["b2"].shape) def linear(self, W: np.ndarray, X: np.ndarray, b: np.ndarray) -> np.ndarray: """Fully connected (linear) layer. Parameters: W: the weight matrix X: the input data b: the bias Returns: the output """ tmp=np.dot(X, W) + b #print("shape of tmp",tmp.shape) return tmp def relu(self, X: np.ndarray) -> np.ndarray: """Rectified Linear Unit (ReLU). Parameters: X: the input data Returns: the output """ # TODO: implement me return X * (X > 0) #elementwise multiplication def softmax(self, X: np.ndarray) -> np.ndarray: """The softmax function. Parameters: X: the input data Returns: the output """ maxiscores = np.amax(X, axis=1) # this is a vector with the dimension (N,) expo_scores = np.exp(X - maxiscores[:,None]) # this is a ndarray with the dimension (N, C) prob_class = expo_scores / np.sum(expo_scores, axis=1, keepdims=True) # this is a ndarray with the dimension (N, C) return prob_class def forward(self, X: np.ndarray) -> np.ndarray: """Compute the scores for each class for all of the data samples. Hint: this function is also used for prediction. Parameters: X: Input data of shape (N, D). Each X[i] is a training or testing sample Returns: Matrix of shape (N, C) where scores[i, c] is the score for class c on input X[i] outputted from the last layer of your network """ self.outputs = {} # TODO: implement me. You'll want to store the output of each layer in # self.outputs as it will be used during back-propagation. You can use # the same keys as self.params. You can use functions like # self.linear, self.relu, and self.softmax in here. input_X=X # print("number of layers",self.num_layers) for i in range(1, self.num_layers + 1): # print(i, input_X.shape) after_linear=self.linear(self.params["W" + str(i)] , input_X, self.params['b' + str(i)]) self.outputs["X"+str(i+1)]=after_linear #NOTICE ADDING ONE # print("linear happened") # print("after linear",self.outputs["X"+str(i+1)].shape) if(i < self.num_layers): # print("relu happened") after_relu=self.relu(after_linear) self.outputs["X"+str(i+1)+"_ReLU"]=after_relu #NOTICE ADDING ONE input_X=after_relu # last one should be (N, C) The outputs of the last fully-connected layer are the scores for each class # print("after relu",self.outputs["X"+str(i+1)+"_ReLU"].shape) scores_all_1= after_linear scores_all=self.softmax(scores_all_1) # scores_all=self.softmax(after_linear) # print("softmax happened") # print(self.outputs) # maxiscores = np.amax(scores, axis=1) # this is a vector with the dimension (N,) # expo_scores = np.exp(scores - maxiscores[:,None]) # this is a ndarray with the dimension (N, C) # prob_class = expo_scores / np.sum(expo_scores, axis=1, keepdims=True) # this is a ndarray with the dimension (N, C) # prob_class_log = -np.log(prob_class[range(N), y]) # (N,) # data_loss = np.mean(prob_class_log) #reg_loss = reg * (np.sum(W1**2) + np.sum(W2**2)) return scores_all def backward(self, X: np.ndarray, y: np.ndarray, update: str ,lr: float, reg: float = 0.0) -> float: """Perform back-propagation and update the parameters using the gradients. Parameters: X: Input data of shape (N, D). Each X[i] is a training sample y: Vector of training labels. y[i] is the label for X[i], and each y[i] is an integer in the range 0 <= y[i] < C lr: Learning rate reg: Regularization strength Returns: Total loss for this batch of training samples """ self.gradients = {} loss = 0.0 # TODO: implement me. You'll want to store the gradient of each layer # in self.gradients if you want to be able to debug your gradients # later. You can use the same keys as self.params. You can add # functions like self.linear_grad, self.relu_grad, and # self.softmax_grad if it helps organize your code. ford=self.forward(X) #ford=self.softmax(ford0) # print("softmax happened") log_prob=-np.log(ford[range(X.shape[0]),y]) data_loss=np.mean(log_prob) reg_loss=reg * (np.sum(self.params["W1"]**2) + np.sum(self.params["W2"]**2)) if (self.num_layers == 2): # from softmax derivative after_softmax=self.soft_max_grad(ford,y) #print("shape of after_softmaxk derivative",after_softmax) # W2 gradient self.gradients["W2"]=np.dot(self.outputs["X2_ReLU"].T,after_softmax) self.gradients["W2"]+=2*reg*self.params["W2"] self.gradients["b2"]=np.sum(after_softmax,axis=0) # W1 gradient #first change X1 dW1=np.dot(after_softmax,self.params["W2"].T) # ReLu layer dX1=self.relu_grad(self.outputs["X2"]) new_dX1=dX1*dW1 #elementwise multiplication self.gradients["W1"]=X.T.dot(new_dX1) self.gradients["W1"]+= 2 * reg *self.params["W1"] #b1 grad self.gradients["b1"]=np.sum(new_dX1, axis=0) if (update=="SGD"): self.params["W1"]+= - lr*self.gradients["W1"] self.params["W2"]+= - lr*self.gradients["W2"] self.params["b1"]+= - lr*self.gradients["b1"] self.params["b2"]+= - lr*self.gradients["b2"] if (self.num_layers == 3): after_softmax=self.soft_max_grad(ford,y) #print("shape of after_softmaxk derivative",after_softmax) # W3 gradient self.gradients["W3"]=np.dot(self.outputs["X3_ReLU"].T,after_softmax) self.gradients["W3"]+=2*reg*self.params["W3"] self.gradients["b3"]=np.sum(after_softmax,axis=0) # X2 gradient dX3=np.dot(after_softmax,self.params["W3"].T) # ReLu layer dX33=self.relu_grad(self.outputs["X3"]) new_dX3=dX33*dX3 #W2 gradient self.gradients["W2"]=self.outputs["X2_ReLU"].T.dot(new_dX3) self.gradients["W2"]+= 2 * reg *self.params["W2"] #b2 grad self.gradients["b2"]=np.sum(new_dX3, axis=0) # X1 gradient dX1=np.dot(new_dX3 ,self.params["W2"].T) # ReLu layer dX11=self.relu_grad(self.outputs["X2"]) new_dX1=dX11*dX1 #W2 gradient self.gradients["W1"]=X.T.dot(new_dX1) self.gradients["W1"]+= 2 * reg *self.params["W1"] #b2 grad self.gradients["b1"]=np.sum(new_dX1, axis=0) if (update=="SGD"): self.params["W1"]+= - lr*self.gradients["W1"] self.params["W2"]+= - lr*self.gradients["W2"] self.params["W3"]+= - lr*self.gradients["W3"] self.params["b1"]+= - lr*self.gradients["b1"] self.params["b2"]+= - lr*self.gradients["b2"] self.params["b3"]+= - lr*self.gradients["b3"] loss = data_loss + reg_loss return (loss,self.gradients) def soft_max_grad(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: X[range(X.shape[0]), y] -=1 return X/X.shape[0] def relu_grad(self, X: np.ndarray)-> np.ndarray: X[X<=0]=0 X[X>0]=1 return X