{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import copy\n", "from kaggle_submission import output_submission_csv\n", "from models.neural_net import NeuralNetwork\n", "from utils.data_process import get_CIFAR10_data\n", "\n", "%matplotlib inline\n", "plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots\n", "\n", "# For auto-reloading external modules\n", "# See http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython\n", "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Loading CIFAR-10\n", "Now that you have implemented a neural network that passes gradient checks and works on toy data, you will test your network on the CIFAR-10 dataset." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# You can change these numbers for experimentation\n", "# For submission be sure they are set to the default values\n", "TRAIN_IMAGES = 49000\n", "VAL_IMAGES = 1000\n", "TEST_IMAGES = 10000\n", "\n", "data = get_CIFAR10_data(TRAIN_IMAGES, VAL_IMAGES, TEST_IMAGES)\n", "X_train, y_train = data['X_train'], data['y_train']\n", "X_val, y_val = data['X_val'], data['y_val']\n", "X_test, y_test = data['X_test'], data['y_test']" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(49000,)\n", "(49000, 3072)\n", "(1000, 3072)\n", "(1000,)\n", "(10000, 3072)\n", "(10000,)\n" ] } ], "source": [ "# mean_image = np.mean(X_train, axis=0)\n", "# X_train -= mean_image\n", "# X_val -= mean_image\n", "# X_test -= mean_image\n", "print(y_train.shape)\n", "print(X_train.shape)\n", "print(X_val.shape)\n", "print(y_val.shape)\n", "print(X_test.shape)\n", "print(y_test.shape)\n", "# for x in y_train:\n", "# print(x)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#set of functions needed to implement Adam\n", "def set_zero(dictio):\n", " new_dictio={}\n", " for wx in dictio:\n", " new_dictio[wx]=np.zeros(dictio[wx].shape)\n", " return new_dictio\n", "def dict_x_ele(dict1, ele):\n", " new_dictio={}\n", " for wx in dict1:\n", " new_dictio[wx]=np.multiply(dict1[wx],ele)\n", " return new_dictio\n", "def dict_by_ele(dict1, ele): \n", " new_dictio={}\n", " for wx in dict1:\n", " new_dictio[wx]=np.divide(dict1[wx],ele)\n", " return new_dictio \n", "def dict_sqr(dict1): \n", " new_dictio={}\n", " for wx in dict1:\n", " new_dictio[wx]=np.square(dict1[wx])\n", " return new_dictio \n", "def dict_sqrt(dict1): \n", " new_dictio={}\n", " for wx in dict1:\n", " new_dictio[wx]=np.sqrt(dict1[wx])\n", " return new_dictio \n", "def dict_add_ele(dict1,ele): \n", " new_dictio={}\n", " for wx in dict1:\n", " new_dictio[wx]=dict1[wx]+ele\n", " return new_dictio \n", "def dict_by_dict(dict1,dict2): \n", " new_dictio={}\n", " for wx in dict1:\n", " new_dictio[wx]=dict1[wx]/dict2[wx]\n", " return new_dictio \n", "def dict_minus_dict(dict1,dict2): \n", " new_dictio={}\n", " for wx in dict1:\n", " new_dictio[wx]=dict1[wx]-dict2[wx]\n", " return new_dictio \n", "def dict_add_dict(dict1,dict2): \n", " new_dictio={}\n", " for wx in dict1:\n", " new_dictio[wx]=dict1[wx]+dict2[wx]\n", " return new_dictio " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train using SGD\n", "To train our network we will use SGD. In addition, we will adjust the learning rate with an exponential learning rate schedule as optimization proceeds; after each epoch, we will reduce the learning rate by multiplying it by a decay rate.\n", "\n", "You can try different numbers of layers and other hyperparameters on the CIFAR-10 dataset below." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "epoch: 0\n", "epoch: 1\n", "epoch: 2\n", "epoch: 3\n", "epoch: 4\n", "epoch: 5\n", "epoch: 6\n", "epoch: 7\n", "epoch: 8\n", "epoch: 9\n", "epoch: 10\n", "epoch: 11\n", "epoch: 12\n", "epoch: 13\n", "epoch: 14\n", "epoch: 15\n", "epoch: 16\n", "epoch: 17\n", "epoch: 18\n", "epoch: 19\n", "epoch: 20\n", "epoch: 21\n", "epoch: 22\n", "epoch: 23\n", "epoch: 24\n", "epoch: 25\n", "epoch: 26\n", "epoch: 27\n", "epoch: 28\n", "epoch: 29\n", "epoch: 30\n", "epoch: 31\n", "epoch: 32\n", "epoch: 33\n", "epoch: 34\n", "epoch: 35\n", "epoch: 36\n", "epoch: 37\n", "epoch: 38\n", "epoch: 39\n" ] } ], "source": [ "from random import shuffle\n", "\n", "# Hyperparameters\n", "input_size = 32 * 32 * 3\n", "num_layers = 3\n", "hidden_size = 80\n", "hidden_sizes = [hidden_size] * (num_layers - 1)\n", "num_classes = 10\n", "epochs = 40\n", "batch_size = 200\n", "learning_rate = 1e-3\n", "learning_rate_decay = 0.95\n", "regularization = 0.001\n", "\n", "# Initialize a new neural network model\n", "net = NeuralNetwork(input_size, hidden_sizes, num_classes, num_layers)\n", "\n", "# Variables to store performance for each epoch\n", "train_loss = np.zeros(epochs)\n", "train_accuracy = np.zeros(epochs)\n", "val_accuracy = np.zeros(epochs)\n", "\n", "# For each epoch...\n", "for epoch in range(epochs):\n", " print('epoch:', epoch)\n", " \n", " # Shuffle the dataset\n", " ind_list = [i for i in range(X_train.shape[0])]\n", " shuffle(ind_list)\n", " X_train = X_train[ind_list,:]\n", " y_train = y_train[ind_list]\n", "\n", " \n", " # Training\n", " # For each mini-batch...\n", " #################################\n", " #Implemented the linear decay in learning rate but found that results without it were more accurate\n", " #learning_rate=learning_rate/(1+learning_rate_decay*epoch)\n", " #################################\n", " for batch in range(TRAIN_IMAGES // batch_size):\n", " batch_indices = np.random.choice(range(X_train.shape[0]), size=batch_size)\n", " # Create a mini-batch of training data and labels\n", " X_batch = X_train[batch_indices]\n", " y_batch = y_train[batch_indices]\n", " # Run the forward pass of the model to get a prediction and compute the accuracy\n", " scores=net.forward(X_batch)\n", " y_pred=np.argmax(scores, axis=1)\n", " train_accuracy[epoch] += (np.sum(y_batch==y_pred)/len(y_batch))\n", " # Run the backward pass of the model to update the weights and compute the loss\n", " loss,gradients=net.backward(X_batch,y_batch,\"SGD\",learning_rate,regularization) \n", " train_loss[epoch]+=loss\n", "\n", " # Validation\n", " # No need to run the backward pass here, just run the forward pass to compute accuracy\n", " train_accuracy[epoch]/=(TRAIN_IMAGES // batch_size)\n", " train_loss[epoch]/=(TRAIN_IMAGES // batch_size)\n", " #net.params=dict_by_ele(net.params,(TRAIN_IMAGES // batch_size))\n", " scores=net.forward(X_val)\n", " y_pred=np.argmax(scores, axis=1)\n", " val_accuracy[epoch]= (np.sum(y_val==y_pred)/len(y_val))" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "<Figure size 720x576 with 2 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Plot the loss function and train / validation accuracies\n", "plt.subplot(2, 1, 1)\n", "plt.plot(train_loss)\n", "plt.title('Loss history')\n", "plt.xlabel('Iteration')\n", "plt.ylabel('Loss')\n", "plt.subplot(2, 1, 2)\n", "plt.plot(train_accuracy, label='train')\n", "plt.plot(val_accuracy, label='val')\n", "plt.title('Classification accuracy history')\n", "plt.xlabel('Epoch')\n", "plt.ylabel('Classification accuracy')\n", "plt.legend()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train using Adam\n", "Next we will train the same model using the Adam optimizer. You should take the above code for SGD and modify it to use Adam instead. For implementation details, see the lecture slides. The original paper that introduced Adam is also a good reference, and contains suggestions for default values: https://arxiv.org/pdf/1412.6980.pdf" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "epoch: 0\n", "epoch: 1\n", "epoch: 2\n", "epoch: 3\n", "epoch: 4\n", "epoch: 5\n", "epoch: 6\n", "epoch: 7\n", "epoch: 8\n", "epoch: 9\n", "epoch: 10\n", "epoch: 11\n", "epoch: 12\n", "epoch: 13\n", "epoch: 14\n", "epoch: 15\n", "epoch: 16\n", "epoch: 17\n", "epoch: 18\n", "epoch: 19\n", "epoch: 20\n", "epoch: 21\n", "epoch: 22\n", "epoch: 23\n", "epoch: 24\n", "epoch: 25\n", "epoch: 26\n", "epoch: 27\n", "epoch: 28\n", "epoch: 29\n", "epoch: 30\n", "epoch: 31\n", "epoch: 32\n", "epoch: 33\n", "epoch: 34\n", "epoch: 35\n", "epoch: 36\n", "epoch: 37\n", "epoch: 38\n", "epoch: 39\n", "epoch: 40\n", "epoch: 41\n", "epoch: 42\n", "epoch: 43\n", "epoch: 44\n", "epoch: 45\n", "epoch: 46\n", "epoch: 47\n", "epoch: 48\n", "epoch: 49\n", "epoch: 50\n", "epoch: 51\n", "epoch: 52\n", "epoch: 53\n", "epoch: 54\n", "epoch: 55\n", "epoch: 56\n", "epoch: 57\n", "epoch: 58\n", "epoch: 59\n", "epoch: 60\n", "epoch: 61\n", "epoch: 62\n", "epoch: 63\n", "epoch: 64\n", "epoch: 65\n", "epoch: 66\n", "epoch: 67\n", "epoch: 68\n", "epoch: 69\n", "epoch: 70\n", "epoch: 71\n", "epoch: 72\n", "epoch: 73\n", "epoch: 74\n", "epoch: 75\n", "epoch: 76\n", "epoch: 77\n", "epoch: 78\n", "epoch: 79\n", "epoch: 80\n", "epoch: 81\n", "epoch: 82\n", "epoch: 83\n", "epoch: 84\n", "epoch: 85\n", "epoch: 86\n", "epoch: 87\n", "epoch: 88\n", "epoch: 89\n", "epoch: 90\n", "epoch: 91\n", "epoch: 92\n", "epoch: 93\n", "epoch: 94\n", "epoch: 95\n", "epoch: 96\n", "epoch: 97\n", "epoch: 98\n", "epoch: 99\n" ] } ], "source": [ " \n", "# TODO: implement me\n", "# Hyperparameters\n", "################################################\n", "# Implementing Adam for two and three layers\n", "################################################\n", "\n", "\n", "\n", "import copy\n", "input_size = 32 * 32 * 3\n", "num_layers = 3\n", "hidden_size = 20\n", "hidden_sizes = [hidden_size] * (num_layers - 1)\n", "num_classes = 10\n", "epochs = 100\n", "batch_size = 200\n", "learning_rate = 1e-3 #0.025 #1e-3\n", "learning_rate_decay = 0.95\n", "regularization = 0.009\n", "betas=(0.9, 0.999)\n", "epsilon=1e-8\n", "# Initialize a new neural network model\n", "net = NeuralNetwork(input_size, hidden_sizes, num_classes, num_layers)\n", "\n", "# Variables to store performance for each epoch\n", "train_loss_adam = np.zeros(epochs)\n", "train_accuracy_adam = np.zeros(epochs)\n", "val_accuracy_adam = np.zeros(epochs)\n", " \n", "w = copy.deepcopy(net.params)\n", "m = set_zero(w) #creates a new dictionary which has the same size/elemets as the one passesed all set to zero\n", "v = set_zero(w)\n", "w_prev=set_zero(w)\n", "v_prev=set_zero(w)\n", "m_prev=set_zero(w)\n", "# For each epoch...\n", "for epoch in range(epochs):\n", " print('epoch:', epoch)\n", "\n", " ind_list = [i for i in range(X_train.shape[0])]\n", " shuffle(ind_list)\n", " X_train = X_train[ind_list,:]\n", " y_train = y_train[ind_list]\n", " for batch in range(TRAIN_IMAGES // batch_size):\n", " # Create a mini-batch of training data and labels\n", " batch_indices = np.random.choice(range(X_train.shape[0]), size=batch_size)\n", " X_batch = X_train[batch_indices]\n", " y_batch = y_train[batch_indices]\n", " # Run the forward pass of the model to get a prediction and compute the accuracy\n", " scores=net.forward(X_batch)\n", " y_pred=np.argmax(scores, axis=1)\n", " train_accuracy_adam[epoch] += (np.sum(y_batch==y_pred)/len(y_batch))\n", " \n", " # Run the backward pass of the model to update the weights and compute the loss\n", " loss, gradients=net.backward(X_batch,y_batch,learning_rate,regularization)\n", " \n", " m=dict_add_dict(dict_x_ele(m_prev,betas[0]),dict_x_ele(gradients,(1-betas[0])))\n", " \n", " v=dict_add_dict(dict_x_ele(v_prev,betas[1]),dict_x_ele(dict_sqr(gradients),(1-betas[1])))\n", " \n", " m_new=dict_by_ele(m,(1-betas[0]))\n", " v_new=dict_by_ele(v,(1-betas[1]))\n", " \n", " w=dict_minus_dict(w_prev,dict_by_dict(dict_x_ele(m_new,learning_rate),(dict_add_ele(dict_sqrt(v_new),epsilon))))\n", " net.params=copy.copy(w)\n", " #w=w_prev-(alpha*m_new/(np.sqrt(v_new)+epsilon))\n", " \n", " w_prev=copy.copy(w)\n", " v_prev=copy.copy(v)\n", " m_prev=copy.copy(m)\n", "\n", " train_loss_adam[epoch]+=loss\n", "\n", " # Validation\n", " # No need to run the backward pass here, just run the forward pass to compute accuracy\n", " train_accuracy_adam[epoch]/=(TRAIN_IMAGES // batch_size)\n", " train_loss_adam[epoch]/=(TRAIN_IMAGES // batch_size)\n", " #net.params=dict_by_ele(net.params,(TRAIN_IMAGES // batch_size))\n", " scores=net.forward(X_val)\n", " y_pred=np.argmax(scores, axis=1)\n", " val_accuracy_adam[epoch]= (np.sum(y_val==y_pred)/len(y_val))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Graph loss and train/val accuracies\n", "\n", "Examining the loss graph along with the train and val accuracy graphs should help you gain some intuition for the hyperparameters you should try in the hyperparameter tuning below. It should also help with debugging any issues you might have with your network." ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "<Figure size 720x576 with 2 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Plot the loss function and train / validation accuracies\n", "plt.subplot(2, 1, 1)\n", "plt.plot(train_loss_adam)\n", "plt.title('Loss history')\n", "plt.xlabel('Iteration')\n", "plt.ylabel('Loss')\n", "plt.subplot(2, 1, 2)\n", "plt.plot(train_accuracy_adam, label='train')\n", "plt.plot(val_accuracy_adam, label='val')\n", "plt.title('Classification accuracy history')\n", "plt.xlabel('Epoch')\n", "plt.ylabel('Classification accuracy')\n", "plt.legend()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Tuning SGD 2 layers\n", "\n", "\n", "##################################\n", "# Tuninig in the various hyper parameters for the SGD update\n", "##################################\n", "from random import shuffle\n", "\n", "# Hyperparameters\n", "\n", "# hidden_size = 20\n", "# epochs = 10\n", "# batch_size = 200\n", "# learning_rate = 1e-3\n", "# regularization = 0.1\n", "\n", "###########################################\n", "epochs_list=[100,150]\n", "batch_size_list=[50,200,400,1000]\n", "hidden_size_list=[20,40,80,120]\n", "learning_rate_list =[1e-1,1e-2,1e-3]\n", "regularization_list =[0.95,0.4,0.05,0.005]\n", "###########################################\n", "\n", "\n", "input_size = 32 * 32 * 3\n", "num_layers = 2\n", "num_classes = 10\n", "learning_rate_decay = 0.95\n", "\n", "# Initialize a new neural network model\n", "\n", "\n", "# Variables to store performance for each epoch\n", "\n", "\n", "# For each epoch...\n", "print(\"Dataset batch_size hidden_sizes regulariztion epochs accuracy \")\n", "for batch_size in batch_size_list:\n", " for hidden_size in hidden_size_list:\n", " hidden_sizes = [hidden_size] * (num_layers - 1)\n", " for learning_rate in learning_rate_list:\n", " for regularization in regularization_list:\n", " ff=0\n", " for epoch in range(epochs_list[ff]):\n", " net = NeuralNetwork(input_size, hidden_sizes, num_classes, num_layers)\n", " test_accuracy=0\n", " train_loss = 0\n", " train_accuracy = 0\n", " val_accuracy = 0\n", " #################################\n", " #Implemented the linear decay in learning rate but found that results without it were more accurate\n", " #learning_rate=learning_rate/(1+learning_rate_decay*epoch)\n", " #################################\n", " # Shuffle the dataset\n", " ind_list = [i for i in range(X_train.shape[0])]\n", " shuffle(ind_list)\n", " X_train = X_train[ind_list,:]\n", " y_train = y_train[ind_list]\n", " # Training\n", " # For each mini-batch...\n", " for batch in range(TRAIN_IMAGES // batch_size):\n", " batch_indices = np.random.choice(range(X_train.shape[0]), size=batch_size)\n", " # Create a mini-batch of training data and labels\n", " X_batch = X_train[batch_indices]\n", " y_batch = y_train[batch_indices]\n", " # Run the forward pass of the model to get a prediction and compute the accuracy\n", " scores=net.forward(X_batch)\n", " y_pred=np.argmax(scores, axis=1)\n", " train_accuracy+= (np.sum(y_batch==y_pred)/len(y_batch))\n", " # Run the backward pass of the model to update the weights and compute the loss\n", " loss,gradients=net.backward(X_batch,y_batch,\"SGD\",learning_rate,regularization) \n", " train_loss+=loss\n", "\n", " # Validation\n", " # No need to run the backward pass here, just run the forward pass to compute accuracy\n", " train_accuracy/=(TRAIN_IMAGES // batch_size)\n", " train_loss/=(TRAIN_IMAGES // batch_size)\n", " #net.params=dict_by_ele(net.params,(TRAIN_IMAGES // batch_size))\n", " scores=net.forward(X_val)\n", " y_pred=np.argmax(scores, axis=1)\n", " val_accuracy= (np.sum(y_val==y_pred)/len(y_val))\n", " scores=net.forward(X_test)\n", " y_pred=np.argmax(scores, axis=1)\n", " test_accuracy= (np.sum(y_test==y_pred)/len(y_test))\n", " print(\"Trainng %f %f %f %f %f\" % (batch_size, hidden_size, regularization, epoch, train_accuracy))\n", " print(\"Validation %f %f %f %f %f\" % (batch_size, hidden_size, regularization, epoch, val_accuracy))\n", " print(\"Test %f %f %f %f %f\" % (batch_size, hidden_size, regularization, epoch, test_accuracy))\n", " ff+=1\n" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "epoch: 0\n", "0.2041020408163263 0.251 0.2647\n", "epoch: 1\n", "0.2922857142857144 0.306 0.3137\n", "epoch: 2\n", "0.324061224489796 0.336 0.344\n", "epoch: 3\n", "0.3506122448979591 0.361 0.3594\n", "epoch: 4\n", "0.3654285714285716 0.372 0.3723\n", "epoch: 5\n", "0.37651020408163277 0.386 0.3801\n", "epoch: 6\n", "0.38544897959183666 0.391 0.3878\n", "epoch: 7\n", "0.39863265306122464 0.4 0.3933\n", "epoch: 8\n", "0.40479591836734724 0.409 0.3981\n", "epoch: 9\n", "0.41183673469387755 0.406 0.4021\n", "epoch: 10\n", "0.41457142857142865 0.417 0.4078\n", "epoch: 11\n", "0.41761224489795923 0.418 0.4142\n", "epoch: 12\n", "0.42463265306122466 0.42 0.4181\n", "epoch: 13\n", "0.4261224489795919 0.42 0.4195\n", "epoch: 14\n", "0.42422448979591865 0.426 0.4215\n", "epoch: 15\n", "0.430061224489796 0.43 0.4246\n", "epoch: 16\n", "0.43473469387755104 0.432 0.4284\n", "epoch: 17\n", "0.43936734693877555 0.43 0.4321\n", "epoch: 18\n", "0.43771428571428556 0.429 0.434\n", "epoch: 19\n", "0.4422857142857143 0.435 0.4362\n", "epoch: 20\n", "0.4471632653061229 0.432 0.4376\n", "epoch: 21\n", "0.4463265306122449 0.431 0.4397\n", "epoch: 22\n", "0.4506734693877552 0.425 0.442\n", "epoch: 23\n", "0.44991836734693885 0.435 0.4407\n", "epoch: 24\n", "0.45220408163265274 0.436 0.444\n", "epoch: 25\n", "0.45530612244897933 0.439 0.4447\n", "epoch: 26\n", "0.4576734693877552 0.437 0.4468\n", "epoch: 27\n", "0.4588163265306126 0.442 0.4476\n", "epoch: 28\n", "0.4579999999999998 0.442 0.449\n", "epoch: 29\n", "0.4572448979591832 0.442 0.4507\n", "epoch: 30\n", "0.46595918367346933 0.44 0.4539\n", "epoch: 31\n", "0.46289795918367344 0.439 0.4538\n", "epoch: 32\n", "0.46410204081632683 0.448 0.4558\n", "epoch: 33\n", "0.463142857142857 0.445 0.457\n", "epoch: 34\n", "0.4654897959183671 0.446 0.4581\n", "epoch: 35\n", "0.47089795918367316 0.451 0.4587\n", "epoch: 36\n", "0.4713469387755101 0.451 0.4588\n", "epoch: 37\n", "0.4708979591836729 0.446 0.4594\n", "epoch: 38\n", "0.4755714285714285 0.451 0.4609\n", "epoch: 39\n", "0.47193877551020424 0.452 0.4621\n", "epoch: 40\n", "0.47465306122448964 0.456 0.4625\n", "epoch: 41\n", "0.4757755102040817 0.459 0.462\n", "epoch: 42\n", "0.47279591836734675 0.459 0.4612\n", "epoch: 43\n", "0.47730612244897924 0.46 0.4644\n", "epoch: 44\n", "0.48104081632653034 0.458 0.4664\n", "epoch: 45\n", "0.47916326530612213 0.456 0.4665\n", "epoch: 46\n", "0.48048979591836694 0.461 0.466\n", "epoch: 47\n", "0.479734693877551 0.457 0.4684\n", "epoch: 48\n", "0.4755714285714285 0.466 0.4666\n", "epoch: 49\n", "0.48024489795918335 0.459 0.4668\n", "epoch: 50\n", "0.4800204081632653 0.463 0.4665\n", "epoch: 51\n", "0.4849795918367344 0.461 0.4693\n", "epoch: 52\n", "0.4856734693877547 0.465 0.4697\n", "epoch: 53\n", "0.48389795918367323 0.466 0.4693\n", "epoch: 54\n", "0.486530612244898 0.462 0.4699\n", "epoch: 55\n", "0.4885306122448983 0.469 0.4692\n", "epoch: 56\n", "0.48626530612244884 0.463 0.4712\n", "epoch: 57\n", "0.4895306122448981 0.466 0.4705\n", "epoch: 58\n", "0.48936734693877526 0.467 0.4699\n", "epoch: 59\n", "0.4908571428571428 0.472 0.4721\n", "epoch: 60\n", "0.49236734693877526 0.476 0.4718\n", "epoch: 61\n", "0.4887551020408162 0.474 0.4721\n", "epoch: 62\n", "0.492428571428571 0.476 0.473\n", "epoch: 63\n", "0.4893469387755097 0.476 0.4732\n", "epoch: 64\n", "0.4900816326530613 0.478 0.476\n", "epoch: 65\n", "0.48910204081632647 0.479 0.4741\n", "epoch: 66\n", "0.49491836734693856 0.479 0.4747\n", "epoch: 67\n", "0.49271428571428555 0.485 0.477\n", "epoch: 68\n", "0.4953673469387755 0.473 0.4767\n", "epoch: 69\n", "0.49430612244897976 0.482 0.4754\n", "epoch: 70\n", "0.49914285714285717 0.482 0.4759\n", "epoch: 71\n", "0.4960612244897959 0.484 0.477\n", "epoch: 72\n", "0.4959387755102039 0.479 0.4762\n", "epoch: 73\n", "0.4966938775510202 0.485 0.4786\n", "epoch: 74\n", "0.5010816326530613 0.483 0.4776\n", "epoch: 75\n", "0.4965306122448976 0.485 0.4803\n", "epoch: 76\n", "0.49840816326530624 0.486 0.4804\n", "epoch: 77\n", "0.5008571428571428 0.482 0.48\n", "epoch: 78\n", "0.503591836734694 0.484 0.4785\n", "epoch: 79\n", "0.5052653061224494 0.485 0.4809\n", "epoch: 80\n", "0.49826530612244896 0.481 0.4795\n", "epoch: 81\n", "0.5070000000000001 0.485 0.4809\n", "epoch: 82\n", "0.5025918367346941 0.484 0.482\n", "epoch: 83\n", "0.505204081632653 0.484 0.4814\n", "epoch: 84\n", "0.5076938775510205 0.485 0.48\n", "epoch: 85\n", "0.5027142857142856 0.484 0.483\n", "epoch: 86\n", "0.5053673469387753 0.485 0.4835\n", "epoch: 87\n", "0.5043265306122452 0.485 0.485\n", "epoch: 88\n", "0.5063265306122445 0.491 0.4832\n", "epoch: 89\n", "0.5072857142857149 0.492 0.484\n", "epoch: 90\n", "0.5053469387755102 0.498 0.4841\n", "epoch: 91\n", "0.5072448979591839 0.495 0.4866\n", "epoch: 92\n", "0.5118163265306123 0.493 0.4862\n", "epoch: 93\n", "0.5103469387755101 0.497 0.4883\n", "epoch: 94\n", "0.5114897959183673 0.494 0.4855\n", "epoch: 95\n", "0.5143877551020408 0.49 0.4863\n", "epoch: 96\n", "0.5137755102040817 0.492 0.4835\n", "epoch: 97\n", "0.5129387755102041 0.49 0.4839\n", "epoch: 98\n", "0.5122857142857141 0.499 0.487\n", "epoch: 99\n", "0.5127755102040817 0.488 0.484\n", "epoch: 100\n", "0.5122857142857143 0.492 0.4863\n", "epoch: 101\n", "0.5147142857142856 0.489 0.4851\n", "epoch: 102\n", "0.5141632653061226 0.489 0.4888\n", "epoch: 103\n", "0.5155510204081638 0.496 0.4877\n", "epoch: 104\n", "0.5175102040816327 0.494 0.4878\n", "epoch: 105\n", "0.5196938775510206 0.5 0.4889\n", "epoch: 106\n", "0.5107959183673473 0.499 0.4871\n", "epoch: 107\n", "0.5146122448979594 0.5 0.4908\n", "epoch: 108\n", "0.5151020408163267 0.497 0.4906\n", "epoch: 109\n", "0.5157142857142858 0.497 0.4894\n", "epoch: 110\n", "0.5190408163265308 0.498 0.4914\n", "epoch: 111\n", "0.5189591836734696 0.501 0.491\n", "epoch: 112\n", "0.5171632653061226 0.501 0.4891\n", "epoch: 113\n", "0.5213877551020408 0.496 0.4892\n", "epoch: 114\n", "0.5198367346938778 0.501 0.4899\n", "epoch: 115\n", "0.5218571428571428 0.504 0.4896\n", "epoch: 116\n", "0.5186938775510206 0.5 0.491\n", "epoch: 117\n", "0.5202653061224489 0.496 0.4924\n", "epoch: 118\n", "0.5245102040816326 0.502 0.4929\n", "epoch: 119\n", "0.5210204081632653 0.5 0.4917\n", "epoch: 120\n", "0.5221632653061226 0.505 0.4937\n", "epoch: 121\n", "0.5210204081632651 0.504 0.4937\n", "epoch: 122\n", "0.5235102040816327 0.5 0.4921\n", "epoch: 123\n", "0.5234897959183673 0.5 0.4927\n", "epoch: 124\n", "0.5235510204081634 0.499 0.4946\n", "epoch: 125\n", "0.5232653061224489 0.501 0.4964\n", "epoch: 126\n", "0.5236326530612248 0.503 0.4941\n", "epoch: 127\n", "0.5211020408163265 0.495 0.4957\n", "epoch: 128\n", "0.524 0.5 0.4959\n", "epoch: 129\n", "0.5253469387755101 0.502 0.4943\n", "epoch: 130\n", "0.5254081632653061 0.507 0.4949\n", "epoch: 131\n", "0.5219183673469389 0.504 0.4958\n", "epoch: 132\n", "0.5266938775510205 0.508 0.4933\n", "epoch: 133\n", "0.5268367346938777 0.505 0.4961\n", "epoch: 134\n", "0.5275510204081633 0.503 0.4961\n", "epoch: 135\n", "0.5196530612244902 0.501 0.4974\n", "epoch: 136\n", "0.525775510204082 0.51 0.4955\n", "epoch: 137\n", "0.5292857142857144 0.51 0.4978\n", "epoch: 138\n", "0.5243877551020409 0.504 0.4966\n", "epoch: 139\n", "0.5269387755102043 0.508 0.4974\n", "epoch: 140\n", "0.5236938775510205 0.511 0.4983\n", "epoch: 141\n", "0.5322244897959185 0.507 0.4981\n", "epoch: 142\n", "0.5298979591836734 0.506 0.4993\n", "epoch: 143\n", "0.5292857142857145 0.509 0.497\n", "epoch: 144\n", "0.5299387755102043 0.503 0.4977\n", "epoch: 145\n", "0.528326530612245 0.501 0.4994\n", "epoch: 146\n", "0.5264285714285714 0.504 0.4997\n", "epoch: 147\n", "0.5258775510204083 0.509 0.4998\n", "epoch: 148\n", "0.5314897959183674 0.512 0.4985\n", "epoch: 149\n", "0.5305510204081634 0.503 0.5001\n" ] } ], "source": [ "#Tuning SGD\n", "\n", "##########################################\n", "# Best two layer SGD NN\n", "##########################################\n", "\n", "\n", "from random import shuffle\n", "\n", "# Hyperparameters\n", "input_size = 32 * 32 * 3\n", "num_layers = 2\n", "hidden_size = 80#80#20\n", "hidden_sizes = [hidden_size] * (num_layers - 1)\n", "num_classes = 10\n", "epochs = 150 #200\n", "batch_size = 200\n", "learning_rate = 1e-3#0.0001#1e-3 the best #0.5 NoWrk #0.05 NoWrk\n", "learning_rate_decay = 0.95\n", "regularization = 0.009 #0.1 #0.009 worked really well with 1e-3 and 80 hidden sizes\n", "\n", "# Initialize a new neural network model\n", "net = NeuralNetwork(input_size, hidden_sizes, num_classes, num_layers)\n", "\n", "# Variables to store performance for each epoch\n", "train_loss = np.zeros(epochs)\n", "train_accuracy = np.zeros(epochs)\n", "val_accuracy = np.zeros(epochs)\n", "test_accuracy = np.zeros(epochs)\n", "# For each epoch...\n", "for epoch in range(epochs):\n", " print('epoch:', epoch)\n", " \n", " # Shuffle the dataset\n", " ind_list = [i for i in range(X_train.shape[0])]\n", " shuffle(ind_list)\n", " X_train = X_train[ind_list,:]\n", " y_train = y_train[ind_list]\n", "\n", " \n", " # Training\n", " # For each mini-batch...\n", " #################################\n", " #Implemented the linear decay in learning rate but found that results without it were more accurate\n", " #learning_rate=learning_rate/(1+learning_rate_decay*epoch)\n", " #################################\n", " for batch in range(TRAIN_IMAGES // batch_size):\n", " batch_indices = np.random.choice(range(X_train.shape[0]), size=batch_size)\n", " # Create a mini-batch of training data and labels\n", " X_batch = X_train[batch_indices]\n", " y_batch = y_train[batch_indices]\n", " # Run the forward pass of the model to get a prediction and compute the accuracy\n", " scores=net.forward(X_batch)\n", " y_pred=np.argmax(scores, axis=1)\n", " train_accuracy[epoch] += (np.sum(y_batch==y_pred)/len(y_batch))\n", " # Run the backward pass of the model to update the weights and compute the loss\n", " loss,gradients=net.backward(X_batch,y_batch,\"SGD\",learning_rate,regularization) \n", " train_loss[epoch]+=loss\n", "\n", " # Validation\n", " # No need to run the backward pass here, just run the forward pass to compute accuracy\n", " \n", " train_accuracy[epoch]/=(TRAIN_IMAGES // batch_size)\n", " train_loss[epoch]/=(TRAIN_IMAGES // batch_size)\n", " \n", " scores=net.forward(X_val)\n", " y_pred=np.argmax(scores, axis=1)\n", " val_accuracy[epoch]= (np.sum(y_val==y_pred)/len(y_val))\n", " \n", " scores=net.forward(X_test)\n", " y_pred=np.argmax(scores, axis=1)\n", " test_accuracy[epoch]= (np.sum(y_test==y_pred)/len(y_test))\n", " print(train_accuracy[epoch],val_accuracy[epoch],test_accuracy[epoch])" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "<Figure size 720x576 with 2 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "0.5322244897959185\n", "0.5001\n" ] } ], "source": [ "# Plot the loss function and train / validation accuracies\n", "plt.subplot(2, 1, 1)\n", "plt.plot(train_loss)\n", "plt.title('Loss history')\n", "plt.xlabel('Iteration')\n", "plt.ylabel('Loss')\n", "plt.subplot(2, 1, 2)\n", "plt.plot(train_accuracy, label='train')\n", "plt.plot(val_accuracy, label='val')\n", "plt.title('Classification accuracy history')\n", "plt.xlabel('Epoch')\n", "plt.ylabel('Classification accuracy')\n", "plt.legend()\n", "plt.show()\n", "print(max(train_accuracy))\n", "print(max(test_accuracy))\n", "#print(train_accuracy)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.5001\n" ] } ], "source": [ "scores=net.forward(X_test)\n", "y_pred=np.argmax(scores, axis=1)\n", "test_accuracy= (np.sum(y_test==y_pred)/len(y_test))\n", "print(test_accuracy)\n", "best_2layer_sgd_prediction=y_pred\n", "output_submission_csv('kaggle/nn_2layer_sgd_submission.csv', best_2layer_sgd_prediction)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "epoch: 0\n", "0.19463265306122451 0.26 0.2444\n", "epoch: 1\n", "0.2728163265306122 0.317 0.2925\n", "epoch: 2\n", "0.3068571428571429 0.335 0.3243\n", "epoch: 3\n", "0.331938775510204 0.356 0.3442\n", "epoch: 4\n", "0.348918367346939 0.367 0.3548\n", "epoch: 5\n", "0.3596326530612247 0.384 0.361\n", "epoch: 6\n", "0.3712244897959183 0.389 0.3711\n", "epoch: 7\n", "0.3759387755102042 0.393 0.3775\n", "epoch: 8\n", "0.3809795918367347 0.393 0.3825\n", "epoch: 9\n", "0.39179591836734695 0.401 0.3885\n", "epoch: 10\n", "0.3929795918367347 0.4 0.3929\n", "epoch: 11\n", "0.3952653061224488 0.408 0.3987\n", "epoch: 12\n", "0.40508163265306135 0.412 0.403\n", "epoch: 13\n", "0.4095510204081631 0.415 0.4064\n", "epoch: 14\n", "0.4113265306122451 0.423 0.4083\n", "epoch: 15\n", "0.41936734693877575 0.419 0.413\n", "epoch: 16\n", "0.4257142857142858 0.419 0.4153\n", "epoch: 17\n", "0.42418367346938757 0.421 0.4158\n", "epoch: 18\n", "0.4258367346938774 0.419 0.4177\n", "epoch: 19\n", "0.4247551020408159 0.421 0.4225\n", "epoch: 20\n", "0.4336734693877548 0.433 0.4234\n", "epoch: 21\n", "0.4336122448979591 0.432 0.4265\n", "epoch: 22\n", "0.43891836734693895 0.427 0.4295\n", "epoch: 23\n", "0.44159183673469365 0.439 0.4296\n", "epoch: 24\n", "0.4410408163265303 0.435 0.4319\n", "epoch: 25\n", "0.4465510204081629 0.435 0.4351\n", "epoch: 26\n", "0.4500816326530611 0.434 0.4369\n", "epoch: 27\n", "0.4462448979591838 0.449 0.4384\n", "epoch: 28\n", "0.45195918367346904 0.443 0.441\n", "epoch: 29\n", "0.4557959183673466 0.442 0.443\n", "epoch: 30\n", "0.45814285714285663 0.444 0.4455\n", "epoch: 31\n", "0.45593877551020406 0.444 0.4456\n", "epoch: 32\n", "0.460938775510204 0.441 0.4448\n", "epoch: 33\n", "0.4594285714285712 0.442 0.4469\n", "epoch: 34\n", "0.46640816326530615 0.446 0.449\n", "epoch: 35\n", "0.46861224489795916 0.45 0.4523\n", "epoch: 36\n", "0.4680612244897957 0.448 0.4525\n", "epoch: 37\n", "0.46816326530612223 0.447 0.4523\n", "epoch: 38\n", "0.4692244897959177 0.444 0.4507\n", "epoch: 39\n", "0.4698367346938773 0.45 0.4528\n", "epoch: 40\n", "0.4747142857142855 0.446 0.454\n", "epoch: 41\n", "0.47242857142857164 0.455 0.4536\n", "epoch: 42\n", "0.47306122448979576 0.457 0.4547\n", "epoch: 43\n", "0.47663265306122465 0.456 0.4542\n", "epoch: 44\n", "0.479204081632653 0.45 0.4561\n", "epoch: 45\n", "0.47479591836734686 0.447 0.4569\n", "epoch: 46\n", "0.47993877551020436 0.451 0.457\n", "epoch: 47\n", "0.4854489795918366 0.457 0.4592\n", "epoch: 48\n", "0.48048979591836716 0.468 0.4597\n", "epoch: 49\n", "0.4879591836734693 0.458 0.4601\n", "epoch: 50\n", "0.49012244897959156 0.461 0.4584\n", "epoch: 51\n", "0.48691836734693894 0.458 0.4606\n", "epoch: 52\n", "0.49002040816326503 0.458 0.4639\n", "epoch: 53\n", "0.4930408163265305 0.462 0.4653\n", "epoch: 54\n", "0.4922448979591837 0.464 0.4651\n", "epoch: 55\n", "0.48826530612244934 0.466 0.4661\n", "epoch: 56\n", "0.49557142857142833 0.465 0.4662\n", "epoch: 57\n", "0.4978367346938775 0.471 0.4666\n", "epoch: 58\n", "0.4997755102040817 0.46 0.4666\n", "epoch: 59\n", "0.4943265306122448 0.477 0.4686\n", "epoch: 60\n", "0.496489795918367 0.477 0.4672\n", "epoch: 61\n", "0.4982653061224488 0.48 0.4695\n", "epoch: 62\n", "0.503020408163265 0.483 0.4704\n", "epoch: 63\n", "0.5017346938775507 0.474 0.4707\n", "epoch: 64\n", "0.5011224489795916 0.475 0.4704\n", "epoch: 65\n", "0.5058367346938772 0.475 0.4706\n", "epoch: 66\n", "0.5021020408163263 0.477 0.473\n", "epoch: 67\n", "0.5055714285714284 0.479 0.4735\n", "epoch: 68\n", "0.5046938775510204 0.478 0.4713\n", "epoch: 69\n", "0.5118367346938774 0.481 0.4717\n", "epoch: 70\n", "0.5102857142857142 0.475 0.4754\n", "epoch: 71\n", "0.5117551020408162 0.481 0.4742\n", "epoch: 72\n", "0.5113265306122448 0.488 0.4753\n", "epoch: 73\n", "0.5085510204081632 0.487 0.4756\n", "epoch: 74\n", "0.5156938775510206 0.479 0.4782\n", "epoch: 75\n", "0.5172653061224491 0.49 0.4772\n", "epoch: 76\n", "0.5164489795918371 0.489 0.4759\n", "epoch: 77\n", "0.5159591836734694 0.487 0.4775\n", "epoch: 78\n", "0.5187142857142856 0.49 0.4753\n", "epoch: 79\n", "0.5168775510204079 0.489 0.4783\n", "epoch: 80\n", "0.520326530612245 0.491 0.4755\n", "epoch: 81\n", "0.5216326530612246 0.491 0.4774\n", "epoch: 82\n", "0.5221428571428575 0.493 0.4801\n", "epoch: 83\n", "0.5206530612244898 0.494 0.4794\n", "epoch: 84\n", "0.5278979591836737 0.493 0.4816\n", "epoch: 85\n", "0.5325510204081634 0.494 0.4814\n", "epoch: 86\n", "0.5229183673469391 0.491 0.4804\n", "epoch: 87\n", "0.5280816326530614 0.493 0.4798\n", "epoch: 88\n", "0.5270612244897956 0.499 0.4823\n", "epoch: 89\n", "0.5332040816326531 0.493 0.4817\n", "epoch: 90\n", "0.5295306122448985 0.497 0.4821\n", "epoch: 91\n", "0.5300408163265309 0.502 0.4821\n", "epoch: 92\n", "0.5340612244897963 0.496 0.4836\n", "epoch: 93\n", "0.5356326530612243 0.501 0.4814\n", "epoch: 94\n", "0.5348163265306125 0.494 0.4849\n", "epoch: 95\n", "0.5375306122448983 0.494 0.4846\n", "epoch: 96\n", "0.535591836734694 0.502 0.486\n", "epoch: 97\n", "0.5373061224489797 0.505 0.4856\n", "epoch: 98\n", "0.5383061224489798 0.503 0.4868\n", "epoch: 99\n", "0.5400204081632658 0.501 0.4868\n", "epoch: 100\n", "0.539265306122449 0.506 0.4906\n", "epoch: 101\n", "0.5474081632653063 0.502 0.488\n", "epoch: 102\n", "0.544918367346939 0.505 0.4895\n", "epoch: 103\n", "0.5397551020408164 0.496 0.4885\n", "epoch: 104\n", "0.5489183673469389 0.505 0.4902\n", "epoch: 105\n", "0.5432653061224489 0.508 0.4897\n", "epoch: 106\n", "0.5459387755102043 0.506 0.4924\n", "epoch: 107\n", "0.5462448979591837 0.507 0.4943\n", "epoch: 108\n", "0.5487346938775507 0.512 0.4935\n", "epoch: 109\n", "0.5507551020408166 0.517 0.493\n", "epoch: 110\n", "0.5485306122448982 0.51 0.4924\n", "epoch: 111\n", "0.5474285714285719 0.512 0.4937\n", "epoch: 112\n", "0.5536530612244899 0.512 0.4948\n", "epoch: 113\n", "0.5512448979591843 0.507 0.4958\n", "epoch: 114\n", "0.5536734693877555 0.508 0.4932\n", "epoch: 115\n", "0.5486530612244901 0.509 0.4961\n", "epoch: 116\n", "0.5563877551020405 0.509 0.4959\n", "epoch: 117\n", "0.5536938775510207 0.512 0.4956\n", "epoch: 118\n", "0.5575714285714285 0.514 0.497\n", "epoch: 119\n", "0.5513061224489794 0.516 0.4959\n", "epoch: 120\n", "0.5569591836734695 0.518 0.4994\n", "epoch: 121\n", "0.5545510204081631 0.515 0.4999\n", "epoch: 122\n", "0.5575510204081632 0.518 0.5007\n", "epoch: 123\n", "0.5585102040816328 0.516 0.5005\n", "epoch: 124\n", "0.5574285714285712 0.52 0.4998\n", "epoch: 125\n", "0.559612244897959 0.51 0.4993\n", "epoch: 126\n", "0.558795918367347 0.516 0.5004\n", "epoch: 127\n", "0.5639387755102041 0.513 0.5036\n", "epoch: 128\n", "0.561979591836735 0.52 0.5023\n", "epoch: 129\n", "0.5646122448979594 0.519 0.5025\n", "epoch: 130\n", "0.56630612244898 0.514 0.5035\n", "epoch: 131\n", "0.5675102040816329 0.513 0.503\n", "epoch: 132\n", "0.5657755102040818 0.515 0.5039\n", "epoch: 133\n", "0.5656938775510205 0.52 0.504\n", "epoch: 134\n", "0.564530612244898 0.52 0.5031\n", "epoch: 135\n", "0.5640408163265305 0.511 0.5046\n", "epoch: 136\n", "0.5699387755102046 0.52 0.5058\n", "epoch: 137\n", "0.5655918367346938 0.524 0.5044\n", "epoch: 138\n", "0.5703061224489797 0.517 0.506\n", "epoch: 139\n", "0.5700000000000002 0.526 0.5057\n", "epoch: 140\n", "0.5725714285714285 0.52 0.506\n", "epoch: 141\n", "0.5726734693877551 0.517 0.5064\n", "epoch: 142\n", "0.5762448979591837 0.518 0.5081\n", "epoch: 143\n", "0.5736122448979591 0.52 0.5072\n", "epoch: 144\n", "0.57065306122449 0.516 0.5079\n", "epoch: 145\n", "0.5710612244897963 0.517 0.507\n", "epoch: 146\n", "0.5777346938775509 0.52 0.5083\n", "epoch: 147\n", "0.5754897959183672 0.524 0.5085\n", "epoch: 148\n", "0.5741428571428571 0.52 0.5079\n", "epoch: 149\n", "0.5753877551020407 0.522 0.5117\n", "epoch: 150\n", "0.5782857142857142 0.516 0.5072\n", "epoch: 151\n", "0.5834489795918368 0.52 0.5088\n", "epoch: 152\n", "0.5817551020408167 0.513 0.5083\n", "epoch: 153\n", "0.5826122448979594 0.523 0.5091\n", "epoch: 154\n", "0.5815918367346936 0.517 0.5113\n", "epoch: 155\n", "0.5810204081632654 0.518 0.5104\n", "epoch: 156\n", "0.584081632653061 0.519 0.5101\n", "epoch: 157\n", "0.5822244897959186 0.515 0.51\n", "epoch: 158\n", "0.579938775510204 0.515 0.5102\n", "epoch: 159\n", "0.5837551020408167 0.517 0.5105\n", "epoch: 160\n", "0.5811836734693875 0.516 0.5101\n", "epoch: 161\n", "0.589591836734694 0.516 0.5118\n", "epoch: 162\n", "0.5895714285714286 0.519 0.5125\n", "epoch: 163\n", "0.5923673469387755 0.525 0.5123\n", "epoch: 164\n", "0.5879795918367344 0.52 0.511\n", "epoch: 165\n", "0.5905510204081632 0.52 0.513\n", "epoch: 166\n", "0.5875510204081634 0.522 0.5106\n", "epoch: 167\n", "0.5869795918367351 0.526 0.5111\n", "epoch: 168\n", "0.5917755102040815 0.519 0.512\n", "epoch: 169\n", "0.5947142857142858 0.525 0.5099\n", "epoch: 170\n", "0.5914489795918363 0.525 0.5106\n", "epoch: 171\n", "0.5869387755102036 0.519 0.5121\n", "epoch: 172\n", "0.5905306122448974 0.522 0.5124\n", "epoch: 173\n", "0.5954693877551019 0.52 0.5122\n", "epoch: 174\n", "0.5942653061224493 0.521 0.5116\n", "epoch: 175\n", "0.5951836734693873 0.516 0.5123\n", "epoch: 176\n", "0.5937755102040814 0.521 0.512\n", "epoch: 177\n", "0.5973673469387751 0.519 0.5106\n", "epoch: 178\n", "0.5935918367346937 0.518 0.5128\n", "epoch: 179\n", "0.5960408163265306 0.522 0.5128\n", "epoch: 180\n", "0.599734693877551 0.524 0.5119\n", "epoch: 181\n", "0.5979183673469388 0.523 0.5114\n", "epoch: 182\n", "0.6000612244897959 0.525 0.5119\n", "epoch: 183\n", "0.6015714285714288 0.527 0.5138\n", "epoch: 184\n", "0.6002244897959184 0.521 0.5138\n", "epoch: 185\n", "0.6002244897959178 0.523 0.5128\n", "epoch: 186\n", "0.6031224489795918 0.518 0.512\n", "epoch: 187\n", "0.5970816326530615 0.523 0.5129\n", "epoch: 188\n", "0.6057142857142859 0.525 0.513\n", "epoch: 189\n", "0.60569387755102 0.522 0.5134\n", "epoch: 190\n", "0.6065102040816324 0.521 0.5145\n", "epoch: 191\n", "0.6070816326530611 0.523 0.5152\n", "epoch: 192\n", "0.6036326530612242 0.527 0.5139\n", "epoch: 193\n", "0.6056938775510201 0.524 0.5146\n", "epoch: 194\n", "0.6052448979591833 0.524 0.5156\n", "epoch: 195\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0.6071020408163263 0.519 0.5153\n", "epoch: 196\n", "0.6111428571428565 0.524 0.5138\n", "epoch: 197\n", "0.6101836734693877 0.527 0.5152\n", "epoch: 198\n", "0.6054897959183677 0.526 0.5161\n", "epoch: 199\n", "0.6072448979591835 0.527 0.517\n" ] } ], "source": [ "##########################################\n", "# Best three layer SGD NN\n", "##########################################\n", "#Tuning SGD 3 layers\n", "from random import shuffle\n", "\n", "# Hyperparameters\n", "input_size = 32 * 32 * 3\n", "num_layers = 3\n", "hidden_size = 120#80#20\n", "hidden_sizes = [hidden_size] * (num_layers - 1)\n", "num_classes = 10\n", "epochs = 200 #200\n", "batch_size = 200\n", "learning_rate = 1e-3#0.0001#1e-3 the best #0.5 NoWrk #0.05 NoWrk\n", "learning_rate_decay = 0.95\n", "regularization = 0.0009 #0.1 #0.009 worked really well with 1e-3 and 80 hidden sizes\n", "\n", "# Initialize a new neural network model\n", "net = NeuralNetwork(input_size, hidden_sizes, num_classes, num_layers)\n", "\n", "# Variables to store performance for each epoch\n", "train_loss_three = np.zeros(epochs)\n", "train_accuracy_three = np.zeros(epochs)\n", "val_accuracy_three = np.zeros(epochs)\n", "test_accuracy_three = np.zeros(epochs)\n", "# For each epoch...\n", "for epoch in range(epochs):\n", " print('epoch:', epoch)\n", " \n", " # Shuffle the dataset\n", " ind_list = [i for i in range(X_train.shape[0])]\n", " shuffle(ind_list)\n", " X_train = X_train[ind_list,:]\n", " y_train = y_train[ind_list]\n", "\n", " \n", " # Training\n", " # For each mini-batch...\n", " #################################\n", " #Implemented the linear decay in learning rate but found that results without it were more accurate\n", " #learning_rate=learning_rate/(1+learning_rate_decay*epoch)\n", " #################################\n", " for batch in range(TRAIN_IMAGES // batch_size):\n", " batch_indices = np.random.choice(range(X_train.shape[0]), size=batch_size)\n", " # Create a mini-batch of training data and labels\n", " X_batch = X_train[batch_indices]\n", " y_batch = y_train[batch_indices]\n", " # Run the forward pass of the model to get a prediction and compute the accuracy\n", " scores=net.forward(X_batch)\n", " y_pred=np.argmax(scores, axis=1)\n", " train_accuracy_three[epoch] += (np.sum(y_batch==y_pred)/len(y_batch))\n", " # Run the backward pass of the model to update the weights and compute the loss\n", " loss,gradients=net.backward(X_batch,y_batch,\"SGD\",learning_rate,regularization) \n", " train_loss_three[epoch]+=loss\n", "\n", " # Validation\n", " # No need to run the backward pass here, just run the forward pass to compute accuracy\n", " \n", " train_accuracy_three[epoch]/=(TRAIN_IMAGES // batch_size)\n", " train_loss_three[epoch]/=(TRAIN_IMAGES // batch_size)\n", " \n", " scores=net.forward(X_val)\n", " y_pred=np.argmax(scores, axis=1)\n", " val_accuracy_three[epoch]= (np.sum(y_val==y_pred)/len(y_val))\n", " \n", " scores=net.forward(X_test)\n", " y_pred=np.argmax(scores, axis=1)\n", " test_accuracy_three[epoch]= (np.sum(y_test==y_pred)/len(y_test))\n", " print(train_accuracy_three[epoch],val_accuracy_three[epoch],test_accuracy_three[epoch])\n" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "<Figure size 720x576 with 2 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "0.6111428571428565\n", "0.517\n" ] } ], "source": [ "# Plot the loss function and train / validation accuracies\n", "plt.subplot(2, 1, 1)\n", "plt.plot(train_loss_three)\n", "plt.title('Loss history')\n", "plt.xlabel('Iteration')\n", "plt.ylabel('Loss')\n", "plt.subplot(2, 1, 2)\n", "plt.plot(train_accuracy_three, label='train')\n", "plt.plot(val_accuracy_three, label='val')\n", "plt.title('Classification accuracy history')\n", "plt.xlabel('Epoch')\n", "plt.ylabel('Classification accuracy')\n", "plt.legend()\n", "plt.show()\n", "print(max(train_accuracy_three))\n", "print(max(test_accuracy_three))\n", "#print(train_accuracy)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.517\n" ] } ], "source": [ "scores=net.forward(X_test)\n", "y_pred=np.argmax(scores, axis=1)\n", "test_accuracy= (np.sum(y_test==y_pred)/len(y_test))\n", "print(test_accuracy)\n", "best_3layer_sgd_prediction=y_pred\n", "output_submission_csv('kaggle/nn_3layer_sgd_submission.csv', best_3layer_sgd_prediction)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "epoch: 0\n", "0.41197959183673466 0.48 0.4625\n", "epoch: 1\n", "0.49 0.506 0.4845\n", "epoch: 2\n", "0.516122448979592 0.493 0.4862\n", "epoch: 3\n", "0.533387755102041 0.509 0.4985\n", "epoch: 4\n", "0.5498979591836731 0.504 0.4963\n", "epoch: 5\n", "0.5637755102040818 0.509 0.5041\n", "epoch: 6\n", "0.5829387755102042 0.512 0.5086\n", "epoch: 7\n", "0.5827959183673471 0.527 0.5117\n", "epoch: 8\n", "0.5976938775510203 0.521 0.5171\n", "epoch: 9\n", "0.6036122448979593 0.521 0.5142\n", "epoch: 10\n", "0.6110408163265307 0.517 0.5097\n", "epoch: 11\n", "0.617163265306123 0.515 0.5132\n", "epoch: 12\n", "0.6250612244897958 0.534 0.5122\n", "epoch: 13\n", "0.6329795918367349 0.528 0.5131\n", "epoch: 14\n", "0.6352857142857143 0.535 0.5147\n", "epoch: 15\n", "0.6426326530612243 0.527 0.5115\n", "epoch: 16\n", "0.6469999999999998 0.525 0.5116\n", "epoch: 17\n", "0.654408163265306 0.524 0.5152\n", "epoch: 18\n", "0.6526122448979594 0.538 0.5063\n", "epoch: 19\n", "0.6645714285714291 0.537 0.5124\n", "epoch: 20\n", "0.665979591836735 0.536 0.514\n", "epoch: 21\n", "0.6720816326530613 0.53 0.5144\n", "epoch: 22\n", "0.6744081632653057 0.532 0.5143\n", "epoch: 23\n", "0.679387755102041 0.542 0.5208\n", "epoch: 24\n", "0.6865714285714282 0.525 0.5138\n", "epoch: 25\n", "0.6859387755102043 0.542 0.5107\n", "epoch: 26\n", "0.6922857142857146 0.534 0.5121\n", "epoch: 27\n", "0.6992448979591834 0.537 0.5153\n", "epoch: 28\n", "0.7009387755102046 0.543 0.5096\n", "epoch: 29\n", "0.7011836734693874 0.548 0.5109\n", "epoch: 30\n", "0.707714285714286 0.54 0.5091\n", "epoch: 31\n", "0.7094693877551022 0.537 0.5069\n", "epoch: 32\n", "0.7188163265306128 0.542 0.5068\n", "epoch: 33\n", "0.721326530612245 0.538 0.5059\n", "epoch: 34\n", "0.7260204081632654 0.525 0.505\n", "epoch: 35\n", "0.7270816326530612 0.532 0.5066\n", "epoch: 36\n", "0.729224489795918 0.531 0.5016\n", "epoch: 37\n", "0.7356734693877551 0.525 0.5019\n", "epoch: 38\n", "0.7386734693877552 0.539 0.5104\n", "epoch: 39\n", "0.7412857142857141 0.527 0.5106\n", "epoch: 40\n", "0.7441428571428571 0.542 0.5065\n", "epoch: 41\n", "0.7421632653061219 0.547 0.5067\n", "epoch: 42\n", "0.7480612244897957 0.533 0.5086\n", "epoch: 43\n", "0.7520612244897954 0.531 0.5033\n", "epoch: 44\n", "0.7577142857142858 0.531 0.5062\n", "epoch: 45\n", "0.7569387755102044 0.527 0.5108\n", "epoch: 46\n", "0.7594693877551016 0.524 0.4986\n", "epoch: 47\n", "0.7630612244897957 0.526 0.5042\n", "epoch: 48\n", "0.7652857142857142 0.53 0.5003\n", "epoch: 49\n", "0.7696326530612247 0.526 0.5009\n" ] } ], "source": [ "#Tuning Adam 2 layers\n", "# TODO: implement me\n", "# Hyperparameters\n", "\n", "##########################################\n", "# Best two layer Adam NN\n", "##########################################\n", "import copy\n", "from random import shuffle\n", "input_size = 32 * 32 * 3\n", "num_layers = 2\n", "hidden_size = 100#20\n", "hidden_sizes = [hidden_size] * (num_layers - 1)\n", "num_classes = 10\n", "epochs = 50\n", "batch_size = 200\n", "learning_rate = 1e-3#1e-3 #0.025 #1e-3\n", "learning_rate_decay = 0.95\n", "regularization = 0.00009 #0.025\n", "betas=(0.9, 0.999)\n", "epsilon=1e-8\n", "# Initialize a new neural network model\n", "net = NeuralNetwork(input_size, hidden_sizes, num_classes, num_layers)\n", "\n", "# Variables to store performance for each epoch\n", "train_loss_adam_two = np.zeros(epochs)\n", "train_accuracy_adam_two = np.zeros(epochs)\n", "val_accuracy_adam_two = np.zeros(epochs)\n", "test_accuracy_adam_two = np.zeros(epochs) \n", "\n", "w = copy.deepcopy(net.params)\n", "m = set_zero(w) #creates a new dictionary which has the same size/elemets as the one passesed all set to zero\n", "v = set_zero(w)\n", "w_prev=set_zero(w)\n", "v_prev=set_zero(w)\n", "m_prev=set_zero(w)\n", "# For each epoch...\n", "for epoch in range(epochs):\n", " print('epoch:', epoch)\n", "\n", " ind_list = [i for i in range(X_train.shape[0])]\n", " shuffle(ind_list)\n", " X_train = X_train[ind_list,:]\n", " y_train = y_train[ind_list]\n", " #################################\n", " #Implemented the linear decay in learning rate but found that results without it were more accurate\n", " #learning_rate=learning_rate/(1+learning_rate_decay*epoch)\n", " #################################\n", " for batch in range(TRAIN_IMAGES // batch_size):\n", " # Create a mini-batch of training data and labels\n", " batch_indices = np.random.choice(range(X_train.shape[0]), size=batch_size)\n", " X_batch = X_train[batch_indices]\n", " y_batch = y_train[batch_indices]\n", " # Run the forward pass of the model to get a prediction and compute the accuracy\n", " scores=net.forward(X_batch)\n", " y_pred=np.argmax(scores, axis=1)\n", " train_accuracy_adam_two[epoch] += (np.sum(y_batch==y_pred)/len(y_batch))\n", " \n", " # Run the backward pass of the model to update the weights and compute the loss\n", " loss, gradients=net.backward(X_batch,y_batch,learning_rate,regularization)\n", " #gradients=dict_by_ele(gradients,batch_size)\n", " m=dict_add_dict(dict_x_ele(m_prev,betas[0]),dict_x_ele(gradients,(1-betas[0])))\n", " \n", " v=dict_add_dict(dict_x_ele(v_prev,betas[1]),dict_x_ele(dict_sqr(gradients),(1-betas[1])))\n", " \n", " m_new=dict_by_ele(m,(1-betas[0]))\n", " v_new=dict_by_ele(v,(1-betas[1]))\n", " \n", " w=dict_minus_dict(w_prev,dict_by_dict(dict_x_ele(m_new,learning_rate),(dict_add_ele(dict_sqrt(v_new),epsilon))))\n", " net.params=copy.copy(w)\n", " #w=w_prev-(alpha*m_new/(np.sqrt(v_new)+epsilon))\n", " \n", " w_prev=copy.copy(w)\n", " v_prev=copy.copy(v)\n", " m_prev=copy.copy(m)\n", "\n", " train_loss_adam_two[epoch]+=loss\n", "\n", " # Validation\n", " # No need to run the backward pass here, just run the forward pass to compute accuracy\n", " train_accuracy_adam_two[epoch]/=(TRAIN_IMAGES // batch_size)\n", " train_loss_adam_two[epoch]/=(TRAIN_IMAGES // batch_size)\n", " #net.params=dict_by_ele(net.params,(TRAIN_IMAGES // batch_size))\n", "# scores=net.forward(X_val)\n", "# y_pred=np.argmax(scores, axis=1)\n", "# val_accuracy_adam[epoch]= (np.sum(y_val==y_pred)/len(y_val))\n", " scores=net.forward(X_val)\n", " y_pred=np.argmax(scores, axis=1)\n", " val_accuracy_adam_two[epoch]= (np.sum(y_val==y_pred)/len(y_val))\n", " \n", " scores=net.forward(X_test)\n", " y_pred=np.argmax(scores, axis=1)\n", " test_accuracy_adam_two[epoch]= (np.sum(y_test==y_pred)/len(y_test))\n", " print(train_accuracy_adam_two[epoch],val_accuracy_adam_two[epoch],test_accuracy_adam_two[epoch])" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "<Figure size 720x576 with 2 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "0.7696326530612247\n", "0.5208\n" ] } ], "source": [ "# Plot the loss function and train / validation accuracies\n", "plt.subplot(2, 1, 1)\n", "plt.plot(train_loss_adam_two)\n", "plt.title('Loss history')\n", "plt.xlabel('Iteration')\n", "plt.ylabel('Loss')\n", "plt.subplot(2, 1, 2)\n", "plt.plot(train_accuracy_adam_two, label='train')\n", "plt.plot(val_accuracy_adam_two, label='val')\n", "plt.title('Classification accuracy history')\n", "plt.xlabel('Epoch')\n", "plt.ylabel('Classification accuracy')\n", "plt.legend()\n", "plt.show()\n", "print(max(train_accuracy_adam_two))\n", "print(max(test_accuracy_adam_two))" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.5009\n" ] } ], "source": [ "#############################################################################\n", "# For the Adam gradient updated 2-layer NN we observe that even for a very low\n", "# value of regularization as the training error decreases with the increasing \n", "# number of epochs the training accuracy keeps on increasing however, the \n", "# validation and testing accuracy increase upto a certain number of epochs \n", "# and then start decreasing. This might be due to the fact that Adam gradient method\n", "# causes faster convergence of results as compared to SDG and for lesser number of \n", "# epochs we get the best Adam results given the parameters. However, if we don't stop\n", "# the training the model overlearns or overfits the data which increases training \n", "# accuracy but decreases the validation and testing accuracy. So we do only 50 epochs\n", "# for Adam gradient methods as compared to 100-150 for SGD.\n", "#############################################################################\n", "scores=net.forward(X_test)\n", "y_pred=np.argmax(scores, axis=1)\n", "test_accuracy= (np.sum(y_test==y_pred)/len(y_test))\n", "print(test_accuracy)\n", "best_2layer_adam_prediction=y_pred\n", "output_submission_csv('kaggle/nn_2layer_sgd_submission.csv', best_2layer_adam_prediction)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "epoch: 0\n", "0.3337142857142858 0.447 0.4181\n", "epoch: 1\n", "0.4507959183673469 0.453 0.4554\n", "epoch: 2\n", "0.48481632653061274 0.476 0.4781\n", "epoch: 3\n", "0.5112857142857143 0.509 0.4936\n", "epoch: 4\n", "0.5336734693877553 0.514 0.5024\n", "epoch: 5\n", "0.5390816326530612 0.524 0.5118\n", "epoch: 6\n", "0.5562040816326532 0.51 0.5072\n", "epoch: 7\n", "0.5686326530612242 0.524 0.5177\n", "epoch: 8\n", "0.5770000000000001 0.527 0.5188\n", "epoch: 9\n", "0.5889591836734694 0.532 0.5188\n", "epoch: 10\n", "0.5929591836734699 0.542 0.5256\n", "epoch: 11\n", "0.6007755102040816 0.537 0.5231\n", "epoch: 12\n", "0.6111428571428573 0.535 0.5251\n", "epoch: 13\n", "0.6195306122448979 0.535 0.5267\n", "epoch: 14\n", "0.6250816326530613 0.55 0.5285\n", "epoch: 15\n", "0.6325918367346943 0.548 0.5219\n", "epoch: 16\n", "0.6380816326530614 0.548 0.5252\n", "epoch: 17\n", "0.6433877551020408 0.543 0.5272\n", "epoch: 18\n", "0.6488571428571427 0.547 0.5237\n", "epoch: 19\n", "0.653918367346939 0.543 0.5281\n", "epoch: 20\n", "0.6598163265306124 0.541 0.5253\n", "epoch: 21\n", "0.6629591836734697 0.536 0.53\n", "epoch: 22\n", "0.6700612244897967 0.548 0.5246\n", "epoch: 23\n", "0.6785918367346936 0.533 0.5217\n", "epoch: 24\n", "0.6804693877551019 0.539 0.5232\n", "epoch: 25\n", "0.6851020408163269 0.534 0.5246\n", "epoch: 26\n", "0.6866122448979594 0.542 0.5239\n", "epoch: 27\n", "0.694979591836735 0.538 0.5278\n", "epoch: 28\n", "0.696183673469388 0.536 0.5264\n", "epoch: 29\n", "0.7059387755102037 0.529 0.5241\n", "epoch: 30\n", "0.708857142857143 0.532 0.5258\n", "epoch: 31\n", "0.7103673469387755 0.535 0.5199\n", "epoch: 32\n", "0.7152653061224488 0.53 0.5199\n", "epoch: 33\n", "0.7184897959183675 0.525 0.5154\n", "epoch: 34\n", "0.7209183673469387 0.514 0.5123\n", "epoch: 35\n", "0.7247142857142854 0.52 0.5158\n", "epoch: 36\n", "0.725938775510204 0.53 0.5156\n", "epoch: 37\n", "0.7295510204081633 0.533 0.5169\n", "epoch: 38\n", "0.7348775510204085 0.535 0.5146\n", "epoch: 39\n", "0.739714285714286 0.536 0.5158\n" ] } ], "source": [ "#Tuning Adam 3 layers\n", "# TODO: implement me\n", "# Hyperparameters\n", "\n", "##########################################\n", "# Best three layer Adam NN\n", "##########################################\n", "import copy\n", "from random import shuffle\n", "input_size = 32 * 32 * 3\n", "num_layers = 3\n", "hidden_size = 100 #80 causes large distances between val and train\n", "hidden_sizes = [hidden_size] * (num_layers - 1)\n", "num_classes = 10\n", "epochs = 40\n", "batch_size = 200\n", "learning_rate = 1e-3 #0.025 #1e-3\n", "learning_rate_decay = 0.95\n", "regularization = 0.00009\n", "betas=(0.9, 0.999)\n", "epsilon=1e-8\n", "# Initialize a new neural network model\n", "net = NeuralNetwork(input_size, hidden_sizes, num_classes, num_layers)\n", "\n", "# Variables to store performance for each epoch\n", "train_loss_adam_three = np.zeros(epochs)\n", "train_accuracy_adam_three = np.zeros(epochs)\n", "val_accuracy_adam_three = np.zeros(epochs)\n", "test_accuracy_adam_three = np.zeros(epochs) \n", "w = copy.deepcopy(net.params)\n", "m = set_zero(w) #creates a new dictionary which has the same size/elemets as the one passesed all set to zero\n", "v = set_zero(w)\n", "w_prev=set_zero(w)\n", "v_prev=set_zero(w)\n", "m_prev=set_zero(w)\n", "# For each epoch...\n", "for epoch in range(epochs):\n", " print('epoch:', epoch)\n", "\n", " ind_list = [i for i in range(X_train.shape[0])]\n", " shuffle(ind_list)\n", " X_train = X_train[ind_list,:]\n", " y_train = y_train[ind_list]\n", " #################################\n", " #Implemented the linear decay in learning rate but found that results without it were more accurate\n", " #learning_rate=learning_rate/(1+learning_rate_decay*epoch)\n", " #################################\n", " \n", " \n", " for batch in range(TRAIN_IMAGES // batch_size):\n", " # Create a mini-batch of training data and labels\n", " batch_indices = np.random.choice(range(X_train.shape[0]), size=batch_size)\n", " X_batch = X_train[batch_indices]\n", " y_batch = y_train[batch_indices]\n", " # Run the forward pass of the model to get a prediction and compute the accuracy\n", " scores=net.forward(X_batch)\n", " y_pred=np.argmax(scores, axis=1)\n", " train_accuracy_adam_three[epoch] += (np.sum(y_batch==y_pred)/len(y_batch))\n", " \n", " # Run the backward pass of the model to update the weights and compute the loss\n", " loss, gradients=net.backward(X_batch,y_batch,learning_rate,regularization)\n", " \n", " m=dict_add_dict(dict_x_ele(m_prev,betas[0]),dict_x_ele(gradients,(1-betas[0])))\n", " \n", " v=dict_add_dict(dict_x_ele(v_prev,betas[1]),dict_x_ele(dict_sqr(gradients),(1-betas[1])))\n", " \n", " m_new=dict_by_ele(m,(1-betas[0]))\n", " v_new=dict_by_ele(v,(1-betas[1]))\n", " \n", " w=dict_minus_dict(w_prev,dict_by_dict(dict_x_ele(m_new,learning_rate),(dict_add_ele(dict_sqrt(v_new),epsilon))))\n", " net.params=copy.copy(w)\n", " #w=w_prev-(alpha*m_new/(np.sqrt(v_new)+epsilon))\n", " \n", " w_prev=copy.copy(w)\n", " v_prev=copy.copy(v)\n", " m_prev=copy.copy(m)\n", "\n", " train_loss_adam_three[epoch]+=loss\n", "\n", " # Validation\n", " # No need to run the backward pass here, just run the forward pass to compute accuracy\n", " train_accuracy_adam_three[epoch]/=(TRAIN_IMAGES // batch_size)\n", " train_loss_adam_three[epoch]/=(TRAIN_IMAGES // batch_size)\n", " #net.params=dict_by_ele(net.params,(TRAIN_IMAGES // batch_size))\n", "# scores=net.forward(X_val)\n", "# y_pred=np.argmax(scores, axis=1)\n", "# val_accuracy_adam[epoch]= (np.sum(y_val==y_pred)/len(y_val))\n", " \n", " scores=net.forward(X_val)\n", " y_pred=np.argmax(scores, axis=1)\n", " val_accuracy_adam_three[epoch]= (np.sum(y_val==y_pred)/len(y_val))\n", " \n", " scores=net.forward(X_test)\n", " y_pred=np.argmax(scores, axis=1)\n", " test_accuracy_adam_three[epoch]= (np.sum(y_test==y_pred)/len(y_test))\n", " print(train_accuracy_adam_three[epoch],val_accuracy_adam_three[epoch],test_accuracy_adam_three[epoch])" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "<Figure size 720x576 with 2 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "0.739714285714286\n", "0.53\n" ] } ], "source": [ "# Plot the loss function and train / validation accuracies\n", "plt.subplot(2, 1, 1)\n", "plt.plot(train_loss_adam_three)\n", "plt.title('Loss history')\n", "plt.xlabel('Iteration')\n", "plt.ylabel('Loss')\n", "plt.subplot(2, 1, 2)\n", "plt.plot(train_accuracy_adam_three, label='train')\n", "plt.plot(val_accuracy_adam_three, label='val')\n", "plt.title('Classification accuracy history')\n", "plt.xlabel('Epoch')\n", "plt.ylabel('Classification accuracy')\n", "plt.legend()\n", "plt.show()\n", "print(max(train_accuracy_adam_three))\n", "print(max(test_accuracy_adam_three))" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.5158\n" ] } ], "source": [ "#############################################################################\n", "# For the Adam gradient updated 3-layer NN we observe that even for a very low\n", "# value of regularization as the training error decreases with the increasing \n", "# number of epochs the training accuracy keeps on increasing however, the \n", "# validation and testing accuracy increase upto a certain number of epochs \n", "# and then start decreasing. This might be due to the fact that Adam gradient method\n", "# causes faster convergence of results as compared to SDG and for lesser number of \n", "# epochs we get the best Adam results given the parameters. However, if we don't stop\n", "# the training the model overlearns or overfits the data which increases training \n", "# accuracy but decreases the validation and testing accuracy. So we do only 40 epochs\n", "# for Adam gradient methods as compared to 100-150 for SGD.\n", "#############################################################################\n", "scores=net.forward(X_test)\n", "y_pred=np.argmax(scores, axis=1)\n", "test_accuracy= (np.sum(y_test==y_pred)/len(y_test))\n", "print(test_accuracy)\n", "best_3layer_adam_prediction=y_pred\n", "output_submission_csv('kaggle/best_3layer_adam_prediction.csv', best_3layer_adam_prediction)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Hyperparameter tuning\n", "\n", "Once you have successfully trained a network you can tune your hyparameters to increase your accuracy.\n", "\n", "Based on the graphs of the loss function above you should be able to develop some intuition about what hyperparameter adjustments may be necessary. A very noisy loss implies that the learning rate might be too high, while a linearly decreasing loss would suggest that the learning rate may be too low. A large gap between training and validation accuracy would suggest overfitting due to large model without much regularization. No gap between training and validation accuracy would indicate low model capacity. \n", "\n", "You will compare networks of two and three layers using the different optimization methods you implemented. \n", "\n", "The different hyperparameters you can experiment with are:\n", "- **Batch size**: We recommend you leave this at 200 initially which is the batch size we used. \n", "- **Number of iterations**: You can gain an intuition for how many iterations to run by checking when the validation accuracy plateaus in your train/val accuracy graph.\n", "- **Initialization** Weight initialization is very important for neural networks. We used the initialization `W = np.random.randn(n) / sqrt(n)` where `n` is the input dimension for layer corresponding to `W`. We recommend you stick with the given initializations, but you may explore modifying these. Typical initialization practices: http://cs231n.github.io/neural-networks-2/#init\n", "- **Learning rate**: Generally from around 1e-4 to 1e-1 is a good range to explore according to our implementation.\n", "- **Learning rate decay**: We recommend a 0.95 decay to start.\n", "- **Hidden layer size**: You should explore up to around 120 units per layer. For three-layer network, we fixed the two hidden layers to be the same size when obtaining the target numbers. However, you may experiment with having different size hidden layers.\n", "- **Regularization coefficient**: We recommend trying values in the range 0 to 0.1. \n", "\n", "Hints:\n", "- After getting a sense of the parameters by trying a few values yourself, you will likely want to write a few for-loops to traverse over a set of hyperparameters.\n", "- If you find that your train loss is decreasing, but your train and val accuracy start to decrease rather than increase, your model likely started minimizing the regularization term. To prevent this you will need to decrease the regularization coefficient. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run on the test set\n", "When you are done experimenting, you should evaluate your final trained networks on the test set." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#############################################################################\n", "# I have split this section into 4 different cells with their respective NN codes \n", "# and shown above\n", "#############################################################################\n", "\n", "best_2layer_sgd_prediction = None\n", "best_3layer_sgd_prediction = None\n", "best_2layer_adam_prediction = None\n", "best_3layer_adam_prediction = None" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Kaggle output\n", "\n", "Once you are satisfied with your solution and test accuracy, output a file to submit your test set predictions to the Kaggle. Use the following code to do so:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#############################################################################\n", "# I have split this section into 4 different cells with their respective NN codes \n", "# and shown above\n", "#############################################################################\n", "\n", "output_submission_csv('nn_2layer_sgd_submission.csv', best_2layer_sgd_prediction)\n", "output_submission_csv('nn_3layer_sgd_submission.csv', best_3layer_sgd_prediction)\n", "output_submission_csv('nn_2layer_adam_submission.csv', best_2layer_adam_prediction)\n", "output_submission_csv('nn_3layer_adam_submission.csv', best_3layer_adam_prediction)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Compare SGD and Adam\n", "Create graphs to compare training loss and validation accuracy between SGD and Adam. The code is similar to the above code, but instead of comparing train and validation, we are comparing SGD and Adam." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# TODO: implement me\n", "# Plot the loss function and train / validation accuracies\n", "plt.subplot(2, 1, 1)\n", "plt.plot(train_loss)\n", "plt.plot(train_loss_adam_three)\n", "plt.title('Loss history')\n", "plt.xlabel('Iteration')\n", "plt.ylabel('Loss')\n", "plt.subplot(2, 1, 2)\n", "plt.plot(train_accuracy_adam_three, label='train')\n", "plt.plot(val_accuracy_adam_three, label='val')\n", "plt.plot(train_accuracy, label='train')\n", "plt.plot(val_accuracy, label='val')\n", "plt.title('Classification accuracy history')\n", "plt.xlabel('Epoch')\n", "plt.ylabel('Classification accuracy')\n", "plt.legend()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 1 }