{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "CompStats_project.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "code", "metadata": { "id": "gbSJIO-nk4Do" }, "source": [ "import math\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from matplotlib.patches import Ellipse\n", "import matplotlib.transforms as transforms\n", "\n", "from scipy.stats import multivariate_normal\n", "\n", "import sklearn\n", "from sklearn.cluster import KMeans\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.preprocessing import StandardScaler" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "F-X9hWgUlxWC" }, "source": [ "def data_generator(cluster_nums, cluster_means, cluster_var):\n", " data = []\n", " for num, mu, var in zip(cluster_nums, cluster_means, cluster_var):\n", " data += [np.random.multivariate_normal(mu, np.diag(var), num)]\n", " data = np.vstack(data)\n", " return data\n", "\n", "mus= [[-6.1, 3.8],[-5.9, -4.2], [6, 0.1]]\n", "vs = [[1, 3],[2, 2],[6, 2]]\n", "\n", "X = data_generator(cluster_nums=[50,50,50],\n", " cluster_means=mus,\n", " cluster_var=vs)\n", "\n", "def confidence_ellipse(mu, cov, ax, nstd=3.0, **kwargs):\n", " def eigsorted(cov):\n", " vals, vecs = np.linalg.eigh(cov)\n", " order = vals.argsort()[::-1]\n", " return vals[order], vecs[:,order]\n", " if ax is None:\n", " ax = plt.gca()\n", " vals, vecs = eigsorted(cov)\n", " theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))\n", " width, height = 2 * nstd * np.sqrt(vals)\n", " ellip = Ellipse(xy=mu, width=width, height=height, angle=theta, facecolor=\"none\", **kwargs)\n", " ax.add_artist(ellip)\n", " return ellip\n", "\n", "ax = plt.axes()\n", "plt.scatter(X[:, 0], X[:, 1])\n", "\n", "for j in range(len(mus)):\n", " confidence_ellipse(mus[j], np.diag(vs[j]), ax, edgecolor=\"r\")\n", "\n", "plt.show()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ED49Fq6Y4BTd" }, "source": [ "def Tk(k, a=0, b=-1, c=1, r=1):\n", " kappa = (k+r*c)/r\n", " return 1 + a**kappa + b*np.sin(kappa)/kappa\n", "\n", "x = np.arange(1,1000)\n", "plt.plot(x, Tk(x))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "qlWwrIWLXpsr" }, "source": [ "class GaussianMixture():\n", "\n", " def __init__(self, n_components):\n", " \"\"\"Inits the GaussianMixture class by setting n_components and n_iter\"\"\"\n", " self.n_components = n_components\n", " \n", " \n", " def kmeans_init(self, X):\n", " n = X.shape[0]\n", " d = X.shape[1]\n", " m = self.n_components\n", "\n", " kms = KMeans(n_clusters=m)\n", " preds = kms.fit_predict(X)\n", " \n", " # sort X by clusters predicted from kmeans\n", " x_comp = [[] for _ in range(m)]\n", " for i,pred in enumerate(preds):\n", " x_comp[pred].append(X[i])\n", " x_comp = [np.array(x) for x in x_comp]\n", " \n", " # compute prior, mean and variance of each cluster \n", " alpha = np.zeros(m)\n", " mu = np.zeros((m, d))\n", " sigma = np.zeros((m, d, d))\n", " for i,x in enumerate(x_comp):\n", " alpha[i] = x.shape[0] / n\n", " mu[i] = np.mean(x)\n", " sigma[i] = np.cov(x.T)\n", "\n", " return alpha, mu, sigma\n", "\n", " # Appendix B.1 of the paper\n", " def fit_em(self, X, n_iter):\n", " n = X.shape[0]\n", " d = X.shape[1]\n", " m = self.n_components\n", " \n", " # init with KMeans\n", " alpha, mu, sigma = self.kmeans_init(X)\n", " \n", " # EM loop\n", " for k in range(n_iter):\n", "\n", " # Expectation\n", " tau = np.zeros((m, n))\n", " for j in range(m):\n", " tau[j] = alpha[j] * multivariate_normal.pdf(X, mu[j], sigma[j])\n", " tau /= tau.sum(axis=0)\n", " llikelihood = np.sum(np.log(np.sum(tau, axis=0)))\n", "\n", " # Maximization\n", " a = np.sum(tau, axis=1)\n", " alpha = a / np.sum(a)\n", "\n", " mu = np.sum(tau[:,:,None]*X, axis=1) / a[:, None]\n", "\n", " Xm = X - mu[:,None]\n", " cov = Xm[:,:,:,None] * Xm[:,:,None,:]\n", " sigma = np.sum(tau[:,:,None,None]*cov, axis=1) / a[:, None,None] + np.eye(d) * 1e-8 # the np.eye 1e-8 prevents some numerical issues\n", " \n", " self.alpha = alpha\n", " self.mu = mu\n", " self.sigma = sigma\n", " self.tau = tau\n", "\n", " return np.argmax(tau, axis=0)\n", "\n", "\n", " # Appendix B.2 of the paper\n", " def fit_saem(self, X, n_iter, params):\n", " n = X.shape[0]\n", " d = X.shape[1]\n", " m = self.n_components\n", "\n", " if 'burnin' in params and 'alpha_burnin' in params:\n", " burnin, alpha_burnin = params['burnin'], params['alpha_burnin']\n", " else:\n", " raise Exception(\"Params must contain: {'burnin': int, 'alpha_burnin': float}\")\n", "\n", " # init with KMeans\n", " alpha, mu, sigma = self.kmeans_init(X)\n", "\n", " # init the sufficient statistics\n", " S1 = np.zeros(m)\n", " S2 = np.zeros((m,d))\n", " S3 = np.zeros((m,d,d))\n", "\n", " # SAEM loop\n", " np.random.seed(42)\n", " for k in range(n_iter):\n", "\n", " # Simulation\n", " ## compute tau \n", " tau = np.zeros((m, n))\n", " for j in range(m):\n", " tau[j] = alpha[j] * multivariate_normal.pdf(X, mu[j], sigma[j])\n", " \n", " ## compute distribution\n", " tau /= tau.sum(axis=0)\n", " llikelihood = np.sum(np.log(np.sum(tau, axis=0)))\n", "\n", " ## simulate latent variables\n", " z = np.zeros(n, dtype=int)\n", " for i in range(n):\n", " z[i] = np.random.choice(range(m), p = tau[:,i])\n", "\n", " # Stochastic Approximation\n", " gamma = 1 if k <= burnin else (k-burnin)**(-alpha_burnin)\n", " S1_new = np.zeros(m)\n", " S2_new = np.zeros((m,d))\n", " S3_new = np.zeros((m,d,d))\n", " for i in range(n):\n", " S1_new[z[i]] += 1\n", " S2_new[z[i]] += X[i]\n", " S3_new[z[i]] += X[i, None] * X[i,None].T\n", " S1 = S1 + gamma*(S1_new-S1)\n", " S2 = S2 + gamma*(S2_new-S2)\n", " S3 = S3 + gamma*(S3_new-S3)\n", "\n", " # Maximization\n", " alpha = S1/n\n", " mu = S2 / S1[:,None]\n", " sigma = (S3-S2[:,:,None]*mu[:,None,:]) / S1[:,None,None]\n", " \n", " self.alpha = alpha\n", " self.mu = mu\n", " self.sigma = sigma\n", " self.tau = tau\n", "\n", " return np.argmax(tau, axis=0)\n", "\n", " # Appendix B.3 of the paper\n", " def fit_temp_saem(self, X, n_iter, params):\n", " n = X.shape[0]\n", " d = X.shape[1]\n", " m = self.n_components\n", "\n", " if 'burnin' in params and 'alpha_burnin' in params:\n", " burnin, alpha_burnin = params['burnin'], params['alpha_burnin']\n", " else:\n", " raise Exception(\"Params must contain: {'burnin': int, 'alpha_burnin': float}\")\n", "\n", " if 'decreasing_rate' in params and 'amplitude_rate' in params and 'scaling' in params and 'delay' in params:\n", " decreasing_rate = params['decreasing_rate']\n", " amplitude_rate = params['amplitude_rate']\n", " scaling = params['scaling']\n", " delay = params['delay']\n", " else:\n", " raise Exception(\"Params must contain: {'decreasing_rate', 'amplitude_rate', 'scaling', 'delay'}\")\n", " def Tk(k):\n", " kappa = (k+scaling*delay)/scaling\n", " return 1 + decreasing_rate**kappa + amplitude_rate*np.sin(kappa)/kappa\n", "\n", " # init with KMeans\n", " alpha, mu, sigma = self.kmeans_init(X)\n", "\n", " # init the sufficient statistics\n", " S1 = np.zeros(m)\n", " S2 = np.zeros((m,d))\n", " S3 = np.zeros((m,d,d))\n", "\n", " # SAEM loop\n", " np.random.seed(42)\n", " for k in range(n_iter):\n", "\n", " # Simulation\n", " ## compute tau \n", " tau = np.zeros((m, n))\n", " for j in range(m):\n", " tau[j] = alpha[j] * multivariate_normal.pdf(X, mu[j], sigma[j])\n", " \n", " ## compute tempered distribution\n", " T = Tk(k)\n", " tau = tau**(1/T)\n", " tau /= tau.sum(axis=0)\n", " llikelihood = np.sum(np.log(np.sum(tau, axis=0)))\n", "\n", " ## simulate latent variables\n", " z = np.zeros(n, dtype=int)\n", " for i in range(n):\n", " z[i] = np.random.choice(range(m), p = tau[:,i])\n", "\n", " # Stochastic Approximation\n", " gamma = 1 if k <= burnin else (k-burnin)**(-alpha_burnin)\n", " S1_new = np.zeros(m)\n", " S2_new = np.zeros((m,d))\n", " S3_new = np.zeros((m,d,d))\n", " for i in range(n):\n", " S1_new[z[i]] += 1\n", " S2_new[z[i]] += X[i]\n", " S3_new[z[i]] += X[i, None] * X[i,None].T\n", " S1 = S1 + gamma*(S1_new-S1)\n", " S2 = S2 + gamma*(S2_new-S2)\n", " S3 = S3 + gamma*(S3_new-S3)\n", "\n", " # Maximization\n", " alpha = S1/n\n", " mu = S2 / S1[:,None]\n", " sigma = (S3-S2[:,:,None]*mu[:,None,:]) / S1[:,None,None]\n", " \n", " self.alpha = alpha\n", " self.mu = mu\n", " self.sigma = sigma\n", " self.tau = tau\n", "\n", " return np.argmax(tau, axis=0)\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "TRgt_a0glhmC" }, "source": [ "gmm = GaussianMixture(n_components=3)\n", "# params_saem = {'burnin': 10, 'alpha_burnin': 0.8}\n", "# pred_gmm = gmm.fit_saem(X, n_iter= 100, params = params_saem)\n", "params_temp_saem = {'burnin': 10, 'alpha_burnin': 0.8, \n", " 'decreasing_rate': 0.0, 'amplitude_rate': -1, 'scaling': 1, 'delay': 1}\n", "pred_gmm = gmm.fit_temp_saem(X, n_iter= 100, params = params_temp_saem)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "eRbZvCfOlk5H" }, "source": [ "ax = plt.axes()\n", "plt.scatter(X[:, 0], X[:, 1], c=pred_gmm)\n", "plt.suptitle('GMM clusters')\n", "\n", "for j in range(len(gmm.mu)):\n", " confidence_ellipse(gmm.mu[j], gmm.sigma[j], ax, edgecolor=\"r\")\n", "for j in range(len(gmm.mu)):\n", " confidence_ellipse(mus[j], np.diag(vs[j]), ax, edgecolor=\"g\")\n", "\n", "plt.show()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "FbaSoeyGln1n" }, "source": [ "" ], "execution_count": null, "outputs": [] } ] }