{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "CompStats_project.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "gbSJIO-nk4Do"
},
"source": [
"import math\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.patches import Ellipse\n",
"import matplotlib.transforms as transforms\n",
"\n",
"from scipy.stats import multivariate_normal\n",
"\n",
"import sklearn\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.preprocessing import StandardScaler"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "F-X9hWgUlxWC"
},
"source": [
"def data_generator(cluster_nums, cluster_means, cluster_var):\n",
" data = []\n",
" for num, mu, var in zip(cluster_nums, cluster_means, cluster_var):\n",
" data += [np.random.multivariate_normal(mu, np.diag(var), num)]\n",
" data = np.vstack(data)\n",
" return data\n",
"\n",
"mus= [[-6.1, 3.8],[-5.9, -4.2], [6, 0.1]]\n",
"vs = [[1, 3],[2, 2],[6, 2]]\n",
"\n",
"X = data_generator(cluster_nums=[50,50,50],\n",
" cluster_means=mus,\n",
" cluster_var=vs)\n",
"\n",
"def confidence_ellipse(mu, cov, ax, nstd=3.0, **kwargs):\n",
" def eigsorted(cov):\n",
" vals, vecs = np.linalg.eigh(cov)\n",
" order = vals.argsort()[::-1]\n",
" return vals[order], vecs[:,order]\n",
" if ax is None:\n",
" ax = plt.gca()\n",
" vals, vecs = eigsorted(cov)\n",
" theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))\n",
" width, height = 2 * nstd * np.sqrt(vals)\n",
" ellip = Ellipse(xy=mu, width=width, height=height, angle=theta, facecolor=\"none\", **kwargs)\n",
" ax.add_artist(ellip)\n",
" return ellip\n",
"\n",
"ax = plt.axes()\n",
"plt.scatter(X[:, 0], X[:, 1])\n",
"\n",
"for j in range(len(mus)):\n",
" confidence_ellipse(mus[j], np.diag(vs[j]), ax, edgecolor=\"r\")\n",
"\n",
"plt.show()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ED49Fq6Y4BTd"
},
"source": [
"def Tk(k, a=0, b=-1, c=1, r=1):\n",
" kappa = (k+r*c)/r\n",
" return 1 + a**kappa + b*np.sin(kappa)/kappa\n",
"\n",
"x = np.arange(1,1000)\n",
"plt.plot(x, Tk(x))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "qlWwrIWLXpsr"
},
"source": [
"class GaussianMixture():\n",
"\n",
" def __init__(self, n_components):\n",
" \"\"\"Inits the GaussianMixture class by setting n_components and n_iter\"\"\"\n",
" self.n_components = n_components\n",
" \n",
" \n",
" def kmeans_init(self, X):\n",
" n = X.shape[0]\n",
" d = X.shape[1]\n",
" m = self.n_components\n",
"\n",
" kms = KMeans(n_clusters=m)\n",
" preds = kms.fit_predict(X)\n",
" \n",
" # sort X by clusters predicted from kmeans\n",
" x_comp = [[] for _ in range(m)]\n",
" for i,pred in enumerate(preds):\n",
" x_comp[pred].append(X[i])\n",
" x_comp = [np.array(x) for x in x_comp]\n",
" \n",
" # compute prior, mean and variance of each cluster \n",
" alpha = np.zeros(m)\n",
" mu = np.zeros((m, d))\n",
" sigma = np.zeros((m, d, d))\n",
" for i,x in enumerate(x_comp):\n",
" alpha[i] = x.shape[0] / n\n",
" mu[i] = np.mean(x)\n",
" sigma[i] = np.cov(x.T)\n",
"\n",
" return alpha, mu, sigma\n",
"\n",
" # Appendix B.1 of the paper\n",
" def fit_em(self, X, n_iter):\n",
" n = X.shape[0]\n",
" d = X.shape[1]\n",
" m = self.n_components\n",
" \n",
" # init with KMeans\n",
" alpha, mu, sigma = self.kmeans_init(X)\n",
" \n",
" # EM loop\n",
" for k in range(n_iter):\n",
"\n",
" # Expectation\n",
" tau = np.zeros((m, n))\n",
" for j in range(m):\n",
" tau[j] = alpha[j] * multivariate_normal.pdf(X, mu[j], sigma[j])\n",
" tau /= tau.sum(axis=0)\n",
" llikelihood = np.sum(np.log(np.sum(tau, axis=0)))\n",
"\n",
" # Maximization\n",
" a = np.sum(tau, axis=1)\n",
" alpha = a / np.sum(a)\n",
"\n",
" mu = np.sum(tau[:,:,None]*X, axis=1) / a[:, None]\n",
"\n",
" Xm = X - mu[:,None]\n",
" cov = Xm[:,:,:,None] * Xm[:,:,None,:]\n",
" sigma = np.sum(tau[:,:,None,None]*cov, axis=1) / a[:, None,None] + np.eye(d) * 1e-8 # the np.eye 1e-8 prevents some numerical issues\n",
" \n",
" self.alpha = alpha\n",
" self.mu = mu\n",
" self.sigma = sigma\n",
" self.tau = tau\n",
"\n",
" return np.argmax(tau, axis=0)\n",
"\n",
"\n",
" # Appendix B.2 of the paper\n",
" def fit_saem(self, X, n_iter, params):\n",
" n = X.shape[0]\n",
" d = X.shape[1]\n",
" m = self.n_components\n",
"\n",
" if 'burnin' in params and 'alpha_burnin' in params:\n",
" burnin, alpha_burnin = params['burnin'], params['alpha_burnin']\n",
" else:\n",
" raise Exception(\"Params must contain: {'burnin': int, 'alpha_burnin': float}\")\n",
"\n",
" # init with KMeans\n",
" alpha, mu, sigma = self.kmeans_init(X)\n",
"\n",
" # init the sufficient statistics\n",
" S1 = np.zeros(m)\n",
" S2 = np.zeros((m,d))\n",
" S3 = np.zeros((m,d,d))\n",
"\n",
" # SAEM loop\n",
" np.random.seed(42)\n",
" for k in range(n_iter):\n",
"\n",
" # Simulation\n",
" ## compute tau \n",
" tau = np.zeros((m, n))\n",
" for j in range(m):\n",
" tau[j] = alpha[j] * multivariate_normal.pdf(X, mu[j], sigma[j])\n",
" \n",
" ## compute distribution\n",
" tau /= tau.sum(axis=0)\n",
" llikelihood = np.sum(np.log(np.sum(tau, axis=0)))\n",
"\n",
" ## simulate latent variables\n",
" z = np.zeros(n, dtype=int)\n",
" for i in range(n):\n",
" z[i] = np.random.choice(range(m), p = tau[:,i])\n",
"\n",
" # Stochastic Approximation\n",
" gamma = 1 if k <= burnin else (k-burnin)**(-alpha_burnin)\n",
" S1_new = np.zeros(m)\n",
" S2_new = np.zeros((m,d))\n",
" S3_new = np.zeros((m,d,d))\n",
" for i in range(n):\n",
" S1_new[z[i]] += 1\n",
" S2_new[z[i]] += X[i]\n",
" S3_new[z[i]] += X[i, None] * X[i,None].T\n",
" S1 = S1 + gamma*(S1_new-S1)\n",
" S2 = S2 + gamma*(S2_new-S2)\n",
" S3 = S3 + gamma*(S3_new-S3)\n",
"\n",
" # Maximization\n",
" alpha = S1/n\n",
" mu = S2 / S1[:,None]\n",
" sigma = (S3-S2[:,:,None]*mu[:,None,:]) / S1[:,None,None]\n",
" \n",
" self.alpha = alpha\n",
" self.mu = mu\n",
" self.sigma = sigma\n",
" self.tau = tau\n",
"\n",
" return np.argmax(tau, axis=0)\n",
"\n",
" # Appendix B.3 of the paper\n",
" def fit_temp_saem(self, X, n_iter, params):\n",
" n = X.shape[0]\n",
" d = X.shape[1]\n",
" m = self.n_components\n",
"\n",
" if 'burnin' in params and 'alpha_burnin' in params:\n",
" burnin, alpha_burnin = params['burnin'], params['alpha_burnin']\n",
" else:\n",
" raise Exception(\"Params must contain: {'burnin': int, 'alpha_burnin': float}\")\n",
"\n",
" if 'decreasing_rate' in params and 'amplitude_rate' in params and 'scaling' in params and 'delay' in params:\n",
" decreasing_rate = params['decreasing_rate']\n",
" amplitude_rate = params['amplitude_rate']\n",
" scaling = params['scaling']\n",
" delay = params['delay']\n",
" else:\n",
" raise Exception(\"Params must contain: {'decreasing_rate', 'amplitude_rate', 'scaling', 'delay'}\")\n",
" def Tk(k):\n",
" kappa = (k+scaling*delay)/scaling\n",
" return 1 + decreasing_rate**kappa + amplitude_rate*np.sin(kappa)/kappa\n",
"\n",
" # init with KMeans\n",
" alpha, mu, sigma = self.kmeans_init(X)\n",
"\n",
" # init the sufficient statistics\n",
" S1 = np.zeros(m)\n",
" S2 = np.zeros((m,d))\n",
" S3 = np.zeros((m,d,d))\n",
"\n",
" # SAEM loop\n",
" np.random.seed(42)\n",
" for k in range(n_iter):\n",
"\n",
" # Simulation\n",
" ## compute tau \n",
" tau = np.zeros((m, n))\n",
" for j in range(m):\n",
" tau[j] = alpha[j] * multivariate_normal.pdf(X, mu[j], sigma[j])\n",
" \n",
" ## compute tempered distribution\n",
" T = Tk(k)\n",
" tau = tau**(1/T)\n",
" tau /= tau.sum(axis=0)\n",
" llikelihood = np.sum(np.log(np.sum(tau, axis=0)))\n",
"\n",
" ## simulate latent variables\n",
" z = np.zeros(n, dtype=int)\n",
" for i in range(n):\n",
" z[i] = np.random.choice(range(m), p = tau[:,i])\n",
"\n",
" # Stochastic Approximation\n",
" gamma = 1 if k <= burnin else (k-burnin)**(-alpha_burnin)\n",
" S1_new = np.zeros(m)\n",
" S2_new = np.zeros((m,d))\n",
" S3_new = np.zeros((m,d,d))\n",
" for i in range(n):\n",
" S1_new[z[i]] += 1\n",
" S2_new[z[i]] += X[i]\n",
" S3_new[z[i]] += X[i, None] * X[i,None].T\n",
" S1 = S1 + gamma*(S1_new-S1)\n",
" S2 = S2 + gamma*(S2_new-S2)\n",
" S3 = S3 + gamma*(S3_new-S3)\n",
"\n",
" # Maximization\n",
" alpha = S1/n\n",
" mu = S2 / S1[:,None]\n",
" sigma = (S3-S2[:,:,None]*mu[:,None,:]) / S1[:,None,None]\n",
" \n",
" self.alpha = alpha\n",
" self.mu = mu\n",
" self.sigma = sigma\n",
" self.tau = tau\n",
"\n",
" return np.argmax(tau, axis=0)\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "TRgt_a0glhmC"
},
"source": [
"gmm = GaussianMixture(n_components=3)\n",
"# params_saem = {'burnin': 10, 'alpha_burnin': 0.8}\n",
"# pred_gmm = gmm.fit_saem(X, n_iter= 100, params = params_saem)\n",
"params_temp_saem = {'burnin': 10, 'alpha_burnin': 0.8, \n",
" 'decreasing_rate': 0.0, 'amplitude_rate': -1, 'scaling': 1, 'delay': 1}\n",
"pred_gmm = gmm.fit_temp_saem(X, n_iter= 100, params = params_temp_saem)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "eRbZvCfOlk5H"
},
"source": [
"ax = plt.axes()\n",
"plt.scatter(X[:, 0], X[:, 1], c=pred_gmm)\n",
"plt.suptitle('GMM clusters')\n",
"\n",
"for j in range(len(gmm.mu)):\n",
" confidence_ellipse(gmm.mu[j], gmm.sigma[j], ax, edgecolor=\"r\")\n",
"for j in range(len(gmm.mu)):\n",
" confidence_ellipse(mus[j], np.diag(vs[j]), ax, edgecolor=\"g\")\n",
"\n",
"plt.show()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "FbaSoeyGln1n"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}