SymNMF-Clustering / kmeanshw1.py
kmeanshw1.py
Raw
import math
import numpy as np

EPSILON = 1e-4

#All code from HW1 which was irrelevant was removed

#calculates the distance between two points
def getDistance(point1, point2):
    """
    Compute Euclidean distance between two points.

    Args:
        point1 (list[float]): First point.
        point2 (list[float]): Second point.

    Returns:
        float: Euclidean distance.
    """
    d = len(point1)
    dist = 0
    for i in range(d):
        diff = float(point1[i])-float(point2[i])
        dist += diff*diff
    return math.sqrt(dist)

#updates the centroids after we insert all points       
def update_center(clusters):
    """
    Compute new centroids after we insert all points.

    Args:
        clusters (list[list[list[float]]]): List of clusters, each has a list of points.

    Returns:
        list[list[float]]: Updated centroids.
    """
    d = len(clusters[0][0])
    centers = []
    for cluster in clusters:
        up_center= []
        for i in range(d): 
            s = 0
            for point in cluster:
                s+=point[i]
            up_center.append(s/len(cluster))
        centers.append(up_center)
    return centers    

#returns a tuple of clusters, clusts_ind= list of indexes of the clusters each point was assigned to
#given the current centroids and the points, sorts the points
#to their closest centroid 
def sort_points(points, centroids):
    """
    Sorts points to nearest centroid.

    Args:
        points (list[list[float]]): Input points.
        centroids (list[list[float]]): Current centroids.

    Returns:
            clusters (list[list[list[float]]]): Points grouped by nearest centroid.
            clusts_ind (list[int]): List of indexes of the clusters each point was assigned to.
    """
    clusters = [[] for i in range(len(centroids))]
    clusts_ind = [] 
    for p_ind in range(len(points)): 
        point = points[p_ind] 
        minDist = math.inf
        centIdx = len(centroids)
        for i in range(len(centroids)):
            centroid = centroids[i]
            dist = getDistance(centroid, point)
            if dist<minDist:
                minDist = dist
                centIdx = i
        clusters[centIdx].append(point)
        clusts_ind.append(centIdx) 
    return clusters, clusts_ind 

#checks if the centroids converged enough
def e_convergence(prev_ctr, up_ctr):
     """
    Check if centroids converged.

    Args:
        prev_ctr (list[list[float]]): Previous centroids.
        up_ctr (list[list[float]]): Updated centroids.

    Returns:
        bool: True if converged, False otherwise.
    """
     for i in range(len(prev_ctr)):
        if(getDistance(prev_ctr[i],up_ctr[i]))>=EPSILON:
            return False
     return True

def final_clusters(points_arr, k):
    """
    Cluster the data points into k clusters until convergence.

    Args:
        points_arr (numpy.ndarray): Input points as NumPy array.
        k (int): Number of clusters.

    Returns:
        numpy.ndarray: Cluster indices each point was assigned to.
    """
    points = points_arr.tolist()
    centroids = [points[i] for i in range(k)]
    for i in range(300): #default iter is 300
            clusters, clusters_indexs = sort_points(points, centroids)
            new_cents = update_center(clusters)
            if e_convergence(centroids, new_cents):
                break
            centroids = new_cents
    final_clusters_indexs = np.array(clusters_indexs)
    return final_clusters_indexs