MTS2Graph / Clustering.py
Clustering.py
Raw
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 11 11:43:54 2020
"""

from matplotlib import pyplot
from sklearn.cluster import KMeans
from scipy.interpolate import interp1d
from kneed import KneeLocator
import numpy as np
from kneed import KneeLocator
from tslearn.clustering import KShape
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as hac
from scipy.cluster.hierarchy import fcluster
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS
from joblib import Parallel, delayed
class Clustering:
    def __init__(self,cluster_lists):
        self.cluster_lists = cluster_lists

    def scale_data(self,cluster_lists):
        i = 0
        if(cluster_lists != []):
            for data in cluster_lists:
                #print(len(cluster_lists[i]))
                for idx,seq in enumerate(data):
                    if(seq != []):
                        max_seq = max(seq)
                        min_seq = min(seq)
                        i = 0
                        while (i < len(seq)):
                            seq[i] = (seq[i] - min_seq) / (max_seq - min_seq)
                            i += 1
                    else:
                        data.pop(idx)
                i+=1        
        return cluster_lists

    def k_mean_clustering(self,num_clusters,data):
        kmeans = KMeans(init="random",n_clusters=num_clusters,n_init=10,max_iter=300,random_state=42)
        kmeans.fit(data)
        kmeans_kwargs = {"init": "random","n_init": 12,"max_iter": 300,"random_state": 42,}
        return kmeans.cluster_centers_

    def K_shape_clustering(self,num_clusters,data,layer_len):
        # Calculate length of maximal list
        n = len(max(data, key=len))
        # Make the lists equal in length
        lst_2 = [x + [0]*(n-len(x)) for x in data]
        a = np.nan_to_num(np.array(lst_2))
        print('K_shape_data')
        kshape = KShape(n_clusters=num_clusters, verbose=True, random_state=42)
        kshape.fit(a)
        name = 'MHAP_layer_data/cluster_center'+str(layer_len)+'.npy'
        np.save(name,kshape.cluster_centers_)
        return kshape.cluster_centers_
    
    def fancy_dendrogram(self,*args, **kwargs):
        max_d = kwargs.pop('max_d', None)
        if max_d and 'color_threshold' not in kwargs:
            kwargs['color_threshold'] = max_d
        annotate_above = kwargs.pop('annotate_above', 0)
    
        ddata = hac.dendrogram(*args, **kwargs)
    
        if not kwargs.get('no_plot', False):
            pyplot.title('Hierarchical Clustering Dendrogram (truncated)')
            pyplot.xlabel('sample index or (cluster size)')
            pyplot.ylabel('distance')
            for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
                x = 0.5 * sum(i[1:3])
                y = d[1]
                if y > annotate_above:
                    pyplot.plot(x, y, 'o', c=c)
                    pyplot.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                                 textcoords='offset points',
                                 va='top', ha='center')
            if max_d:
                pyplot.axhline(y=max_d, c='k')
        return ddata
    
    def print_clusters(self,timeSeries, Z, k, plot=False):
        # k Number of clusters I'd like to extract
        results = fcluster(Z, k, criterion='maxclust')
    
        # check the results
        s = pd.Series(results)
        clusters = s.unique()
    
        for c in clusters:
            cluster_indeces = s[s==c].index
            print("Cluster %d number of entries %d" % (c, len(cluster_indeces)))
            if plot:
                timeSeries.T.iloc[:,cluster_indeces].plot()
                pyplot.show()
        return clusters
    
    def hierarchical_cluster(self,num_clusters,data):
        #cluster = AgglomerativeClustering(n_clusters=num_clusters, affinity='euclidean', linkage='ward')  
        #cluster.fit_predict(data)
        # Do the clustering
        data = np.array(data)
        df = pd.DataFrame(data=data)
        data = df.dropna()
        Z = hac.linkage(data, method='complete', metric='euclidean')
        
        # Plot dendogram
        pyplot.figure(figsize=(25, 10))
        pyplot.title('Hierarchical Clustering Dendrogram')
        pyplot.xlabel('sample index')
        pyplot.ylabel('distance')
        """hac.dendrogram(
            Z,
            truncate_mode='lastp',  # show only the last p merged clusters
            p=12,  # show only the last p merged clusters
            show_leaf_counts=False,  # otherwise numbers in brackets are counts
            leaf_rotation=90.,
            leaf_font_size=12.,
            show_contracted=True,  # to get a distribution impression in truncated branches
        )"""
        self.fancy_dendrogram(
            Z,
            truncate_mode='lastp',
            p=12,
            leaf_rotation=90.,
            leaf_font_size=12.,
            show_contracted=True,
            annotate_above=10,  # useful in small plots so annotations don't overlap
        )
        pyplot.show()
        return self.print_clusters(data, Z, num_clusters, plot=False)
    
    def DBscan_cluster(self,num_clusters,data):
        data = np.array(data)
        data[np.isnan(data)] = 0
        clustering = DBSCAN(eps=0.8, min_samples = 15).fit(data)
        return clustering
    
    def optic_cluster(self,num_clusters,data):
        data = np.array(data)
        data[np.isnan(data)] = 0
        clustering = OPTICS(min_samples=num_clusters).fit(data)
        return clustering
    
    def do_cluster(self,j,cluster_lists,cluster_number,layer_len):
        #return (self.K_shape_clustering(cluster_number[j],cluster_lists[j],layer_len[j]))
        return (self.k_mean_clustering(cluster_number[j],cluster_lists[j]))
    
    def cluster_sequence_data(self,cluster_number,layer_len,cluser_data_pre_list1):
        # scale the data between 0 and 1
        cluster_centers = []
        cluster_lists = cluser_data_pre_list1
        #loop thriugh the periods for each CNN layer
        results = []
        #loop through each training sample
        results = Parallel(n_jobs=3,backend="threading")(delayed(self.do_cluster)
                (j,cluster_lists,cluster_number,layer_len) for j in range(len(cluster_lists)))        
        return  results