import pandas as pd from sklearn.cluster import KMeans from kneed import KneeLocator import matplotlib.pyplot as plt from File_Operations.file_ops import FileOperations from app_logs.logger import App_logger class KMeansClustering: """ This class is used to divide the data into clusters before training. """ def __init__(self, log_file): self.log_file = log_file self.logger = App_logger() def elbow_plot(self, data): """ This method iterates over a range of clusters to generate elbow plot and return optimal no. of clusters :param data: A DataFrame :return: optimal number of clusters """ self.logger.log(self.log_file, "Entered the elbow_plot method of the KMeansClustering class") try: wcss = [] for i in range(1, 11): kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42) pred = kmeans.fit(data) wcss.append(pred.inertia_) # Create plot plt.plot(range(1, 11), wcss) plt.title('Elbow Plot') plt.xlabel('Clusters') plt.ylabel('WCSS') plt.savefig('Data_Preprocessing/Kmeans_elbow.png') # Get optimal number of cluster knee_loc = KneeLocator(range(1, 11), wcss, curve='convex', direction='decreasing') optimal_clusters = knee_loc.knee msg = f"Optimal clusters: {optimal_clusters}. Elbow plot saved at directory: /Data_Preprocessing successfully !!" self.logger.log(self.log_file,msg) return optimal_clusters except Exception as e: self.logger.log(self.log_file, 'Error generating optimal clusters, error: ' + str(e)) raise e def create_clusters(self, n_clusters, data): """ This method clusters the data and add a new cluster column to the dataset. The method also saves the model in a directory :param n_clusters: :param data: :return: """ self.data = data self.clusters = n_clusters self.logger.log(self.log_file, "Entered the create_clusters method of the KMeansClustering class") try: kmeans = KMeans(n_clusters=self.clusters, init="k-means++", random_state=42) preds = kmeans.fit_predict(self.data) self.data['Cluster'] = preds self.logger.log(self.log_file, "Cluster information added to data successfully") # Save Model fileops = FileOperations(self.log_file) fileops.save_model(kmeans, 'KMeans') return self.data except Exception as e: self.logger.log(self.log_file, 'Error when creating clusters, error: ' + str(e)) raise e