wafer-detection / Data_Preprocessing / clustering.py
clustering.py
Raw
import pandas as pd
from sklearn.cluster import KMeans
from kneed import KneeLocator
import matplotlib.pyplot as plt
from File_Operations.file_ops import FileOperations
from app_logs.logger import App_logger

class KMeansClustering:
	"""
	This class is used to divide the data into clusters before training.
	"""
	def __init__(self, log_file):
		self.log_file = log_file
		self.logger = App_logger()

	def elbow_plot(self, data):
		"""
		This method iterates over a range of clusters to generate elbow plot and return optimal no. of clusters
		:param data: A DataFrame
		:return: optimal number of clusters
		"""
		self.logger.log(self.log_file, "Entered the elbow_plot method of the KMeansClustering class")
		try:
			wcss = []
			for i in range(1, 11):
				kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
				pred = kmeans.fit(data)
				wcss.append(pred.inertia_)
			# Create plot
			plt.plot(range(1, 11), wcss)
			plt.title('Elbow Plot')
			plt.xlabel('Clusters')
			plt.ylabel('WCSS')
			plt.savefig('Data_Preprocessing/Kmeans_elbow.png')
			# Get optimal number of cluster
			knee_loc = KneeLocator(range(1, 11), wcss, curve='convex', direction='decreasing')
			optimal_clusters = knee_loc.knee

			msg = f"Optimal clusters: {optimal_clusters}. Elbow plot saved at directory: /Data_Preprocessing successfully !!"
			self.logger.log(self.log_file,msg)
			return optimal_clusters

		except Exception as e:
			self.logger.log(self.log_file, 'Error generating optimal clusters, error: ' + str(e))
			raise e

	def create_clusters(self, n_clusters, data):
		"""
		This method clusters the data and add a new cluster column to the dataset.
		The method also saves the model in a directory

		:param n_clusters:
		:param data:
		:return:
		"""
		self.data = data
		self.clusters = n_clusters
		self.logger.log(self.log_file, "Entered the create_clusters method of the KMeansClustering class")
		try:
			kmeans = KMeans(n_clusters=self.clusters, init="k-means++", random_state=42)
			preds = kmeans.fit_predict(self.data)
			self.data['Cluster'] = preds
			self.logger.log(self.log_file, "Cluster information added to data successfully")
			# Save Model
			fileops = FileOperations(self.log_file)
			fileops.save_model(kmeans, 'KMeans')
			return self.data

		except Exception as e:
			self.logger.log(self.log_file, 'Error when creating clusters, error: ' + str(e))
			raise e