trainingModel.py · wafer-detection

import pandas as pd
from Data_Preprocessing import preprocessing
from Data_Preprocessing import clustering
from sklearn.model_selection import train_test_split
from app_logs.logger import App_logger
from best_model_finder import tuner
from File_Operations import file_ops
import time

class trainModel:
	"""
	This class implements the model training functionality.
	"""

	def __init__(self):
		self.log_file = open("Training_logs/ModelTrainingLog.txt", 'a+')
		self.logger = App_logger()
		self.input_file = 'TrainingFile_FromDB/InputFile.csv'

	def trainingModel(self):
		"""
		This method is used to preprocess, cluster the data and then
		train the models for different clusters.
		:return:
		"""
		self.logger.log(self.log_file, '*** Model Training started !! ***')
		try:
			""" Get data """
			data = pd.read_csv(self.input_file)
			self.logger.log(self.log_file, 'Data load successful !!')

			""" Preprocess Data """
			preprocessor = preprocessing.Preprocessor(self.log_file)

			# Remove columns that don't add values for training
			train_data = preprocessor.remove_columns(data, ['Wafer'])

			# Split the features and labels
			X, Y = preprocessor.split_xy(train_data, 'Output')

			# Check if any null values are present
			null_present = preprocessor.is_null_present(X)

			# Impute any missing values using K-means imputer
			if null_present:
				X = preprocessor.impute_missing(X)

			# Remove columns with zero standard deviation - these columns do not contribute to predictions
			columns_to_remove = preprocessor.cols_with_zero_standard_deviation(X)
			X = preprocessor.remove_columns(X, columns_to_remove)
			X.to_csv('Data_Preprocessing/clean_X.csv')

			""" Create Clusters"""
			kmeans = clustering.KMeansClustering(self.log_file)

			# Find optimal number of clusters
			optimal_clusters = kmeans.elbow_plot(X)

			# Create data with cluster information
			X = kmeans.create_clusters(optimal_clusters, X)

			# Add output label information to data
			X['Label'] = Y

			# Get unique count of clusters
			cluster_count = X['Cluster'].unique()

			""" Apply cluster approach to build models"""
			self.logger.log(self.log_file, '*** Model creation started !! ***')

			# Iterate over and build models for each cluster
			for i in cluster_count:

				# Subset the data for cluster i
				cluster_data = X[X['Cluster']==i]

				# Separate features and label data
				c_features = cluster_data.drop(['Label','Cluster'], axis=1)
				c_labels = cluster_data['Label']

				# Split the data into train and test sets
				x_train, x_test, y_train, y_test = train_test_split(c_features, c_labels, test_size=1/3,
				                                                    random_state=100)

				# Get best model
				self.logger.log(self.log_file, 'Building model for cluster '+str(i))
				start_time = time.time()
				model_finder = tuner.Model_Finder(self.log_file)
				best_model_name, best_model = model_finder.get_best_model(x_train, x_test, y_train, y_test)
				msg = f'Time elapsed in finding the best model for cluster {i}: {(time.time()-start_time)/60:.2f} mins'
				self.logger.log(self.log_file, msg)
				self.logger.log(self.log_file, 'Best model for cluster '+str(i)+' is: '+str(best_model_name))

				# Save the best model
				file_op = file_ops.FileOperations(self.log_file)
				save_model = file_op.save_model(best_model, best_model_name+str(i))

			self.logger.log(self.log_file, '*** Model training successful !! ***')
			self.log_file.close()

		except Exception as e:
			self.logger.log(self.log_file, 'Error when training model, error: ' + str(e))
			self.log_file.close()
			raise e


###Test
# model_training = trainModel()
# model_training.trainingModel()