predictFromModel.py · wafer-detection

import pandas as pd
from Data_Preprocessing import preprocessing
from Predict_raw_data_validation.predictionRawValidation import prediction_data_validation
from app_logs.logger import App_logger
from File_Operations import file_ops
import os
import time

class prediction:
	"""
	This class implements the model training functionality.
	"""

	def __init__(self, path):
		self.log_file = open("Prediction_Logs/ModelPredictionLog.txt", 'a+')
		self.logger = App_logger()
		if path is not None:
			self.pred_data_val = prediction_data_validation(path)
		self.input_file = 'PredictionFile_FromDB/InputFile.csv'

	def predictFromModel(self):
		"""
		This method is used to preprocess, cluster the data and then
		train the models for different clusters.
		:return:
		"""
		self.pred_data_val.deletePredictionFile()   # delete existing prediction csv file
		self.logger.log(self.log_file, '*** Model Prediction started !! ***')
		try:
			""" Get data """
			data = pd.read_csv(self.input_file)
			self.logger.log(self.log_file, 'Data load successful !!')

			""" Preprocess Data """
			preprocessor = preprocessing.Preprocessor(self.log_file)

			# Check if any null values are present
			null_present = preprocessor.is_null_present(data)

			# Impute any missing values using K-means imputer
			if null_present:
				data = preprocessor.impute_missing(data)

			# Remove columns with zero standard deviation - these columns do not contribute to predictions
			columns_to_remove = preprocessor.cols_with_zero_standard_deviation(data)
			data = preprocessor.remove_columns(data, columns_to_remove)
			# X.to_csv('Data_Preprocessing/clean_X.csv')

			""" Create Clusters"""
			# Load KMeans model
			file_loader = file_ops.FileOperations(self.log_file)
			kmeans = file_loader.load_model('KMeans')

			# Create clusters
			# drop `Wafer` column as it doesn't add value
			clusters = kmeans.predict(data.drop(['Wafer'], axis=1))

			# Create data with cluster information
			data['Cluster'] = clusters

			# Get unique count of clusters
			cluster_count = data['Cluster'].unique()

			""" Apply cluster approach to build models"""
			self.logger.log(self.log_file, '*** Prediction started !! ***')

			# Iterate over and make predictions for each cluster
			for i in cluster_count:

				# Subset the data for cluster i
				cluster_data = data[data['Cluster']==i]

				# Create wafer names
				wafer_names = cluster_data['Wafer']

				# Drop Wafer and Cluster columns
				cluster_data = cluster_data.drop(['Wafer','Cluster'], axis=1)

				# Load the correct model for cluster
				model_name = file_loader.find_correct_model_file(i)
				model = file_loader.load_model(model_name)

				# Generate Predictions
				self.logger.log(self.log_file, 'Starting prediction for cluster ' + str(i))
				start_time = time.time()
				result = list(model.predict(cluster_data))
				msg = f'Time elapsed in prediction for cluster {i}: {(time.time() - start_time) / 60:.4f} mins'
				self.logger.log(self.log_file, msg)

				# Create dataframe for results
				result_df = pd.DataFrame(list(zip(wafer_names, result)), columns=['Wafer', 'Prediction'])
				result_df['Prediction'].replace({-1:'Bad', 1:'Good'}, inplace=True)

				# Append result to prediction output file
				if not os.path.isdir('Prediction_Output_File/'):
					os.makedirs('Prediction_Output_File/')
				path = os.path.join('Prediction_Output_File/', 'Predictions.csv')
				result_df.to_csv(path, header=True, mode='a+')
				self.logger.log(self.log_file, 'Predictions for cluster ' + str(i) + ' appended to Predictions.csv')

			self.logger.log(self.log_file, '*** Data Prediction successful !! ***')
			self.log_file.close()

			return path, result_df.tail(25).to_json(orient='records')

		except Exception as e:
			self.logger.log(self.log_file, 'Error when generating predictions, error: ' + str(e))
			self.log_file.close()
			raise e


###Test
# path = "Prediction_Batch_Files"
# model_pred = prediction(path)
# model_pred.predictFromModel()