import pandas as pd from Data_Preprocessing import preprocessing from Predict_raw_data_validation.predictionRawValidation import prediction_data_validation from app_logs.logger import App_logger from File_Operations import file_ops import os import time class prediction: """ This class implements the model training functionality. """ def __init__(self, path): self.log_file = open("Prediction_Logs/ModelPredictionLog.txt", 'a+') self.logger = App_logger() if path is not None: self.pred_data_val = prediction_data_validation(path) self.input_file = 'PredictionFile_FromDB/InputFile.csv' def predictFromModel(self): """ This method is used to preprocess, cluster the data and then train the models for different clusters. :return: """ self.pred_data_val.deletePredictionFile() # delete existing prediction csv file self.logger.log(self.log_file, '*** Model Prediction started !! ***') try: """ Get data """ data = pd.read_csv(self.input_file) self.logger.log(self.log_file, 'Data load successful !!') """ Preprocess Data """ preprocessor = preprocessing.Preprocessor(self.log_file) # Check if any null values are present null_present = preprocessor.is_null_present(data) # Impute any missing values using K-means imputer if null_present: data = preprocessor.impute_missing(data) # Remove columns with zero standard deviation - these columns do not contribute to predictions columns_to_remove = preprocessor.cols_with_zero_standard_deviation(data) data = preprocessor.remove_columns(data, columns_to_remove) # X.to_csv('Data_Preprocessing/clean_X.csv') """ Create Clusters""" # Load KMeans model file_loader = file_ops.FileOperations(self.log_file) kmeans = file_loader.load_model('KMeans') # Create clusters # drop `Wafer` column as it doesn't add value clusters = kmeans.predict(data.drop(['Wafer'], axis=1)) # Create data with cluster information data['Cluster'] = clusters # Get unique count of clusters cluster_count = data['Cluster'].unique() """ Apply cluster approach to build models""" self.logger.log(self.log_file, '*** Prediction started !! ***') # Iterate over and make predictions for each cluster for i in cluster_count: # Subset the data for cluster i cluster_data = data[data['Cluster']==i] # Create wafer names wafer_names = cluster_data['Wafer'] # Drop Wafer and Cluster columns cluster_data = cluster_data.drop(['Wafer','Cluster'], axis=1) # Load the correct model for cluster model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) # Generate Predictions self.logger.log(self.log_file, 'Starting prediction for cluster ' + str(i)) start_time = time.time() result = list(model.predict(cluster_data)) msg = f'Time elapsed in prediction for cluster {i}: {(time.time() - start_time) / 60:.4f} mins' self.logger.log(self.log_file, msg) # Create dataframe for results result_df = pd.DataFrame(list(zip(wafer_names, result)), columns=['Wafer', 'Prediction']) result_df['Prediction'].replace({-1:'Bad', 1:'Good'}, inplace=True) # Append result to prediction output file if not os.path.isdir('Prediction_Output_File/'): os.makedirs('Prediction_Output_File/') path = os.path.join('Prediction_Output_File/', 'Predictions.csv') result_df.to_csv(path, header=True, mode='a+') self.logger.log(self.log_file, 'Predictions for cluster ' + str(i) + ' appended to Predictions.csv') self.logger.log(self.log_file, '*** Data Prediction successful !! ***') self.log_file.close() return path, result_df.tail(25).to_json(orient='records') except Exception as e: self.logger.log(self.log_file, 'Error when generating predictions, error: ' + str(e)) self.log_file.close() raise e ###Test # path = "Prediction_Batch_Files" # model_pred = prediction(path) # model_pred.predictFromModel()