import pandas as pd import time import pickle import os from Prediction.Data_Preprocessing.preProcessing import predictionDataValidation from File_Operations.fileOps import fileOperations from Training.App_Logging.logger import appLogger class predictModel: def __init__(self): self.log_file = open('Model_Prediction_Log.txt', 'a+') self.logger = appLogger() self.pred_data_val = predictionDataValidation(self.log_file) self.file_ops = fileOperations(self.log_file) self.input_file = 'Prediction/PredictionFile_FromDB/InputFile.csv' def modelPrediction(self): """ This method is used to preprocess, cluster the data and then make prediction the models for different clusters. :return: None """ self.logger.log(self.log_file, '*** Model Prediction started !! ***') try: prediction_start = time.time() """ Delete existing prediction file""" self.pred_data_val.delete_existing_prediction_file() """ Preprocess Data """ preprocess_start = time.time() self.logger.log(self.log_file, 'Data pre-processing started ...') # Get the data df = pd.read_csv(self.input_file) self.logger.log(self.log_file, 'Data load successful !') # Replace invalid "?" values with NaN df = self.pred_data_val.replace_invalid_values(df) # Remove columns that don't add value df = self.pred_data_val.drop_unnecessary_columns(df) # Encode categorical features in data df = self.pred_data_val.encode_categorical(df) # Check and impute missing values using KNN imputer if self.pred_data_val.is_null_present(df): df = self.pred_data_val.impute_missing(df) # Check and remove outliers if self.pred_data_val.is_outlier_present(df): df = self.pred_data_val.remove_outliers(df) # Scale the data using trained StandardScaler with open("Models/scaler.pickle", "rb") as f: scaler = pickle.load(f) cols = df.columns scaled = scaler.transform(df) df = pd.DataFrame(scaled, columns=cols) self.logger.log(self.log_file, f'Data transformed successfully using trained Standard Scaler') self.logger.log(self.log_file, f'Data pre-processing completed successfully in {(time.time() - preprocess_start):.2f} seconds') """ Cluster Data """ cluster_start = time.time() self.logger.log(self.log_file, 'Predicting clusters started ...') # Load KMeans model kmeans = self.file_ops.loadModel('KMeans') # Make prediction clusters = kmeans.predict(df) # Add cluster information to the data df['cluster'] = clusters self.logger.log(self.log_file, f'Clusters predicted successfully in {(time.time() - cluster_start):.2f} seconds') """ Apply cluster approach to build models""" self.logger.log(self.log_file, '*** Prediction started !! ***') # Get unique clusters unique_clusters = df['cluster'].unique() # Iterate over and make predictions for each cluster results = [] # Load encoder pickle with open('Models/encoder.pickle', 'rb') as f: encoder = pickle.load(f) for c in unique_clusters: # Subset the data for cluster c cluster_df = df[df['cluster']==c] # Drop cluster column cluster_df = cluster_df.drop(columns=['cluster'], axis=1) # Load the correct model for cluster model_name = self.file_ops.find_correct_model_file(c) model = self.file_ops.loadModel(model_name) # Generate Predictions self.logger.log(self.log_file, 'Starting prediction for cluster ' + str(c)) start_time = time.time() model_pred = model.predict(cluster_df).astype(int) pred_class = encoder.inverse_transform(model_pred) for val in pred_class: results.append(val) msg = f'Time elapsed in prediction for cluster {c}: {(time.time() - start_time):.2f} seconds' self.logger.log(self.log_file, msg) # Create data frame for results result_df = pd.DataFrame(results, columns=['Predictions']) path = "Prediction_Output_File/Predictions.csv" result_df.to_csv(path, header=True) self.file_ops.upload_prediction_results(bucketName='thyroid-detection-app', filepath = path, name='Predictions.csv') self.logger.log(self.log_file, f'Total time for Prediction: {(time.time() - prediction_start):.2f} seconds') self.logger.log(self.log_file, '*** Model Prediction successful !! ***') self.log_file.close() return path, result_df except Exception as e: self.logger.log(self.log_file, 'Error predicting model: ' + str(e)) self.log_file.close() raise e ### Test ### pred = predictModel() pred.modelPrediction()