modelPrediction.py · thyroid-detection

import pandas as pd
import time
import pickle
import os
from Prediction.Data_Preprocessing.preProcessing import predictionDataValidation
from File_Operations.fileOps import fileOperations
from Training.App_Logging.logger import appLogger


class predictModel:
	def __init__(self):
		self.log_file = open('Model_Prediction_Log.txt', 'a+')
		self.logger = appLogger()
		self.pred_data_val = predictionDataValidation(self.log_file)
		self.file_ops = fileOperations(self.log_file)
		self.input_file = 'Prediction/PredictionFile_FromDB/InputFile.csv'

	def modelPrediction(self):
		"""
		This method is used to preprocess, cluster the data and then
		make prediction the models for different clusters.
		:return: None
		"""

		self.logger.log(self.log_file, '*** Model Prediction started !! ***')

		try:
			prediction_start = time.time()

			""" Delete existing prediction file"""
			self.pred_data_val.delete_existing_prediction_file()

			""" Preprocess Data """
			preprocess_start = time.time()
			self.logger.log(self.log_file, 'Data pre-processing started ...')

			# Get the data
			df = pd.read_csv(self.input_file)
			self.logger.log(self.log_file, 'Data load successful !')

			# Replace invalid "?" values with NaN
			df = self.pred_data_val.replace_invalid_values(df)

			# Remove columns that don't add value
			df = self.pred_data_val.drop_unnecessary_columns(df)

			# Encode categorical features in data
			df = self.pred_data_val.encode_categorical(df)

			# Check and impute missing values using KNN imputer
			if self.pred_data_val.is_null_present(df):
				df = self.pred_data_val.impute_missing(df)

			# Check and remove outliers
			if self.pred_data_val.is_outlier_present(df):
				df = self.pred_data_val.remove_outliers(df)

			# Scale the data using trained StandardScaler
			with open("Models/scaler.pickle", "rb") as f:
				scaler = pickle.load(f)
				cols = df.columns
				scaled = scaler.transform(df)
				df = pd.DataFrame(scaled, columns=cols)
				self.logger.log(self.log_file, f'Data transformed successfully using trained Standard Scaler')

			self.logger.log(self.log_file, f'Data pre-processing completed successfully in {(time.time() - preprocess_start):.2f} seconds')

			""" Cluster Data """
			cluster_start = time.time()
			self.logger.log(self.log_file, 'Predicting clusters started ...')

			# Load KMeans model
			kmeans = self.file_ops.loadModel('KMeans')

			# Make prediction
			clusters = kmeans.predict(df)

			# Add cluster information to the data
			df['cluster'] = clusters
			self.logger.log(self.log_file, f'Clusters predicted successfully in {(time.time() - cluster_start):.2f} seconds')

			""" Apply cluster approach to build models"""
			self.logger.log(self.log_file, '*** Prediction started !! ***')

			# Get unique clusters
			unique_clusters = df['cluster'].unique()

			# Iterate over and make predictions for each cluster
			results = []

			# Load encoder pickle
			with open('Models/encoder.pickle', 'rb') as f:
				encoder = pickle.load(f)

			for c in unique_clusters:
				# Subset the data for cluster c
				cluster_df = df[df['cluster']==c]
				# Drop cluster column
				cluster_df = cluster_df.drop(columns=['cluster'], axis=1)
				# Load the correct model for cluster
				model_name = self.file_ops.find_correct_model_file(c)
				model = self.file_ops.loadModel(model_name)

				# Generate Predictions
				self.logger.log(self.log_file, 'Starting prediction for cluster ' + str(c))
				start_time = time.time()

				model_pred = model.predict(cluster_df).astype(int)
				pred_class = encoder.inverse_transform(model_pred)
				for val in pred_class:
					results.append(val)

				msg = f'Time elapsed in prediction for cluster {c}: {(time.time() - start_time):.2f} seconds'
				self.logger.log(self.log_file, msg)

			# Create data frame for results
			result_df = pd.DataFrame(results, columns=['Predictions'])
			path = "Prediction_Output_File/Predictions.csv"
			result_df.to_csv(path, header=True)

			self.file_ops.upload_prediction_results(bucketName='thyroid-detection-app',
			                                        filepath = path, name='Predictions.csv')

			self.logger.log(self.log_file, f'Total time for Prediction: {(time.time() - prediction_start):.2f} seconds')
			self.logger.log(self.log_file, '*** Model Prediction successful !! ***')
			self.log_file.close()

			return path, result_df

		except Exception as e:
			self.logger.log(self.log_file, 'Error predicting model: ' + str(e))
			self.log_file.close()
			raise e


### Test ###
pred = predictModel()
pred.modelPrediction()