modelTraining.py · thyroid-detection

import time
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from Training.Data_Preprocessing import preProcessing
from Training.Data_Preprocessing import clustering
from File_Operations import fileOps
from Training.Best_Model_Finder import tuner
from Training.App_Logging.logger import appLogger


class trainModel:
	"""
	This class implements the model training functionality.
	"""
	def __init__(self):
		self.log_file = open('Model_Training_Log.txt', 'a+')
		self.logger = appLogger()
		self.input_file = 'Training/TrainingFile_FromDB/InputFile.csv'

	def modelTraining(self):
		"""
		This method is used to preprocess, cluster the data and then
		train the models for different clusters.
		:return: None
		"""
		self.logger.log(self.log_file, '*** Model Training started !! ***')

		try:
			train_start = time.time()

			# Delete existing models directory
			file_ops = fileOps.fileOperations(self.log_file)
			file_ops.delete_existing_model_dir()

			""" Get data """
			preprocess_start = time.time()
			data = pd.read_csv(self.input_file)
			self.logger.log(self.log_file, 'Data load successful !!')

			""" Preprocess Data """
			self.logger.log(self.log_file, 'Data preprocessing started ...')
			preprocessor = preProcessing.preProcessor(self.log_file)

			# Remove columns that don't add value for model training
			cols_to_drop = [col for col in data.columns if '_measured' in col]
			cols_to_drop.append('TBG')
			df = preprocessor.dropUnnecessaryColumns(data, cols_to_drop)
			# Remove records with "secondary_hypothyroid" only 2 records in training
			df = df[df['Class']!="secondary_hypothyroid"]

			# Replace invalid "?" values with NaN (as shown in EDA)
			df = preprocessor.replaceInvalidValuesWithNull(df)

			# Encode categorical features in data
			df = preprocessor.encodeCategoricalFeatures(df)

			# Encode the target feature `Class` using Label Encoder
			df = preprocessor.encodeTarget(df)

			# Check and impute missing values using KNN imputer
			if preprocessor.isNullPresent(df):
				df = preprocessor.imputeMissing(df)

			# Check and remove outliers
			if preprocessor.isOutlierPresent(df):
				df = preprocessor.removeOutliers(df)

			# Split the independent features and target
			X, Y = preprocessor.splitTarget(df, 'Class')

			# Handle imbalanced target using RandomOverSampler
			X, Y = preprocessor.handleImbalanceData(X, Y)

			# Scale the data
			X = preprocessor.scaleData(X)
			self.logger.log(self.log_file,
			                f'Data preprocessing completed successfully in {(time.time() - preprocess_start):.2f} seconds !!')

			""" Cluster Data """
			cluster_start = time.time()
			self.logger.log(self.log_file, 'Clustering started ...')
			kmeans = clustering.kMeansClustering(self.log_file)

			# Get optimal clusters
			clusters = kmeans.getOptimalClusters(X)

			# Create clusters and add cluster information to the data
			X = kmeans.createClusters(X, clusters)
			self.logger.log(self.log_file,
		                f'Clustering completed successfully in {(time.time() - cluster_start):.2f} seconds !!')

			# Add target column to X
			X['Label'] = Y

			# Get unique clusters
			unique_clusters = X['Cluster'].unique()

			""" Apply cluster approach to build models"""
			self.logger.log(self.log_file, '*** Model creation started !! ***')

			# Iterate over and build models for each cluster
			best_models = []
			scores = []
			for i in unique_clusters:

				# Subset the data for cluster i
				cluster_data = X[X['Cluster']==i]
				# cluster_data.to_csv('test_cluster', header=True)

				# Separate features and label data
				c_features = cluster_data.drop(['Label','Cluster'], axis=1)
				c_label = cluster_data['Label']

				# Split the data into training and test sets
				x_train, x_test, y_train, y_test = train_test_split(c_features, c_label,
				                                                    test_size=1/3, random_state=110)

				# Get best model
				self.logger.log(self.log_file, 'Building model for data in cluster ' + str(i))
				model_start = time.time()
				model_finder = tuner.modelFinder(self.log_file)
				model_name, model, model_score = model_finder.getBestModel(x_train, x_test, y_train, y_test)
				self.logger.log(self.log_file, f'Best model for cluster {i}, {model_name}, found in {(time.time() - model_start) / 60:.2f} mins.')

				# Save the best model
				file_ops.saveModel(model, model_name+str(i))

				# Add to the list of best models and scores
				best_models.append(model_name)
				scores.append(model_score)

			cluster_models = list(zip(unique_clusters, best_models, scores))
			self.logger.log(self.log_file, f'Best models for each cluster and their scores are: ')
			self.logger.log(self.log_file, f'{cluster_models}')
			self.logger.log(self.log_file,
			                f'Training completed in {(time.time() - train_start) / 60:.2f} mins.')
			self.logger.log(self.log_file, '*** End of Training !! ***')
			self.log_file.close()

		except Exception as e:
			self.logger.log(self.log_file, 'Error training model: ' + str(e))
			self.log_file.close()
			raise e



### Test ###
# train = trainModel()
# train.modelTraining()