import time import pandas as pd import os from sklearn.model_selection import train_test_split from Training.Data_Preprocessing import preProcessing from Training.Data_Preprocessing import clustering from File_Operations import fileOps from Training.Best_Model_Finder import tuner from Training.App_Logging.logger import appLogger class trainModel: """ This class implements the model training functionality. """ def __init__(self): self.log_file = open('Model_Training_Log.txt', 'a+') self.logger = appLogger() self.input_file = 'Training/TrainingFile_FromDB/InputFile.csv' def modelTraining(self): """ This method is used to preprocess, cluster the data and then train the models for different clusters. :return: None """ self.logger.log(self.log_file, '*** Model Training started !! ***') try: train_start = time.time() # Delete existing models directory file_ops = fileOps.fileOperations(self.log_file) file_ops.delete_existing_model_dir() """ Get data """ preprocess_start = time.time() data = pd.read_csv(self.input_file) self.logger.log(self.log_file, 'Data load successful !!') """ Preprocess Data """ self.logger.log(self.log_file, 'Data preprocessing started ...') preprocessor = preProcessing.preProcessor(self.log_file) # Remove columns that don't add value for model training cols_to_drop = [col for col in data.columns if '_measured' in col] cols_to_drop.append('TBG') df = preprocessor.dropUnnecessaryColumns(data, cols_to_drop) # Remove records with "secondary_hypothyroid" only 2 records in training df = df[df['Class']!="secondary_hypothyroid"] # Replace invalid "?" values with NaN (as shown in EDA) df = preprocessor.replaceInvalidValuesWithNull(df) # Encode categorical features in data df = preprocessor.encodeCategoricalFeatures(df) # Encode the target feature `Class` using Label Encoder df = preprocessor.encodeTarget(df) # Check and impute missing values using KNN imputer if preprocessor.isNullPresent(df): df = preprocessor.imputeMissing(df) # Check and remove outliers if preprocessor.isOutlierPresent(df): df = preprocessor.removeOutliers(df) # Split the independent features and target X, Y = preprocessor.splitTarget(df, 'Class') # Handle imbalanced target using RandomOverSampler X, Y = preprocessor.handleImbalanceData(X, Y) # Scale the data X = preprocessor.scaleData(X) self.logger.log(self.log_file, f'Data preprocessing completed successfully in {(time.time() - preprocess_start):.2f} seconds !!') """ Cluster Data """ cluster_start = time.time() self.logger.log(self.log_file, 'Clustering started ...') kmeans = clustering.kMeansClustering(self.log_file) # Get optimal clusters clusters = kmeans.getOptimalClusters(X) # Create clusters and add cluster information to the data X = kmeans.createClusters(X, clusters) self.logger.log(self.log_file, f'Clustering completed successfully in {(time.time() - cluster_start):.2f} seconds !!') # Add target column to X X['Label'] = Y # Get unique clusters unique_clusters = X['Cluster'].unique() """ Apply cluster approach to build models""" self.logger.log(self.log_file, '*** Model creation started !! ***') # Iterate over and build models for each cluster best_models = [] scores = [] for i in unique_clusters: # Subset the data for cluster i cluster_data = X[X['Cluster']==i] # cluster_data.to_csv('test_cluster', header=True) # Separate features and label data c_features = cluster_data.drop(['Label','Cluster'], axis=1) c_label = cluster_data['Label'] # Split the data into training and test sets x_train, x_test, y_train, y_test = train_test_split(c_features, c_label, test_size=1/3, random_state=110) # Get best model self.logger.log(self.log_file, 'Building model for data in cluster ' + str(i)) model_start = time.time() model_finder = tuner.modelFinder(self.log_file) model_name, model, model_score = model_finder.getBestModel(x_train, x_test, y_train, y_test) self.logger.log(self.log_file, f'Best model for cluster {i}, {model_name}, found in {(time.time() - model_start) / 60:.2f} mins.') # Save the best model file_ops.saveModel(model, model_name+str(i)) # Add to the list of best models and scores best_models.append(model_name) scores.append(model_score) cluster_models = list(zip(unique_clusters, best_models, scores)) self.logger.log(self.log_file, f'Best models for each cluster and their scores are: ') self.logger.log(self.log_file, f'{cluster_models}') self.logger.log(self.log_file, f'Training completed in {(time.time() - train_start) / 60:.2f} mins.') self.logger.log(self.log_file, '*** End of Training !! ***') self.log_file.close() except Exception as e: self.logger.log(self.log_file, 'Error training model: ' + str(e)) self.log_file.close() raise e ### Test ### # train = trainModel() # train.modelTraining()