import pandas as pd from Data_Preprocessing import preprocessing from Data_Preprocessing import clustering from sklearn.model_selection import train_test_split from app_logs.logger import App_logger from best_model_finder import tuner from File_Operations import file_ops import time class trainModel: """ This class implements the model training functionality. """ def __init__(self): self.log_file = open("Training_logs/ModelTrainingLog.txt", 'a+') self.logger = App_logger() self.input_file = 'TrainingFile_FromDB/InputFile.csv' def trainingModel(self): """ This method is used to preprocess, cluster the data and then train the models for different clusters. :return: """ self.logger.log(self.log_file, '*** Model Training started !! ***') try: """ Get data """ data = pd.read_csv(self.input_file) self.logger.log(self.log_file, 'Data load successful !!') """ Preprocess Data """ preprocessor = preprocessing.Preprocessor(self.log_file) # Remove columns that don't add values for training train_data = preprocessor.remove_columns(data, ['Wafer']) # Split the features and labels X, Y = preprocessor.split_xy(train_data, 'Output') # Check if any null values are present null_present = preprocessor.is_null_present(X) # Impute any missing values using K-means imputer if null_present: X = preprocessor.impute_missing(X) # Remove columns with zero standard deviation - these columns do not contribute to predictions columns_to_remove = preprocessor.cols_with_zero_standard_deviation(X) X = preprocessor.remove_columns(X, columns_to_remove) X.to_csv('Data_Preprocessing/clean_X.csv') """ Create Clusters""" kmeans = clustering.KMeansClustering(self.log_file) # Find optimal number of clusters optimal_clusters = kmeans.elbow_plot(X) # Create data with cluster information X = kmeans.create_clusters(optimal_clusters, X) # Add output label information to data X['Label'] = Y # Get unique count of clusters cluster_count = X['Cluster'].unique() """ Apply cluster approach to build models""" self.logger.log(self.log_file, '*** Model creation started !! ***') # Iterate over and build models for each cluster for i in cluster_count: # Subset the data for cluster i cluster_data = X[X['Cluster']==i] # Separate features and label data c_features = cluster_data.drop(['Label','Cluster'], axis=1) c_labels = cluster_data['Label'] # Split the data into train and test sets x_train, x_test, y_train, y_test = train_test_split(c_features, c_labels, test_size=1/3, random_state=100) # Get best model self.logger.log(self.log_file, 'Building model for cluster '+str(i)) start_time = time.time() model_finder = tuner.Model_Finder(self.log_file) best_model_name, best_model = model_finder.get_best_model(x_train, x_test, y_train, y_test) msg = f'Time elapsed in finding the best model for cluster {i}: {(time.time()-start_time)/60:.2f} mins' self.logger.log(self.log_file, msg) self.logger.log(self.log_file, 'Best model for cluster '+str(i)+' is: '+str(best_model_name)) # Save the best model file_op = file_ops.FileOperations(self.log_file) save_model = file_op.save_model(best_model, best_model_name+str(i)) self.logger.log(self.log_file, '*** Model training successful !! ***') self.log_file.close() except Exception as e: self.logger.log(self.log_file, 'Error when training model, error: ' + str(e)) self.log_file.close() raise e ###Test # model_training = trainModel() # model_training.trainingModel()