import pandas as pd import numpy as np from sklearn.impute import KNNImputer from app_logs.logger import App_logger class Preprocessor: """ This class is used to clean and transform the data before training a model """ def __init__(self, log_file): self.log_file = log_file self.logger = App_logger() def remove_columns(self, data, column_names:list): """ This method removes given columns from the data :param data: a DataFrame containing data :param column_names: list of columns :return: A pandas DataFrame after removing specific columns """ self.df = data self.logger.log(self.log_file, 'Entered the remove_columns method of the Preprocessor class') try: clean_data = self.df.drop(columns=column_names, axis=1) self.logger.log(self.log_file, 'Column removal successful !!') return clean_data except Exception as e: self.logger.log(self.log_file, 'Column removal unsuccessful, error: ' + str(e)) raise e def split_xy(self, data, label_column): """ This methods splits the features and their labels :param data: a DataFrame with unwanted columns removed :return: a DataFrame containing features and a Series containing labels """ self.logger.log(self.log_file, 'Entered the split_xy method of the Preprocessor class') try: X = data.drop(columns=label_column, axis=1) Y = data[label_column] self.logger.log(self.log_file, 'Label, features separation successful !!') return X,Y except Exception as e: self.logger.log(self.log_file, 'Label, features separation unsuccessful, error: ' + str(e)) raise e def is_null_present(self, data): """ This method checks if any null values are present in the data. If null are present, it exports a csv containing each column along with number of null :param data: DataFrame of features :return: Boolean True/False. True is null values are present else False """ self.logger.log(self.log_file, 'Entered the is_null_present method of the Preprocessor class') null_present = False try: null_counts = data.isna().sum() for i in null_counts: if i > 0: null_present = True break if null_present: null_df = pd.DataFrame() null_df['columns'] = data.columns null_df['missing_count'] = np.asarray(data.isna().sum()) null_df.to_csv('Data_Preprocessing/null_values.csv') self.logger.log(self.log_file, 'Missing values found and written to null_values.csv') return null_present except Exception as e: self.logger.log(self.log_file, 'Error when finding null values, error: ' + str(e)) raise e def impute_missing(self, data): """ This method imputes the missing values using KNN imputer :param data: DataFrame of features with missing values :return: DataFrame with imputed missing values """ self.logger.log(self.log_file, 'Entered the impute_missing method of the Preprocessor class') try: imputer = KNNImputer(missing_values=np.nan, n_neighbors=3, weights='uniform') imputed_array = imputer.fit_transform(data) imputed_df = pd.DataFrame(data=imputed_array, columns=data.columns) self.logger.log(self.log_file, 'Data imputation successful !!') return imputed_df except Exception as e: self.logger.log(self.log_file, 'Error when imputing data, error: ' + str(e)) raise e def cols_with_zero_standard_deviation(self, data): """ This method identifies columns in the data that have zero standard deviation :param data: :return: List of columns with zero standard deviation """ self.logger.log(self.log_file, 'Entered the cols_with_zero_standard_deviation method of the Preprocessor class') data_describe = data.describe() cols_to_remove = [] try: for col in data.columns: if data_describe[col]['std'] == 0: cols_to_remove.append(col) self.logger.log(self.log_file, 'Columns with zero standard deviation found successfully !!') return cols_to_remove except Exception as e: self.logger.log(self.log_file, 'Error when finding columns with zero standard deviation, error: ' + str(e)) raise e ### Test # log_file = open("Training_logs/ModelTrainingLog.txt", 'a+') # data = pd.read_csv('TrainingFile_FromDB/TestFile.csv') # print(data.shape) # preprocessor = Preprocessor(log_file) # train_data = preprocessor.remove_columns(data,['Wafer']) # print(train_data.shape) # X, Y = preprocessor.split_xy(train_data, 'Output') # null_present = preprocessor.is_null_present(X) # if null_present: # X = preprocessor.impute_missing(X) # # columns_to_remove = preprocessor.cols_with_zero_standard_deviation(X) # X = preprocessor.remove_columns(X, columns_to_remove) # log_file.close()