Data_Preprocessing/preprocessing.py · wafer-detection

import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from app_logs.logger import App_logger


class Preprocessor:
	"""
	This class is used to clean and transform the data before training a model
	"""

	def __init__(self, log_file):
		self.log_file = log_file
		self.logger = App_logger()

	def remove_columns(self, data, column_names:list):
		"""
		This method removes given columns from the data
		:param data: a DataFrame containing data
		:param column_names: list of columns
		:return: A pandas DataFrame after removing specific columns
		"""
		self.df = data
		self.logger.log(self.log_file, 'Entered the remove_columns method of the Preprocessor class')
		try:
			clean_data = self.df.drop(columns=column_names, axis=1)
			self.logger.log(self.log_file, 'Column removal successful !!')
			return clean_data

		except Exception as e:
			self.logger.log(self.log_file, 'Column removal unsuccessful, error: ' + str(e))
			raise e

	def split_xy(self, data, label_column):
		"""
		This methods splits the features and their labels
		:param data: a DataFrame with unwanted columns removed
		:return: a DataFrame containing features and a Series containing labels
		"""
		self.logger.log(self.log_file, 'Entered the split_xy method of the Preprocessor class')
		try:
			X = data.drop(columns=label_column, axis=1)
			Y = data[label_column]
			self.logger.log(self.log_file, 'Label, features separation successful !!')
			return X,Y
		except Exception as e:
			self.logger.log(self.log_file, 'Label, features separation unsuccessful, error: ' + str(e))
			raise e

	def is_null_present(self, data):
		"""
		This method checks if any null values are present in the data.
		If null are present, it exports a csv containing each column along with number of null
		:param data: DataFrame of features
		:return: Boolean True/False. True is null values are present else False
		"""
		self.logger.log(self.log_file, 'Entered the is_null_present method of the Preprocessor class')
		null_present = False
		try:
			null_counts = data.isna().sum()
			for i in null_counts:
				if i > 0:
					null_present = True
					break
			if null_present:
				null_df = pd.DataFrame()
				null_df['columns'] = data.columns
				null_df['missing_count'] = np.asarray(data.isna().sum())
				null_df.to_csv('Data_Preprocessing/null_values.csv')
			self.logger.log(self.log_file, 'Missing values found and written to null_values.csv')
			return null_present

		except Exception as e:
			self.logger.log(self.log_file, 'Error when finding null values, error: ' + str(e))
			raise e

	def impute_missing(self, data):
		"""
		This method imputes the missing values using KNN imputer
		:param data: DataFrame of features with missing values
		:return: DataFrame with imputed missing values
		"""
		self.logger.log(self.log_file, 'Entered the impute_missing method of the Preprocessor class')
		try:
			imputer = KNNImputer(missing_values=np.nan, n_neighbors=3, weights='uniform')
			imputed_array = imputer.fit_transform(data)
			imputed_df = pd.DataFrame(data=imputed_array, columns=data.columns)
			self.logger.log(self.log_file, 'Data imputation successful !!')
			return imputed_df

		except Exception as e:
			self.logger.log(self.log_file, 'Error when imputing data, error: ' + str(e))
			raise e

	def cols_with_zero_standard_deviation(self, data):
		"""
		This method identifies columns in the data that have zero standard deviation
		:param data:
		:return: List of columns with zero standard deviation
		"""
		self.logger.log(self.log_file, 'Entered the cols_with_zero_standard_deviation method of the Preprocessor class')
		data_describe = data.describe()
		cols_to_remove = []
		try:
			for col in data.columns:
				if data_describe[col]['std'] == 0:
					cols_to_remove.append(col)
			self.logger.log(self.log_file, 'Columns with zero standard deviation found successfully !!')
			return cols_to_remove

		except Exception as e:
			self.logger.log(self.log_file, 'Error when finding columns with zero standard deviation, error: ' + str(e))
			raise e


### Test
# log_file = open("Training_logs/ModelTrainingLog.txt", 'a+')
# data = pd.read_csv('TrainingFile_FromDB/TestFile.csv')
# print(data.shape)
# preprocessor = Preprocessor(log_file)
# train_data = preprocessor.remove_columns(data,['Wafer'])
# print(train_data.shape)
# X, Y = preprocessor.split_xy(train_data, 'Output')
# null_present = preprocessor.is_null_present(X)
# if null_present:
# 	X = preprocessor.impute_missing(X)
#
# columns_to_remove = preprocessor.cols_with_zero_standard_deviation(X)
# X = preprocessor.remove_columns(X, columns_to_remove)
# log_file.close()