# LexicialAnalyzer method will come here (Contributor: Yassin)
import pandas as pd
from urllib.parse import urlparse
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
def DataPreparation(df):
#print(df.info())
# Identify categorical, integer, and float columns
categorical_cols = df.select_dtypes(include=['object']).columns
integer_cols = df.select_dtypes(include=['int64']).columns
float_cols = df.select_dtypes(include=['float64']).columns
boolean_cols = df.select_dtypes(include=['bool']).columns
#print("Categorical Columns:", categorical_cols)
#print("Integer Columns:", integer_cols)
#print("Float Columns:", float_cols)
#print("Boolean Columns:", boolean_cols)
# Fill missing values for numerical columns with mean/median
# df[integer_cols] = df[integer_cols].fillna(df[integer_cols].median())
# df[float_cols] = df[float_cols].fillna(df[float_cols].mean())
# Fill missing values for categorical and boolean columns with mode (most frequent value)
# df[boolean_cols] = df[boolean_cols].fillna(df[boolean_cols].mode().iloc[0])
# df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])
print("Missing values handled!")
for col in df.columns:
unique_types = df[col].apply(type).unique() # Check data types in the column
print(f"Column '{col}' has data types: {unique_types}")
# Convert mixed-type columns to strings
mixed_columns = ["ASN Number", "ASN Postal Code"]
for col in mixed_columns:
if col in df.columns:
df[col] = df[col].astype(str) # Convert to string
# df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
encoder = LabelEncoder()
for col in categorical_cols:
df.loc[:, col] = encoder.fit_transform(df[col])
# # Convert object (string) columns to categorical
# for col in df.select_dtypes(include=['object']).columns:
# df[col] = df[col].astype('category')
# # Convert all boolean columns to integers (0 and 1)
# df.loc[:, boolean_cols] = df[boolean_cols].astype(int)
# Convert all boolean columns to integers (0 and 1)
df = df.astype({col: int for col in df.select_dtypes(include=['object']).columns})
# Convert all boolean columns to integers (0 and 1)
df = df.astype({col: int for col in df.select_dtypes(include=['bool']).columns})
scaler = StandardScaler()
df[integer_cols] = scaler.fit_transform(df[integer_cols])
df[float_cols] = scaler.fit_transform(df[float_cols])
df.columns = df.columns.str.replace(' ', '_')
return df
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
def DataPreparationUp(df):
"""
Cleans and processes the dataset, handling mixed data types, categorical encoding, and scaling.
Parameters:
- df (DataFrame): The input dataset.
Returns:
- df (DataFrame): The processed dataset.
"""
df.columns = [re.sub(r'[^a-zA-Z0-9_]', '_', col) for col in df.columns]
# Identify categorical, integer, and float columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
integer_cols = df.select_dtypes(include=['int64']).columns.tolist()
float_cols = df.select_dtypes(include=['float64']).columns.tolist()
boolean_cols = df.select_dtypes(include=['bool']).columns.tolist()
print("Categorical Columns:", categorical_cols)
print("Integer Columns:", integer_cols)
print("Float Columns:", float_cols)
print("Boolean Columns:", boolean_cols)
# ✅ Handle Mixed Data Type Columns (Convert to String)
for col in df.columns:
unique_types = df[col].apply(type).unique()
if len(unique_types) > 1: # If column has mixed types
print(f"Converting mixed-type column '{col}' to string.")
df[col] = df[col].astype(str)
# ✅ Handle Missing Values
df[integer_cols] = df[integer_cols].fillna(df[integer_cols].median()) # Fill integers with median
df[float_cols] = df[float_cols].fillna(df[float_cols].mean()) # Fill floats with mean
# Fill categorical and boolean columns with mode safely
for col in categorical_cols + boolean_cols:
if df[col].isnull().any(): # Only fill if there are missing values
mode_val = df[col].mode()
if not mode_val.empty: # Check if mode exists
df[col] = df[col].fillna(mode_val.iloc[0]) # Fill with mode safely
else:
df[col] = df[col].fillna("Unknown") # Assign a default value if mode is empty
print("Missing values handled!")
# ✅ Convert categorical columns to strings before encoding
df[categorical_cols] = df[categorical_cols].astype(str)
# ✅ Apply Label Encoding to Categorical Features
encoder = LabelEncoder()
for col in categorical_cols:
df[col] = encoder.fit_transform(df[col])
# ✅ Convert Boolean columns to integers (0 and 1)
df[boolean_cols] = df[boolean_cols].astype(int)
# ✅ Scale Numerical Columns
# scaler = StandardScaler()
# if integer_cols:
# df[integer_cols] = scaler.fit_transform(df[integer_cols])
# if float_cols:
# df[float_cols] = scaler.fit_transform(df[float_cols])
# ✅ Scale ALL Numerical Columns (Including Encoded Categorical & Boolean Features)
scaler = StandardScaler()
df[df.columns] = scaler.fit_transform(df[df.columns]) # Scale all columns
# ✅ Replace Spaces in Column Names (to avoid errors in ML models)
df.columns = df.columns.str.replace(' ', '_')
return df