ML-REFRANCE-CODES / WISDM DATSET CODES / SINGLE LAYER SIMPLE LSTM (WISDM).py
SINGLE LAYER SIMPLE LSTM (WISDM).py
Raw
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
import tarfile
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

# Suppress TensorFlow GPU warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Enable memory growth for the GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        logger.error(f"GPU error: {e}")

logger.info("Starting data preprocessing...")

# Path to the dataset file
dataset_path = r'C:\Users\LENOVO LEGION\Desktop\ml codes\ML CODES WISDIM\WISDM_ar_latest.tar.gz'
extract_path = r'C:\Users\LENOVO LEGION\Desktop\ml codes\ML CODES WISDIM\WISDM_ar_latest'

# Extract the dataset
if not os.path.exists(extract_path):
    logger.info("Extracting dataset...")
    with tarfile.open(dataset_path, 'r:gz') as tar:
        tar.extractall(path=extract_path)

# Define the path to the main dataset file
data_file = os.path.join(extract_path, 'WISDM_ar_v1.1', 'WISDM_ar_v1.1_raw.txt')

# Load the dataset, skipping bad lines
logger.info("Loading dataset...")
column_names = ['user', 'activity', 'timestamp', 'x', 'y', 'z']
wisdm_data = pd.read_csv(data_file, header=None, names=column_names, on_bad_lines='skip')

logger.info(f"Initial dataset shape: {wisdm_data.shape}")

# Data Cleaning
# Convert all values to strings
wisdm_data['x'] = wisdm_data['x'].astype(str)
wisdm_data['y'] = wisdm_data['y'].astype(str)
wisdm_data['z'] = wisdm_data['z'].astype(str)

# Remove non-numeric characters
wisdm_data['x'] = wisdm_data['x'].str.replace(';', '', regex=False)
wisdm_data['y'] = wisdm_data['y'].str.replace(';', '', regex=False)
wisdm_data['z'] = wisdm_data['z'].str.replace(';', '', regex=False)

# Remove rows with non-numeric values
wisdm_data = wisdm_data[wisdm_data['x'].apply(lambda x: x.replace('.', '', 1).isdigit())]
wisdm_data = wisdm_data[wisdm_data['y'].apply(lambda y: y.replace('.', '', 1).isdigit())]
wisdm_data = wisdm_data[wisdm_data['z'].apply(lambda z: z.replace('.', '', 1).isdigit())]

# Convert columns back to numeric
wisdm_data['x'] = pd.to_numeric(wisdm_data['x'])
wisdm_data['y'] = pd.to_numeric(wisdm_data['y'])
wisdm_data['z'] = pd.to_numeric(wisdm_data['z'])

# Handle missing values
wisdm_data = wisdm_data.dropna()

logger.info(f"Dataset shape after cleaning: {wisdm_data.shape}")

# Feature Engineering
logger.info("Performing feature engineering...")

# Calculate magnitude
wisdm_data['magnitude'] = np.sqrt(wisdm_data['x']**2 + wisdm_data['y']**2 + wisdm_data['z']**2)

# Calculate jerk (derivative of acceleration)
for axis in ['x', 'y', 'z']:
    diff = np.diff(wisdm_data[axis])
    time_diff = np.diff(wisdm_data['timestamp'])
    jerk = np.zeros(len(wisdm_data))
    jerk[1:] = np.where(time_diff != 0, diff / time_diff, 0)
    wisdm_data[f'{axis}_jerk'] = jerk

# Calculate rolling mean and standard deviation
window_size = 20
for axis in ['x', 'y', 'z']:
    wisdm_data[f'{axis}_rolling_mean'] = wisdm_data.groupby('user')[axis].rolling(window=window_size).mean().reset_index(0, drop=True)
    wisdm_data[f'{axis}_rolling_std'] = wisdm_data.groupby('user')[axis].rolling(window=window_size).std().reset_index(0, drop=True)

# Handle NaN and infinite values
wisdm_data = wisdm_data.replace([np.inf, -np.inf], np.nan).fillna(method='ffill').fillna(method='bfill')

# Map activity labels to integers
activity_mapping = {label: idx for idx, label in enumerate(wisdm_data['activity'].unique())}
wisdm_data['activity'] = wisdm_data['activity'].map(activity_mapping)

# Reverse mapping for later use
reverse_activity_mapping = {v: k for k, v in activity_mapping.items()}

# Normalize features
logger.info("Normalizing features...")
scaler = StandardScaler()
features = ['x', 'y', 'z', 'magnitude', 'x_jerk', 'y_jerk', 'z_jerk', 
            'x_rolling_mean', 'y_rolling_mean', 'z_rolling_mean', 
            'x_rolling_std', 'y_rolling_std', 'z_rolling_std']
wisdm_data[features] = scaler.fit_transform(wisdm_data[features])

# Create sequences
def create_sequences(data, seq_length, step=1):
    sequences = []
    labels = []
    for start in range(0, len(data) - seq_length, step):
        sequences.append(data.iloc[start:start + seq_length][features].values)
        labels.append(data.iloc[start + seq_length - 1]['activity'])
    return np.array(sequences), np.array(labels)

# Create sequences from the data
sequence_length = 200
logger.info("Creating sequences...")
X, y = create_sequences(wisdm_data, sequence_length)
logger.info(f"Shape of X after sequence creation: {X.shape}")
logger.info(f"Shape of y after sequence creation: {y.shape}")

# Final check for any NaN or infinite values
if np.isnan(X).any() or np.isinf(X).any():
    logger.error("NaN or infinite values detected in the final dataset")
    raise ValueError("Dataset contains NaN or infinite values after preprocessing")

# Convert labels to categorical
y_categorical = to_categorical(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)
logger.info(f"Training set shape: {X_train.shape}")
logger.info(f"Testing set shape: {X_test.shape}")
# Simple LSTM Model definition
def build_simple_lstm(input_shape, num_classes):
    model = Sequential()
    model.add(LSTM(100, input_shape=input_shape, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    return model

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name="Simple LSTM"):
    # Compile model
    model.compile(optimizer=Adam(learning_rate=0.001),
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    
    # Define callbacks
    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                 factor=0.2,
                                 patience=5,
                                 min_lr=0.00001)
    early_stopping = EarlyStopping(monitor='val_loss',
                                 patience=10,
                                 restore_best_weights=True)
    
    # Train model
    logger.info(f"\nTraining {model_name}...")
    history = model.fit(X_train, y_train,
                       epochs=20,
                       batch_size=64,
                       validation_split=0.2,
                       callbacks=[reduce_lr, early_stopping],
                       verbose=1)
    
    # Evaluate model
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    y_pred = model.predict(X_test)
    
    return model, history, accuracy, y_pred

# Visualization functions
def plot_training_history(history, model_name="Simple LSTM"):
    plt.figure(figsize=(15, 5))
    
    # Plot accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f'{model_name} - Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Plot loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'{model_name} - Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(f'training_history_{model_name.lower().replace(" ", "_")}.png')
    plt.close()

def plot_confusion_matrix(y_true, y_pred, model_name="Simple LSTM"):
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(y_true, axis=1)
    
    cm = confusion_matrix(y_true_classes, y_pred_classes)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} - Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
    # Set x and y tick labels to activity names
    tick_marks = np.arange(len(reverse_activity_mapping))
    plt.xticks(tick_marks + 0.5, [reverse_activity_mapping[i] for i in range(len(reverse_activity_mapping))], 
               rotation=45, ha='right')
    plt.yticks(tick_marks + 0.5, [reverse_activity_mapping[i] for i in range(len(reverse_activity_mapping))], 
               rotation=0)
    
    plt.tight_layout()
    plt.savefig(f'confusion_matrix_{model_name.lower().replace(" ", "_")}.png')
    plt.close()
    
    return cm

def plot_roc_curves(y_test, y_pred, model_name="Simple LSTM"):
    n_classes = y_test.shape[1]
    fpr = {}
    tpr = {}
    roc_auc = {}
    
    # Calculate ROC curve and ROC area for each class
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Plot ROC curves
    plt.figure(figsize=(12, 8))
    for i in range(n_classes):
        plt.plot(fpr[i], tpr[i],
                 label=f'{reverse_activity_mapping[i]} (AUC = {roc_auc[i]:.2f})')
    
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} - ROC Curves')
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig(f'roc_curves_{model_name.lower().replace(" ", "_")}.png')
    plt.close()
    
    return roc_auc

def plot_precision_recall_curves(y_test, y_pred, model_name="Simple LSTM"):
    n_classes = y_test.shape[1]
    precision = {}
    recall = {}
    pr_auc = {}
    
    # Calculate precision-recall curve for each class
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_pred[:, i])
        pr_auc[i] = auc(recall[i], precision[i])
    
    # Plot precision-recall curves
    plt.figure(figsize=(12, 8))
    for i in range(n_classes):
        plt.plot(recall[i], precision[i],
                 label=f'{reverse_activity_mapping[i]} (AUC = {pr_auc[i]:.2f})')
    
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'{model_name} - Precision-Recall Curves')
    plt.legend(loc="lower left")
    plt.tight_layout()
    plt.savefig(f'precision_recall_curves_{model_name.lower().replace(" ", "_")}.png')
    plt.close()
    
    return pr_auc

def save_model_results(model, history, accuracy, cm, roc_auc, pr_auc, model_name="Simple LSTM"):
    with open(f'{model_name.lower().replace(" ", "_")}_results.txt', 'w') as f:
        # Model summary
        f.write(f"{model_name} Results\n")
        f.write("="*50 + "\n\n")
        model.summary(print_fn=lambda x: f.write(x + '\n'))
        f.write("\n")
        
        # Performance metrics
        f.write("Performance Metrics\n")
        f.write("-"*50 + "\n")
        f.write(f"Test Accuracy: {accuracy*100:.2f}%\n\n")
        
        # Confusion Matrix
        f.write("Confusion Matrix\n")
        f.write("-"*50 + "\n")
        np.savetxt(f, cm, fmt='%d')
        f.write("\n")
        
        # ROC AUC scores
        f.write("ROC AUC Scores\n")
        f.write("-"*50 + "\n")
        for i in range(len(roc_auc)):
            f.write(f"{reverse_activity_mapping[i]}: {roc_auc[i]:.4f}\n")
        f.write("\n")
        
        # PR AUC scores
        f.write("Precision-Recall AUC Scores\n")
        f.write("-"*50 + "\n")
        for i in range(len(pr_auc)):
            f.write(f"{reverse_activity_mapping[i]}: {pr_auc[i]:.4f}\n")
        f.write("\n")
        
        # Training history
        f.write("Training History\n")
        f.write("-"*50 + "\n")
        f.write("Epoch\tLoss\tAccuracy\tVal_Loss\tVal_Accuracy\n")
        for i in range(len(history.history['loss'])):
            f.write(f"{i+1}\t{history.history['loss'][i]:.4f}\t")
            f.write(f"{history.history['accuracy'][i]:.4f}\t")
            f.write(f"{history.history['val_loss'][i]:.4f}\t")
            f.write(f"{history.history['val_accuracy'][i]:.4f}\n")

# Main execution
if __name__ == "__main__":
    logger.info("Starting Simple LSTM model training and evaluation...")
    
    # Create and compile model
    input_shape = (X_train.shape[1], X_train.shape[2])
    num_classes = y_train.shape[1]
    model = build_simple_lstm(input_shape, num_classes)
    
    # Print model summary
    model.summary()
    
    # Train and evaluate model
    model, history, accuracy, y_pred = train_and_evaluate_model(
        model, X_train, y_train, X_test, y_test
    )
    
    # Generate visualizations and metrics
    cm = plot_confusion_matrix(y_test, y_pred)
    plot_training_history(history)
    roc_auc = plot_roc_curves(y_test, y_pred)
    pr_auc = plot_precision_recall_curves(y_test, y_pred)
    
    # Save results
    save_model_results(model, history, accuracy, cm, roc_auc, pr_auc)
    
    # Save model
    model.save('simple_lstm_wisdm.h5')
    
    logger.info(f"\nSimple LSTM Results:")
    logger.info(f"Test Accuracy: {accuracy*100:.2f}%")
    logger.info("Model and results have been saved.")