Unsupervised MLs/conv_ae_anomalies_detection.py · Unsupervised-ML

import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
from matplotlib import pyplot as plt

"""# I. Import .CSV data"""

from google.colab import files
setdate = files.upload()

# reading the input states and parameters from .csv files
import pandas as pd
dataset = pd.read_csv("nofaults_AE.csv")

from google.colab import files
setdate1 = files.upload()

# reading the input states and parameters from .csv files
import pandas as pd
dataset_faults = pd.read_csv("faults_AE.csv")

"""# Quick look at the data"""

new_dataset_nofaults = dataset.iloc[:,1:2]
labels_nofaults=dataset.iloc[:,2]
labels_nofaults

new_dataset_nofaults.head()

new_dataset_faults = dataset_faults.iloc[:,1:2]
labels_faults=dataset_faults.iloc[:,2]
labels_faults

new_dataset_faults.head()

"""# Visualize the data

1. Normal data
"""

# exploring DO2 sensor output
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
figure(figsize=(8, 6), dpi=80)

plt.figure(1)
plt.plot(new_dataset_nofaults["DO_sensor"],label='DO2 historical data')
plt.xlim(0, 58466)
plt.xlabel('time')
plt.ylabel('DO')

"""2. Data with anomalies"""

# exploring DO2 sensor output
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
figure(figsize=(8, 6), dpi=100)

plt.figure(1)
plt.plot(new_dataset_faults["DO_sensor"],label='DO2 historical data')
plt.xlim(0, 58466)
plt.xlabel('time')
plt.ylabel('DO')

"""# Prepare training data

Get data values from the training timeseries data file and normalize the value data. We have a value for every 15 mins for 609 days.

BSM2: Data is evaluated at each 15 minutes interval starting from 245th day

24*60/15 = 96 timesteps per day

96 * 609 days = 58464 data points in total
"""

# Normalize and save the mean and std we get,
# for normalizing test data.
training_mean = labels_faults.mean()
training_std = labels_faults.std()
df_training_value = (new_dataset_nofaults - training_mean) / training_std
print("Number of training samples:", len(df_training_value))

"""# Create sequences"""

TIME_STEPS = 96

# Generated training sequences for use in the model.
def create_sequences(values, labels, time_steps=TIME_STEPS):
    x_output = []
    y_output=[]
    for i in range(len(values) - time_steps + 1):
        x_output.append(values[i : (i + time_steps)])
        y_output.append(labels[i + time_steps-1])
    return np.stack(x_output), np.stack(y_output)


x_train, y_train = create_sequences(df_training_value.values, labels_nofaults)
print("Training input shape: ", x_train.shape)
print("Training input shape: ", y_train.shape)

"""# II. Create the model

We will build a convolutional reconstruction autoencoder model. The model will take input of shape (batch_size, sequence_length, num_features) and return output of the same shape. In this case, sequence_length is 288 and num_features is 1.
"""

model = keras.Sequential(
    [
        layers.Input(shape=(x_train.shape[1], x_train.shape[2])),
        layers.Conv1D(
            filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Dropout(rate=0.2),
        layers.Conv1D(
            filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Conv1DTranspose(
            filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Dropout(rate=0.2),
        layers.Conv1DTranspose(
            filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Conv1DTranspose(filters=1, kernel_size=7, padding="same"),
    ]
)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse")
model.summary()

"""# Train the model"""

history = model.fit(
    x_train,
    x_train,
    epochs=50,
    batch_size=128,
    validation_split=0.1,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")
    ],
)

"""Let's plot training and validation loss to see how the training went."""

plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()
plt.show()

"""# Save the model"""

from google.colab import drive
drive.mount('/content/drive')

from tensorflow import keras
keras.utils.plot_model(model, "/content/drive/MyDrive/Autoencoder/models/model_Conv-AE.png", show_shapes=True, expand_nested=True)

# Assuming 'model' is your trained neural model
model.save('model_conv_ae_1.h5')

import shutil

source_path = '/content/model_conv_ae_1.h5'
destination_path = '/content/drive/MyDrive/Autoencoder/models/model_conv_ae_1.h5'

shutil.move(source_path, destination_path)

"""# Load the model"""

from tensorflow.keras.models import load_model

model_path = '/content/drive/MyDrive/Autoencoder/models/model_conv_ae_1.h5'
loaded_model = load_model(model_path)

loaded_model.summary()

"""# III. Detecting anomalies

We will detect anomalies by determining how well our model can reconstruct the input data.

Find MAE loss on training samples.
Find max MAE loss value. This is the worst our model has performed trying to reconstruct a sample. We will make this the threshold for anomaly detection.
If the reconstruction loss for a sample is greater than this threshold value then we can infer that the model is seeing a pattern that it isn't familiar with. We will label this sample as an anomaly.
"""

# Get train MAE loss.
x_train_pred = loaded_model.predict(x_train)
train_mae_loss = np.mean(np.abs(x_train_pred - x_train), axis=1)

plt.hist(train_mae_loss, bins=50)
plt.xlabel("Train MAE loss")
plt.ylabel("No of samples")
plt.show()

# Get reconstruction loss threshold.
threshold = np.max(train_mae_loss)
print("Reconstruction error threshold: ", threshold)

"""**1. Compare recontruction:** check how our model has recontructed the first sample."""

# Checking how the first sequence is learnt
plt.plot(x_train[0])
plt.plot(x_train_pred[0])
plt.show()

"""



# Prepare test data


"""

df_test_value = (new_dataset_faults - training_mean) / training_std
fig, ax = plt.subplots()
df_test_value.plot(legend=False, ax=ax)
plt.show()

# Create sequences from test values.

x_test,y_test = create_sequences(df_test_value.values,labels_faults)
print("Test input shape: ", x_test.shape)
print("Test input shape: ", y_test.shape)

# Get test MAE loss.
x_test_pred = loaded_model.predict(x_test)
test_mae_loss = np.mean(np.abs(x_test_pred - x_test), axis=1)
test_mae_loss = test_mae_loss.reshape((-1))

plt.hist(test_mae_loss, bins=50)
plt.xlabel("test MAE loss")
plt.ylabel("No of samples")
plt.show()

# Detect all the samples which are anomalies.

anomalies = test_mae_loss > threshold
print("Number of anomaly samples: ", np.sum(anomalies))
print("Indices of anomaly samples: ", np.where(anomalies))

"""# Plot anomalies"""

# data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
anomalous_data_indices = []
for data_idx in range(TIME_STEPS - 1, len(df_test_value) - TIME_STEPS + 1):
    if np.all(anomalies[data_idx - TIME_STEPS + 1 : data_idx]):
        anomalous_data_indices.append(data_idx)

"""overlay the anomalies on the original test data plot."""

# Subset the DataFrame to select the anomalous data
df_subset = new_dataset_faults.iloc[anomalous_data_indices]

# Plot the data with anomalies
fig, ax = plt.subplots()
new_dataset_faults.plot(legend=False, ax=ax)
df_subset.plot(legend=False, ax=ax, style='x')  # Anomalies highlighted in red
ax.set_xlim(20000, 58465)
plt.xlabel('Samples', fontsize=12)  # Adjust the font size for y-axis label
plt.ylabel('DO [mg/L]', fontsize=12)  # Adjust the font size for x-axis label
# Save the figure with the specified DPI
plt.savefig('test_dataset.png', dpi=dpi)
plt.show()

# Subset the DataFrame to select the anomalous data
df_subset = new_dataset_faults.iloc[anomalous_data_indices]

# Plot the data with anomalies
fig, ax = plt.subplots()
new_dataset_faults.plot(legend=False, ax=ax)
df_subset.plot(legend=False, ax=ax, style='x')  # Anomalies highlighted in red
plt.xlabel('Samples')  # Adjust the font size for y-axis label
plt.ylabel('DO [mg/L]')  # Adjust the font size for x-axis label
ax.set_xlim(23000, 25200)
ax.set_ylim(0, 5)
dpi = 300
# Save the figure with the specified DPI
plt.savefig('drift_anomalies.png', dpi=dpi)
plt.show()

# Subset the DataFrame to select the anomalous data
df_subset = new_dataset_faults.iloc[anomalous_data_indices]

# Plot the data with anomalies
fig, ax = plt.subplots()
new_dataset_faults.plot(legend=False, ax=ax)
df_subset.plot(legend=False, ax=ax, style='x')  # Anomalies highlighted in red
plt.xlabel('Samples')  # Adjust the font size for y-axis label
plt.ylabel('DO [mg/L]')  # Adjust the font size for x-axis label
ax.set_xlim(26000, 30000)
ax.set_ylim(0, 5)
dpi = 300
# Save the figure with the specified DPI
plt.savefig('aeration_anomalies.png', dpi=dpi)
plt.show()

# Subset the DataFrame to select the anomalous data
df_subset = new_dataset_faults.iloc[anomalous_data_indices]

# Plot the data with anomalies
fig, ax = plt.subplots()
new_dataset_faults.plot(legend=False, ax=ax)
df_subset.plot(legend=False, ax=ax, style='x')  # Anomalies highlighted in red
plt.xlabel('Samples')  # Adjust the font size for y-axis label
plt.ylabel('DO [mg/L]')  # Adjust the font size for x-axis label
ax.set_xlim(33000, 37000)
ax.set_ylim(0, 5)
dpi = 300
# Save the figure with the specified DPI
plt.savefig('bias_anomalies.png', dpi=dpi)
plt.show()

# Subset the DataFrame to select the anomalous data
df_subset = new_dataset_faults.iloc[anomalous_data_indices]

# Plot the data with anomalies
fig, ax = plt.subplots()
new_dataset_faults.plot(legend=False, ax=ax)
df_subset.plot(legend=False, ax=ax, style='x')  # Anomalies highlighted in red
plt.xlabel('Samples', fontsize=12)  # Adjust the font size for y-axis label
plt.ylabel('DO [mg/L]', fontsize=12)  # Adjust the font size for x-axis label
ax.set_xlim(37000, 44700)
ax.set_ylim(0, 8)
dpi = 300
# Save the figure with the specified DPI
plt.savefig('spike_anomalies.png', dpi=dpi)
plt.show()

# Subset the DataFrame to select the anomalous data
df_subset = new_dataset_faults.iloc[anomalous_data_indices]

# Plot the data with anomalies
fig, ax = plt.subplots()
new_dataset_faults.plot(legend=False, ax=ax)
df_subset.plot(legend=False, ax=ax, style='x')  # Anomalies highlighted in red
plt.xlabel('Samples', fontsize=12)  # Adjust the font size for y-axis label
plt.ylabel('DO [mg/L]', fontsize=12)  # Adjust the font size for x-axis label
ax.set_xlim(47000, 52000)
ax.set_ylim(-3, 6)
dpi = 300
# Save the figure with the specified DPI
plt.savefig('pd_anomalies.png', dpi=dpi)
plt.show()

from sklearn.metrics import classification_report

# Define a function to classify samples as normal or anomalous based on the threshold
def classify_samples(mae_loss, threshold):
    return (mae_loss <= threshold).astype(int)

# Classify test samples as normal (0) or anomalous (1) based on the threshold
predicted_labels = classify_samples(test_mae_loss, threshold)

# Convert the labels_faults to binary (0: normal, 1: anomalous) based on the threshold
true_labels = classify_samples(test_mae_loss, threshold)

# Print the classification report
target_names = ['Anomalous', 'Normal']
print(classification_report(y_test, predicted_labels, target_names=target_names, digits=4))

from sklearn.metrics import confusion_matrix
matrix_ae = confusion_matrix(y_test, predicted_labels)

import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# Assuming you have the 'matrix_ae' confusion matrix defined

plt.figure(figsize=(10, 8))
sns.set(font_scale=1.5)

# Define class labels for x and y axes
class_labels = ['Anomaly', 'Normal']

# Create a heatmap for the confusion matrix without the color bar
sns.heatmap(matrix_ae, annot=True, cmap='Blues', fmt=".1f", xticklabels=class_labels, yticklabels=class_labels,
            annot_kws={"color": "black", "fontsize": 14}, cbar=False)

# Set axis labels and title
plt.xlabel('Predicted Label', fontsize=14)
plt.ylabel('True Label', fontsize=14)
plt.title('Autoencoder Confusion Matrix', fontsize=16)

plt.show()

import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve

def plot_roc_curve(true_y, y_prob):
    """
    plots the roc curve based of the probabilities
    """

    fpr, tpr, thresholds = roc_curve(true_y, y_prob)
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

plot_roc_curve(y_test, predicted_labels)
print(f'model CNN AUC score: {roc_auc_score(y_test, predicted_labels)}')