import numpy as np import pandas as pd from tensorflow import keras from tensorflow.keras import layers from matplotlib import pyplot as plt """# I. Import .CSV data""" from google.colab import files setdate = files.upload() # reading the input states and parameters from .csv files import pandas as pd dataset = pd.read_csv("nofaults_AE.csv") from google.colab import files setdate1 = files.upload() # reading the input states and parameters from .csv files import pandas as pd dataset_faults = pd.read_csv("faults_AE.csv") """# Quick look at the data""" new_dataset_nofaults = dataset.iloc[:,1:2] labels_nofaults=dataset.iloc[:,2] labels_nofaults new_dataset_nofaults.head() new_dataset_faults = dataset_faults.iloc[:,1:2] labels_faults=dataset_faults.iloc[:,2] labels_faults new_dataset_faults.head() """# Visualize the data 1. Normal data """ # exploring DO2 sensor output import matplotlib.pyplot as plt from matplotlib.pyplot import figure figure(figsize=(8, 6), dpi=80) plt.figure(1) plt.plot(new_dataset_nofaults["DO_sensor"],label='DO2 historical data') plt.xlim(0, 58466) plt.xlabel('time') plt.ylabel('DO') """2. Data with anomalies""" # exploring DO2 sensor output import matplotlib.pyplot as plt from matplotlib.pyplot import figure figure(figsize=(8, 6), dpi=100) plt.figure(1) plt.plot(new_dataset_faults["DO_sensor"],label='DO2 historical data') plt.xlim(0, 58466) plt.xlabel('time') plt.ylabel('DO') """# Prepare training data Get data values from the training timeseries data file and normalize the value data. We have a value for every 15 mins for 609 days. BSM2: Data is evaluated at each 15 minutes interval starting from 245th day 24*60/15 = 96 timesteps per day 96 * 609 days = 58464 data points in total """ # Normalize and save the mean and std we get, # for normalizing test data. training_mean = labels_faults.mean() training_std = labels_faults.std() df_training_value = (new_dataset_nofaults - training_mean) / training_std print("Number of training samples:", len(df_training_value)) """# Create sequences""" TIME_STEPS = 96 # Generated training sequences for use in the model. def create_sequences(values, labels, time_steps=TIME_STEPS): x_output = [] y_output=[] for i in range(len(values) - time_steps + 1): x_output.append(values[i : (i + time_steps)]) y_output.append(labels[i + time_steps-1]) return np.stack(x_output), np.stack(y_output) x_train, y_train = create_sequences(df_training_value.values, labels_nofaults) print("Training input shape: ", x_train.shape) print("Training input shape: ", y_train.shape) """# II. Create the model We will build a convolutional reconstruction autoencoder model. The model will take input of shape (batch_size, sequence_length, num_features) and return output of the same shape. In this case, sequence_length is 288 and num_features is 1. """ model = keras.Sequential( [ layers.Input(shape=(x_train.shape[1], x_train.shape[2])), layers.Conv1D( filters=32, kernel_size=7, padding="same", strides=2, activation="relu" ), layers.Dropout(rate=0.2), layers.Conv1D( filters=16, kernel_size=7, padding="same", strides=2, activation="relu" ), layers.Conv1DTranspose( filters=16, kernel_size=7, padding="same", strides=2, activation="relu" ), layers.Dropout(rate=0.2), layers.Conv1DTranspose( filters=32, kernel_size=7, padding="same", strides=2, activation="relu" ), layers.Conv1DTranspose(filters=1, kernel_size=7, padding="same"), ] ) model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse") model.summary() """# Train the model""" history = model.fit( x_train, x_train, epochs=50, batch_size=128, validation_split=0.1, callbacks=[ keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min") ], ) """Let's plot training and validation loss to see how the training went.""" plt.plot(history.history["loss"], label="Training Loss") plt.plot(history.history["val_loss"], label="Validation Loss") plt.legend() plt.show() """# Save the model""" from google.colab import drive drive.mount('/content/drive') from tensorflow import keras keras.utils.plot_model(model, "/content/drive/MyDrive/Autoencoder/models/model_Conv-AE.png", show_shapes=True, expand_nested=True) # Assuming 'model' is your trained neural model model.save('model_conv_ae_1.h5') import shutil source_path = '/content/model_conv_ae_1.h5' destination_path = '/content/drive/MyDrive/Autoencoder/models/model_conv_ae_1.h5' shutil.move(source_path, destination_path) """# Load the model""" from tensorflow.keras.models import load_model model_path = '/content/drive/MyDrive/Autoencoder/models/model_conv_ae_1.h5' loaded_model = load_model(model_path) loaded_model.summary() """# III. Detecting anomalies We will detect anomalies by determining how well our model can reconstruct the input data. Find MAE loss on training samples. Find max MAE loss value. This is the worst our model has performed trying to reconstruct a sample. We will make this the threshold for anomaly detection. If the reconstruction loss for a sample is greater than this threshold value then we can infer that the model is seeing a pattern that it isn't familiar with. We will label this sample as an anomaly. """ # Get train MAE loss. x_train_pred = loaded_model.predict(x_train) train_mae_loss = np.mean(np.abs(x_train_pred - x_train), axis=1) plt.hist(train_mae_loss, bins=50) plt.xlabel("Train MAE loss") plt.ylabel("No of samples") plt.show() # Get reconstruction loss threshold. threshold = np.max(train_mae_loss) print("Reconstruction error threshold: ", threshold) """**1. Compare recontruction:** check how our model has recontructed the first sample.""" # Checking how the first sequence is learnt plt.plot(x_train[0]) plt.plot(x_train_pred[0]) plt.show() """ # Prepare test data """ df_test_value = (new_dataset_faults - training_mean) / training_std fig, ax = plt.subplots() df_test_value.plot(legend=False, ax=ax) plt.show() # Create sequences from test values. x_test,y_test = create_sequences(df_test_value.values,labels_faults) print("Test input shape: ", x_test.shape) print("Test input shape: ", y_test.shape) # Get test MAE loss. x_test_pred = loaded_model.predict(x_test) test_mae_loss = np.mean(np.abs(x_test_pred - x_test), axis=1) test_mae_loss = test_mae_loss.reshape((-1)) plt.hist(test_mae_loss, bins=50) plt.xlabel("test MAE loss") plt.ylabel("No of samples") plt.show() # Detect all the samples which are anomalies. anomalies = test_mae_loss > threshold print("Number of anomaly samples: ", np.sum(anomalies)) print("Indices of anomaly samples: ", np.where(anomalies)) """# Plot anomalies""" # data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies anomalous_data_indices = [] for data_idx in range(TIME_STEPS - 1, len(df_test_value) - TIME_STEPS + 1): if np.all(anomalies[data_idx - TIME_STEPS + 1 : data_idx]): anomalous_data_indices.append(data_idx) """overlay the anomalies on the original test data plot.""" # Subset the DataFrame to select the anomalous data df_subset = new_dataset_faults.iloc[anomalous_data_indices] # Plot the data with anomalies fig, ax = plt.subplots() new_dataset_faults.plot(legend=False, ax=ax) df_subset.plot(legend=False, ax=ax, style='x') # Anomalies highlighted in red ax.set_xlim(20000, 58465) plt.xlabel('Samples', fontsize=12) # Adjust the font size for y-axis label plt.ylabel('DO [mg/L]', fontsize=12) # Adjust the font size for x-axis label # Save the figure with the specified DPI plt.savefig('test_dataset.png', dpi=dpi) plt.show() # Subset the DataFrame to select the anomalous data df_subset = new_dataset_faults.iloc[anomalous_data_indices] # Plot the data with anomalies fig, ax = plt.subplots() new_dataset_faults.plot(legend=False, ax=ax) df_subset.plot(legend=False, ax=ax, style='x') # Anomalies highlighted in red plt.xlabel('Samples') # Adjust the font size for y-axis label plt.ylabel('DO [mg/L]') # Adjust the font size for x-axis label ax.set_xlim(23000, 25200) ax.set_ylim(0, 5) dpi = 300 # Save the figure with the specified DPI plt.savefig('drift_anomalies.png', dpi=dpi) plt.show() # Subset the DataFrame to select the anomalous data df_subset = new_dataset_faults.iloc[anomalous_data_indices] # Plot the data with anomalies fig, ax = plt.subplots() new_dataset_faults.plot(legend=False, ax=ax) df_subset.plot(legend=False, ax=ax, style='x') # Anomalies highlighted in red plt.xlabel('Samples') # Adjust the font size for y-axis label plt.ylabel('DO [mg/L]') # Adjust the font size for x-axis label ax.set_xlim(26000, 30000) ax.set_ylim(0, 5) dpi = 300 # Save the figure with the specified DPI plt.savefig('aeration_anomalies.png', dpi=dpi) plt.show() # Subset the DataFrame to select the anomalous data df_subset = new_dataset_faults.iloc[anomalous_data_indices] # Plot the data with anomalies fig, ax = plt.subplots() new_dataset_faults.plot(legend=False, ax=ax) df_subset.plot(legend=False, ax=ax, style='x') # Anomalies highlighted in red plt.xlabel('Samples') # Adjust the font size for y-axis label plt.ylabel('DO [mg/L]') # Adjust the font size for x-axis label ax.set_xlim(33000, 37000) ax.set_ylim(0, 5) dpi = 300 # Save the figure with the specified DPI plt.savefig('bias_anomalies.png', dpi=dpi) plt.show() # Subset the DataFrame to select the anomalous data df_subset = new_dataset_faults.iloc[anomalous_data_indices] # Plot the data with anomalies fig, ax = plt.subplots() new_dataset_faults.plot(legend=False, ax=ax) df_subset.plot(legend=False, ax=ax, style='x') # Anomalies highlighted in red plt.xlabel('Samples', fontsize=12) # Adjust the font size for y-axis label plt.ylabel('DO [mg/L]', fontsize=12) # Adjust the font size for x-axis label ax.set_xlim(37000, 44700) ax.set_ylim(0, 8) dpi = 300 # Save the figure with the specified DPI plt.savefig('spike_anomalies.png', dpi=dpi) plt.show() # Subset the DataFrame to select the anomalous data df_subset = new_dataset_faults.iloc[anomalous_data_indices] # Plot the data with anomalies fig, ax = plt.subplots() new_dataset_faults.plot(legend=False, ax=ax) df_subset.plot(legend=False, ax=ax, style='x') # Anomalies highlighted in red plt.xlabel('Samples', fontsize=12) # Adjust the font size for y-axis label plt.ylabel('DO [mg/L]', fontsize=12) # Adjust the font size for x-axis label ax.set_xlim(47000, 52000) ax.set_ylim(-3, 6) dpi = 300 # Save the figure with the specified DPI plt.savefig('pd_anomalies.png', dpi=dpi) plt.show() from sklearn.metrics import classification_report # Define a function to classify samples as normal or anomalous based on the threshold def classify_samples(mae_loss, threshold): return (mae_loss <= threshold).astype(int) # Classify test samples as normal (0) or anomalous (1) based on the threshold predicted_labels = classify_samples(test_mae_loss, threshold) # Convert the labels_faults to binary (0: normal, 1: anomalous) based on the threshold true_labels = classify_samples(test_mae_loss, threshold) # Print the classification report target_names = ['Anomalous', 'Normal'] print(classification_report(y_test, predicted_labels, target_names=target_names, digits=4)) from sklearn.metrics import confusion_matrix matrix_ae = confusion_matrix(y_test, predicted_labels) import seaborn as sns from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt # Assuming you have the 'matrix_ae' confusion matrix defined plt.figure(figsize=(10, 8)) sns.set(font_scale=1.5) # Define class labels for x and y axes class_labels = ['Anomaly', 'Normal'] # Create a heatmap for the confusion matrix without the color bar sns.heatmap(matrix_ae, annot=True, cmap='Blues', fmt=".1f", xticklabels=class_labels, yticklabels=class_labels, annot_kws={"color": "black", "fontsize": 14}, cbar=False) # Set axis labels and title plt.xlabel('Predicted Label', fontsize=14) plt.ylabel('True Label', fontsize=14) plt.title('Autoencoder Confusion Matrix', fontsize=16) plt.show() import matplotlib.pyplot as plt import numpy as np from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve def plot_roc_curve(true_y, y_prob): """ plots the roc curve based of the probabilities """ fpr, tpr, thresholds = roc_curve(true_y, y_prob) plt.plot(fpr, tpr) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plot_roc_curve(y_test, predicted_labels) print(f'model CNN AUC score: {roc_auc_score(y_test, predicted_labels)}')