Unsupervised-ML / Scenario2 / Test-Classical-Unsupervised-ML / test_dataset2_unsupervised_classical_algorithms.py
test_dataset2_unsupervised_classical_algorithms.py
Raw
import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from pylab import rcParams
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Normal", "Anomaly"]

from google.colab import files
setdate = files.upload()

# reading the input states and parameters from .csv files
import pandas as pd
df = pd.read_csv("faults_classical_alg.csv")
df.head()

df.info()

df.isnull().sum()

count_classes = pd.value_counts(df['label'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
plt.title("DO sensor samples Distribution")
plt.xticks(range(2), LABELS)
plt.xlabel("label")
plt.ylabel("Samples")

# exploring DO2 sensor output
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
figure(figsize=(8, 6), dpi=100)

plt.figure(1)
plt.plot(df['DO_sensor'],label='DO2 historical data')
plt.xlim(0, 58365)
plt.xlabel('time')
plt.ylabel('DO')

df_new1=df.iloc[22000:53000,:]
#df_new2=df.iloc[35000:50000,:]
#df_new3=df.iloc[53000:55000,:]
df_new=pd.concat([df_new1], ignore_index=True)

# exploring DO2 sensor output
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
figure(figsize=(8, 6), dpi=100)
plt.figure(1)
plt.plot(df_new['DO_sensor'],label='DO2 historical data')
plt.xlabel('time')
plt.ylabel('DO')

## Get the anomaly and the normal dataset
anomaly = df[df['label']==1]
normal = df[df['label']==0]

print(anomaly.shape,normal.shape)

# Take some sample of the data

df1= df.sample(frac = 0.2,random_state=1)
df1.shape

df.shape

#Determine the number of anomaly and valid samples in the dataset

Anomaly =  df1[df1['label']==1]
Valid = df1[df1['label']==0]
outlier_fraction = len(Anomaly)/float(len(Valid))

len(Anomaly), len(Valid)

outlier_fraction

#Create independent and Dependent Features
columns = df1.columns.tolist()

# Filter the columns to remove data we do not want
columns = [c for c in columns if c not in ["label"]]
# Store the variable we are predicting
target = "label"
# Define a random state
state = np.random.RandomState(42)

X = df1[columns]
Y = df1[target]

X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1]))
print(X.shape)
print(Y.shape)

##Define the outlier detection methods

classifiers = {
    "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(X),
                                       contamination=outlier_fraction,random_state=5, verbose=0),
    "Local Outlier Factor":LocalOutlierFactor(n_neighbors=20, algorithm='auto',
                                              leaf_size=30, metric='euclidean',
                                              p=2, metric_params=None, contamination=outlier_fraction),
    "Support Vector Machine":OneClassSVM(kernel='rbf', degree=3, gamma=0.1,nu=0.05,
                                         max_iter=-1)

}

type(classifiers)

n_outliers = len(Anomaly)
for i, (clf_name,clf) in enumerate(classifiers.items()):
    #Fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_prediction = clf.negative_outlier_factor_
    elif clf_name == "Support Vector Machine":
        clf.fit(X)
        y_pred = clf.predict(X)
    else:
        clf.fit(X)
        scores_prediction = clf.decision_function(X)
        y_pred = clf.predict(X)
    #Reshape the prediction values to 0 for Valid transactions , 1 for Fraud transactions
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    n_errors = (y_pred != Y).sum()
    # Run Classification Metrics
    print("{}: {}".format(clf_name,n_errors))
    print("Accuracy Score :")
    print(accuracy_score(Y,y_pred))
    print("Classification Report :")
    print(classification_report(Y,y_pred, digits=4))