import numpy as np import pandas as pd import sklearn import scipy import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics import classification_report,accuracy_score from sklearn.ensemble import IsolationForest from sklearn.neighbors import LocalOutlierFactor from sklearn.svm import OneClassSVM from pylab import rcParams rcParams['figure.figsize'] = 14, 8 RANDOM_SEED = 42 LABELS = ["Normal", "Anomaly"] from google.colab import files setdate = files.upload() # reading the input states and parameters from .csv files import pandas as pd df = pd.read_csv("faults_classical_alg.csv") df.head() df.info() df.isnull().sum() count_classes = pd.value_counts(df['label'], sort = True) count_classes.plot(kind = 'bar', rot=0) plt.title("DO sensor samples Distribution") plt.xticks(range(2), LABELS) plt.xlabel("label") plt.ylabel("Samples") # exploring DO2 sensor output import matplotlib.pyplot as plt from matplotlib.pyplot import figure figure(figsize=(8, 6), dpi=100) plt.figure(1) plt.plot(df['DO_sensor'],label='DO2 historical data') plt.xlim(0, 58365) plt.xlabel('time') plt.ylabel('DO') df_new1=df.iloc[22000:53000,:] #df_new2=df.iloc[35000:50000,:] #df_new3=df.iloc[53000:55000,:] df_new=pd.concat([df_new1], ignore_index=True) # exploring DO2 sensor output import matplotlib.pyplot as plt from matplotlib.pyplot import figure figure(figsize=(8, 6), dpi=100) plt.figure(1) plt.plot(df_new['DO_sensor'],label='DO2 historical data') plt.xlabel('time') plt.ylabel('DO') ## Get the anomaly and the normal dataset anomaly = df[df['label']==1] normal = df[df['label']==0] print(anomaly.shape,normal.shape) # Take some sample of the data df1= df.sample(frac = 0.2,random_state=1) df1.shape df.shape #Determine the number of anomaly and valid samples in the dataset Anomaly = df1[df1['label']==1] Valid = df1[df1['label']==0] outlier_fraction = len(Anomaly)/float(len(Valid)) len(Anomaly), len(Valid) outlier_fraction #Create independent and Dependent Features columns = df1.columns.tolist() # Filter the columns to remove data we do not want columns = [c for c in columns if c not in ["label"]] # Store the variable we are predicting target = "label" # Define a random state state = np.random.RandomState(42) X = df1[columns] Y = df1[target] X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1])) print(X.shape) print(Y.shape) ##Define the outlier detection methods classifiers = { "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(X), contamination=outlier_fraction,random_state=5, verbose=0), "Local Outlier Factor":LocalOutlierFactor(n_neighbors=20, algorithm='auto', leaf_size=30, metric='euclidean', p=2, metric_params=None, contamination=outlier_fraction), "Support Vector Machine":OneClassSVM(kernel='rbf', degree=3, gamma=0.1,nu=0.05, max_iter=-1) } type(classifiers) n_outliers = len(Anomaly) for i, (clf_name,clf) in enumerate(classifiers.items()): #Fit the data and tag outliers if clf_name == "Local Outlier Factor": y_pred = clf.fit_predict(X) scores_prediction = clf.negative_outlier_factor_ elif clf_name == "Support Vector Machine": clf.fit(X) y_pred = clf.predict(X) else: clf.fit(X) scores_prediction = clf.decision_function(X) y_pred = clf.predict(X) #Reshape the prediction values to 0 for Valid transactions , 1 for Fraud transactions y_pred[y_pred == 1] = 0 y_pred[y_pred == -1] = 1 n_errors = (y_pred != Y).sum() # Run Classification Metrics print("{}: {}".format(clf_name,n_errors)) print("Accuracy Score :") print(accuracy_score(Y,y_pred)) print("Classification Report :") print(classification_report(Y,y_pred, digits=4))