import numpy as np
import pandas as pd
import random
import textattack
from textattack.attack_recipes import TextFoolerJin2019
from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
import ace_tools as tools
def generate_adversarial_examples(X_test, model):
"""
Generate adversarial examples based on model type.
- For text-based models: Uses TextAttack's TextFooler.
- For numerical models: Adds Gaussian noise.
Parameters:
- X_test: The original test dataset
- model: The trained model
Returns:
- X_test_adv: The adversarially perturbed dataset
"""
if isinstance(model, MLPClassifier) or isinstance(model, SVC):
# Use FGSM-like attack for neural networks & SVM
epsilon = 0.1 # Small perturbation factor
noise = epsilon * np.sign(np.random.randn(*X_test.shape)) # Add small adversarial noise
X_test_adv = X_test + noise # Apply perturbation
elif isinstance(model, (RandomForestClassifier, DecisionTreeClassifier, AdaBoostClassifier)):
# Tree-based models are less sensitive to small perturbations, apply slight data modification
X_test_adv = X_test.copy()
for col in range(X_test.shape[1]):
X_test_adv[:, col] += np.random.normal(0, 0.01, size=X_test.shape[0]) # Small noise
elif isinstance(model, GaussianNB):
# Add small noise to numerical values to attack Naive Bayes models
X_test_adv = X_test + np.random.normal(0, 0.05, size=X_test.shape)
else:
# Default: Apply slight Gaussian noise
X_test_adv = X_test + np.random.normal(0, 0.02, size=X_test.shape)
return np.clip(X_test_adv, 0, 1) # Ensure values remain within valid range
def evaluate_adversarial_attacks(trained_models, X_test, y_test):
"""
Evaluate trained models against adversarial attacks.
Parameters:
- trained_models (dict): Dictionary of trained models
- X_test: Original test dataset
- y_test: True labels
Returns:
- attack_results (dict): Dictionary with accuracy before & after attack
"""
attack_results = {}
print("\n🔥 Evaluating Adversarial Attacks on Trained Models 🔥")
print("{:<25} {:<10} {:<10} {:<10}".format("Model", "Acc Before", "Acc After", "Drop (%)"))
print("-" * 60)
for model_name, model in trained_models.items():
# Original Accuracy
y_pred = model.predict(X_test)
accuracy_before = accuracy_score(y_test, y_pred)
# Generate Adversarial Examples
X_test_adv = generate_adversarial_examples(X_test, model)
y_pred_adv = model.predict(X_test_adv)
accuracy_after = accuracy_score(y_test, y_pred_adv)
# Compute accuracy drop
accuracy_drop = round((accuracy_before - accuracy_after) * 100, 2)
# Store results
attack_results[model_name] = {
"Accuracy Before": accuracy_before,
"Accuracy After": accuracy_after,
"Accuracy Drop (%)": accuracy_drop
}
print("{:<25} {:.4f} {:.4f} {:.2f}%".format(
model_name, accuracy_before, accuracy_after, accuracy_drop
))
# Display attack results in a dataframe
attack_df = pd.DataFrame(attack_results).T
tools.display_dataframe_to_user(name="Adversarial Attack Performance", dataframe=attack_df)
return attack_results