import numpy as np
import random
import textattack # Adversarial attack library
from textattack.attack_recipes import TextFoolerJin2019
from textattack.models.wrappers import SklearnModelWrapper
from sklearn.metrics import accuracy_score, f1_score, precision_score
# def simple_tokenizer(text):
# return text.split() # Tokenize by splitting text into words
# def apply_adversarial_attack(model, X_test, y_test):
# """
# Apply adversarial attack on a trained NLP model and evaluate its performance.
# Parameters:
# - model: Trained machine learning model
# - X_test: Original test dataset
# - y_test: True labels
# Returns:
# - attack_results: Dictionary containing accuracy, F1-score, and precision after attack
# """
# # โ
Wrap the model for TextAttack compatibility
# #wrapped_model = SklearnModelWrapper(model, simple_tokenizer)
# attack = TextFoolerJin2019.build(model) # Load a predefined adversarial attack
# adversarial_texts = []
# for text in X_test:
# attack_result = attack.attack(text) # Generate adversarial text
# adversarial_texts.append(attack_result.perturbed_text if attack_result.perturbed_text else text)
# # Predict using adversarial examples
# X_test_adv = np.array(adversarial_texts)
# y_pred_adv = model.predict(X_test_adv)
# # Evaluate model performance after attack
# accuracy_adv = accuracy_score(y_test, y_pred_adv)
# f1_adv = f1_score(y_test, y_pred_adv, average='weighted')
# precision_adv = precision_score(y_test, y_pred_adv, average='weighted')
# attack_results = {
# "Adversarial Accuracy": accuracy_adv,
# "Adversarial F1 Score": f1_adv,
# "Adversarial Precision": precision_adv
# }
# return attack_results
# def evaluate_adversarial_attacks(trained_models, X_test, y_test):
# """
# Evaluate all trained NLP models against adversarial attacks and compare performance.
# Parameters:
# - trained_models (dict): Dictionary of trained models
# - X_test: Test dataset
# - y_test: Test labels
# Returns:
# - attack_results (dict): Performance metrics of models before & after attack
# """
# attack_results = {}
# print("\n๐ฅ Evaluating Adversarial Attacks on Trained Models ๐ฅ")
# print("{:<30} {:<10} {:<10} {:<10}".format("Model", "Acc Before", "Acc After", "Change (%)"))
# print("-" * 70)
# for model_name, model in trained_models.items():
# # Evaluate original accuracy
# y_pred = model.predict(X_test)
# accuracy_before = accuracy_score(y_test, y_pred)
# # Apply adversarial attack
# adv_metrics = apply_adversarial_attack(model, X_test, y_test)
# accuracy_after = adv_metrics["Adversarial Accuracy"]
# # Store results
# attack_results[model_name] = {
# "Accuracy Before": accuracy_before,
# "Accuracy After": accuracy_after,
# "Accuracy Drop (%)": round((accuracy_before - accuracy_after) * 100, 2)
# }
# print("{:<30} {:.4f} {:.4f} {:.2f}%".format(
# model_name, accuracy_before, accuracy_after, attack_results[model_name]["Accuracy Drop (%)"]
# ))
# return attack_results
import numpy as np
from textattack.models.wrappers import SklearnModelWrapper
from textattack.attack_recipes import TextFoolerJin2019
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score, f1_score, precision_score
# โ
Define a wrapper function to integrate TextAttack with Sklearn models
class CustomTextModelWrapper(SklearnModelWrapper):
def __init__(self, model, vectorizer):
"""
Custom wrapper for Sklearn models to process raw text input before prediction.
Parameters:
- model: Trained Scikit-learn model (e.g., ComplementNB, LogisticRegression)
- vectorizer: The text vectorizer (e.g., CountVectorizer, TfidfVectorizer)
"""
self.model = model
self.vectorizer = vectorizer
def __call__(self, text_list):
"""Process raw text input -> Convert to numerical features -> Predict"""
X_transformed = self.vectorizer.transform(text_list)
return self.model.predict(X_transformed)
def apply_adversarial_attack(model, X_test, y_test, vectorizer):
"""
Apply adversarial attack on a trained NLP model and evaluate its performance.
Parameters:
- model: Trained Scikit-learn model
- X_test: Original test dataset (text format)
- y_test: True labels
- vectorizer: CountVectorizer or TF-IDF Vectorizer used for text transformation
Returns:
- attack_results: Dictionary containing accuracy, F1-score, and precision after attack
"""
# โ
Convert feature vectors back to text for adversarial attack
if isinstance(vectorizer, (CountVectorizer, TfidfVectorizer)):
X_test_text = vectorizer.inverse_transform(X_test) # Convert numerical vectors back to words
X_test_text = [" ".join(words) for words in X_test_text] # Join tokens into text
else:
X_test_text = X_test # If already text, keep as is
# โ
Wrap the model with vectorizer for TextAttack compatibility
wrapped_model = CustomTextModelWrapper(model, vectorizer)
# โ
Load adversarial attack method
attack = TextFoolerJin2019.build(wrapped_model)
adversarial_texts = []
for text in X_test_text:
attack_result = attack.attack(text)
adversarial_texts.append(attack_result.perturbed_text if attack_result.perturbed_text else text)
# โ
Convert adversarial text back to numerical feature vectors
X_test_adv = vectorizer.transform(adversarial_texts)
# โ
Predict using adversarial examples
y_pred_adv = model.predict(X_test_adv)
# โ
Evaluate model performance after attack
accuracy_adv = accuracy_score(y_test, y_pred_adv)
f1_adv = f1_score(y_test, y_pred_adv, average='weighted')
precision_adv = precision_score(y_test, y_pred_adv, average='weighted')
attack_results = {
"Adversarial Accuracy": accuracy_adv,
"Adversarial F1 Score": f1_adv,
"Adversarial Precision": precision_adv
}
return attack_results
import numpy as np
import textattack
from textattack.attack_recipes import TextFoolerJin2019
from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textattack.models.wrappers import ModelWrapper
from textattack.models.wrappers import ModelWrapper
class SklearnTextModelWrapper(ModelWrapper):
""" Wrapper to make Scikit-learn text classifiers compatible with TextAttack. """
def __init__(self, model, vectorizer):
self.model = model
self.vectorizer = vectorizer
def __call__(self, text_list):
""" Convert text to vectors and get model probability predictions. """
if isinstance(text_list, str): # If input is a string, convert it to a list
text_list = [text_list]
vectorized_text = self.vectorizer.transform(text_list) # Convert text to feature vectors
probs = self.model.predict_proba(vectorized_text) # Get probability scores
return probs
def apply_adversarial_attackUp(model, X_test, y_test, vectorizer):
"""
Apply adversarial attack on a trained NLP model and evaluate its performance.
Parameters:
- model: Trained Scikit-learn model (e.g., MultinomialNB)
- X_test: Original test dataset (text format or transformed vectors)
- y_test: True labels
- vectorizer: CountVectorizer or TF-IDF Vectorizer used for text transformation
Returns:
- attack_results: Dictionary containing accuracy, F1-score, and precision after attack
"""
# โ
Ensure model is wrapped correctly for TextAttack
wrapped_model = SklearnTextModelWrapper(model, vectorizer)
# โ
Convert feature vectors back to text
if isinstance(vectorizer, (CountVectorizer, TfidfVectorizer)):
X_test_text = vectorizer.inverse_transform(X_test)
X_test_text = [" ".join(words) for words in X_test_text] # Convert tokens into text
else:
X_test_text = X_test # If already text, keep as is
# โ
Load adversarial attack method
attack = TextFoolerJin2019.build(wrapped_model)
adversarial_texts = []
for text, label in zip(X_test_text, y_test):
text = str(text) # โ
Ensure text is a string
ground_truth_output = np.argmax(label) if isinstance(label, np.ndarray) else label # Ensure correct format
attack_result = attack.attack(text, ground_truth_output) # โ
Now correctly passing both arguments
adversarial_texts.append(attack_result.perturbed_text if attack_result.perturbed_text else text)
# โ
Convert adversarial text back to numerical feature vectors
X_test_adv = vectorizer.transform(adversarial_texts)
# โ
Predict using adversarial examples
y_pred_adv = model.predict(X_test_adv)
# โ
Evaluate model performance after attack
accuracy_adv = accuracy_score(y_test, y_pred_adv)
f1_adv = f1_score(y_test, y_pred_adv, average='weighted')
precision_adv = precision_score(y_test, y_pred_adv, average='weighted')
attack_results = {
"Adversarial Accuracy": accuracy_adv,
"Adversarial F1 Score": f1_adv,
"Adversarial Precision": precision_adv
}
return attack_results