Malicious-URL-Detection-ML / URL Datasets / AdversarialAttack.py
AdversarialAttack.py
Raw
import numpy as np
import random
import textattack  # Adversarial attack library
from textattack.attack_recipes import TextFoolerJin2019
from textattack.models.wrappers import SklearnModelWrapper
from sklearn.metrics import accuracy_score, f1_score, precision_score

# def simple_tokenizer(text):
#     return text.split()  # Tokenize by splitting text into words

# def apply_adversarial_attack(model, X_test, y_test):
#     """
#     Apply adversarial attack on a trained NLP model and evaluate its performance.
    
#     Parameters:
#     - model: Trained machine learning model
#     - X_test: Original test dataset
#     - y_test: True labels
    
#     Returns:
#     - attack_results: Dictionary containing accuracy, F1-score, and precision after attack
#     """
#     # โœ… Wrap the model for TextAttack compatibility
#     #wrapped_model = SklearnModelWrapper(model, simple_tokenizer)
    
#     attack = TextFoolerJin2019.build(model)  # Load a predefined adversarial attack
#     adversarial_texts = []
    
#     for text in X_test:
#         attack_result = attack.attack(text)  # Generate adversarial text
#         adversarial_texts.append(attack_result.perturbed_text if attack_result.perturbed_text else text)
    
#     # Predict using adversarial examples
#     X_test_adv = np.array(adversarial_texts)
#     y_pred_adv = model.predict(X_test_adv)

#     # Evaluate model performance after attack
#     accuracy_adv = accuracy_score(y_test, y_pred_adv)
#     f1_adv = f1_score(y_test, y_pred_adv, average='weighted')
#     precision_adv = precision_score(y_test, y_pred_adv, average='weighted')

#     attack_results = {
#         "Adversarial Accuracy": accuracy_adv,
#         "Adversarial F1 Score": f1_adv,
#         "Adversarial Precision": precision_adv
#     }

#     return attack_results


# def evaluate_adversarial_attacks(trained_models, X_test, y_test):
#     """
#     Evaluate all trained NLP models against adversarial attacks and compare performance.
    
#     Parameters:
#     - trained_models (dict): Dictionary of trained models
#     - X_test: Test dataset
#     - y_test: Test labels
    
#     Returns:
#     - attack_results (dict): Performance metrics of models before & after attack
#     """
#     attack_results = {}

#     print("\n๐Ÿ”ฅ Evaluating Adversarial Attacks on Trained Models ๐Ÿ”ฅ")
#     print("{:<30} {:<10} {:<10} {:<10}".format("Model", "Acc Before", "Acc After", "Change (%)"))
#     print("-" * 70)

#     for model_name, model in trained_models.items():
#         # Evaluate original accuracy
#         y_pred = model.predict(X_test)
#         accuracy_before = accuracy_score(y_test, y_pred)

#         # Apply adversarial attack
#         adv_metrics = apply_adversarial_attack(model, X_test, y_test)
#         accuracy_after = adv_metrics["Adversarial Accuracy"]

#         # Store results
#         attack_results[model_name] = {
#             "Accuracy Before": accuracy_before,
#             "Accuracy After": accuracy_after,
#             "Accuracy Drop (%)": round((accuracy_before - accuracy_after) * 100, 2)
#         }

#         print("{:<30} {:.4f}    {:.4f}    {:.2f}%".format(
#             model_name, accuracy_before, accuracy_after, attack_results[model_name]["Accuracy Drop (%)"]
#         ))

#     return attack_results

import numpy as np
from textattack.models.wrappers import SklearnModelWrapper
from textattack.attack_recipes import TextFoolerJin2019
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score, f1_score, precision_score

# โœ… Define a wrapper function to integrate TextAttack with Sklearn models
class CustomTextModelWrapper(SklearnModelWrapper):
    def __init__(self, model, vectorizer):
        """
        Custom wrapper for Sklearn models to process raw text input before prediction.

        Parameters:
        - model: Trained Scikit-learn model (e.g., ComplementNB, LogisticRegression)
        - vectorizer: The text vectorizer (e.g., CountVectorizer, TfidfVectorizer)
        """
        self.model = model
        self.vectorizer = vectorizer

    def __call__(self, text_list):
        """Process raw text input -> Convert to numerical features -> Predict"""
        X_transformed = self.vectorizer.transform(text_list)
        return self.model.predict(X_transformed)

def apply_adversarial_attack(model, X_test, y_test, vectorizer):
    """
    Apply adversarial attack on a trained NLP model and evaluate its performance.

    Parameters:
    - model: Trained Scikit-learn model
    - X_test: Original test dataset (text format)
    - y_test: True labels
    - vectorizer: CountVectorizer or TF-IDF Vectorizer used for text transformation

    Returns:
    - attack_results: Dictionary containing accuracy, F1-score, and precision after attack
    """
    # โœ… Convert feature vectors back to text for adversarial attack
    if isinstance(vectorizer, (CountVectorizer, TfidfVectorizer)):
        X_test_text = vectorizer.inverse_transform(X_test)  # Convert numerical vectors back to words
        X_test_text = [" ".join(words) for words in X_test_text]  # Join tokens into text
    else:
        X_test_text = X_test  # If already text, keep as is

    # โœ… Wrap the model with vectorizer for TextAttack compatibility
    wrapped_model = CustomTextModelWrapper(model, vectorizer)

    # โœ… Load adversarial attack method
    attack = TextFoolerJin2019.build(wrapped_model)

    adversarial_texts = []
    
    for text in X_test_text:
        attack_result = attack.attack(text)
        adversarial_texts.append(attack_result.perturbed_text if attack_result.perturbed_text else text)
    
    # โœ… Convert adversarial text back to numerical feature vectors
    X_test_adv = vectorizer.transform(adversarial_texts)

    # โœ… Predict using adversarial examples
    y_pred_adv = model.predict(X_test_adv)

    # โœ… Evaluate model performance after attack
    accuracy_adv = accuracy_score(y_test, y_pred_adv)
    f1_adv = f1_score(y_test, y_pred_adv, average='weighted')
    precision_adv = precision_score(y_test, y_pred_adv, average='weighted')

    attack_results = {
        "Adversarial Accuracy": accuracy_adv,
        "Adversarial F1 Score": f1_adv,
        "Adversarial Precision": precision_adv
    }

    return attack_results


import numpy as np
import textattack
from textattack.attack_recipes import TextFoolerJin2019
from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


from textattack.models.wrappers import ModelWrapper

from textattack.models.wrappers import ModelWrapper

class SklearnTextModelWrapper(ModelWrapper):
    """ Wrapper to make Scikit-learn text classifiers compatible with TextAttack. """

    def __init__(self, model, vectorizer):
        self.model = model
        self.vectorizer = vectorizer

    def __call__(self, text_list):
        """ Convert text to vectors and get model probability predictions. """
        if isinstance(text_list, str):  # If input is a string, convert it to a list
            text_list = [text_list]

        vectorized_text = self.vectorizer.transform(text_list)  # Convert text to feature vectors
        probs = self.model.predict_proba(vectorized_text)  # Get probability scores
        return probs

    
def apply_adversarial_attackUp(model, X_test, y_test, vectorizer):
    """
    Apply adversarial attack on a trained NLP model and evaluate its performance.

    Parameters:
    - model: Trained Scikit-learn model (e.g., MultinomialNB)
    - X_test: Original test dataset (text format or transformed vectors)
    - y_test: True labels
    - vectorizer: CountVectorizer or TF-IDF Vectorizer used for text transformation

    Returns:
    - attack_results: Dictionary containing accuracy, F1-score, and precision after attack
    """

    # โœ… Ensure model is wrapped correctly for TextAttack
    wrapped_model = SklearnTextModelWrapper(model, vectorizer)

    # โœ… Convert feature vectors back to text
    if isinstance(vectorizer, (CountVectorizer, TfidfVectorizer)):
        X_test_text = vectorizer.inverse_transform(X_test)
        X_test_text = [" ".join(words) for words in X_test_text]  # Convert tokens into text
    else:
        X_test_text = X_test  # If already text, keep as is

    # โœ… Load adversarial attack method
    attack = TextFoolerJin2019.build(wrapped_model)

    adversarial_texts = []
    
    for text, label in zip(X_test_text, y_test):
        text = str(text)  # โœ… Ensure text is a string
        ground_truth_output = np.argmax(label) if isinstance(label, np.ndarray) else label  # Ensure correct format
        
        attack_result = attack.attack(text, ground_truth_output)  # โœ… Now correctly passing both arguments
        adversarial_texts.append(attack_result.perturbed_text if attack_result.perturbed_text else text)

    # โœ… Convert adversarial text back to numerical feature vectors
    X_test_adv = vectorizer.transform(adversarial_texts)

    # โœ… Predict using adversarial examples
    y_pred_adv = model.predict(X_test_adv)

    # โœ… Evaluate model performance after attack
    accuracy_adv = accuracy_score(y_test, y_pred_adv)
    f1_adv = f1_score(y_test, y_pred_adv, average='weighted')
    precision_adv = precision_score(y_test, y_pred_adv, average='weighted')

    attack_results = {
        "Adversarial Accuracy": accuracy_adv,
        "Adversarial F1 Score": f1_adv,
        "Adversarial Precision": precision_adv
    }

    return attack_results