import pandas as pd import numpy as np import glob from sklearn.model_selection import train_test_split import SeperateData import Vectorizer import TFIDFVectorizer import PreprocessData import MLAlgorithms import warnings import AdversarialAttack #import AdversarialMLAttack # Ignore all warnings warnings.filterwarnings("ignore") # Specify the folder containing the CSV files (modify this path accordingly) folder_path = "URL Datasets/MLClassification/*.csv" # Example: "data/*.csv" class MaliciousURLS: # Get a list of all CSV files in the folder csv_files = glob.glob(folder_path) # Read and combine all CSV files into a single DataFrame url_df = pd.concat((pd.read_csv(file, index_col=0) for file in csv_files), ignore_index=True) #seperate the training and test data test_percentage = .2 train_df, test_df = train_test_split(url_df, test_size=test_percentage, random_state=42) seperated_Traindf,seperated_Testdf=SeperateData.dataSeperation(train_df, test_df) #print(seperated_Traindf[0].head) # Convert y_train and y_test to 1D array y_train = seperated_Traindf[6].values.ravel() if isinstance(seperated_Traindf[6], pd.DataFrame) else seperated_Traindf[6] y_test = seperated_Testdf[6].values.ravel() if isinstance(seperated_Testdf[6], pd.DataFrame) else seperated_Testdf[6] #Block1: NLP Based Approach # Domain Vectorization traincountvector_X, testcountvector_X =Vectorizer.countVectorizer(seperated_Traindf[0], seperated_Testdf[0]) print("countVectorizing Completed") # traintfidfvector_X, testtfidfvector_X =Vectorizer.TfidVectorizer(seperated_Traindf[0], seperated_Testdf[0]) # print("TFIDFVectorizing Completed") # trainword2vector_X, testword2vector_X =Vectorizer.TfidVectorizer(seperated_Traindf[0], seperated_Testdf[0]) # print("Word2VecVectorizing Completed") # Evaluate adversarial attacks on all trained NLP models #Train the NLP Model #method_name={0:"Bag of Words",1: "TF-IDF",2: "Word2Vec"} #model_accuracies, trained_models = MLAlgorithms.train_NLP(traincountvector_X, testcountvector_X, y_train, y_test,method_name[1]) #model_accuracies = MLAlgorithms.train_NLP(traintfidfvector_X, testtfidfvector_X, y_train, y_test,method_name[1]) #model_accuracies = MLAlgorithms.train_NLP(trainword2vector_X, testword2vector_X, y_train, y_test,method_name[2]) #print(trained_models) #attack_results = AdversarialAttack.apply_adversarial_attackUp(trained_models['Naive Bayes (Multinomial)'], testcountvector_X, y_test, method_name[1]) #Block2: Convert URL into different feature method_name={1:'LexicalFeature', 2:'LinguisticFeature', 3: 'Feature', 4: 'HostBasedFeature', 5: 'ContentBasedFeature'} trainlexicialvector_X =PreprocessData.DataPreparationUp(seperated_Traindf[3]) testlexicialvector_X =PreprocessData.DataPreparationUp(seperated_Testdf[3]) # print(seperated_Traindf[3]) # print(trainlexicialvector_X) #HostBasedFeature.process_domains(input_filename, output_filename) #print("HostBasedFeature Completed") # traindictonaryvector_X, testdictonaryvector_X =DictonaryWord.dictionary_word(train_df, test_df) # print("Linguistic(DictonaryWord) Completed") # traindictonaryvector_X, testdictonaryvector_X =ContentBasedFeature.ContentBasedFeature(train_df, test_df) # print("Content-based Feature Vectorizing Completed") # model train #MLAlgorithms.LogisticReg(trainlexicialvector_X, y_train, testlexicialvector_X, y_test) model_accuracies,trained_models = MLAlgorithms.train_classifiers(trainlexicialvector_X, testlexicialvector_X, y_train, y_test, method_name[3]) # Apply adversarial attacks and compare model robustness #attack_results = AdversarialMLAttack.evaluate_adversarial_attacks(trained_models, testlexicialvector_X, y_test)