import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
import SeperateData
import Vectorizer
import TFIDFVectorizer
import PreprocessData
import MLAlgorithms
import warnings
import AdversarialAttack
#import AdversarialMLAttack
# Ignore all warnings
warnings.filterwarnings("ignore")
# Specify the folder containing the CSV files (modify this path accordingly)
folder_path = "URL Datasets/MLClassification/*.csv" # Example: "data/*.csv"
class MaliciousURLS:
# Get a list of all CSV files in the folder
csv_files = glob.glob(folder_path)
# Read and combine all CSV files into a single DataFrame
url_df = pd.concat((pd.read_csv(file, index_col=0) for file in csv_files), ignore_index=True)
#seperate the training and test data
test_percentage = .2
train_df, test_df = train_test_split(url_df, test_size=test_percentage, random_state=42)
seperated_Traindf,seperated_Testdf=SeperateData.dataSeperation(train_df, test_df)
#print(seperated_Traindf[0].head)
# Convert y_train and y_test to 1D array
y_train = seperated_Traindf[6].values.ravel() if isinstance(seperated_Traindf[6], pd.DataFrame) else seperated_Traindf[6]
y_test = seperated_Testdf[6].values.ravel() if isinstance(seperated_Testdf[6], pd.DataFrame) else seperated_Testdf[6]
#Block1: NLP Based Approach
# Domain Vectorization
traincountvector_X, testcountvector_X =Vectorizer.countVectorizer(seperated_Traindf[0], seperated_Testdf[0])
print("countVectorizing Completed")
# traintfidfvector_X, testtfidfvector_X =Vectorizer.TfidVectorizer(seperated_Traindf[0], seperated_Testdf[0])
# print("TFIDFVectorizing Completed")
# trainword2vector_X, testword2vector_X =Vectorizer.TfidVectorizer(seperated_Traindf[0], seperated_Testdf[0])
# print("Word2VecVectorizing Completed")
# Evaluate adversarial attacks on all trained NLP models
#Train the NLP Model
#method_name={0:"Bag of Words",1: "TF-IDF",2: "Word2Vec"}
#model_accuracies, trained_models = MLAlgorithms.train_NLP(traincountvector_X, testcountvector_X, y_train, y_test,method_name[1])
#model_accuracies = MLAlgorithms.train_NLP(traintfidfvector_X, testtfidfvector_X, y_train, y_test,method_name[1])
#model_accuracies = MLAlgorithms.train_NLP(trainword2vector_X, testword2vector_X, y_train, y_test,method_name[2])
#print(trained_models)
#attack_results = AdversarialAttack.apply_adversarial_attackUp(trained_models['Naive Bayes (Multinomial)'], testcountvector_X, y_test, method_name[1])
#Block2: Convert URL into different feature
method_name={1:'LexicalFeature', 2:'LinguisticFeature', 3: 'Feature', 4: 'HostBasedFeature', 5: 'ContentBasedFeature'}
trainlexicialvector_X =PreprocessData.DataPreparationUp(seperated_Traindf[3])
testlexicialvector_X =PreprocessData.DataPreparationUp(seperated_Testdf[3])
# print(seperated_Traindf[3])
# print(trainlexicialvector_X)
#HostBasedFeature.process_domains(input_filename, output_filename)
#print("HostBasedFeature Completed")
# traindictonaryvector_X, testdictonaryvector_X =DictonaryWord.dictionary_word(train_df, test_df)
# print("Linguistic(DictonaryWord) Completed")
# traindictonaryvector_X, testdictonaryvector_X =ContentBasedFeature.ContentBasedFeature(train_df, test_df)
# print("Content-based Feature Vectorizing Completed")
# model train
#MLAlgorithms.LogisticReg(trainlexicialvector_X, y_train, testlexicialvector_X, y_test)
model_accuracies,trained_models = MLAlgorithms.train_classifiers(trainlexicialvector_X, testlexicialvector_X, y_train, y_test, method_name[3])
# Apply adversarial attacks and compare model robustness
#attack_results = AdversarialMLAttack.evaluate_adversarial_attacks(trained_models, testlexicialvector_X, y_test)