#main class will come here (Contributor: Hamza) import CountVectorizer import TFIDFVectorizer import LexicialAnalyzer import DictonaryWord import HostBasedFeature import ContentBasedFeature import pandas as pd from sklearn.model_selection import train_test_split input_filename='NewCode/Dataset/MaliciousURLs.csv' output_filename='NewCode/Dataset/HostBasedFeatures.csv' class MaliciousURLS: url_df = pd.read_csv(input_filename) #url_df.tail(50) #seperate the training and test data test_percentage = .2 train_df, test_df = train_test_split(url_df, test_size=test_percentage, random_state=42) labels = train_df['Class'] test_labels = test_df['Class'] #print("Seperated training and test data") #Feature Engineering # traincountvector_X, testcountvector_X =CountVectorizer.countVectorizer(train_df, test_df) # print("countVectorizing Completed") # traintfidfvector_X, testtfidfvector_X =TFIDFVectorizer.tfidf_vectorizer(train_df, test_df) # print("TFIDFVectorizing Completed") # trainlexicialvector_X, testlexicialvector_X =LexicialAnalyzer.lexicialAnalyzer(train_df, test_df) # print("LexicalVectorizing Completed") #HostBasedFeature.process_domains(input_filename, output_filename) #print("HostBasedFeature Completed") # traindictonaryvector_X, testdictonaryvector_X =DictonaryWord.dictionary_word(train_df, test_df) # print("Linguistic(DictonaryWord) Completed") traindictonaryvector_X, testdictonaryvector_X =ContentBasedFeature.ContentBasedFeature(train_df, test_df) print("Content-based Feature Vectorizing Completed") # Train the model #mnb_tfidf = MultinomialNB() #mnb_tfidf.fit(tfidf_X, labels)