#main class will come here (Contributor: Hamza)
import CountVectorizer
import TFIDFVectorizer
import LexicialAnalyzer
import DictonaryWord
import HostBasedFeature
import ContentBasedFeature
import pandas as pd
from sklearn.model_selection import train_test_split
input_filename='NewCode/Dataset/MaliciousURLs.csv'
output_filename='NewCode/Dataset/HostBasedFeatures.csv'
class MaliciousURLS:
url_df = pd.read_csv(input_filename)
#url_df.tail(50)
#seperate the training and test data
test_percentage = .2
train_df, test_df = train_test_split(url_df, test_size=test_percentage, random_state=42)
labels = train_df['Class']
test_labels = test_df['Class']
#print("Seperated training and test data")
#Feature Engineering
# traincountvector_X, testcountvector_X =CountVectorizer.countVectorizer(train_df, test_df)
# print("countVectorizing Completed")
# traintfidfvector_X, testtfidfvector_X =TFIDFVectorizer.tfidf_vectorizer(train_df, test_df)
# print("TFIDFVectorizing Completed")
# trainlexicialvector_X, testlexicialvector_X =LexicialAnalyzer.lexicialAnalyzer(train_df, test_df)
# print("LexicalVectorizing Completed")
#HostBasedFeature.process_domains(input_filename, output_filename)
#print("HostBasedFeature Completed")
# traindictonaryvector_X, testdictonaryvector_X =DictonaryWord.dictionary_word(train_df, test_df)
# print("Linguistic(DictonaryWord) Completed")
traindictonaryvector_X, testdictonaryvector_X =ContentBasedFeature.ContentBasedFeature(train_df, test_df)
print("Content-based Feature Vectorizing Completed")
# Train the model
#mnb_tfidf = MultinomialNB()
#mnb_tfidf.fit(tfidf_X, labels)