Malicious-URL-Detection-ML / NewCode / main.py
main.py
Raw
#main class will come here (Contributor: Hamza)
import CountVectorizer
import TFIDFVectorizer
import LexicialAnalyzer
import DictonaryWord
import HostBasedFeature
import ContentBasedFeature
import pandas as pd
from sklearn.model_selection import train_test_split

input_filename='NewCode/Dataset/MaliciousURLs.csv'
output_filename='NewCode/Dataset/HostBasedFeatures.csv'
class MaliciousURLS:
    url_df = pd.read_csv(input_filename)
    #url_df.tail(50)
    
    #seperate the training and test data
    test_percentage = .2
    train_df, test_df = train_test_split(url_df, test_size=test_percentage, random_state=42)
    labels = train_df['Class']
    test_labels = test_df['Class']
    #print("Seperated training and test data")

    #Feature Engineering
    # traincountvector_X, testcountvector_X =CountVectorizer.countVectorizer(train_df, test_df)
    # print("countVectorizing Completed")
    # traintfidfvector_X, testtfidfvector_X  =TFIDFVectorizer.tfidf_vectorizer(train_df, test_df)
    # print("TFIDFVectorizing Completed")
    # trainlexicialvector_X, testlexicialvector_X =LexicialAnalyzer.lexicialAnalyzer(train_df, test_df)
    # print("LexicalVectorizing Completed")
    #HostBasedFeature.process_domains(input_filename, output_filename)
    #print("HostBasedFeature Completed")
    # traindictonaryvector_X, testdictonaryvector_X =DictonaryWord.dictionary_word(train_df, test_df)
    # print("Linguistic(DictonaryWord) Completed")
    traindictonaryvector_X, testdictonaryvector_X =ContentBasedFeature.ContentBasedFeature(train_df, test_df)
    print("Content-based Feature Vectorizing Completed")
    
    # Train the model
    #mnb_tfidf = MultinomialNB()
    #mnb_tfidf.fit(tfidf_X, labels)