Malicious-URL-Detection-ML / NewCode / TFIDFVectorizer.py
TFIDFVectorizer.py
Raw
#TFIDFvector method will come here (Contributor: Pavan)
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

# Function to tokenize the URL
def tokenizer(url):
    # Split by slash (/) and dash (-)
    tokens = re.split('[/-]', url)

    for i in tokens:
        # Include the splits extensions and subdomains
        if i.find(".") >= 0:
            dot_split = i.split('.')

            # Remove .com and www. since they're too common
            if "com" in dot_split:
                dot_split.remove("com")
            if "www" in dot_split:
                dot_split.remove("www")

            tokens += dot_split

    return tokens

def tfidf_vectorizer(train_df, test_df):

    # Initialize TFIDF Vectorizer with custom tokenizer
    tfidfVec = TfidfVectorizer(tokenizer=tokenizer)

    # Vectorize the training inputs
    print("Training TFIDF Vectorizer")
    train_vector = tfidfVec.fit_transform(train_df['URLs'])

    # Vectorize the testing inputs
    print("Test TFIDF Vectorizer")
    test_vector = tfidfVec.transform(test_df['URLs'])  # Use 'transform' instead of 'fit_transform'

    return train_vector, test_vector