#TFIDFvector method will come here (Contributor: Pavan)
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
# Function to tokenize the URL
def tokenizer(url):
# Split by slash (/) and dash (-)
tokens = re.split('[/-]', url)
for i in tokens:
# Include the splits extensions and subdomains
if i.find(".") >= 0:
dot_split = i.split('.')
# Remove .com and www. since they're too common
if "com" in dot_split:
dot_split.remove("com")
if "www" in dot_split:
dot_split.remove("www")
tokens += dot_split
return tokens
def tfidf_vectorizer(train_df, test_df):
# Initialize TFIDF Vectorizer with custom tokenizer
tfidfVec = TfidfVectorizer(tokenizer=tokenizer)
# Vectorize the training inputs
print("Training TFIDF Vectorizer")
train_vector = tfidfVec.fit_transform(train_df['URLs'])
# Vectorize the testing inputs
print("Test TFIDF Vectorizer")
test_vector = tfidfVec.transform(test_df['URLs']) # Use 'transform' instead of 'fit_transform'
return train_vector, test_vector