#TFIDFvector method will come here (Contributor: Pavan) import pandas as pd import numpy as np import re from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import confusion_matrix, classification_report # Function to tokenize the URL def tokenizer(url): # Split by slash (/) and dash (-) tokens = re.split('[/-]', url) for i in tokens: # Include the splits extensions and subdomains if i.find(".") >= 0: dot_split = i.split('.') # Remove .com and www. since they're too common if "com" in dot_split: dot_split.remove("com") if "www" in dot_split: dot_split.remove("www") tokens += dot_split return tokens def tfidf_vectorizer(train_df, test_df): # Initialize TFIDF Vectorizer with custom tokenizer tfidfVec = TfidfVectorizer(tokenizer=tokenizer) # Vectorize the training inputs print("Training TFIDF Vectorizer") train_vector = tfidfVec.fit_transform(train_df['URLs']) # Vectorize the testing inputs print("Test TFIDF Vectorizer") test_vector = tfidfVec.transform(test_df['URLs']) # Use 'transform' instead of 'fit_transform' return train_vector, test_vector