#LexicialAnalyzer method will come here (Contributor: Pavan)
# read data from CSV file
import pandas as pd
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from sklearn.feature_extraction.text import TfidfVectorizer
# Import Scikit-learn helper functions
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# Import Scikit-learn models
#scikit-learn is a free software machine learning library for the Python programming language
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
# Import Scikit-learn metric functions
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import gensim.downloader as api
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore") # Suppress warnings
#print("Imported all necessary packages")
# Tokenize the URL
# The purpose of a tokenizer is to separate the features from the raw data
def tokenizer(url):
"""Separates feature words from the raw data
Keyword arguments:
url ---- The full URL
:Returns -- The tokenized words; returned as a list
"""
# Split by slash (/) and dash (-)
tokens = re.split('[/-]', url)
for i in tokens:
# Include the splits extensions and subdomains
if i.find(".") >= 0:
dot_split = i.split('.')
# Remove .com and www. since they're too common
if "com" in dot_split:
dot_split.remove("com")
if "www" in dot_split:
dot_split.remove("www")
tokens += dot_split
return tokens
def countVectorizer(train_df, test_df):
cVec = CountVectorizer(tokenizer=tokenizer)
count_X = cVec.fit_transform(train_df['Domain Name'])
test_count_X = cVec.transform(test_df['Domain Name'])
return count_X , test_count_X
def TfidVectorizer(train_df, test_df):
tfidfVec = TfidfVectorizer(tokenizer=tokenizer)
count_X = tfidfVec.fit_transform(train_df['Domain Name'])
test_count_X = tfidfVec.transform(test_df['Domain Name'])
return count_X , test_count_X
def Word2Vec(train_df, test_df, vector_size=100):
# Load pre-trained Word2Vec model (GloVe 100D)
word_vectors = api.load("glove-wiki-gigaword-100") # You can change to another model
# Convert text into average word embeddings
def text_to_vector(text, model, vector_size=100):
words = str(text).split() # Ensure text is a string
word_vecs = [model[word] for word in words if word in model]
if len(word_vecs) == 0:
return np.zeros(vector_size)
return np.mean(word_vecs, axis=0)
# Apply function to convert text to vectors
train_vectors = np.array([text_to_vector(text, word_vectors, vector_size) for text in tqdm(train_df.iloc[:, 0], desc="Processing Train Data")])
test_vectors = np.array([text_to_vector(text, word_vectors, vector_size) for text in tqdm(test_df.iloc[:, 0], desc="Processing Test Data")])
# Convert to DataFrame
column_names = [f"w2v_{i}" for i in range(vector_size)] # Naming columns as w2v_0, w2v_1, ..., w2v_99
train_vectors_df = pd.DataFrame(train_vectors, columns=column_names)
test_vectors_df = pd.DataFrame(test_vectors, columns=column_names)
return train_vectors_df, test_vectors_df