#LexicialAnalyzer method will come here (Contributor: Pavan) # read data from CSV file import pandas as pd import pandas as pd import numpy as np import re import requests from bs4 import BeautifulSoup from urllib.parse import urljoin from sklearn.feature_extraction.text import TfidfVectorizer # Import Scikit-learn helper functions from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer # Import Scikit-learn models #scikit-learn is a free software machine learning library for the Python programming language from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB # Import Scikit-learn metric functions from sklearn.metrics import confusion_matrix, classification_report from sklearn.model_selection import train_test_split #print("Imported all necessary packages") # Tokenize the URL # The purpose of a tokenizer is to separate the features from the raw data def tokenizer(url): """Separates feature words from the raw data Keyword arguments: url ---- The full URL :Returns -- The tokenized words; returned as a list """ # Split by slash (/) and dash (-) tokens = re.split('[/-]', url) for i in tokens: # Include the splits extensions and subdomains if i.find(".") >= 0: dot_split = i.split('.') # Remove .com and www. since they're too common if "com" in dot_split: dot_split.remove("com") if "www" in dot_split: dot_split.remove("www") tokens += dot_split return tokens def countVectorizer(train_df, test_df): # Vectorizer the training inputs # # 1. Count vectorizer: It considers both the term frequency (how often a word appears in a document) and the inverse document frequency (how unique or rare a word is across the entire corpus). print("Training Count Vectorizer") cVec = CountVectorizer(tokenizer=tokenizer) count_X = cVec.fit_transform(train_df['URLs']) # Vectorize the testing inputs print("Test Count Vectorizer") test_count_X = cVec.transform(test_df['URLs'])# Use 'transform' instead of 'fit_transform' since we had already trained our vectorizers return count_X , test_count_X