# import re # import requests # ''' # # used to seperate words and create a new different token # # used refindall to extract token from URL # # Pattern r'\b[\w.-]+\b': # # \b: Matches word boundaries. # # [\w.-]+: Matches sequences of word characters, hyphens, and dots. # # \b: Matches word boundaries. # ''' # # # def tokenize_url(url): # # Extract tokens based on word boundaries and include alphanumeric characters, hyphens, and dots # # tokens = re.findall(r'\b[\w.-]+\b', url) # # # # # Further split tokens containing periods (.) # # new_tokens = [] # # for token in tokens: # # if '.' in token: # # new_tokens.extend(token.split('.')) # # else: # # new_tokens.append(token) # # # # # Remove common URL parts that are not useful for dictionary matching # # common_parts = {'www', 'com', 'net', 'org', 'http', 'https'} # # new_tokens = [token for token in new_tokens if token not in common_parts] # # # # return new_tokens # # url = re.sub(r'(https?://)?(www\.)?', '', url) #replacing https and www with empty string # tokens = re.split(r'\W+', url) #splitting the websites into tokens now # # # common_parts = {'com', 'net', 'org', 'http', 'https'} # removing thse common words # tokens = [token.lower() for token in tokens if token.lower() not in common_parts] # # return tokens # # # # checking if the list exists in the file and then returning 0 or 1 to indicate there existence and using set for faster lookups # def check_dictionary(words): # english_words = set( # requests.get("https://raw.githubusercontent.com/dwyl/english-words/master/words.txt").text.lower().splitlines()) # return [int(word.lower() in english_words) for word in words] # # url = input("Enter a URL: ") # tokens = tokenize_url(url) # # # words_in_dictionary = check_dictionary(tokens) # # # Print results # print("Tokenized URL:", tokens) # print("Words in English Dictionary:", words_in_dictionary) import re import requests import pandas as pd from sklearn.feature_extraction.text import CountVectorizer def fetch_english_words(): # Fetches a list of English words from a given URL and returns them as a set. # Returns: # set: A set of English words. url = "https://raw.githubusercontent.com/dwyl/english-words/master/words.txt" response = requests.get(url) english_words = set(response.text.lower().splitlines()) return english_words def tokenize_url(url): # Tokenizes the given URL by removing the scheme (http, https) and 'www', # splitting the remaining part into tokens, and removing common parts. # # Parameters: # url (str): The URL to tokenize. # # Returns: # list: A list of tokens extracted from the URL. # Remove the scheme (http, https) and 'www' url = re.sub(r'(https?://)?(www\.)?', '', url) # Split the URL into tokens based on non-word characters tokens = re.split(r'\W+', url) # Remove common parts that are not useful for dictionary matching common_parts = {'com', 'net', 'org', 'http', 'https'} tokens = [token.lower() for token in tokens if token.lower() not in common_parts] return tokens def dictionary_word(train_df, test_df): # Processes the training and testing DataFrames to tokenize URLs, check tokens against the English dictionary, # and convert the results into numerical vectors. # # Parameters: # train_df (DataFrame): The training DataFrame containing URLs. # test_df (DataFrame): The testing DataFrame containing URLs. # # Returns: # tuple: Two arrays of numerical vectors for the training and testing DataFrames. # Fetch the English words dictionary english_words = fetch_english_words() def process_urls(urls): # Helper function to process URLs, tokenize them, and create a combined string of tokens. # # Parameters: # urls (list): A list of URLs to process. # # Returns: # list: A list of tokenized URLs as combined strings. # tokenized_urls = [] for url in urls: tokens = tokenize_url(url) valid_tokens = [token for token in tokens if token in english_words] tokenized_urls.append(" ".join(valid_tokens)) return tokenized_urls # Tokenize and filter URLs in both training and testing DataFrames train_urls = process_urls(train_df['URLs']) test_urls = process_urls(test_df['URLs']) # Use CountVectorizer to convert the tokenized URLs into numerical vectors vectorizer = CountVectorizer() train_vectors = vectorizer.fit_transform(train_urls) test_vectors = vectorizer.transform(test_urls) return train_vectors, test_vectors