# import re
# import requests

# '''
# # used to seperate words and create a new different token
# # used refindall to extract token from URL
# # Pattern r'\b[\w.-]+\b':
# # \b: Matches word boundaries.
# # [\w.-]+: Matches sequences of word characters, hyphens, and dots.
# # \b: Matches word boundaries.
# '''
#
#
# def tokenize_url(url):
#     # Extract tokens based on word boundaries and include alphanumeric characters, hyphens, and dots
#     # tokens = re.findall(r'\b[\w.-]+\b', url)
#     #
#     # # Further split tokens containing periods (.)
#     # new_tokens = []
#     # for token in tokens:
#     #     if '.' in token:
#     #         new_tokens.extend(token.split('.'))
#     #     else:
#     #         new_tokens.append(token)
#     #
#     # # Remove common URL parts that are not useful for dictionary matching
#     # common_parts = {'www', 'com', 'net', 'org', 'http', 'https'}
#     # new_tokens = [token for token in new_tokens if token not in common_parts]
#     #
#     # return new_tokens
#
#     url = re.sub(r'(https?://)?(www\.)?', '', url) #replacing https and www with empty string
#     tokens = re.split(r'\W+', url)  #splitting the websites into tokens now
#
#
#     common_parts = {'com', 'net', 'org', 'http', 'https'} # removing thse common words
#     tokens = [token.lower() for token in tokens if token.lower() not in common_parts]
#
#     return tokens
#
#
# # checking if the list exists in the file and then returning 0 or 1 to indicate there existence and using set for faster lookups
# def check_dictionary(words):
#     english_words = set(
#         requests.get("https://raw.githubusercontent.com/dwyl/english-words/master/words.txt").text.lower().splitlines())
#     return [int(word.lower() in english_words) for word in words]
#
# url = input("Enter a URL: ")
# tokens = tokenize_url(url)
#
#
# words_in_dictionary = check_dictionary(tokens)
#
# # Print results
# print("Tokenized URL:", tokens)
# print("Words in English Dictionary:", words_in_dictionary)

import re
import requests
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer


def fetch_english_words():

    # Fetches a list of English words from a given URL and returns them as a set.
    # Returns:
    # set: A set of English words.

    url = "https://raw.githubusercontent.com/dwyl/english-words/master/words.txt"
    response = requests.get(url)
    english_words = set(response.text.lower().splitlines())
    return english_words


def tokenize_url(url):

    # Tokenizes the given URL by removing the scheme (http, https) and 'www',
    # splitting the remaining part into tokens, and removing common parts.
    #
    # Parameters:
    # url (str): The URL to tokenize.
    #
    # Returns:
    # list: A list of tokens extracted from the URL.

    # Remove the scheme (http, https) and 'www'
    url = re.sub(r'(https?://)?(www\.)?', '', url)

    # Split the URL into tokens based on non-word characters
    tokens = re.split(r'\W+', url)

    # Remove common parts that are not useful for dictionary matching
    common_parts = {'com', 'net', 'org', 'http', 'https'}
    tokens = [token.lower() for token in tokens if token.lower() not in common_parts]

    return tokens


def dictionary_word(train_df, test_df):

    # Processes the training and testing DataFrames to tokenize URLs, check tokens against the English dictionary,
    # and convert the results into numerical vectors.
    #
    # Parameters:
    # train_df (DataFrame): The training DataFrame containing URLs.
    # test_df (DataFrame): The testing DataFrame containing URLs.
    #
    # Returns:
    # tuple: Two arrays of numerical vectors for the training and testing DataFrames.

    # Fetch the English words dictionary
    english_words = fetch_english_words()

    def process_urls(urls):

        # Helper function to process URLs, tokenize them, and create a combined string of tokens.
        #
        # Parameters:
        # urls (list): A list of URLs to process.
        #
        # Returns:
        # list: A list of tokenized URLs as combined strings.
        #
        tokenized_urls = []
        for url in urls:
            tokens = tokenize_url(url)
            valid_tokens = [token for token in tokens if token in english_words]
            tokenized_urls.append(" ".join(valid_tokens))
        return tokenized_urls

    # Tokenize and filter URLs in both training and testing DataFrames
    train_urls = process_urls(train_df['URLs'])
    test_urls = process_urls(test_df['URLs'])

    # Use CountVectorizer to convert the tokenized URLs into numerical vectors
    vectorizer = CountVectorizer()
    train_vectors = vectorizer.fit_transform(train_urls)
    test_vectors = vectorizer.transform(test_urls)

    return train_vectors, test_vectors