Malicious-URL-Detection-ML / NewCode / LexicialAnalyzer.py
LexicialAnalyzer.py
Raw
#LexicialAnalyzer method will come here (Contributor: Yassin)
import pandas as pd
from urllib.parse import urlparse
def lexicialAnalyzer(train_df, test_df):
    def extract_features(url):
        parsed_url = urlparse(url)
        lexical_features = {
            'URL_length': len(url),
            'Has_http': 1 if parsed_url.scheme == 'http' else 0,
            'Has_https': 1 if parsed_url.scheme == 'https' else 0,
            'Count_dots': url.count('.'),
            'Count_dashes': url.count('-'),
            'Count_underscores': url.count('_'),
            'Count_slashes': url.count('/'),
            'Count_ques': url.count('?'),
            'Count_non_alphanumeric': sum(1 for c in url if not c.isalnum()),
            'Count_digits': sum(1 for c in url if c.isdigit()),
            'Count_letters': sum(1 for c in url if c.isalpha()),
            'Count_params': len(parsed_url.params.split(';')),
            'Has_php': 1 if 'php' in url else 0,
            'Has_html': 1 if 'html' in url else 0,
        }
        return lexical_features

    # Apply the feature extraction to each URL in the train and test dataframes
    #print(type(train_df))
    train_vector = train_df['URLs'].apply(extract_features).apply(pd.Series)
    test_vector = test_df['URLs'].apply(extract_features).apply(pd.Series)
    
    return train_vector, test_vector
    #retun train_vector, test_vector