#LexicialAnalyzer method will come here (Contributor: Yassin)
import pandas as pd
from urllib.parse import urlparse
def lexicialAnalyzer(train_df, test_df):
def extract_features(url):
parsed_url = urlparse(url)
lexical_features = {
'URL_length': len(url),
'Has_http': 1 if parsed_url.scheme == 'http' else 0,
'Has_https': 1 if parsed_url.scheme == 'https' else 0,
'Count_dots': url.count('.'),
'Count_dashes': url.count('-'),
'Count_underscores': url.count('_'),
'Count_slashes': url.count('/'),
'Count_ques': url.count('?'),
'Count_non_alphanumeric': sum(1 for c in url if not c.isalnum()),
'Count_digits': sum(1 for c in url if c.isdigit()),
'Count_letters': sum(1 for c in url if c.isalpha()),
'Count_params': len(parsed_url.params.split(';')),
'Has_php': 1 if 'php' in url else 0,
'Has_html': 1 if 'html' in url else 0,
}
return lexical_features
# Apply the feature extraction to each URL in the train and test dataframes
#print(type(train_df))
train_vector = train_df['URLs'].apply(extract_features).apply(pd.Series)
test_vector = test_df['URLs'].apply(extract_features).apply(pd.Series)
return train_vector, test_vector
#retun train_vector, test_vector