Malicious-URL-Detection-ML / OldCode / 4-LexicalFeatures.py
4-LexicalFeatures.py
Raw
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re

# Load the dataset
url_df = pd.read_csv('Dataset\Malicious URLs.csv')

# Define lexical features
lexical_features = ['url_length', 'num_digits', 'num_special_chars', 'num_keywords', 'has_pattern', 
                    'has_http', 'has_https', 'has_domain_com', 'has_domain_org', 'has_domain_net',
                    'has_domain_info', 'has_ip', 'has_redirect', 'has_script', 'has_iframe']

# Extract lexical features from URLs
url_df['url_length'] = url_df['URLs'].apply(lambda x: len(x))
url_df['num_digits'] = url_df['URLs'].apply(lambda x: sum(c.isdigit() for c in x))
url_df['num_special_chars'] = url_df['URLs'].apply(lambda x: sum(not c.isalnum() for c in x))
url_df['num_keywords'] = url_df['URLs'].apply(lambda x: sum(word in x.lower() for word in ['phishing', 'malware', 'attack', 'fraud', 'hacked']))
url_df['has_pattern'] = url_df['URLs'].apply(lambda x: 1 if '//' in x or '..' in x else 0)
url_df['has_http'] = url_df['URLs'].apply(lambda x: 1 if 'http' in x else 0)
url_df['has_https'] = url_df['URLs'].apply(lambda x: 1 if 'https' in x else 0)
url_df['has_domain_com'] = url_df['URLs'].apply(lambda x: 1 if '.com' in x else 0)
url_df['has_domain_org'] = url_df['URLs'].apply(lambda x: 1 if '.org' in x else 0)
url_df['has_domain_net'] = url_df['URLs'].apply(lambda x: 1 if '.net' in x else 0)
url_df['has_domain_info'] = url_df['URLs'].apply(lambda x: 1 if '.info' in x else 0)
url_df['has_ip'] = url_df['URLs'].apply(lambda x: 1 if re.match(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', x) else 0)
url_df['has_redirect'] = url_df['URLs'].apply(lambda x: 1 if 'redirect' in x.lower() else 0)
url_df['has_script'] = url_df['URLs'].apply(lambda x: 1 if 'script' in x.lower() else 0)
url_df['has_iframe'] = url_df['URLs'].apply(lambda x: 1 if 'iframe' in x.lower() else 0)

# Define features and target variable
X = url_df[lexical_features]
y = url_df['Class']

url_df.to_csv('Output/LexicalFeature.csv',index=False)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression classifier
logistic_regression = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42) 
logistic_regression.fit(X_train, y_train)

# Predict the labels for the test data
y_pred = logistic_regression.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)