NewCode/ContentBasedFeature.py · Malicious-URL-Detection-ML

#Content-based Feature recover will come here ()
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Fetch and parse the HTML content of a URL
def get_html_content(url):
    try:
        response = requests.get(url, timeout=5)  # Added timeout for safety
        if response.status_code == 200:
            return response.text
        else:
            print(f"Error: Received status code {response.status_code}")
            return ""
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return ""

# Extract HTML and JavaScript features
def extract_features(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    features = {
        'presence_iFrame': bool(soup.find('iframe')),
        'count_eval': html_content.count('eval('),
        'count_escape': html_content.count('escape('),
        'count_unescape': html_content.count('unescape('),
        'count_find': html_content.count('find('),
        'count_exec': html_content.count('exec('),
        'count_search': html_content.count('search('),
        'count_link': html_content.count('link('),
        'presence_windows_open': 'window.open(' in html_content,
        'lines_count': html_content.count('\n')
    }
    features['count_all_functions'] = sum(features[f'count_{func}'] for func in ['eval', 'escape', 'unescape', 'find', 'exec', 'search', 'link'])
    return features

def ContentBasedFeature(train_df, test_df):
    for domain in test_df['URLs']:
        domain="https://"+domain
        print(domain)
        html_content = get_html_content(domain)
        if html_content:
            features = extract_features(html_content)
            print("\nExtracted Features:")
            for key, value in features.items():
                print(f"{key}: {value}")
        else:
            print("Failed to retrieve or parse URL content.")
    #retun train_vector, test_vector