#Content-based Feature recover will come here ()
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Fetch and parse the HTML content of a URL
def get_html_content(url):
try:
response = requests.get(url, timeout=5) # Added timeout for safety
if response.status_code == 200:
return response.text
else:
print(f"Error: Received status code {response.status_code}")
return ""
except requests.exceptions.RequestException as e:
print(f"Request failed: {e}")
return ""
# Extract HTML and JavaScript features
def extract_features(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
features = {
'presence_iFrame': bool(soup.find('iframe')),
'count_eval': html_content.count('eval('),
'count_escape': html_content.count('escape('),
'count_unescape': html_content.count('unescape('),
'count_find': html_content.count('find('),
'count_exec': html_content.count('exec('),
'count_search': html_content.count('search('),
'count_link': html_content.count('link('),
'presence_windows_open': 'window.open(' in html_content,
'lines_count': html_content.count('\n')
}
features['count_all_functions'] = sum(features[f'count_{func}'] for func in ['eval', 'escape', 'unescape', 'find', 'exec', 'search', 'link'])
return features
def ContentBasedFeature(train_df, test_df):
for domain in test_df['URLs']:
domain="https://"+domain
print(domain)
html_content = get_html_content(domain)
if html_content:
features = extract_features(html_content)
print("\nExtracted Features:")
for key, value in features.items():
print(f"{key}: {value}")
else:
print("Failed to retrieve or parse URL content.")
#retun train_vector, test_vector