#Content-based Feature recover will come here () import requests from bs4 import BeautifulSoup import pandas as pd # Fetch and parse the HTML content of a URL def get_html_content(url): try: response = requests.get(url, timeout=5) # Added timeout for safety if response.status_code == 200: return response.text else: print(f"Error: Received status code {response.status_code}") return "" except requests.exceptions.RequestException as e: print(f"Request failed: {e}") return "" # Extract HTML and JavaScript features def extract_features(html_content): soup = BeautifulSoup(html_content, 'html.parser') features = { 'presence_iFrame': bool(soup.find('iframe')), 'count_eval': html_content.count('eval('), 'count_escape': html_content.count('escape('), 'count_unescape': html_content.count('unescape('), 'count_find': html_content.count('find('), 'count_exec': html_content.count('exec('), 'count_search': html_content.count('search('), 'count_link': html_content.count('link('), 'presence_windows_open': 'window.open(' in html_content, 'lines_count': html_content.count('\n') } features['count_all_functions'] = sum(features[f'count_{func}'] for func in ['eval', 'escape', 'unescape', 'find', 'exec', 'search', 'link']) return features def ContentBasedFeature(train_df, test_df): for domain in test_df['URLs']: domain="https://"+domain print(domain) html_content = get_html_content(domain) if html_content: features = extract_features(html_content) print("\nExtracted Features:") for key, value in features.items(): print(f"{key}: {value}") else: print("Failed to retrieve or parse URL content.") #retun train_vector, test_vector