Malicious-URL-Detection-ML / URL Datasets / URLAnalysis.ipynb
URLAnalysis.ipynb
Raw
import pandas as pd

# Replace 'your_file.csv' with the actual file path
df = pd.read_csv('URL Datasets/MLClassification/Copy of Merged Dataset 1.csv',  index_col=0)

# Display the first few rows of the dataframe
print(df.head())
           Domain Name  URL Length  Is IP as Host name  Is .exe present  \
Sr. No.                                                                   
1           google.com          22               False            False   
2          youtube.com          23               False            False   
3         facebook.com          24               False            False   
4        wikipedia.org          25               False            False   
5            yahoo.com          21               False            False   

         Is www present  FTP used  .js used  Files in URL  css used  \
Sr. No.                                                               
1                 False     False     False         False     False   
2                 False     False     False         False     False   
3                 False     False     False         False     False   
4                 False     False     False         False     False   
5                 False     False     False         False     False   

         Digit to alphabet ratio  ...  mailto: present  Frame tag present  \
Sr. No.                           ...                                       
1                            0.0  ...            False              False   
2                            0.0  ...            False               True   
3                            0.0  ...            False               True   
4                            0.0  ...            False              False   
5                            0.0  ...            False               True   

         Is title tag empty  Eval() function  Escape() function  \
Sr. No.                                                           
1                     False                0                  0   
2                     False                0                  0   
3                     False                0                  0   
4                     False                0                  0   
5                     False                0                  0   

         Exec() Function  Search() function  Is image only in webpage  \
Sr. No.                                                                 
1                      0                  3                     False   
2                      0                  0                     False   
3                      0                  0                     False   
4                      0                  0                     False   
5                      3                  0                     False   

         Domain Age in Days  Label  
Sr. No.                             
1                      3431      0  
2                      3431      0  
3                      2346      0  
4                         0      0  
5                         0      0  

[5 rows x 91 columns]
import pandas as pd
import glob

# Specify the folder containing the CSV files (modify this path accordingly)
folder_path = "URL Datasets/MLClassification/*.csv"  # Example: "data/*.csv"

# Get a list of all CSV files in the folder
csv_files = glob.glob(folder_path)

# Read and combine all CSV files into a single DataFrame
df_combined = pd.concat((pd.read_csv(file, index_col=0) for file in csv_files), ignore_index=True)
df_combined.head()

Domain Name URL Length Is IP as Host name Is .exe present Is www present FTP used .js used Files in URL css used Digit to alphabet ratio ... mailto: present Frame tag present Is title tag empty Eval() function Escape() function Exec() Function Search() function Is image only in webpage Domain Age in Days Label
0 google.com 22 False False False False False False False 0.0 ... False False False 0 0 0 3 False 3431 0
1 youtube.com 23 False False False False False False False 0.0 ... False True False 0 0 0 0 False 3431 0
2 facebook.com 24 False False False False False False False 0.0 ... False True False 0 0 0 0 False 2346 0
3 wikipedia.org 25 False False False False False False False 0.0 ... False False False 0 0 0 0 False 0 0
4 yahoo.com 21 False False False False False False False 0.0 ... False True False 0 0 3 0 False 0 0

5 rows × 91 columns

# Get the list of feature names
feature_list = df.columns.tolist()

# Get the number of features
num_features = len(feature_list)

# Print the results
print("Number of features:", num_features)
print("Feature list:", feature_list)
Number of features: 91
Feature list: ['Domain Name', 'URL Length', 'Is IP as Host name', 'Is .exe present', 'Is www present', 'FTP used', '.js used', 'Files in URL', 'css used', 'Digit to alphabet ratio', 'Special Char to Alphabet Ratio', 'Uppercase to LowercaseRatio', 'Domain to URL Ratio', 'Numeric Character', 'English Letters', 'Special Characters', 'Dots', 'Semicolon', 'Underscore', 'Question Mark', 'Hash Character', 'Equals', 'Percentage Character', 'Ampersand', 'Dash', 'Delimiters', 'At Character', 'Tilde', 'Double Slash', 'Is Hashed', 'TLD', 'Digit to alphabet distance', 'Https in URL', 'File Extention', 'TLD in Subdomain', 'TLD in path', 'https in host name', 'Host name length', 'Path length', 'Query length', 'Word based distribution', 'Is English word', 'Is Meaningful', 'Is Pronounceable', 'Is random', 'Unigram', 'Bigram', 'Trigram', 'Sensitive Words', 'Is domain suspicious', 'Levenshtein Distance', 'Entropy', 'Hyphenstring', 'Homoglyph', 'Vowel string', 'Bitsquatting', 'Insertion string', 'Omission', 'Repeatition', 'Replacement', 'Subdomain', 'Transposition', 'Addition string', 'Google Search Feature', 'IP Address', 'ASN Number', 'ASN Country Code', 'ASN CIDR', 'ASN Postal Code', 'ASN creation date', 'ASN updation date', 'Total images in webpage', 'Total links', 'Number of parameter', 'Number of fragments', 'Body tags in source', 'Meta tag in source', 'Div tag in source', 'Fake link in status bar', 'Right click disable', 'Popup window', 'mailto: present', 'Frame tag present', 'Is title tag empty', 'Eval() function', 'Escape() function', 'Exec() Function', 'Search() function', 'Is image only in webpage', 'Domain Age in Days', 'Label']
#Seperate the Lexical Feature
CountVectorizer=['Domain Name']
LexicalFeature=['URL Length', 'Is IP as Host name', 'Is .exe present', 'Is www present', 'FTP used', '.js used', 'Files in URL', 'css used', 'Digit to alphabet ratio', 'Special Char to Alphabet Ratio', 'Uppercase to LowercaseRatio', 'Domain to URL Ratio', 'Numeric Character', 'English Letters', 'Special Characters', 'Dots', 'Semicolon', 'Underscore', 'Question Mark', 'Hash Character', 'Equals', 'Percentage Character', 'Ampersand', 'Dash', 'Delimiters', 'At Character', 'Tilde', 'Double Slash', 'Is Hashed', 'TLD', 'Digit to alphabet distance', 'Https in URL', 'File Extention', 'TLD in Subdomain', 'TLD in path', 'https in host name', 'Host name length', 'Path length', 'Query length', 'Word based distribution']
LinguisticFeature=['Is English word', 'Is Meaningful', 'Is Pronounceable', 'Is random', 'Unigram', 'Bigram', 'Trigram', 'Sensitive Words']
Feature=['Is domain suspicious', 'Levenshtein Distance', 'Entropy', 'Hyphenstring', 'Homoglyph', 'Vowel string', 'Bitsquatting', 'Insertion string', 'Omission', 'Repeatition', 'Replacement', 'Subdomain', 'Transposition', 'Addition string', 'Google Search Feature']
HostBasedFeature=['IP Address', 'ASN Number', 'ASN Country Code', 'ASN CIDR', 'ASN Postal Code', 'ASN creation date', 'ASN updation date']
ContentBasedFeature=['Total images in webpage', 'Total links', 'Number of parameter', 'Number of fragments', 'Body tags in source', 'Meta tag in source', 'Div tag in source', 'Fake link in status bar', 'Right click disable', 'Popup window', 'mailto: present', 'Frame tag present', 'Is title tag empty', 'Eval() function', 'Escape() function', 'Exec() Function', 'Search() function', 'Is image only in webpage', 'Domain Age in Days']
classLable=['Label']
print("Total length:", (len(CountVectorizer)+len(LexicalFeature)+len(LinguisticFeature)+len(Feature)+len(HostBasedFeature)+len(ContentBasedFeature)))
Total length: 90
# Filter DataFrame for only the columns that exist
df_CountVectorizer = df.loc[:, df.columns.isin(CountVectorizer)]
df_LexicalFeature = df.loc[:, df.columns.isin(LexicalFeature)]
df_LinguisticFeature = df.loc[:, df.columns.isin(LinguisticFeature)]
df_Feature = df.loc[:, df.columns.isin(Feature)]
df_HostBasedFeature = df.loc[:, df.columns.isin(HostBasedFeature)]
df_ContentBasedFeature = df.loc[:, df.columns.isin(ContentBasedFeature)]
df_Label=df.loc[:, df.columns.isin(classLable)]