URL Datasets/URLAnalysis.ipynb · Malicious-URL-Detection-ML

import pandas as pd

# Replace 'your_file.csv' with the actual file path
df = pd.read_csv('URL Datasets/MLClassification/Copy of Merged Dataset 1.csv',  index_col=0)

# Display the first few rows of the dataframe
print(df.head())

           Domain Name  URL Length  Is IP as Host name  Is .exe present  \
Sr. No.                                                                   
1           google.com          22               False            False   
2          youtube.com          23               False            False   
3         facebook.com          24               False            False   
4        wikipedia.org          25               False            False   
5            yahoo.com          21               False            False   

         Is www present  FTP used  .js used  Files in URL  css used  \
Sr. No.                                                               
1                 False     False     False         False     False   
2                 False     False     False         False     False   
3                 False     False     False         False     False   
4                 False     False     False         False     False   
5                 False     False     False         False     False   

         Digit to alphabet ratio  ...  mailto: present  Frame tag present  \
Sr. No.                           ...                                       
1                            0.0  ...            False              False   
2                            0.0  ...            False               True   
3                            0.0  ...            False               True   
4                            0.0  ...            False              False   
5                            0.0  ...            False               True   

         Is title tag empty  Eval() function  Escape() function  \
Sr. No.                                                           
1                     False                0                  0   
2                     False                0                  0   
3                     False                0                  0   
4                     False                0                  0   
5                     False                0                  0   

         Exec() Function  Search() function  Is image only in webpage  \
Sr. No.                                                                 
1                      0                  3                     False   
2                      0                  0                     False   
3                      0                  0                     False   
4                      0                  0                     False   
5                      3                  0                     False   

         Domain Age in Days  Label  
Sr. No.                             
1                      3431      0  
2                      3431      0  
3                      2346      0  
4                         0      0  
5                         0      0  

[5 rows x 91 columns]

import pandas as pd
import glob

# Specify the folder containing the CSV files (modify this path accordingly)
folder_path = "URL Datasets/MLClassification/*.csv"  # Example: "data/*.csv"

# Get a list of all CSV files in the folder
csv_files = glob.glob(folder_path)

# Read and combine all CSV files into a single DataFrame
df_combined = pd.concat((pd.read_csv(file, index_col=0) for file in csv_files), ignore_index=True)
df_combined.head()

	Domain Name	URL Length	Is IP as Host name	Is .exe present	Is www present	FTP used	.js used	Files in URL	css used	...	mailto: present	Frame tag present	Is title tag empty	Exec() Function	Search() function	Is image only in webpage	Domain Age in Days
0	google.com	22	False	False	False	False	False	False	False	...	False	False	False	0	3	False	3431
1	youtube.com	23	False	False	False	False	False	False	False	...	False	True	False	0	0	False	3431
2	facebook.com	24	False	False	False	False	False	False	False	...	False	True	False	0	0	False	2346
3	wikipedia.org	25	False	False	False	False	False	False	False	...	False	False	False	0	0	False	0
4	yahoo.com	21	False	False	False	False	False	False	False	...	False	True	False	3	0	False	0

5 rows × 91 columns

# Get the list of feature names
feature_list = df.columns.tolist()

# Get the number of features
num_features = len(feature_list)

# Print the results
print("Number of features:", num_features)
print("Feature list:", feature_list)

Number of features: 91
Feature list: ['Domain Name', 'URL Length', 'Is IP as Host name', 'Is .exe present', 'Is www present', 'FTP used', '.js used', 'Files in URL', 'css used', 'Digit to alphabet ratio', 'Special Char to Alphabet Ratio', 'Uppercase to LowercaseRatio', 'Domain to URL Ratio', 'Numeric Character', 'English Letters', 'Special Characters', 'Dots', 'Semicolon', 'Underscore', 'Question Mark', 'Hash Character', 'Equals', 'Percentage Character', 'Ampersand', 'Dash', 'Delimiters', 'At Character', 'Tilde', 'Double Slash', 'Is Hashed', 'TLD', 'Digit to alphabet distance', 'Https in URL', 'File Extention', 'TLD in Subdomain', 'TLD in path', 'https in host name', 'Host name length', 'Path length', 'Query length', 'Word based distribution', 'Is English word', 'Is Meaningful', 'Is Pronounceable', 'Is random', 'Unigram', 'Bigram', 'Trigram', 'Sensitive Words', 'Is domain suspicious', 'Levenshtein Distance', 'Entropy', 'Hyphenstring', 'Homoglyph', 'Vowel string', 'Bitsquatting', 'Insertion string', 'Omission', 'Repeatition', 'Replacement', 'Subdomain', 'Transposition', 'Addition string', 'Google Search Feature', 'IP Address', 'ASN Number', 'ASN Country Code', 'ASN CIDR', 'ASN Postal Code', 'ASN creation date', 'ASN updation date', 'Total images in webpage', 'Total links', 'Number of parameter', 'Number of fragments', 'Body tags in source', 'Meta tag in source', 'Div tag in source', 'Fake link in status bar', 'Right click disable', 'Popup window', 'mailto: present', 'Frame tag present', 'Is title tag empty', 'Eval() function', 'Escape() function', 'Exec() Function', 'Search() function', 'Is image only in webpage', 'Domain Age in Days', 'Label']

#Seperate the Lexical Feature
CountVectorizer=['Domain Name']
LexicalFeature=['URL Length', 'Is IP as Host name', 'Is .exe present', 'Is www present', 'FTP used', '.js used', 'Files in URL', 'css used', 'Digit to alphabet ratio', 'Special Char to Alphabet Ratio', 'Uppercase to LowercaseRatio', 'Domain to URL Ratio', 'Numeric Character', 'English Letters', 'Special Characters', 'Dots', 'Semicolon', 'Underscore', 'Question Mark', 'Hash Character', 'Equals', 'Percentage Character', 'Ampersand', 'Dash', 'Delimiters', 'At Character', 'Tilde', 'Double Slash', 'Is Hashed', 'TLD', 'Digit to alphabet distance', 'Https in URL', 'File Extention', 'TLD in Subdomain', 'TLD in path', 'https in host name', 'Host name length', 'Path length', 'Query length', 'Word based distribution']
LinguisticFeature=['Is English word', 'Is Meaningful', 'Is Pronounceable', 'Is random', 'Unigram', 'Bigram', 'Trigram', 'Sensitive Words']
Feature=['Is domain suspicious', 'Levenshtein Distance', 'Entropy', 'Hyphenstring', 'Homoglyph', 'Vowel string', 'Bitsquatting', 'Insertion string', 'Omission', 'Repeatition', 'Replacement', 'Subdomain', 'Transposition', 'Addition string', 'Google Search Feature']
HostBasedFeature=['IP Address', 'ASN Number', 'ASN Country Code', 'ASN CIDR', 'ASN Postal Code', 'ASN creation date', 'ASN updation date']
ContentBasedFeature=['Total images in webpage', 'Total links', 'Number of parameter', 'Number of fragments', 'Body tags in source', 'Meta tag in source', 'Div tag in source', 'Fake link in status bar', 'Right click disable', 'Popup window', 'mailto: present', 'Frame tag present', 'Is title tag empty', 'Eval() function', 'Escape() function', 'Exec() Function', 'Search() function', 'Is image only in webpage', 'Domain Age in Days']
classLable=['Label']
print("Total length:", (len(CountVectorizer)+len(LexicalFeature)+len(LinguisticFeature)+len(Feature)+len(HostBasedFeature)+len(ContentBasedFeature)))

Total length: 90

# Filter DataFrame for only the columns that exist
df_CountVectorizer = df.loc[:, df.columns.isin(CountVectorizer)]
df_LexicalFeature = df.loc[:, df.columns.isin(LexicalFeature)]
df_LinguisticFeature = df.loc[:, df.columns.isin(LinguisticFeature)]
df_Feature = df.loc[:, df.columns.isin(Feature)]
df_HostBasedFeature = df.loc[:, df.columns.isin(HostBasedFeature)]
df_ContentBasedFeature = df.loc[:, df.columns.isin(ContentBasedFeature)]
df_Label=df.loc[:, df.columns.isin(classLable)]