import pandas as pd
# Replace 'your_file.csv' with the actual file path
df = pd.read_csv('URL Datasets/MLClassification/Copy of Merged Dataset 1.csv', index_col=0)
# Display the first few rows of the dataframe
print(df.head())
Domain Name URL Length Is IP as Host name Is .exe present \
Sr. No.
1 google.com 22 False False
2 youtube.com 23 False False
3 facebook.com 24 False False
4 wikipedia.org 25 False False
5 yahoo.com 21 False False
Is www present FTP used .js used Files in URL css used \
Sr. No.
1 False False False False False
2 False False False False False
3 False False False False False
4 False False False False False
5 False False False False False
Digit to alphabet ratio ... mailto: present Frame tag present \
Sr. No. ...
1 0.0 ... False False
2 0.0 ... False True
3 0.0 ... False True
4 0.0 ... False False
5 0.0 ... False True
Is title tag empty Eval() function Escape() function \
Sr. No.
1 False 0 0
2 False 0 0
3 False 0 0
4 False 0 0
5 False 0 0
Exec() Function Search() function Is image only in webpage \
Sr. No.
1 0 3 False
2 0 0 False
3 0 0 False
4 0 0 False
5 3 0 False
Domain Age in Days Label
Sr. No.
1 3431 0
2 3431 0
3 2346 0
4 0 0
5 0 0
[5 rows x 91 columns]
import pandas as pd
import glob
# Specify the folder containing the CSV files (modify this path accordingly)
folder_path = "URL Datasets/MLClassification/*.csv" # Example: "data/*.csv"
# Get a list of all CSV files in the folder
csv_files = glob.glob(folder_path)
# Read and combine all CSV files into a single DataFrame
df_combined = pd.concat((pd.read_csv(file, index_col=0) for file in csv_files), ignore_index=True)
df_combined.head()
Domain Name | URL Length | Is IP as Host name | Is .exe present | Is www present | FTP used | .js used | Files in URL | css used | Digit to alphabet ratio | ... | mailto: present | Frame tag present | Is title tag empty | Eval() function | Escape() function | Exec() Function | Search() function | Is image only in webpage | Domain Age in Days | Label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | google.com | 22 | False | False | False | False | False | False | False | 0.0 | ... | False | False | False | 0 | 0 | 0 | 3 | False | 3431 | 0 |
1 | youtube.com | 23 | False | False | False | False | False | False | False | 0.0 | ... | False | True | False | 0 | 0 | 0 | 0 | False | 3431 | 0 |
2 | facebook.com | 24 | False | False | False | False | False | False | False | 0.0 | ... | False | True | False | 0 | 0 | 0 | 0 | False | 2346 | 0 |
3 | wikipedia.org | 25 | False | False | False | False | False | False | False | 0.0 | ... | False | False | False | 0 | 0 | 0 | 0 | False | 0 | 0 |
4 | yahoo.com | 21 | False | False | False | False | False | False | False | 0.0 | ... | False | True | False | 0 | 0 | 3 | 0 | False | 0 | 0 |
5 rows × 91 columns
# Get the list of feature names
feature_list = df.columns.tolist()
# Get the number of features
num_features = len(feature_list)
# Print the results
print("Number of features:", num_features)
print("Feature list:", feature_list)
Number of features: 91
Feature list: ['Domain Name', 'URL Length', 'Is IP as Host name', 'Is .exe present', 'Is www present', 'FTP used', '.js used', 'Files in URL', 'css used', 'Digit to alphabet ratio', 'Special Char to Alphabet Ratio', 'Uppercase to LowercaseRatio', 'Domain to URL Ratio', 'Numeric Character', 'English Letters', 'Special Characters', 'Dots', 'Semicolon', 'Underscore', 'Question Mark', 'Hash Character', 'Equals', 'Percentage Character', 'Ampersand', 'Dash', 'Delimiters', 'At Character', 'Tilde', 'Double Slash', 'Is Hashed', 'TLD', 'Digit to alphabet distance', 'Https in URL', 'File Extention', 'TLD in Subdomain', 'TLD in path', 'https in host name', 'Host name length', 'Path length', 'Query length', 'Word based distribution', 'Is English word', 'Is Meaningful', 'Is Pronounceable', 'Is random', 'Unigram', 'Bigram', 'Trigram', 'Sensitive Words', 'Is domain suspicious', 'Levenshtein Distance', 'Entropy', 'Hyphenstring', 'Homoglyph', 'Vowel string', 'Bitsquatting', 'Insertion string', 'Omission', 'Repeatition', 'Replacement', 'Subdomain', 'Transposition', 'Addition string', 'Google Search Feature', 'IP Address', 'ASN Number', 'ASN Country Code', 'ASN CIDR', 'ASN Postal Code', 'ASN creation date', 'ASN updation date', 'Total images in webpage', 'Total links', 'Number of parameter', 'Number of fragments', 'Body tags in source', 'Meta tag in source', 'Div tag in source', 'Fake link in status bar', 'Right click disable', 'Popup window', 'mailto: present', 'Frame tag present', 'Is title tag empty', 'Eval() function', 'Escape() function', 'Exec() Function', 'Search() function', 'Is image only in webpage', 'Domain Age in Days', 'Label']
#Seperate the Lexical Feature
CountVectorizer=['Domain Name']
LexicalFeature=['URL Length', 'Is IP as Host name', 'Is .exe present', 'Is www present', 'FTP used', '.js used', 'Files in URL', 'css used', 'Digit to alphabet ratio', 'Special Char to Alphabet Ratio', 'Uppercase to LowercaseRatio', 'Domain to URL Ratio', 'Numeric Character', 'English Letters', 'Special Characters', 'Dots', 'Semicolon', 'Underscore', 'Question Mark', 'Hash Character', 'Equals', 'Percentage Character', 'Ampersand', 'Dash', 'Delimiters', 'At Character', 'Tilde', 'Double Slash', 'Is Hashed', 'TLD', 'Digit to alphabet distance', 'Https in URL', 'File Extention', 'TLD in Subdomain', 'TLD in path', 'https in host name', 'Host name length', 'Path length', 'Query length', 'Word based distribution']
LinguisticFeature=['Is English word', 'Is Meaningful', 'Is Pronounceable', 'Is random', 'Unigram', 'Bigram', 'Trigram', 'Sensitive Words']
Feature=['Is domain suspicious', 'Levenshtein Distance', 'Entropy', 'Hyphenstring', 'Homoglyph', 'Vowel string', 'Bitsquatting', 'Insertion string', 'Omission', 'Repeatition', 'Replacement', 'Subdomain', 'Transposition', 'Addition string', 'Google Search Feature']
HostBasedFeature=['IP Address', 'ASN Number', 'ASN Country Code', 'ASN CIDR', 'ASN Postal Code', 'ASN creation date', 'ASN updation date']
ContentBasedFeature=['Total images in webpage', 'Total links', 'Number of parameter', 'Number of fragments', 'Body tags in source', 'Meta tag in source', 'Div tag in source', 'Fake link in status bar', 'Right click disable', 'Popup window', 'mailto: present', 'Frame tag present', 'Is title tag empty', 'Eval() function', 'Escape() function', 'Exec() Function', 'Search() function', 'Is image only in webpage', 'Domain Age in Days']
classLable=['Label']
print("Total length:", (len(CountVectorizer)+len(LexicalFeature)+len(LinguisticFeature)+len(Feature)+len(HostBasedFeature)+len(ContentBasedFeature)))
Total length: 90
# Filter DataFrame for only the columns that exist
df_CountVectorizer = df.loc[:, df.columns.isin(CountVectorizer)]
df_LexicalFeature = df.loc[:, df.columns.isin(LexicalFeature)]
df_LinguisticFeature = df.loc[:, df.columns.isin(LinguisticFeature)]
df_Feature = df.loc[:, df.columns.isin(Feature)]
df_HostBasedFeature = df.loc[:, df.columns.isin(HostBasedFeature)]
df_ContentBasedFeature = df.loc[:, df.columns.isin(ContentBasedFeature)]
df_Label=df.loc[:, df.columns.isin(classLable)]