URL Datasets/SeperateData.py · Malicious-URL-Detection-ML


def dataSeperation(train_df,test_df):
    #Seperate the Lexical Feature
    CountVectorizer=['Domain Name']
    LexicalFeature=['URL Length', 'Is IP as Host name', 'Is .exe present', 'Is www present', 'FTP used', '.js used', 'Files in URL', 'css used', 'Digit to alphabet ratio', 'Special Char to Alphabet Ratio', 'Uppercase to LowercaseRatio', 'Domain to URL Ratio', 'Numeric Character', 'English Letters', 'Special Characters', 'Dots', 'Semicolon', 'Underscore', 'Question Mark', 'Hash Character', 'Equals', 'Percentage Character', 'Ampersand', 'Dash', 'Delimiters', 'At Character', 'Tilde', 'Double Slash', 'Is Hashed', 'TLD', 'Digit to alphabet distance', 'Https in URL', 'File Extention', 'TLD in Subdomain', 'TLD in path', 'https in host name', 'Host name length', 'Path length', 'Query length', 'Word based distribution']
    LinguisticFeature=['Is English word', 'Is Meaningful', 'Is Pronounceable', 'Is random', 'Unigram', 'Bigram', 'Trigram', 'Sensitive Words']
    Feature=['Is domain suspicious', 'Levenshtein Distance', 'Entropy', 'Hyphenstring', 'Homoglyph', 'Vowel string', 'Bitsquatting', 'Insertion string', 'Omission', 'Repeatition', 'Replacement', 'Subdomain', 'Transposition', 'Addition string', 'Google Search Feature']
    HostBasedFeature=['IP Address', 'ASN Number', 'ASN Country Code', 'ASN CIDR', 'ASN Postal Code', 'ASN creation date', 'ASN updation date']
    ContentBasedFeature=['Total images in webpage', 'Total links', 'Number of parameter', 'Number of fragments', 'Body tags in source', 'Meta tag in source', 'Div tag in source', 'Fake link in status bar', 'Right click disable', 'Popup window', 'mailto: present', 'Frame tag present', 'Is title tag empty', 'Eval() function', 'Escape() function', 'Exec() Function', 'Search() function', 'Is image only in webpage', 'Domain Age in Days']
    classLable=['Label']
    #print(len(LexicalFeature))
    
    df_TrainCountVectorizer = train_df.loc[:, train_df.columns.isin(CountVectorizer)]
    df_TrainLexicalFeature = train_df.loc[:, train_df.columns.isin(LexicalFeature)]
    df_TrainLinguisticFeature = train_df.loc[:, train_df.columns.isin(LinguisticFeature)]
    df_TrainFeature = train_df.loc[:, train_df.columns.isin(Feature)]
    df_TrainHostBasedFeature = train_df.loc[:, train_df.columns.isin(HostBasedFeature)]
    df_TrainContentBasedFeature = train_df.loc[:, train_df.columns.isin(ContentBasedFeature)]
    df_TrainLabel=train_df.loc[:, train_df.columns.isin(classLable)]
    
    df_TestCountVectorizer = test_df.loc[:, test_df.columns.isin(CountVectorizer)]
    df_TestLexicalFeature = test_df.loc[:, test_df.columns.isin(LexicalFeature)]
    df_TestLinguisticFeature = test_df.loc[:, test_df.columns.isin(LinguisticFeature)]
    df_TestFeature = test_df.loc[:, test_df.columns.isin(Feature)]
    df_TestHostBasedFeature = test_df.loc[:, test_df.columns.isin(HostBasedFeature)]
    df_TestContentBasedFeature = test_df.loc[:, test_df.columns.isin(ContentBasedFeature)]
    df_TestLabel=test_df.loc[:, test_df.columns.isin(classLable)]
    
    return [df_TrainCountVectorizer,df_TrainLexicalFeature,df_TrainLinguisticFeature,df_TrainFeature,df_TrainHostBasedFeature,df_TrainContentBasedFeature,df_TrainLabel], [df_TestCountVectorizer, df_TestLexicalFeature, df_TestLinguisticFeature, df_TestFeature, df_TestHostBasedFeature, df_TestContentBasedFeature, df_TestLabel]