past-data-projects / life_time_value / 200-Feature_Selection / utils / feature_selection.py
feature_selection.py
Raw
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from . import woe

def calculate_iv(df, feature, qnt_num, v_type, target_var):
    print(feature)
    if v_type == 'c':
        try:
            _, bins_man = pd.qcut(df[feature], qnt_num, retbins=True, duplicates='drop')
        except:
            return np.nan
        wo_c = woe.WoE(qnt_num=qnt_num, v_type=v_type, bins=bins_man)
    elif v_type == 'd':
        wo_c = woe.WoE(v_type=v_type)

    wo_temp = wo_c.fit(df[feature], df[target_var])
    iv = wo_temp.iv
    return iv


def combine_feature_ivs(df, feature_dict, qnt_num, target_var):
    temp_ls = []
    for key, value in feature_dict.items():
        temp_iv = calculate_iv(df, key, qnt_num, value, target_var)
        temp_ls.append([key, temp_iv])
        
    iv_df = pd.DataFrame(temp_ls)
    iv_df.columns = ['feature', 'iv']
    iv_df = iv_df.sort_values('iv', ascending=False).reset_index(drop=True)
    return iv_df


def feat_bins_agg_comp(train_df, val_df, feature, data_type, bins, target_var, duplicates='drop'):
    train_viz = train_df[[feature, target_var]].reset_index(drop=True)
    val_viz = val_df[[feature, target_var]].reset_index(drop=True)

    if data_type == 'd':
        train_viz[feature] = train_viz[feature].fillna('missing')
        train_viz = train_viz.groupby(feature).agg({target_var:['count', 'sum', 'mean']})
        train_viz.columns = ['count', 'sum', 'mean']
        
        val_viz[feature] = val_viz[feature].fillna('missing')
        val_viz = val_viz.groupby(feature).agg({target_var:['count', 'sum', 'mean']})
        val_viz.columns = ['count', 'sum', 'mean']
    elif data_type == 'c':
        _, train_bins = pd.qcut(train_viz[feature], bins, retbins=True, duplicates=duplicates)
        train_bins[0] = train_bins[0] - 0.001
        
        train_viz['bin'] = pd.cut(train_viz[feature], bins=train_bins, duplicates=duplicates)
        if train_viz[feature].isna().sum() > 0:
            train_viz['bin'] = train_viz['bin'].cat.add_categories('missing')
            train_viz['bin'] = train_viz['bin'].fillna('missing')
        train_viz = train_viz.groupby('bin').agg({target_var:['count', 'sum', 'mean']})
        train_viz.columns = ['count', 'sum', 'mean']
        
        val_viz['bin'], val_bins= pd.cut(val_viz[feature], bins=train_bins, retbins=True, duplicates=duplicates)
        if val_viz[feature].isna().sum() > 0:
            val_viz['bin'] = val_viz['bin'].cat.add_categories('missing')
            val_viz['bin'] = val_viz['bin'].fillna('missing')
        val_viz = val_viz.groupby('bin').agg({target_var:['count', 'sum', 'mean']})
        val_viz.columns = ['count', 'sum', 'mean']
    
    return train_viz, val_viz


def eval_feature(train_df, val_df, feature_dict, bins, target_var):
	for feature, data_type in feature_dict.items():
	    print(feature)
	    agg_df_train, agg_df_val = feat_bins_agg_comp(train_df, val_df, feature, data_type, bins, target_var)
	    print('train set')
	    display(agg_df_train)
	    print('val set')
	    display(agg_df_val)
	    
	    # vizualization
	    X = agg_df_train.index.astype(str)
	    Y_1 = agg_df_train['mean']
	    Y_2 = agg_df_val['mean']

	    viz = pd.DataFrame(np.c_[Y_1,Y_2], index=X)
	    viz.columns = ['train', 'val']
	    viz.plot.bar()

	#     plt.xticks(rotation='horizontal')
	    plt.show()