import pandas as pd import numpy as np import matplotlib.pyplot as plt from . import woe def calculate_iv(df, feature, qnt_num, v_type, target_var): print(feature) if v_type == 'c': try: _, bins_man = pd.qcut(df[feature], qnt_num, retbins=True, duplicates='drop') except: return np.nan wo_c = woe.WoE(qnt_num=qnt_num, v_type=v_type, bins=bins_man) elif v_type == 'd': wo_c = woe.WoE(v_type=v_type) wo_temp = wo_c.fit(df[feature], df[target_var]) iv = wo_temp.iv return iv def combine_feature_ivs(df, feature_dict, qnt_num, target_var): temp_ls = [] for key, value in feature_dict.items(): temp_iv = calculate_iv(df, key, qnt_num, value, target_var) temp_ls.append([key, temp_iv]) iv_df = pd.DataFrame(temp_ls) iv_df.columns = ['feature', 'iv'] iv_df = iv_df.sort_values('iv', ascending=False).reset_index(drop=True) return iv_df def feat_bins_agg_comp(train_df, val_df, feature, data_type, bins, target_var, duplicates='drop'): train_viz = train_df[[feature, target_var]].reset_index(drop=True) val_viz = val_df[[feature, target_var]].reset_index(drop=True) if data_type == 'd': train_viz[feature] = train_viz[feature].fillna('missing') train_viz = train_viz.groupby(feature).agg({target_var:['count', 'sum', 'mean']}) train_viz.columns = ['count', 'sum', 'mean'] val_viz[feature] = val_viz[feature].fillna('missing') val_viz = val_viz.groupby(feature).agg({target_var:['count', 'sum', 'mean']}) val_viz.columns = ['count', 'sum', 'mean'] elif data_type == 'c': _, train_bins = pd.qcut(train_viz[feature], bins, retbins=True, duplicates=duplicates) train_bins[0] = train_bins[0] - 0.001 train_viz['bin'] = pd.cut(train_viz[feature], bins=train_bins, duplicates=duplicates) if train_viz[feature].isna().sum() > 0: train_viz['bin'] = train_viz['bin'].cat.add_categories('missing') train_viz['bin'] = train_viz['bin'].fillna('missing') train_viz = train_viz.groupby('bin').agg({target_var:['count', 'sum', 'mean']}) train_viz.columns = ['count', 'sum', 'mean'] val_viz['bin'], val_bins= pd.cut(val_viz[feature], bins=train_bins, retbins=True, duplicates=duplicates) if val_viz[feature].isna().sum() > 0: val_viz['bin'] = val_viz['bin'].cat.add_categories('missing') val_viz['bin'] = val_viz['bin'].fillna('missing') val_viz = val_viz.groupby('bin').agg({target_var:['count', 'sum', 'mean']}) val_viz.columns = ['count', 'sum', 'mean'] return train_viz, val_viz def eval_feature(train_df, val_df, feature_dict, bins, target_var): for feature, data_type in feature_dict.items(): print(feature) agg_df_train, agg_df_val = feat_bins_agg_comp(train_df, val_df, feature, data_type, bins, target_var) print('train set') display(agg_df_train) print('val set') display(agg_df_val) # vizualization X = agg_df_train.index.astype(str) Y_1 = agg_df_train['mean'] Y_2 = agg_df_val['mean'] viz = pd.DataFrame(np.c_[Y_1,Y_2], index=X) viz.columns = ['train', 'val'] viz.plot.bar() # plt.xticks(rotation='horizontal') plt.show()