{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-04-10T08:28:53.974518Z",
"start_time": "2019-04-10T08:28:53.189985Z"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pickle\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Original Data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-04-10T08:28:54.090753Z",
"start_time": "2019-04-10T08:28:54.079572Z"
}
},
"outputs": [],
"source": [
"base_df = pd.read_parquet('data/interim/pl_jumbo_mini_junjulaugsep_27012020.parquet')\\\n",
" .rename(columns={'loan_amount':'amount'})\n",
"pl_feats = pd.read_parquet('data/interim/pl_model_v3_feats_27012020.parquet')\n",
"\n",
"# REMOVE OLD util_non_pl\n",
"pl_feats = pl_feats.drop(columns=['util_non_pl'])\n",
"# ATTACH NEW util_non_pl\n",
"fixed_util_non_pl = pd.read_parquet('data/interim/fixed_util_non_pl.parquet')\n",
"pl_feats = pl_feats.merge(fixed_util_non_pl, how='left', on=['user_id', 'trx_id'])\n",
"\n",
"pf_feats = pd.read_parquet('data/interim/pl_model_v3_pefindo_feats_27012020.parquet')\\\n",
" .drop(columns=['transaction_date'])\n",
"new_feats = pd.read_parquet('data/interim/pl_model_v3_new_feats_27012020.parquet')\n",
"dgtl_trx_feats = pd.read_parquet('data/interim/pl_model_v3_dgtl_trx_feats_18022020.parquet')\n",
"new_trx_feats = pd.read_parquet('data/interim/pl_model_v3_new_trx_feats_27012020.parquet')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(150672, 18)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(150672, 28)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(150672, 9)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(150672, 85)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(150672, 56)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(150672, 30)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(base_df.shape)\n",
"display(pl_feats.shape)\n",
"display(pf_feats.shape)\n",
"display(new_feats.shape)\n",
"display(dgtl_trx_feats.shape)\n",
"display(new_trx_feats.shape)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"raw_df = base_df.merge(pl_feats, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(pf_feats, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(new_feats, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(dgtl_trx_feats, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(new_trx_feats, how='left', on=['user_id', 'trx_id'])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"rename_feats = {\n",
" 'denied_trx_co_3mo':'trx_denied_co_90',\n",
" 'delin_max_dpd_3mo':'delin_max_dpd_90',\n",
" 'pl_trx_co_jumbomini_4_90':'pl_trx_suc_co_90',\n",
" 'settle_to_due_last_pl_180':'pl_settle_to_due_last_180',\n",
" 'settled_trx_sum_1mo':'trx_sett_sum_30',\n",
" 'td_date':'date_of_month',\n",
" 'time_approve_to_pl_hour':'time_appr_to_pl_hour',\n",
" 'time_from_prev_pl_settle':'time_from_last_sett_pl_hour'\n",
"}\n",
"\n",
"raw_df = raw_df.rename(columns=rename_feats)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(150672, 216)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_df.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Feature Transformations"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"raw_df['payment_type'] = np.where(raw_df['payment_type'] == '3_months', 0, 1)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# NaN values in time_approve_to_pl_hour are caused by missing approval time in DB\n",
"# fill with 0. (There are only 4 cases)\n",
"raw_df['time_appr_to_pl_hour'] = raw_df['time_appr_to_pl_hour'].fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Negative values caused by error in loan table. User's previous pl\n",
"# settlement date is after the current pl transaction date.\n",
"# Replace with median values. (There are only 2 cases)\n",
"neg_index = raw_df[raw_df['time_from_last_sett_pl_hour'] < 0].index\n",
"med_val = raw_df['time_from_last_sett_pl_hour'].median()\n",
"raw_df.loc[neg_index, 'time_from_last_sett_pl_hour'] = med_val"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### flag_bad Definition"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"raw_df['flag_bad'] = np.where(raw_df['max_dpd'] > 60, 1, 0)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2019-04-10T08:28:54.096002Z",
"start_time": "2019-04-10T08:28:54.094053Z"
}
},
"outputs": [],
"source": [
"# condlist = [raw_df['max_dpd']<8, raw_df['max_dpd']>=61]\n",
"# choicelist = [0, 1]\n",
"# raw_df['flag_bad'] = np.select(condlist, choicelist, default=2)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 144230\n",
"1 6442\n",
"Name: flag_bad, dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_df['flag_bad'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Rejected Users Data"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"rej_base = pd.read_parquet('data/pl_rejected_users_junjulaugsep_04022020.parquet')\\\n",
" .rename(columns={'transaction_id':'trx_id'})\n",
"rej_feats = pd.read_parquet('data/pl_rejected_users_feats_04022020.parquet')\n",
"# REMOVE util_non_pl & nondgtl_nonpl_trx_co_180\n",
"rej_feats = rej_feats.drop(columns=['util_non_pl', 'nondgtl_nonpl_trx_co_180'])\n",
"# ATTACH fixed feats\n",
"fixed_rej_util = pd.read_parquet('data/fixed_pl_rejected_util_non_pl.parquet')\n",
"fixed_rej_nondgtl = pd.read_parquet('data/fixed_pl_rejected_dgtl_trx_feats.parquet')\n",
"rej_feats = rej_feats.merge(fixed_rej_util[['user_id', 'trx_id', 'util_non_pl']], how='left', \n",
" on=['user_id', 'trx_id'])\\\n",
" .merge(fixed_rej_nondgtl[['user_id', 'trx_id', 'nondgtl_nonpl_trx_co_180']], how='left', \n",
" on=['user_id', 'trx_id'])\n",
"\n",
"rej_pef_feats = pd.read_parquet('data/pl_rejected_users_pefindo_feats_04022020.parquet')\\\n",
" .rename(columns={'transaction_id':'trx_id'})"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"cols_to_drop = [\n",
" 'rule_name', 'result', 'remarks',\n",
" 'transaction_date', 'amount', 'performance_window', 'max_dpd'\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"rej_feats = rej_feats.drop(columns=cols_to_drop)\n",
"rej_pef_feats = rej_pef_feats.drop(columns=['transaction_date'])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['trx_id', 'user_id', 'rule_name', 'result', 'remarks',\n",
" 'transaction_date', 'amount', 'payment_type', 'performance_window',\n",
" 'max_dpd'],\n",
" dtype='object')"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rej_base.columns"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 10)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(16190, 191)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(16190, 9)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(rej_base.shape)\n",
"display(rej_feats.shape)\n",
"display(rej_pef_feats.shape)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"rej_raw = rej_base.merge(rej_feats, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(rej_pef_feats, how='left', on=['user_id', 'trx_id'])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"rename_feats = {\n",
" 'denied_trx_co_3mo':'trx_denied_co_90',\n",
" 'delin_max_dpd_3mo':'delin_max_dpd_90',\n",
" 'pl_trx_co_jumbomini_4_90':'pl_trx_suc_co_90',\n",
" 'settle_to_due_last_pl_180':'pl_settle_to_due_last_180',\n",
" 'settled_trx_sum_1mo':'trx_sett_sum_30',\n",
" 'td_date':'date_of_month',\n",
" 'time_approve_to_pl_hour':'time_appr_to_pl_hour',\n",
" 'time_from_prev_pl_settle':'time_from_last_sett_pl_hour'\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"rej_raw = rej_raw.rename(columns=rename_feats)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 206)"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rej_raw.shape"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"15307"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rej_raw['user_id'].nunique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Feature Transformations"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"rej_raw['payment_type'] = np.where(rej_raw['payment_type'] == '3_months', 0, 1)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# NaN values in time_approve_to_pl_hour are caused by missing approval time in DB\n",
"# fill with 0. (There are only 4 cases)\n",
"rej_raw['time_appr_to_pl_hour'] = rej_raw['time_appr_to_pl_hour'].fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# Negative values caused by error in loan table. User's previous pl\n",
"# settlement date is after the current pl transaction date.\n",
"# Replace with median values. (There are only 2 cases)\n",
"neg_index = rej_raw[rej_raw['time_from_last_sett_pl_hour'] < 0].index\n",
"med_val = rej_raw['time_from_last_sett_pl_hour'].median()\n",
"rej_raw.loc[neg_index, 'time_from_last_sett_pl_hour'] = med_val"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### flag_bad Definition"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"rej_raw['flag_non_trx'] = rej_raw['max_dpd'].isna()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"677"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rej_raw['max_dpd'].isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.04181593576281655"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rej_raw['max_dpd'].isna().sum()/rej_raw.shape[0]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# users without any outstanding trx (max_dpd is null)\n",
"# assign as a good user\n",
"rej_raw['flag_bad'] = np.where(rej_raw['max_dpd'] > 60, 1, 0)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 14755\n",
"1 1435\n",
"Name: flag_bad, dtype: int64"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rej_raw['flag_bad'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# condlist = [rej_raw['max_dpd']<=60, rej_raw['max_dpd']>=61, rej_raw['max_dpd'].isna()]\n",
"# choicelist = [0, 1, np.nan]\n",
"# rej_raw['flag_bad'] = np.select(condlist, choicelist, default=2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Concat Original Data with Rejected Users Data"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"features = ['ap_co_plo',\n",
" 'current_dpd',\n",
" 'date_of_month',\n",
" 'delin_max_dpd_90',\n",
" 'nondgtl_nonpl_trx_co_180',\n",
" 'oth_last_rep_days',\n",
" 'oth_last_rep_dpd',\n",
" 'payment_type',\n",
" 'pf_delin_max_dpd_12mo',\n",
" 'pl_settle_to_due_last_180',\n",
" 'pl_trx_suc_co_90',\n",
" 'time_appr_to_pl_hour',\n",
" 'time_from_last_sett_pl_hour',\n",
" 'trx_denied_co_90',\n",
" 'trx_sett_sum_30',\n",
" 'util_non_pl',\n",
" 'util_pl'\n",
" ]\n",
"\n",
"features = sorted(features)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"context_cols = [\n",
" 'user_id', 'trx_id', 'transaction_date', 'max_dpd', 'flag_bad', 'performance_window', 'amount'\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(150672, 24)"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_df[context_cols + features].shape"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 24)"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rej_raw[context_cols + features].shape"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"calib_df = pd.concat([raw_df[context_cols + features], rej_raw[context_cols + features]])\\\n",
" .reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(166862, 24)"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calib_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"calib_df['flag_no_trx'] = calib_df['max_dpd'].isna()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False 166171\n",
"True 691\n",
"Name: flag_no_trx, dtype: int64"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calib_df['flag_no_trx'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"calib_df['flag_rejected'] = calib_df['trx_id'].isin(rej_raw['trx_id'])"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False 150672\n",
"True 16190\n",
"Name: flag_rejected, dtype: int64"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calib_df['flag_rejected'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 158985\n",
"1 7877\n",
"Name: flag_bad, dtype: int64"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calib_df['flag_bad'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Simple Analysis"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.04275512371243496"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calib_df[calib_df['flag_rejected'] == False]['flag_bad'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3773292.317086121"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calib_df[calib_df['flag_rejected'] == False]['amount'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.08863495985176034"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calib_df[calib_df['flag_rejected'] == True]['flag_bad'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5864175.416924027"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calib_df[calib_df['flag_rejected'] == True]['amount'].mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Input Model Proba Score and Calibrate"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"mod_lgb = pickle.load(open('lgbm_pl_score_v3.p', 'rb'))"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1079"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mod_lgb.best_iteration_"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"calib_df['proba_v3'] = mod_lgb.predict_proba(calib_df[features], \n",
" num_iteration=mod_lgb.best_iteration_\n",
" )[:,1]"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"calib_df['proba_v3'] = calib_df['proba_v3'].apply(lambda x: 0.0045+0.622*x)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get V2 Score"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"import pymysql\n",
"import configparser\n",
"config = configparser.ConfigParser()\n",
"config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
"\n",
"host = config['MYSQL-ROOT']['HOST']\n",
"user = config['MYSQL-ROOT']['USER']\n",
"password = config['MYSQL-ROOT']['PASSWORD']\n",
"\n",
"def connect_sql():\n",
" cnx = pymysql.connect(host=host,\n",
" user=user,\n",
" password=password,\n",
" cursorclass=pymysql.cursors.DictCursor)\n",
" return cnx"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"tids = tuple(calib_df['trx_id'])\n",
"query = f'''\n",
"SELECT trx_id, user_id, score, feature, calculation_date\n",
"FROM ds.b_score_pl_log\n",
"WHERE trx_id IN {tids}\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"pl_v2 = pd.read_sql(query, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(167533, 5)"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_v2.shape"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(166862, 27)"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calib_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"# drop duplicates\n",
"pl_v2 = pl_v2.sort_values(by=['trx_id', 'user_id', 'calculation_date'])\\\n",
" .drop_duplicates(subset=['trx_id'], keep='last')"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(165532, 5)"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_v2.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Update Dataset with Recalculated PL Scores\n",
"There were random cases where users PL score were not calculated (caused by a bug). These users were granted a PL regardless they passed the PL score or not. The PLV2 score for these users must be recalculated to get a full picture. These users are denoted by \"score = -1\""
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"recalc = pl_v2[pl_v2['score'] == -1][['trx_id', 'user_id', 'score', 'feature']].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"nan = np.nan\n",
"recalc['feature'] = recalc.apply(lambda x: eval(x['feature']), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"recalc_feats = pd.DataFrame(recalc['feature'].tolist(), index=recalc['trx_id']).reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"feats_used = ['ap_co_plo',\n",
" 'calibrated_final',\n",
" 'current_dpd',\n",
" 'delin_max_dpd_3mo',\n",
" 'delin_max_dpd_amt_3mo',\n",
" 'flag_bad_pefindo',\n",
" 'flag_good_pefindo',\n",
" 'os_amount',\n",
" 'oth_first_trx_amount',\n",
" 'oth_last_rep_channel',\n",
" 'oth_last_rep_days',\n",
" 'oth_last_rep_dpd',\n",
" 'oth_last_trx_amount',\n",
" 'pf_con_open_count_12mo',\n",
" 'pf_delin_dist_contractcode_30_dpd_3mo',\n",
" 'pf_delin_max_dpd_12mo',\n",
" 'pf_util_creditcard_avg_12mo',\n",
" 'time_approve_to_pl_hour',\n",
" 'util_non_pl',\n",
" 'util_pl']"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"mod_xgb = pickle.load(open('xgb_pl_model_v2.p', 'rb'))"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"recalc_feats['recalc_score'] = mod_xgb.predict_proba(recalc_feats[feats_used],\n",
" ntree_limit=mod_xgb.best_iteration)[:,1]"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"comb_temp = pl_v2.set_index('trx_id')"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"comb_temp.loc[comb_temp['score'] == -1, 'score'] = recalc_feats.set_index('trx_id')['recalc_score']"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"comb_temp = comb_temp.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(165532, 5)"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"comb_temp.shape"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"pl_v2 = comb_temp.copy()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(165532, 5)"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_v2.shape"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"calib_df = calib_df.merge(pl_v2[['trx_id', 'user_id', 'score']], how='left', on=['trx_id', 'user_id'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Simulate Rejection\n",
"Simulate Rejection on September dataset only."
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"calib_df['bad_amount'] = calib_df['flag_bad']*calib_df['amount']"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9 48047\n",
"8 45172\n",
"7 42207\n",
"6 31436\n",
"Name: transaction_date, dtype: int64"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calib_df['transaction_date'].dt.month.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"calib_sep = calib_df[calib_df['transaction_date'].dt.month == 9].reset_index(drop=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sanity Check"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"calib_sep['check_bins'] = pd.qcut(calib_sep['proba_v3'], q=10)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe thead tr:last-of-type th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th></th>\n",
" <th colspan=\"3\" halign=\"left\">flag_bad</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>sum</th>\n",
" <th>mean</th>\n",
" </tr>\n",
" <tr>\n",
" <th>flag_rejected</th>\n",
" <th>check_bins</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"10\" valign=\"top\">False</th>\n",
" <th>(0.00455, 0.00859]</th>\n",
" <td>4805.0</td>\n",
" <td>10.0</td>\n",
" <td>0.002081</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.00859, 0.0111]</th>\n",
" <td>4805.0</td>\n",
" <td>23.0</td>\n",
" <td>0.004787</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0111, 0.0144]</th>\n",
" <td>4804.0</td>\n",
" <td>51.0</td>\n",
" <td>0.010616</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0144, 0.019]</th>\n",
" <td>4803.0</td>\n",
" <td>80.0</td>\n",
" <td>0.016656</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.019, 0.0251]</th>\n",
" <td>4805.0</td>\n",
" <td>116.0</td>\n",
" <td>0.024142</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0251, 0.0341]</th>\n",
" <td>4790.0</td>\n",
" <td>185.0</td>\n",
" <td>0.038622</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0341, 0.047]</th>\n",
" <td>4766.0</td>\n",
" <td>248.0</td>\n",
" <td>0.052035</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.047, 0.0702]</th>\n",
" <td>4670.0</td>\n",
" <td>340.0</td>\n",
" <td>0.072805</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0702, 0.117]</th>\n",
" <td>4272.0</td>\n",
" <td>460.0</td>\n",
" <td>0.107678</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.117, 0.493]</th>\n",
" <td>3108.0</td>\n",
" <td>529.0</td>\n",
" <td>0.170206</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"10\" valign=\"top\">True</th>\n",
" <th>(0.00455, 0.00859]</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.00859, 0.0111]</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0111, 0.0144]</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0144, 0.019]</th>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.019, 0.0251]</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0251, 0.0341]</th>\n",
" <td>14.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0341, 0.047]</th>\n",
" <td>39.0</td>\n",
" <td>2.0</td>\n",
" <td>0.051282</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.047, 0.0702]</th>\n",
" <td>134.0</td>\n",
" <td>8.0</td>\n",
" <td>0.059701</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0702, 0.117]</th>\n",
" <td>533.0</td>\n",
" <td>47.0</td>\n",
" <td>0.088180</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.117, 0.493]</th>\n",
" <td>1697.0</td>\n",
" <td>242.0</td>\n",
" <td>0.142605</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" flag_bad \n",
" count sum mean\n",
"flag_rejected check_bins \n",
"False (0.00455, 0.00859] 4805.0 10.0 0.002081\n",
" (0.00859, 0.0111] 4805.0 23.0 0.004787\n",
" (0.0111, 0.0144] 4804.0 51.0 0.010616\n",
" (0.0144, 0.019] 4803.0 80.0 0.016656\n",
" (0.019, 0.0251] 4805.0 116.0 0.024142\n",
" (0.0251, 0.0341] 4790.0 185.0 0.038622\n",
" (0.0341, 0.047] 4766.0 248.0 0.052035\n",
" (0.047, 0.0702] 4670.0 340.0 0.072805\n",
" (0.0702, 0.117] 4272.0 460.0 0.107678\n",
" (0.117, 0.493] 3108.0 529.0 0.170206\n",
"True (0.00455, 0.00859] NaN NaN NaN\n",
" (0.00859, 0.0111] NaN NaN NaN\n",
" (0.0111, 0.0144] NaN NaN NaN\n",
" (0.0144, 0.019] 2.0 0.0 0.000000\n",
" (0.019, 0.0251] NaN NaN NaN\n",
" (0.0251, 0.0341] 14.0 0.0 0.000000\n",
" (0.0341, 0.047] 39.0 2.0 0.051282\n",
" (0.047, 0.0702] 134.0 8.0 0.059701\n",
" (0.0702, 0.117] 533.0 47.0 0.088180\n",
" (0.117, 0.493] 1697.0 242.0 0.142605"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calib_sep.groupby(['flag_rejected', 'check_bins']).agg({'flag_bad':['count', 'sum', 'mean']})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Same Amount Rejection"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"23277500000.0"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# the new threshold should reject the same amount\n",
"# as shown here\n",
"calib_sep[calib_sep['score'] > 0.25]['amount'].sum()"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.11400 -> 23,326,000,000.0\n",
"0.11401 -> 23,326,000,000.0\n",
"0.11402 -> 23,325,000,000.0\n",
"0.11403 -> 23,325,000,000.0\n",
"0.11404 -> 23,312,000,000.0\n",
"0.11405 -> 23,312,000,000.0\n",
"0.11406 -> 23,312,000,000.0\n",
"0.11407 -> 23,312,000,000.0\n",
"0.11408 -> 23,312,000,000.0\n",
"0.11409 -> 23,311,000,000.0\n",
"0.11410 -> 23,311,000,000.0\n",
"0.11411 -> 23,311,000,000.0\n",
"0.11412 -> 23,311,000,000.0\n",
"0.11413 -> 23,310,000,000.0\n",
"0.11414 -> 23,310,000,000.0\n",
"0.11415 -> 23,310,000,000.0\n",
"0.11416 -> 23,310,000,000.0\n",
"0.11417 -> 23,310,000,000.0\n",
"0.11418 -> 23,310,000,000.0\n",
"0.11419 -> 23,300,500,000.0\n",
"0.11420 -> 23,296,500,000.0\n",
"0.11421 -> 23,295,000,000.0\n",
"0.11422 -> 23,284,500,000.0\n",
"0.11423 -> 23,284,500,000.0\n",
"0.11424 -> 23,284,500,000.0\n",
"0.11425 -> 23,269,500,000.0\n",
"0.11426 -> 23,265,000,000.0\n",
"0.11427 -> 23,251,000,000.0\n",
"0.11428 -> 23,251,000,000.0\n",
"0.11429 -> 23,233,500,000.0\n",
"0.11430 -> 23,224,500,000.0\n",
"0.11431 -> 23,221,000,000.0\n",
"0.11432 -> 23,221,000,000.0\n",
"0.11433 -> 23,217,000,000.0\n",
"0.11434 -> 23,215,500,000.0\n",
"0.11435 -> 23,213,000,000.0\n",
"0.11436 -> 23,213,000,000.0\n",
"0.11437 -> 23,198,000,000.0\n",
"0.11438 -> 23,190,000,000.0\n",
"0.11439 -> 23,187,000,000.0\n",
"0.11440 -> 23,182,500,000.0\n",
"0.11441 -> 23,182,500,000.0\n",
"0.11442 -> 23,182,500,000.0\n",
"0.11443 -> 23,181,500,000.0\n",
"0.11444 -> 23,181,500,000.0\n",
"0.11445 -> 23,181,500,000.0\n",
"0.11446 -> 23,181,500,000.0\n",
"0.11447 -> 23,181,500,000.0\n",
"0.11448 -> 23,181,500,000.0\n",
"0.11449 -> 23,175,500,000.0\n",
"0.11450 -> 23,175,500,000.0\n"
]
}
],
"source": [
"candidates = [x for x in np.arange(.114, .1145, .00001)]\n",
"\n",
"for candidate in candidates:\n",
" print(f\"{candidate:.5f} -> {calib_sep[calib_sep['proba_v3'] > candidate]['amount'].sum():,}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- 0.11424 -> 23,284,500,000.0\n",
"- 0.11425 -> 23,269,500,000.0\n",
"- 0.11426 -> 23,265,000,000.0\n",
"- 0.11427 -> 23,251,000,000.0"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Same Amount Rejection Comparisons New Vs Old"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Old Rejected Amount: 23277500000.0\n",
"New Rejected Amount: 23269500000.0\n",
"\n",
"Old Rejected Amount Perc: 0.12608774547906842\n",
"New Rejected Amount Perc: 0.12604441170336947\n",
"\n",
"Old Rejected Bad Amount: 3139500000.0\n",
"New Rejected Bad Amount: 3733500000.0\n",
"\n",
"Old percent of trx rejected: 0.09538576810206673\n",
"New percent of trx rejected: 0.10439777717651466\n",
"\n",
"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n",
"\n",
"Count Baseline ODR: 0.04872312527317002\n",
"\n",
"Old Count ODR: 0.03948912668277529\n",
"New Count ODR: 0.03576491366689131\n",
"\n",
"Old Rejected Count ODR: 0.13637355444032292\n",
"New Rejected Count ODR: 0.1598883572567783\n",
"\n",
"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n",
"\n",
"Amount Baseline ODR: 0.05904497775081454\n",
"\n",
"Old Bad Amount ODR: 0.04203917914995382\n",
"New Bad Amount ODR: 0.038821646304306026\n",
"\n",
"Old Rejected Bad Amount ODR: 0.017005798600860718\n",
"New Rejected Bad Amount ODR: 0.020223331446508515\n",
"\n"
]
}
],
"source": [
"thresh_old = 0.25\n",
"thresh_new = 0.11425\n",
"\n",
"# compare amount\n",
"print('Old Rejected Amount:',calib_sep[calib_sep['score'] > thresh_old]['amount'].sum())\n",
"print('New Rejected Amount:',calib_sep[calib_sep['proba_v3'] > thresh_new]['amount'].sum())\n",
"print()\n",
"\n",
"# # compare Amount ODR\n",
"# print('Old Amount ODR:',calib_sep[calib_sep['score'] <= thresh_old]['amount'].sum() / calib_sep['amount'].sum())\n",
"# print('New Amount ODR:',calib_sep[calib_sep['proba_v3'] <= thresh_new]['amount'].sum() / calib_sep['amount'].sum())\n",
"# print()\n",
"\n",
"# compare Rejected Amount ODR\n",
"print('Old Rejected Amount Perc:',calib_sep[calib_sep['score'] > thresh_old]['amount'].sum() / calib_sep['amount'].sum())\n",
"print('New Rejected Amount Perc:',calib_sep[calib_sep['proba_v3'] > thresh_new]['amount'].sum() / calib_sep['amount'].sum())\n",
"print()\n",
"\n",
"# compare bad amount\n",
"print('Old Rejected Bad Amount:',calib_sep[calib_sep['score'] > thresh_old]['bad_amount'].sum())\n",
"print('New Rejected Bad Amount:',calib_sep[calib_sep['proba_v3'] > thresh_new]['bad_amount'].sum())\n",
"print()\n",
"\n",
"# compare percent of trx rejected\n",
"print('Old percent of trx rejected:', calib_sep[calib_sep['score'] > thresh_old].shape[0] / calib_sep.shape[0])\n",
"print('New percent of trx rejected:', calib_sep[calib_sep['proba_v3'] > thresh_new].shape[0] / calib_sep.shape[0])\n",
"print()\n",
"\n",
"print('=-'*55)\n",
"print()\n",
"\n",
"# Count Baseline ODR\n",
"print('Count Baseline ODR:',calib_sep['flag_bad'].mean())\n",
"print()\n",
"\n",
"# compare Count ODR\n",
"print('Old Count ODR:',calib_sep[calib_sep['score'] <= thresh_old]['flag_bad'].mean())\n",
"print('New Count ODR:',calib_sep[calib_sep['proba_v3'] <= thresh_new]['flag_bad'].mean())\n",
"print()\n",
"\n",
"# compare Rejected Count ODR\n",
"print('Old Rejected Count ODR:',calib_sep[calib_sep['score'] > thresh_old]['flag_bad'].mean())\n",
"print('New Rejected Count ODR:',calib_sep[calib_sep['proba_v3'] > thresh_new]['flag_bad'].mean())\n",
"print()\n",
"\n",
"print('=-'*55)\n",
"print()\n",
"\n",
"# Amount Baseline ODR\n",
"print('Amount Baseline ODR:',calib_sep['bad_amount'].sum() / calib_sep['amount'].sum())\n",
"print()\n",
"\n",
"# compare Bad Amount ODR\n",
"print('Old Bad Amount ODR:',calib_sep[calib_sep['score'] <= thresh_old]['bad_amount'].sum() / calib_sep['amount'].sum())\n",
"print('New Bad Amount ODR:',calib_sep[calib_sep['proba_v3'] <= thresh_new]['bad_amount'].sum() / calib_sep['amount'].sum())\n",
"print()\n",
"\n",
"# compare Rejected Bad Amount ODR\n",
"print('Old Rejected Bad Amount ODR:',calib_sep[calib_sep['score'] > thresh_old]['bad_amount'].sum() / calib_sep['amount'].sum())\n",
"print('New Rejected Bad Amount ODR:',calib_sep[calib_sep['proba_v3'] > thresh_new]['bad_amount'].sum() / calib_sep['amount'].sum())\n",
"print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Make sure the numbers increase per quantiles**"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"calib_sep['bins_v3'] = pd.qcut(calib_sep['proba_v3'], q=10)"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe thead tr:last-of-type th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th colspan=\"3\" halign=\"left\">flag_bad</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>sum</th>\n",
" <th>mean</th>\n",
" </tr>\n",
" <tr>\n",
" <th>bins_v3</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>(0.00455, 0.00859]</th>\n",
" <td>4805</td>\n",
" <td>10</td>\n",
" <td>0.002081</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.00859, 0.0111]</th>\n",
" <td>4805</td>\n",
" <td>23</td>\n",
" <td>0.004787</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0111, 0.0144]</th>\n",
" <td>4804</td>\n",
" <td>51</td>\n",
" <td>0.010616</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0144, 0.019]</th>\n",
" <td>4805</td>\n",
" <td>80</td>\n",
" <td>0.016649</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.019, 0.0251]</th>\n",
" <td>4805</td>\n",
" <td>116</td>\n",
" <td>0.024142</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0251, 0.0341]</th>\n",
" <td>4804</td>\n",
" <td>185</td>\n",
" <td>0.038510</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0341, 0.047]</th>\n",
" <td>4805</td>\n",
" <td>250</td>\n",
" <td>0.052029</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.047, 0.0702]</th>\n",
" <td>4804</td>\n",
" <td>348</td>\n",
" <td>0.072440</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0702, 0.117]</th>\n",
" <td>4805</td>\n",
" <td>507</td>\n",
" <td>0.105515</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.117, 0.493]</th>\n",
" <td>4805</td>\n",
" <td>771</td>\n",
" <td>0.160458</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" flag_bad \n",
" count sum mean\n",
"bins_v3 \n",
"(0.00455, 0.00859] 4805 10 0.002081\n",
"(0.00859, 0.0111] 4805 23 0.004787\n",
"(0.0111, 0.0144] 4804 51 0.010616\n",
"(0.0144, 0.019] 4805 80 0.016649\n",
"(0.019, 0.0251] 4805 116 0.024142\n",
"(0.0251, 0.0341] 4804 185 0.038510\n",
"(0.0341, 0.047] 4805 250 0.052029\n",
"(0.047, 0.0702] 4804 348 0.072440\n",
"(0.0702, 0.117] 4805 507 0.105515\n",
"(0.117, 0.493] 4805 771 0.160458"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calib_sep.groupby('bins_v3').agg({'flag_bad':['count', 'sum', 'mean']})"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe thead tr:last-of-type th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th colspan=\"2\" halign=\"left\">amount</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>sum</th>\n",
" </tr>\n",
" <tr>\n",
" <th>bins_v3</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>(0.00455, 0.00859]</th>\n",
" <td>4805</td>\n",
" <td>1.091850e+10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.00859, 0.0111]</th>\n",
" <td>4805</td>\n",
" <td>1.334000e+10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0111, 0.0144]</th>\n",
" <td>4804</td>\n",
" <td>1.508650e+10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0144, 0.019]</th>\n",
" <td>4805</td>\n",
" <td>1.718950e+10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.019, 0.0251]</th>\n",
" <td>4805</td>\n",
" <td>1.819200e+10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0251, 0.0341]</th>\n",
" <td>4804</td>\n",
" <td>2.007700e+10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0341, 0.047]</th>\n",
" <td>4805</td>\n",
" <td>2.147550e+10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.047, 0.0702]</th>\n",
" <td>4804</td>\n",
" <td>2.253350e+10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0702, 0.117]</th>\n",
" <td>4805</td>\n",
" <td>2.363200e+10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.117, 0.493]</th>\n",
" <td>4805</td>\n",
" <td>2.216900e+10</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" amount \n",
" count sum\n",
"bins_v3 \n",
"(0.00455, 0.00859] 4805 1.091850e+10\n",
"(0.00859, 0.0111] 4805 1.334000e+10\n",
"(0.0111, 0.0144] 4804 1.508650e+10\n",
"(0.0144, 0.019] 4805 1.718950e+10\n",
"(0.019, 0.0251] 4805 1.819200e+10\n",
"(0.0251, 0.0341] 4804 2.007700e+10\n",
"(0.0341, 0.047] 4805 2.147550e+10\n",
"(0.047, 0.0702] 4804 2.253350e+10\n",
"(0.0702, 0.117] 4805 2.363200e+10\n",
"(0.117, 0.493] 4805 2.216900e+10"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calib_sep.groupby('bins_v3').agg({'amount':['count', 'sum']})"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe thead tr:last-of-type th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th colspan=\"2\" halign=\"left\">bad_amount</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>sum</th>\n",
" </tr>\n",
" <tr>\n",
" <th>bins_v3</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>(0.00455, 0.00859]</th>\n",
" <td>4805</td>\n",
" <td>5.500000e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.00859, 0.0111]</th>\n",
" <td>4805</td>\n",
" <td>6.400000e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0111, 0.0144]</th>\n",
" <td>4804</td>\n",
" <td>1.710000e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0144, 0.019]</th>\n",
" <td>4805</td>\n",
" <td>2.985000e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.019, 0.0251]</th>\n",
" <td>4805</td>\n",
" <td>4.735000e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0251, 0.0341]</th>\n",
" <td>4804</td>\n",
" <td>7.970000e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0341, 0.047]</th>\n",
" <td>4805</td>\n",
" <td>1.287000e+09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.047, 0.0702]</th>\n",
" <td>4804</td>\n",
" <td>1.694000e+09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.0702, 0.117]</th>\n",
" <td>4805</td>\n",
" <td>2.496000e+09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(0.117, 0.493]</th>\n",
" <td>4805</td>\n",
" <td>3.564500e+09</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" bad_amount \n",
" count sum\n",
"bins_v3 \n",
"(0.00455, 0.00859] 4805 5.500000e+07\n",
"(0.00859, 0.0111] 4805 6.400000e+07\n",
"(0.0111, 0.0144] 4804 1.710000e+08\n",
"(0.0144, 0.019] 4805 2.985000e+08\n",
"(0.019, 0.0251] 4805 4.735000e+08\n",
"(0.0251, 0.0341] 4804 7.970000e+08\n",
"(0.0341, 0.047] 4805 1.287000e+09\n",
"(0.047, 0.0702] 4804 1.694000e+09\n",
"(0.0702, 0.117] 4805 2.496000e+09\n",
"(0.117, 0.493] 4805 3.564500e+09"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calib_sep.groupby('bins_v3').agg({'bad_amount':['count', 'sum']})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}