personal_loan_credit_risk/600-Threshold_Analysis/600 - Threshold Analysis on Sep2019 Data.ipynb · past-data-projects

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-04-10T08:28:53.974518Z",
     "start_time": "2019-04-10T08:28:53.189985Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pickle\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Original Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-04-10T08:28:54.090753Z",
     "start_time": "2019-04-10T08:28:54.079572Z"
    }
   },
   "outputs": [],
   "source": [
    "base_df = pd.read_parquet('data/interim/pl_jumbo_mini_junjulaugsep_27012020.parquet')\\\n",
    "            .rename(columns={'loan_amount':'amount'})\n",
    "pl_feats = pd.read_parquet('data/interim/pl_model_v3_feats_27012020.parquet')\n",
    "\n",
    "# REMOVE OLD util_non_pl\n",
    "pl_feats = pl_feats.drop(columns=['util_non_pl'])\n",
    "# ATTACH NEW util_non_pl\n",
    "fixed_util_non_pl = pd.read_parquet('data/interim/fixed_util_non_pl.parquet')\n",
    "pl_feats = pl_feats.merge(fixed_util_non_pl, how='left', on=['user_id', 'trx_id'])\n",
    "\n",
    "pf_feats = pd.read_parquet('data/interim/pl_model_v3_pefindo_feats_27012020.parquet')\\\n",
    "             .drop(columns=['transaction_date'])\n",
    "new_feats = pd.read_parquet('data/interim/pl_model_v3_new_feats_27012020.parquet')\n",
    "dgtl_trx_feats = pd.read_parquet('data/interim/pl_model_v3_dgtl_trx_feats_18022020.parquet')\n",
    "new_trx_feats = pd.read_parquet('data/interim/pl_model_v3_new_trx_feats_27012020.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 18)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(150672, 28)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(150672, 9)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(150672, 85)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(150672, 56)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(150672, 30)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(base_df.shape)\n",
    "display(pl_feats.shape)\n",
    "display(pf_feats.shape)\n",
    "display(new_feats.shape)\n",
    "display(dgtl_trx_feats.shape)\n",
    "display(new_trx_feats.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_df = base_df.merge(pl_feats, how='left', on=['user_id', 'trx_id'])\\\n",
    "                .merge(pf_feats, how='left', on=['user_id', 'trx_id'])\\\n",
    "                .merge(new_feats, how='left', on=['user_id', 'trx_id'])\\\n",
    "                .merge(dgtl_trx_feats, how='left', on=['user_id', 'trx_id'])\\\n",
    "                .merge(new_trx_feats, how='left', on=['user_id', 'trx_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "rename_feats = {\n",
    "    'denied_trx_co_3mo':'trx_denied_co_90',\n",
    "    'delin_max_dpd_3mo':'delin_max_dpd_90',\n",
    "    'pl_trx_co_jumbomini_4_90':'pl_trx_suc_co_90',\n",
    "    'settle_to_due_last_pl_180':'pl_settle_to_due_last_180',\n",
    "    'settled_trx_sum_1mo':'trx_sett_sum_30',\n",
    "    'td_date':'date_of_month',\n",
    "    'time_approve_to_pl_hour':'time_appr_to_pl_hour',\n",
    "    'time_from_prev_pl_settle':'time_from_last_sett_pl_hour'\n",
    "}\n",
    "\n",
    "raw_df = raw_df.rename(columns=rename_feats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 216)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Feature Transformations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_df['payment_type'] = np.where(raw_df['payment_type'] == '3_months', 0, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# NaN values in time_approve_to_pl_hour are caused by missing approval time in DB\n",
    "# fill with 0. (There are only 4 cases)\n",
    "raw_df['time_appr_to_pl_hour'] = raw_df['time_appr_to_pl_hour'].fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Negative values caused by error in loan table. User's previous pl\n",
    "# settlement date is after the current pl transaction date.\n",
    "# Replace with median values. (There are only 2 cases)\n",
    "neg_index = raw_df[raw_df['time_from_last_sett_pl_hour'] < 0].index\n",
    "med_val = raw_df['time_from_last_sett_pl_hour'].median()\n",
    "raw_df.loc[neg_index, 'time_from_last_sett_pl_hour'] = med_val"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### flag_bad Definition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_df['flag_bad'] = np.where(raw_df['max_dpd'] > 60, 1, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-04-10T08:28:54.096002Z",
     "start_time": "2019-04-10T08:28:54.094053Z"
    }
   },
   "outputs": [],
   "source": [
    "# condlist = [raw_df['max_dpd']<8, raw_df['max_dpd']>=61]\n",
    "# choicelist = [0, 1]\n",
    "# raw_df['flag_bad'] = np.select(condlist, choicelist, default=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    144230\n",
       "1      6442\n",
       "Name: flag_bad, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df['flag_bad'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Rejected Users Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "rej_base = pd.read_parquet('data/pl_rejected_users_junjulaugsep_04022020.parquet')\\\n",
    "             .rename(columns={'transaction_id':'trx_id'})\n",
    "rej_feats = pd.read_parquet('data/pl_rejected_users_feats_04022020.parquet')\n",
    "# REMOVE util_non_pl & nondgtl_nonpl_trx_co_180\n",
    "rej_feats = rej_feats.drop(columns=['util_non_pl', 'nondgtl_nonpl_trx_co_180'])\n",
    "# ATTACH fixed feats\n",
    "fixed_rej_util = pd.read_parquet('data/fixed_pl_rejected_util_non_pl.parquet')\n",
    "fixed_rej_nondgtl = pd.read_parquet('data/fixed_pl_rejected_dgtl_trx_feats.parquet')\n",
    "rej_feats = rej_feats.merge(fixed_rej_util[['user_id', 'trx_id', 'util_non_pl']], how='left', \n",
    "                            on=['user_id', 'trx_id'])\\\n",
    "                     .merge(fixed_rej_nondgtl[['user_id', 'trx_id', 'nondgtl_nonpl_trx_co_180']], how='left', \n",
    "                            on=['user_id', 'trx_id'])\n",
    "\n",
    "rej_pef_feats = pd.read_parquet('data/pl_rejected_users_pefindo_feats_04022020.parquet')\\\n",
    "                  .rename(columns={'transaction_id':'trx_id'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "cols_to_drop = [\n",
    "    'rule_name', 'result', 'remarks',\n",
    "    'transaction_date', 'amount', 'performance_window', 'max_dpd'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "rej_feats = rej_feats.drop(columns=cols_to_drop)\n",
    "rej_pef_feats = rej_pef_feats.drop(columns=['transaction_date'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['trx_id', 'user_id', 'rule_name', 'result', 'remarks',\n",
       "       'transaction_date', 'amount', 'payment_type', 'performance_window',\n",
       "       'max_dpd'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rej_base.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(16190, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(16190, 191)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(16190, 9)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(rej_base.shape)\n",
    "display(rej_feats.shape)\n",
    "display(rej_pef_feats.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "rej_raw = rej_base.merge(rej_feats, how='left', on=['user_id', 'trx_id'])\\\n",
    "                  .merge(rej_pef_feats, how='left', on=['user_id', 'trx_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "rename_feats = {\n",
    "    'denied_trx_co_3mo':'trx_denied_co_90',\n",
    "    'delin_max_dpd_3mo':'delin_max_dpd_90',\n",
    "    'pl_trx_co_jumbomini_4_90':'pl_trx_suc_co_90',\n",
    "    'settle_to_due_last_pl_180':'pl_settle_to_due_last_180',\n",
    "    'settled_trx_sum_1mo':'trx_sett_sum_30',\n",
    "    'td_date':'date_of_month',\n",
    "    'time_approve_to_pl_hour':'time_appr_to_pl_hour',\n",
    "    'time_from_prev_pl_settle':'time_from_last_sett_pl_hour'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "rej_raw = rej_raw.rename(columns=rename_feats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(16190, 206)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rej_raw.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15307"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rej_raw['user_id'].nunique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Feature Transformations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "rej_raw['payment_type'] = np.where(rej_raw['payment_type'] == '3_months', 0, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# NaN values in time_approve_to_pl_hour are caused by missing approval time in DB\n",
    "# fill with 0. (There are only 4 cases)\n",
    "rej_raw['time_appr_to_pl_hour'] = rej_raw['time_appr_to_pl_hour'].fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Negative values caused by error in loan table. User's previous pl\n",
    "# settlement date is after the current pl transaction date.\n",
    "# Replace with median values. (There are only 2 cases)\n",
    "neg_index = rej_raw[rej_raw['time_from_last_sett_pl_hour'] < 0].index\n",
    "med_val = rej_raw['time_from_last_sett_pl_hour'].median()\n",
    "rej_raw.loc[neg_index, 'time_from_last_sett_pl_hour'] = med_val"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### flag_bad Definition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "rej_raw['flag_non_trx'] = rej_raw['max_dpd'].isna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "677"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rej_raw['max_dpd'].isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.04181593576281655"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rej_raw['max_dpd'].isna().sum()/rej_raw.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# users without any outstanding trx (max_dpd is null)\n",
    "# assign as a good user\n",
    "rej_raw['flag_bad'] = np.where(rej_raw['max_dpd'] > 60, 1, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    14755\n",
       "1     1435\n",
       "Name: flag_bad, dtype: int64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rej_raw['flag_bad'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# condlist = [rej_raw['max_dpd']<=60, rej_raw['max_dpd']>=61, rej_raw['max_dpd'].isna()]\n",
    "# choicelist = [0, 1, np.nan]\n",
    "# rej_raw['flag_bad'] = np.select(condlist, choicelist, default=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Concat Original Data with Rejected Users Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = ['ap_co_plo',\n",
    "            'current_dpd',\n",
    "            'date_of_month',\n",
    "            'delin_max_dpd_90',\n",
    "            'nondgtl_nonpl_trx_co_180',\n",
    "            'oth_last_rep_days',\n",
    "            'oth_last_rep_dpd',\n",
    "            'payment_type',\n",
    "            'pf_delin_max_dpd_12mo',\n",
    "            'pl_settle_to_due_last_180',\n",
    "            'pl_trx_suc_co_90',\n",
    "            'time_appr_to_pl_hour',\n",
    "            'time_from_last_sett_pl_hour',\n",
    "            'trx_denied_co_90',\n",
    "            'trx_sett_sum_30',\n",
    "            'util_non_pl',\n",
    "            'util_pl'\n",
    "           ]\n",
    "\n",
    "features = sorted(features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "context_cols = [\n",
    "    'user_id', 'trx_id', 'transaction_date', 'max_dpd', 'flag_bad', 'performance_window', 'amount'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 24)"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df[context_cols + features].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(16190, 24)"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rej_raw[context_cols + features].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "calib_df = pd.concat([raw_df[context_cols + features], rej_raw[context_cols + features]])\\\n",
    "             .reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(166862, 24)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calib_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "calib_df['flag_no_trx'] = calib_df['max_dpd'].isna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False    166171\n",
       "True        691\n",
       "Name: flag_no_trx, dtype: int64"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calib_df['flag_no_trx'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "calib_df['flag_rejected'] = calib_df['trx_id'].isin(rej_raw['trx_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False    150672\n",
       "True      16190\n",
       "Name: flag_rejected, dtype: int64"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calib_df['flag_rejected'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    158985\n",
       "1      7877\n",
       "Name: flag_bad, dtype: int64"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calib_df['flag_bad'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Simple Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.04275512371243496"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calib_df[calib_df['flag_rejected'] == False]['flag_bad'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3773292.317086121"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calib_df[calib_df['flag_rejected'] == False]['amount'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.08863495985176034"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calib_df[calib_df['flag_rejected'] == True]['flag_bad'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5864175.416924027"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calib_df[calib_df['flag_rejected'] == True]['amount'].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Input Model Proba Score and Calibrate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "mod_lgb = pickle.load(open('lgbm_pl_score_v3.p', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1079"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mod_lgb.best_iteration_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "calib_df['proba_v3'] = mod_lgb.predict_proba(calib_df[features], \n",
    "                                             num_iteration=mod_lgb.best_iteration_\n",
    "                                             )[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "calib_df['proba_v3'] = calib_df['proba_v3'].apply(lambda x: 0.0045+0.622*x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get V2 Score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pymysql\n",
    "import configparser\n",
    "config = configparser.ConfigParser()\n",
    "config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
    "\n",
    "host = config['MYSQL-ROOT']['HOST']\n",
    "user = config['MYSQL-ROOT']['USER']\n",
    "password = config['MYSQL-ROOT']['PASSWORD']\n",
    "\n",
    "def connect_sql():\n",
    "    cnx = pymysql.connect(host=host,\n",
    "                          user=user,\n",
    "                          password=password,\n",
    "                          cursorclass=pymysql.cursors.DictCursor)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "tids = tuple(calib_df['trx_id'])\n",
    "query = f'''\n",
    "SELECT trx_id, user_id, score, feature, calculation_date\n",
    "FROM ds.b_score_pl_log\n",
    "WHERE trx_id IN {tids}\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "pl_v2 = pd.read_sql(query, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(167533, 5)"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pl_v2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(166862, 27)"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calib_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop duplicates\n",
    "pl_v2 = pl_v2.sort_values(by=['trx_id', 'user_id', 'calculation_date'])\\\n",
    "             .drop_duplicates(subset=['trx_id'], keep='last')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(165532, 5)"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pl_v2.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Update Dataset with Recalculated PL Scores\n",
    "There were random cases where users PL score were not calculated (caused by a bug). These users were granted a PL regardless they passed the PL score or not. The PLV2 score for these users must be recalculated to get a full picture. These users are denoted by \"score = -1\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "recalc = pl_v2[pl_v2['score'] == -1][['trx_id', 'user_id', 'score', 'feature']].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "nan = np.nan\n",
    "recalc['feature'] = recalc.apply(lambda x: eval(x['feature']), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "recalc_feats = pd.DataFrame(recalc['feature'].tolist(), index=recalc['trx_id']).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "feats_used = ['ap_co_plo',\n",
    "                'calibrated_final',\n",
    "                'current_dpd',\n",
    "                'delin_max_dpd_3mo',\n",
    "                'delin_max_dpd_amt_3mo',\n",
    "                'flag_bad_pefindo',\n",
    "                'flag_good_pefindo',\n",
    "                'os_amount',\n",
    "                'oth_first_trx_amount',\n",
    "                'oth_last_rep_channel',\n",
    "                'oth_last_rep_days',\n",
    "                'oth_last_rep_dpd',\n",
    "                'oth_last_trx_amount',\n",
    "                'pf_con_open_count_12mo',\n",
    "                'pf_delin_dist_contractcode_30_dpd_3mo',\n",
    "                'pf_delin_max_dpd_12mo',\n",
    "                'pf_util_creditcard_avg_12mo',\n",
    "                'time_approve_to_pl_hour',\n",
    "                'util_non_pl',\n",
    "                'util_pl']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "mod_xgb = pickle.load(open('xgb_pl_model_v2.p', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "recalc_feats['recalc_score'] = mod_xgb.predict_proba(recalc_feats[feats_used],\n",
    "                                                     ntree_limit=mod_xgb.best_iteration)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "comb_temp = pl_v2.set_index('trx_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "comb_temp.loc[comb_temp['score'] == -1, 'score'] = recalc_feats.set_index('trx_id')['recalc_score']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "comb_temp = comb_temp.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(165532, 5)"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "comb_temp.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "pl_v2 = comb_temp.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(165532, 5)"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pl_v2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "calib_df = calib_df.merge(pl_v2[['trx_id', 'user_id', 'score']], how='left', on=['trx_id', 'user_id'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Simulate Rejection\n",
    "Simulate Rejection on September dataset only."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "calib_df['bad_amount'] = calib_df['flag_bad']*calib_df['amount']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9    48047\n",
       "8    45172\n",
       "7    42207\n",
       "6    31436\n",
       "Name: transaction_date, dtype: int64"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calib_df['transaction_date'].dt.month.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "calib_sep = calib_df[calib_df['transaction_date'].dt.month == 9].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sanity Check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "calib_sep['check_bins'] = pd.qcut(calib_sep['proba_v3'], q=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th colspan=\"3\" halign=\"left\">flag_bad</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>sum</th>\n",
       "      <th>mean</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>flag_rejected</th>\n",
       "      <th>check_bins</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"10\" valign=\"top\">False</th>\n",
       "      <th>(0.00455, 0.00859]</th>\n",
       "      <td>4805.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.002081</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.00859, 0.0111]</th>\n",
       "      <td>4805.0</td>\n",
       "      <td>23.0</td>\n",
       "      <td>0.004787</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0111, 0.0144]</th>\n",
       "      <td>4804.0</td>\n",
       "      <td>51.0</td>\n",
       "      <td>0.010616</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0144, 0.019]</th>\n",
       "      <td>4803.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>0.016656</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.019, 0.0251]</th>\n",
       "      <td>4805.0</td>\n",
       "      <td>116.0</td>\n",
       "      <td>0.024142</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0251, 0.0341]</th>\n",
       "      <td>4790.0</td>\n",
       "      <td>185.0</td>\n",
       "      <td>0.038622</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0341, 0.047]</th>\n",
       "      <td>4766.0</td>\n",
       "      <td>248.0</td>\n",
       "      <td>0.052035</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.047, 0.0702]</th>\n",
       "      <td>4670.0</td>\n",
       "      <td>340.0</td>\n",
       "      <td>0.072805</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0702, 0.117]</th>\n",
       "      <td>4272.0</td>\n",
       "      <td>460.0</td>\n",
       "      <td>0.107678</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.117, 0.493]</th>\n",
       "      <td>3108.0</td>\n",
       "      <td>529.0</td>\n",
       "      <td>0.170206</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"10\" valign=\"top\">True</th>\n",
       "      <th>(0.00455, 0.00859]</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.00859, 0.0111]</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0111, 0.0144]</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0144, 0.019]</th>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.019, 0.0251]</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0251, 0.0341]</th>\n",
       "      <td>14.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0341, 0.047]</th>\n",
       "      <td>39.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.051282</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.047, 0.0702]</th>\n",
       "      <td>134.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>0.059701</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0702, 0.117]</th>\n",
       "      <td>533.0</td>\n",
       "      <td>47.0</td>\n",
       "      <td>0.088180</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.117, 0.493]</th>\n",
       "      <td>1697.0</td>\n",
       "      <td>242.0</td>\n",
       "      <td>0.142605</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 flag_bad                 \n",
       "                                    count    sum      mean\n",
       "flag_rejected check_bins                                  \n",
       "False         (0.00455, 0.00859]   4805.0   10.0  0.002081\n",
       "              (0.00859, 0.0111]    4805.0   23.0  0.004787\n",
       "              (0.0111, 0.0144]     4804.0   51.0  0.010616\n",
       "              (0.0144, 0.019]      4803.0   80.0  0.016656\n",
       "              (0.019, 0.0251]      4805.0  116.0  0.024142\n",
       "              (0.0251, 0.0341]     4790.0  185.0  0.038622\n",
       "              (0.0341, 0.047]      4766.0  248.0  0.052035\n",
       "              (0.047, 0.0702]      4670.0  340.0  0.072805\n",
       "              (0.0702, 0.117]      4272.0  460.0  0.107678\n",
       "              (0.117, 0.493]       3108.0  529.0  0.170206\n",
       "True          (0.00455, 0.00859]      NaN    NaN       NaN\n",
       "              (0.00859, 0.0111]       NaN    NaN       NaN\n",
       "              (0.0111, 0.0144]        NaN    NaN       NaN\n",
       "              (0.0144, 0.019]         2.0    0.0  0.000000\n",
       "              (0.019, 0.0251]         NaN    NaN       NaN\n",
       "              (0.0251, 0.0341]       14.0    0.0  0.000000\n",
       "              (0.0341, 0.047]        39.0    2.0  0.051282\n",
       "              (0.047, 0.0702]       134.0    8.0  0.059701\n",
       "              (0.0702, 0.117]       533.0   47.0  0.088180\n",
       "              (0.117, 0.493]       1697.0  242.0  0.142605"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calib_sep.groupby(['flag_rejected', 'check_bins']).agg({'flag_bad':['count', 'sum', 'mean']})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Same Amount Rejection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "23277500000.0"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the new threshold should reject the same amount\n",
    "# as shown here\n",
    "calib_sep[calib_sep['score'] > 0.25]['amount'].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.11400 -> 23,326,000,000.0\n",
      "0.11401 -> 23,326,000,000.0\n",
      "0.11402 -> 23,325,000,000.0\n",
      "0.11403 -> 23,325,000,000.0\n",
      "0.11404 -> 23,312,000,000.0\n",
      "0.11405 -> 23,312,000,000.0\n",
      "0.11406 -> 23,312,000,000.0\n",
      "0.11407 -> 23,312,000,000.0\n",
      "0.11408 -> 23,312,000,000.0\n",
      "0.11409 -> 23,311,000,000.0\n",
      "0.11410 -> 23,311,000,000.0\n",
      "0.11411 -> 23,311,000,000.0\n",
      "0.11412 -> 23,311,000,000.0\n",
      "0.11413 -> 23,310,000,000.0\n",
      "0.11414 -> 23,310,000,000.0\n",
      "0.11415 -> 23,310,000,000.0\n",
      "0.11416 -> 23,310,000,000.0\n",
      "0.11417 -> 23,310,000,000.0\n",
      "0.11418 -> 23,310,000,000.0\n",
      "0.11419 -> 23,300,500,000.0\n",
      "0.11420 -> 23,296,500,000.0\n",
      "0.11421 -> 23,295,000,000.0\n",
      "0.11422 -> 23,284,500,000.0\n",
      "0.11423 -> 23,284,500,000.0\n",
      "0.11424 -> 23,284,500,000.0\n",
      "0.11425 -> 23,269,500,000.0\n",
      "0.11426 -> 23,265,000,000.0\n",
      "0.11427 -> 23,251,000,000.0\n",
      "0.11428 -> 23,251,000,000.0\n",
      "0.11429 -> 23,233,500,000.0\n",
      "0.11430 -> 23,224,500,000.0\n",
      "0.11431 -> 23,221,000,000.0\n",
      "0.11432 -> 23,221,000,000.0\n",
      "0.11433 -> 23,217,000,000.0\n",
      "0.11434 -> 23,215,500,000.0\n",
      "0.11435 -> 23,213,000,000.0\n",
      "0.11436 -> 23,213,000,000.0\n",
      "0.11437 -> 23,198,000,000.0\n",
      "0.11438 -> 23,190,000,000.0\n",
      "0.11439 -> 23,187,000,000.0\n",
      "0.11440 -> 23,182,500,000.0\n",
      "0.11441 -> 23,182,500,000.0\n",
      "0.11442 -> 23,182,500,000.0\n",
      "0.11443 -> 23,181,500,000.0\n",
      "0.11444 -> 23,181,500,000.0\n",
      "0.11445 -> 23,181,500,000.0\n",
      "0.11446 -> 23,181,500,000.0\n",
      "0.11447 -> 23,181,500,000.0\n",
      "0.11448 -> 23,181,500,000.0\n",
      "0.11449 -> 23,175,500,000.0\n",
      "0.11450 -> 23,175,500,000.0\n"
     ]
    }
   ],
   "source": [
    "candidates = [x for x in np.arange(.114, .1145, .00001)]\n",
    "\n",
    "for candidate in candidates:\n",
    "    print(f\"{candidate:.5f} -> {calib_sep[calib_sep['proba_v3'] > candidate]['amount'].sum():,}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 0.11424 -> 23,284,500,000.0\n",
    "- 0.11425 -> 23,269,500,000.0\n",
    "- 0.11426 -> 23,265,000,000.0\n",
    "- 0.11427 -> 23,251,000,000.0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Same Amount Rejection Comparisons New Vs Old"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Old Rejected Amount: 23277500000.0\n",
      "New Rejected Amount: 23269500000.0\n",
      "\n",
      "Old Rejected Amount Perc: 0.12608774547906842\n",
      "New Rejected Amount Perc: 0.12604441170336947\n",
      "\n",
      "Old Rejected Bad Amount: 3139500000.0\n",
      "New Rejected Bad Amount: 3733500000.0\n",
      "\n",
      "Old percent of trx rejected: 0.09538576810206673\n",
      "New percent of trx rejected: 0.10439777717651466\n",
      "\n",
      "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n",
      "\n",
      "Count Baseline ODR: 0.04872312527317002\n",
      "\n",
      "Old Count ODR: 0.03948912668277529\n",
      "New Count ODR: 0.03576491366689131\n",
      "\n",
      "Old Rejected Count ODR: 0.13637355444032292\n",
      "New Rejected Count ODR: 0.1598883572567783\n",
      "\n",
      "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n",
      "\n",
      "Amount Baseline ODR: 0.05904497775081454\n",
      "\n",
      "Old Bad Amount ODR: 0.04203917914995382\n",
      "New Bad Amount ODR: 0.038821646304306026\n",
      "\n",
      "Old Rejected Bad Amount ODR: 0.017005798600860718\n",
      "New Rejected Bad Amount ODR: 0.020223331446508515\n",
      "\n"
     ]
    }
   ],
   "source": [
    "thresh_old = 0.25\n",
    "thresh_new = 0.11425\n",
    "\n",
    "# compare amount\n",
    "print('Old Rejected Amount:',calib_sep[calib_sep['score'] > thresh_old]['amount'].sum())\n",
    "print('New Rejected Amount:',calib_sep[calib_sep['proba_v3'] > thresh_new]['amount'].sum())\n",
    "print()\n",
    "\n",
    "# # compare Amount ODR\n",
    "# print('Old Amount ODR:',calib_sep[calib_sep['score'] <= thresh_old]['amount'].sum() / calib_sep['amount'].sum())\n",
    "# print('New Amount ODR:',calib_sep[calib_sep['proba_v3'] <= thresh_new]['amount'].sum() / calib_sep['amount'].sum())\n",
    "# print()\n",
    "\n",
    "# compare Rejected Amount ODR\n",
    "print('Old Rejected Amount Perc:',calib_sep[calib_sep['score'] > thresh_old]['amount'].sum() / calib_sep['amount'].sum())\n",
    "print('New Rejected Amount Perc:',calib_sep[calib_sep['proba_v3'] > thresh_new]['amount'].sum() / calib_sep['amount'].sum())\n",
    "print()\n",
    "\n",
    "# compare bad amount\n",
    "print('Old Rejected Bad Amount:',calib_sep[calib_sep['score'] > thresh_old]['bad_amount'].sum())\n",
    "print('New Rejected Bad Amount:',calib_sep[calib_sep['proba_v3'] > thresh_new]['bad_amount'].sum())\n",
    "print()\n",
    "\n",
    "# compare percent of trx rejected\n",
    "print('Old percent of trx rejected:', calib_sep[calib_sep['score'] > thresh_old].shape[0] / calib_sep.shape[0])\n",
    "print('New percent of trx rejected:', calib_sep[calib_sep['proba_v3'] > thresh_new].shape[0] / calib_sep.shape[0])\n",
    "print()\n",
    "\n",
    "print('=-'*55)\n",
    "print()\n",
    "\n",
    "# Count Baseline ODR\n",
    "print('Count Baseline ODR:',calib_sep['flag_bad'].mean())\n",
    "print()\n",
    "\n",
    "# compare Count ODR\n",
    "print('Old Count ODR:',calib_sep[calib_sep['score'] <= thresh_old]['flag_bad'].mean())\n",
    "print('New Count ODR:',calib_sep[calib_sep['proba_v3'] <= thresh_new]['flag_bad'].mean())\n",
    "print()\n",
    "\n",
    "# compare Rejected Count ODR\n",
    "print('Old Rejected Count ODR:',calib_sep[calib_sep['score'] > thresh_old]['flag_bad'].mean())\n",
    "print('New Rejected Count ODR:',calib_sep[calib_sep['proba_v3'] > thresh_new]['flag_bad'].mean())\n",
    "print()\n",
    "\n",
    "print('=-'*55)\n",
    "print()\n",
    "\n",
    "# Amount Baseline ODR\n",
    "print('Amount Baseline ODR:',calib_sep['bad_amount'].sum() / calib_sep['amount'].sum())\n",
    "print()\n",
    "\n",
    "# compare Bad Amount ODR\n",
    "print('Old Bad Amount ODR:',calib_sep[calib_sep['score'] <= thresh_old]['bad_amount'].sum() / calib_sep['amount'].sum())\n",
    "print('New Bad Amount ODR:',calib_sep[calib_sep['proba_v3'] <= thresh_new]['bad_amount'].sum() / calib_sep['amount'].sum())\n",
    "print()\n",
    "\n",
    "# compare Rejected Bad Amount ODR\n",
    "print('Old Rejected Bad Amount ODR:',calib_sep[calib_sep['score'] > thresh_old]['bad_amount'].sum() / calib_sep['amount'].sum())\n",
    "print('New Rejected Bad Amount ODR:',calib_sep[calib_sep['proba_v3'] > thresh_new]['bad_amount'].sum() / calib_sep['amount'].sum())\n",
    "print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Make sure the numbers increase per quantiles**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "calib_sep['bins_v3'] = pd.qcut(calib_sep['proba_v3'], q=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"3\" halign=\"left\">flag_bad</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>sum</th>\n",
       "      <th>mean</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bins_v3</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>(0.00455, 0.00859]</th>\n",
       "      <td>4805</td>\n",
       "      <td>10</td>\n",
       "      <td>0.002081</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.00859, 0.0111]</th>\n",
       "      <td>4805</td>\n",
       "      <td>23</td>\n",
       "      <td>0.004787</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0111, 0.0144]</th>\n",
       "      <td>4804</td>\n",
       "      <td>51</td>\n",
       "      <td>0.010616</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0144, 0.019]</th>\n",
       "      <td>4805</td>\n",
       "      <td>80</td>\n",
       "      <td>0.016649</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.019, 0.0251]</th>\n",
       "      <td>4805</td>\n",
       "      <td>116</td>\n",
       "      <td>0.024142</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0251, 0.0341]</th>\n",
       "      <td>4804</td>\n",
       "      <td>185</td>\n",
       "      <td>0.038510</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0341, 0.047]</th>\n",
       "      <td>4805</td>\n",
       "      <td>250</td>\n",
       "      <td>0.052029</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.047, 0.0702]</th>\n",
       "      <td>4804</td>\n",
       "      <td>348</td>\n",
       "      <td>0.072440</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0702, 0.117]</th>\n",
       "      <td>4805</td>\n",
       "      <td>507</td>\n",
       "      <td>0.105515</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.117, 0.493]</th>\n",
       "      <td>4805</td>\n",
       "      <td>771</td>\n",
       "      <td>0.160458</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   flag_bad               \n",
       "                      count  sum      mean\n",
       "bins_v3                                   \n",
       "(0.00455, 0.00859]     4805   10  0.002081\n",
       "(0.00859, 0.0111]      4805   23  0.004787\n",
       "(0.0111, 0.0144]       4804   51  0.010616\n",
       "(0.0144, 0.019]        4805   80  0.016649\n",
       "(0.019, 0.0251]        4805  116  0.024142\n",
       "(0.0251, 0.0341]       4804  185  0.038510\n",
       "(0.0341, 0.047]        4805  250  0.052029\n",
       "(0.047, 0.0702]        4804  348  0.072440\n",
       "(0.0702, 0.117]        4805  507  0.105515\n",
       "(0.117, 0.493]         4805  771  0.160458"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calib_sep.groupby('bins_v3').agg({'flag_bad':['count', 'sum', 'mean']})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">amount</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>sum</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bins_v3</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>(0.00455, 0.00859]</th>\n",
       "      <td>4805</td>\n",
       "      <td>1.091850e+10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.00859, 0.0111]</th>\n",
       "      <td>4805</td>\n",
       "      <td>1.334000e+10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0111, 0.0144]</th>\n",
       "      <td>4804</td>\n",
       "      <td>1.508650e+10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0144, 0.019]</th>\n",
       "      <td>4805</td>\n",
       "      <td>1.718950e+10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.019, 0.0251]</th>\n",
       "      <td>4805</td>\n",
       "      <td>1.819200e+10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0251, 0.0341]</th>\n",
       "      <td>4804</td>\n",
       "      <td>2.007700e+10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0341, 0.047]</th>\n",
       "      <td>4805</td>\n",
       "      <td>2.147550e+10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.047, 0.0702]</th>\n",
       "      <td>4804</td>\n",
       "      <td>2.253350e+10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0702, 0.117]</th>\n",
       "      <td>4805</td>\n",
       "      <td>2.363200e+10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.117, 0.493]</th>\n",
       "      <td>4805</td>\n",
       "      <td>2.216900e+10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   amount              \n",
       "                    count           sum\n",
       "bins_v3                                \n",
       "(0.00455, 0.00859]   4805  1.091850e+10\n",
       "(0.00859, 0.0111]    4805  1.334000e+10\n",
       "(0.0111, 0.0144]     4804  1.508650e+10\n",
       "(0.0144, 0.019]      4805  1.718950e+10\n",
       "(0.019, 0.0251]      4805  1.819200e+10\n",
       "(0.0251, 0.0341]     4804  2.007700e+10\n",
       "(0.0341, 0.047]      4805  2.147550e+10\n",
       "(0.047, 0.0702]      4804  2.253350e+10\n",
       "(0.0702, 0.117]      4805  2.363200e+10\n",
       "(0.117, 0.493]       4805  2.216900e+10"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calib_sep.groupby('bins_v3').agg({'amount':['count', 'sum']})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">bad_amount</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>sum</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bins_v3</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>(0.00455, 0.00859]</th>\n",
       "      <td>4805</td>\n",
       "      <td>5.500000e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.00859, 0.0111]</th>\n",
       "      <td>4805</td>\n",
       "      <td>6.400000e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0111, 0.0144]</th>\n",
       "      <td>4804</td>\n",
       "      <td>1.710000e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0144, 0.019]</th>\n",
       "      <td>4805</td>\n",
       "      <td>2.985000e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.019, 0.0251]</th>\n",
       "      <td>4805</td>\n",
       "      <td>4.735000e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0251, 0.0341]</th>\n",
       "      <td>4804</td>\n",
       "      <td>7.970000e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0341, 0.047]</th>\n",
       "      <td>4805</td>\n",
       "      <td>1.287000e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.047, 0.0702]</th>\n",
       "      <td>4804</td>\n",
       "      <td>1.694000e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.0702, 0.117]</th>\n",
       "      <td>4805</td>\n",
       "      <td>2.496000e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(0.117, 0.493]</th>\n",
       "      <td>4805</td>\n",
       "      <td>3.564500e+09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   bad_amount              \n",
       "                        count           sum\n",
       "bins_v3                                    \n",
       "(0.00455, 0.00859]       4805  5.500000e+07\n",
       "(0.00859, 0.0111]        4805  6.400000e+07\n",
       "(0.0111, 0.0144]         4804  1.710000e+08\n",
       "(0.0144, 0.019]          4805  2.985000e+08\n",
       "(0.019, 0.0251]          4805  4.735000e+08\n",
       "(0.0251, 0.0341]         4804  7.970000e+08\n",
       "(0.0341, 0.047]          4805  1.287000e+09\n",
       "(0.047, 0.0702]          4804  1.694000e+09\n",
       "(0.0702, 0.117]          4805  2.496000e+09\n",
       "(0.117, 0.493]           4805  3.564500e+09"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calib_sep.groupby('bins_v3').agg({'bad_amount':['count', 'sum']})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}