{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import lightgbm\n",
"import pymysql\n",
"import pickle\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import configparser\n",
"config = configparser.ConfigParser()\n",
"config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
"\n",
"host = config['MYSQL-ROOT']['HOST']\n",
"user = config['MYSQL-ROOT']['USER']\n",
"password = config['MYSQL-ROOT']['PASSWORD']\n",
"\n",
"def connect_sql():\n",
" cnx = pymysql.connect(host=host,\n",
" user=user,\n",
" password=password,\n",
" cursorclass=pymysql.cursors.DictCursor)\n",
" return cnx"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"plv3_q = '''\n",
"SELECT *\n",
"FROM ds.b_score_pl_log\n",
"WHERE calculation_date > '2020-02-21'\n",
"AND engine_name = 'personal loan v3'\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"plv3_df = pd.read_sql(plv3_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(30341, 8)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"plv3_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"plv3_df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2020-02-21 00:01:08')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"plv3_df['calculation_date'].min()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"plv2_q = '''\n",
"SELECT *\n",
"FROM ds.b_score_pl_log\n",
"WHERE calculation_date > '2020-02-21'\n",
"AND engine_name = 'personal loan v2'\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"plv2_df = pd.read_sql(plv2_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(30324, 8)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"plv2_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"plv2_df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"30076"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"plv2_df['trx_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2020-02-21 00:01:08')"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"plv2_df['calculation_date'].min()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"pl_df = plv3_df[['trx_id', 'user_id', \n",
" 'calculation_date', 'score']].merge(plv2_df[['trx_id', 'user_id', \n",
" 'calculation_date', 'score']], \n",
" how='inner', on=['trx_id', 'user_id'])\\\n",
" .sort_values(by=['trx_id', 'user_id',\n",
" 'score_x', 'score_y'],\n",
" ascending=False)\\\n",
" .drop_duplicates(subset = ['trx_id', 'user_id'], keep='first')\n",
"pl_df = pl_df.reset_index(drop=True)\n",
"pl_df.columns = ['trx_id', 'user_id', 'calculation_date_v3', 'score_v3', 'calculation_date_v2', 'score_v2']"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"pl_df"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(30075, 6)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"30075"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_df['trx_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"15703"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_df['user_id'].nunique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Identify Users who are rejected because of cooldown period & Get transaction status"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"tids = tuple(pl_df['trx_id'])"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"filter_q = f'''\n",
"SELECT tar.*, tr.status\n",
"FROM\n",
"\t(SELECT transaction_id AS trx_id,\n",
"\t\t MAX(CASE WHEN `result` = 'REJECT' AND remarks LIKE '%Last PL rejection < 1 month%' THEN 1 ELSE 0 END) AS flag_reject_cooldown\n",
"\tFROM l2alpha.transaction_approval_rules\n",
"\tWHERE transaction_id IN {tids}\n",
"\tGROUP BY transaction_id\n",
"\t) AS tar\n",
"\tLEFT JOIN l2alpha.`transaction` AS tr\n",
"\tON tar.trx_id = tr.id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"filter_df = pd.read_sql(filter_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(30075, 3)"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filter_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"15531"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filter_df['flag_reject_cooldown'].sum()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.5164089775561097"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filter_df['flag_reject_cooldown'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"774"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filter_df[filter_df['status'].isin([2,6,11])].shape[0]"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.025735660847880298"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filter_df[filter_df['status'].isin([2,6,11])].shape[0]/filter_df.shape[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Merge cooldown period & transaction status"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"pl_df = pl_df.merge(filter_df, how='left', on='trx_id')"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"pl_df_clean = pl_df[(pl_df['flag_reject_cooldown'] == 0) & (pl_df['status'].isin([4,5]))]"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(13771, 8)"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_df_clean.shape"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 13771\n",
"Name: flag_reject_cooldown, dtype: int64"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_df_clean['flag_reject_cooldown'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4 11864\n",
"5 1907\n",
"Name: status, dtype: int64"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_df_clean['status'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"13771"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_df_clean['trx_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"13729"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_df_clean['user_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"13108383 3\n",
"13002858 3\n",
"5037939 2\n",
"2271614 2\n",
"11751669 2\n",
" ..\n",
"2606578 1\n",
"12365295 1\n",
"12271085 1\n",
"2690539 1\n",
"5122053 1\n",
"Name: user_id, Length: 13729, dtype: int64"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_df_clean['user_id'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>trx_id</th>\n",
" <th>user_id</th>\n",
" <th>calculation_date_v3</th>\n",
" <th>score_v3</th>\n",
" <th>calculation_date_v2</th>\n",
" <th>score_v2</th>\n",
" <th>flag_reject_cooldown</th>\n",
" <th>status</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [trx_id, user_id, calculation_date_v3, score_v3, calculation_date_v2, score_v2, flag_reject_cooldown, status]\n",
"Index: []"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_df_clean[pl_df_clean['score_v2'] < 0]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"pl_df_clean"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/anaconda3/envs/env_personalloan/lib/python3.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
"/home/ec2-user/anaconda3/envs/env_personalloan/lib/python3.7/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
]
}
],
"source": [
"thresh_v3 = 0.11425\n",
"thresh_v2 = 0.25\n",
"\n",
"pl_df_clean['rejected_v3'] = np.where(pl_df_clean['score_v3'] > thresh_v3, 1, 0)\n",
"pl_df_clean['rejected_v2'] = np.where(pl_df_clean['score_v2'] > thresh_v2, 1, 0)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.13833418052429017"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_df_clean['rejected_v2'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.1392055769370416"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_df_clean['rejected_v3'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>rejected_v2</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" </tr>\n",
" <tr>\n",
" <th>rejected_v3</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>11190</td>\n",
" <td>664</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>676</td>\n",
" <td>1241</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"rejected_v2 0 1\n",
"rejected_v3 \n",
"0 11190 664\n",
"1 676 1241"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.crosstab(pl_df_clean['rejected_v3'], pl_df_clean['rejected_v2'])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}