{
"cells": [
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pymysql\n",
"import psycopg2\n",
"import pickle\n",
"from datetime import datetime, timedelta"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import configparser\n",
"config = configparser.ConfigParser()\n",
"config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
"\n",
"host = config['MYSQL-ROOT']['HOST']\n",
"user = config['MYSQL-ROOT']['USER']\n",
"password = config['MYSQL-ROOT']['PASSWORD']\n",
"\n",
"def connect_sql():\n",
" cnx = pymysql.connect(host=host,\n",
" user=user,\n",
" password=password,\n",
" cursorclass=pymysql.cursors.DictCursor)\n",
" return cnx"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get Pass PL Trx"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"pass_query = '''\n",
"SELECT\n",
"\tbase.*,\n",
"\tCOUNT(*) AS performance_window,\n",
"\tMAX(ula.current_dpd) AS max_dpd\n",
"FROM\n",
"\t(\n",
"\tSELECT\n",
"\t\tuser_id,\n",
"\t\ttransaction_date,\n",
"\t\tpayment_type,\n",
"\t\tid AS trx_id,\n",
"\t\tamount,\n",
"\t\tdatediff(CURDATE(), transaction_date) AS dob\n",
"\tFROM\n",
"\t\t(\n",
"\t\tSELECT\n",
"\t\t\tid,\n",
"\t\t\tuser_id,\n",
"\t\t\tstatus,\n",
"\t\t\tamount,\n",
"\t\t\ttransaction_type,\n",
"\t\t\ttransaction_date,\n",
"\t\t\tpayment_type\n",
"\t\tFROM\n",
"\t\t\tl2alpha.`transaction`\n",
"\t\tWHERE\n",
"\t\t\tstatus = 4\n",
"\t\t\tAND transaction_type = 2\n",
"\t\t\tAND payment_type <> '30_days'\n",
"\t\t\tAND transaction_date >= '2019-12-01'\n",
"\t\t\tAND transaction_date < '2020-02-01' ) AS tr\n",
"\tJOIN (\n",
"\t\tSELECT\n",
"\t\t\tstart_date,\n",
"\t\t\ttransaction_id\n",
"\t\tFROM\n",
"\t\t\tl2alpha.loan\n",
"\t\tWHERE\n",
"\t\t\tloan_amount > 0\n",
"\t\t\tAND is_active = 1 ) AS lo ON\n",
"\t\tlo.transaction_id = tr.id ) AS base\n",
"LEFT JOIN (\n",
"\tSELECT\n",
"\t\tuser_id,\n",
"\t\tsnapshot_date,\n",
"\t\tcurrent_dpd\n",
"\tFROM\n",
"\t\tds.user_level_archive ) AS ula ON\n",
"\tbase.user_id = ula.user_id\n",
"WHERE\n",
"\t(snapshot_date IS NULL)\n",
"\tOR (snapshot_date > transaction_date\n",
"\tAND datediff(ula.snapshot_date, base.transaction_date) <= 121)\n",
"GROUP BY\n",
"\tbase.user_id,\n",
"\tbase.transaction_date,\n",
"\tbase.trx_id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"pl_trx_pass = pd.read_sql(pass_query, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(138501, 8)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_trx_pass.shape"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"user_id counts: 132873\n",
"trx_id counts: 138501\n",
"earliest trx date: 2019-12-01 00:00:53\n",
"latest trx date: 2020-01-31 23:59:01\n"
]
},
{
"data": {
"text/plain": [
"1 71084\n",
"12 67417\n",
"Name: transaction_date, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print('user_id counts:', pl_trx_pass['user_id'].nunique())\n",
"print('trx_id counts:', pl_trx_pass['trx_id'].nunique())\n",
"print('earliest trx date:', pl_trx_pass['transaction_date'].min())\n",
"print('latest trx date:', pl_trx_pass['transaction_date'].max())\n",
"display(pl_trx_pass['transaction_date'].dt.month.value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>transaction_date</th>\n",
" <th>payment_type</th>\n",
" <th>trx_id</th>\n",
" <th>amount</th>\n",
" <th>dob</th>\n",
" <th>performance_window</th>\n",
" <th>max_dpd</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1511</td>\n",
" <td>2019-12-02 19:56:22</td>\n",
" <td>3_months</td>\n",
" <td>48077443</td>\n",
" <td>3500000.0</td>\n",
" <td>71</td>\n",
" <td>71</td>\n",
" <td>-5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1590</td>\n",
" <td>2020-01-01 14:30:32</td>\n",
" <td>3_months</td>\n",
" <td>51947725</td>\n",
" <td>10500000.0</td>\n",
" <td>41</td>\n",
" <td>41</td>\n",
" <td>-7.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1860</td>\n",
" <td>2020-01-27 15:19:15</td>\n",
" <td>6_months</td>\n",
" <td>55263919</td>\n",
" <td>15000000.0</td>\n",
" <td>15</td>\n",
" <td>15</td>\n",
" <td>-15.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1988</td>\n",
" <td>2020-01-08 13:51:13</td>\n",
" <td>6_months</td>\n",
" <td>52887383</td>\n",
" <td>13500000.0</td>\n",
" <td>34</td>\n",
" <td>34</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2711</td>\n",
" <td>2020-01-16 12:46:24</td>\n",
" <td>6_months</td>\n",
" <td>53892762</td>\n",
" <td>4000000.0</td>\n",
" <td>26</td>\n",
" <td>26</td>\n",
" <td>-4.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user_id transaction_date payment_type trx_id amount dob \\\n",
"0 1511 2019-12-02 19:56:22 3_months 48077443 3500000.0 71 \n",
"1 1590 2020-01-01 14:30:32 3_months 51947725 10500000.0 41 \n",
"2 1860 2020-01-27 15:19:15 6_months 55263919 15000000.0 15 \n",
"3 1988 2020-01-08 13:51:13 6_months 52887383 13500000.0 34 \n",
"4 2711 2020-01-16 12:46:24 6_months 53892762 4000000.0 26 \n",
"\n",
" performance_window max_dpd \n",
"0 71 -5.0 \n",
"1 41 -7.0 \n",
"2 15 -15.0 \n",
"3 34 4.0 \n",
"4 26 -4.0 "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_trx_pass.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get Reject PL Trx"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"reject_query = '''\n",
"SELECT\n",
"\tbase.*,\n",
"\tCOUNT(*) AS performance_window,\n",
"\tMAX(ula.current_dpd) AS max_dpd\n",
"FROM\n",
"\t(\n",
"\tSELECT\n",
"\t\ttar.*,\n",
"\t\ttr.user_id,\n",
"\t\ttr.transaction_date,\n",
"\t\ttr.amount,\n",
" tr.payment_type,\n",
" datediff(CURDATE(), tr.transaction_date) AS dob\n",
"\tFROM\n",
"\t\t(\n",
"\t\tSELECT\n",
"\t\t\t*\n",
"\t\tFROM\n",
"\t\t\tl2alpha.transaction_approval_rules\n",
"\t\tWHERE\n",
"\t\t\trule_name = 'pl_score'\n",
"\t\t\tAND `result` = 'REJECT'\n",
"\t\t\tAND remarks LIKE '%Personal Loan Score%' ) AS tar\n",
"\tLEFT JOIN l2alpha.`transaction` AS tr ON\n",
"\t\ttar.transaction_id = tr.id\n",
"\tWHERE\n",
"\t\tpayment_type <> '30_days'\n",
"\t\tAND transaction_date >= '2019-12-01'\n",
"\t\tAND transaction_date < '2020-02-01'\n",
"\t\t) AS base\n",
"LEFT JOIN (\n",
"\tSELECT\n",
"\t\tuser_id,\n",
"\t\tsnapshot_date,\n",
"\t\tcurrent_dpd\n",
"\tFROM\n",
"\t\tds.user_level_archive ) AS ula ON\n",
"\tbase.user_id = ula.user_id\n",
"WHERE\n",
"\t(snapshot_date IS NULL)\n",
"\tOR (snapshot_date > transaction_date\n",
"\tAND datediff(ula.snapshot_date,\n",
"\tbase.transaction_date) <= 121)\n",
"GROUP BY\n",
"\tbase.user_id,\n",
"\tbase.transaction_date,\n",
"\tbase.transaction_id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"pl_trx_reject = pd.read_sql(reject_query, connect_sql())\n",
"pl_trx_reject = pl_trx_reject.rename(columns={'transaction_id':'trx_id'})"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(16254, 15)"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_trx_reject.shape"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"user_id counts: 15488\n",
"trx_id counts: 16254\n",
"earliest trx date: 2019-12-01 00:00:57\n",
"latest trx date: 2020-01-31 23:59:10\n"
]
},
{
"data": {
"text/plain": [
"1 8769\n",
"12 7485\n",
"Name: transaction_date, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print('user_id counts:', pl_trx_reject['user_id'].nunique())\n",
"print('trx_id counts:', pl_trx_reject['trx_id'].nunique())\n",
"print('earliest trx date:', pl_trx_reject['transaction_date'].min())\n",
"print('latest trx date:', pl_trx_reject['transaction_date'].max())\n",
"display(pl_trx_reject['transaction_date'].dt.month.value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"pl_trx_reject.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Combine Pass and Reject Trx"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [],
"source": [
"pl_trx = pd.concat([pl_trx_pass, pl_trx_reject[pl_trx_pass.columns]]).reset_index(drop=True)\n",
"pl_trx['flag_rejected'] = pl_trx['trx_id'].isin(pl_trx_reject['trx_id'])"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"pl_trx"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"user_id counts: 145943\n",
"trx_id counts: 154755\n",
"earliest trx date: 2019-12-01 00:00:53\n",
"latest trx date: 2020-01-31 23:59:10\n"
]
},
{
"data": {
"text/plain": [
"1 79853\n",
"12 74902\n",
"Name: transaction_date, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Percent of rejected by PD users: 0.10503053213143355\n"
]
}
],
"source": [
"print('user_id counts:', pl_trx['user_id'].nunique())\n",
"print('trx_id counts:', pl_trx['trx_id'].nunique())\n",
"print('earliest trx date:', pl_trx['transaction_date'].min())\n",
"print('latest trx date:', pl_trx['transaction_date'].max())\n",
"display(pl_trx['transaction_date'].dt.month.value_counts())\n",
"print()\n",
"print('Percent of rejected by PD users:', pl_trx['flag_rejected'].mean())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get PL V2 Score"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {},
"outputs": [],
"source": [
"tids = tuple(pl_trx['trx_id'])\n",
"plv2_query = f'''\n",
"SELECT trx_id, user_id, score, feature, calculation_date\n",
"FROM ds.b_score_pl_log\n",
"WHERE trx_id IN {tids}\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {},
"outputs": [],
"source": [
"pl_v2 = pd.read_sql(plv2_query, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"OG dataset size: 154755\n",
"PLV2 Score dataset size: 156609\n",
"user_id counts: 145942\n",
"trx_id counts: 154753\n"
]
}
],
"source": [
"print('OG dataset size:', pl_trx.shape[0])\n",
"print('PLV2 Score dataset size:', pl_v2.shape[0])\n",
"print('user_id counts:', pl_v2['user_id'].nunique())\n",
"print('trx_id counts:', pl_v2['trx_id'].nunique())"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [],
"source": [
"# drop duplicates\n",
"pl_v2 = pl_v2.sort_values(by=['trx_id', 'user_id', 'calculation_date'])\\\n",
" .drop_duplicates(subset=['trx_id'], keep='last')"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(154753, 5)"
]
},
"execution_count": 159,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_v2.shape"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Trx where score == -1: 618\n"
]
}
],
"source": [
"print('Trx where score == -1:',pl_v2[pl_v2['score'] < 0].shape[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Update Dataset with Recalculated PL Scores\n",
"There were random cases where users PL score were not calculated (caused by a bug). These users were granted a PL regardless they passed the PL score or not. The PLV2 score for these users must be recalculated to get a full picture. These users are denoted by \"score = -1\""
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [],
"source": [
"recalc = pl_v2[pl_v2['score'] == -1][['trx_id', 'user_id', 'score', 'feature']].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [],
"source": [
"nan = np.nan\n",
"recalc['feature'] = recalc.apply(lambda x: eval(x['feature']), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [],
"source": [
"recalc_feats = pd.DataFrame(recalc['feature'].tolist(), index=recalc['trx_id']).reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [],
"source": [
"feats_used = ['ap_co_plo',\n",
" 'calibrated_final',\n",
" 'current_dpd',\n",
" 'delin_max_dpd_3mo',\n",
" 'delin_max_dpd_amt_3mo',\n",
" 'flag_bad_pefindo',\n",
" 'flag_good_pefindo',\n",
" 'os_amount',\n",
" 'oth_first_trx_amount',\n",
" 'oth_last_rep_channel',\n",
" 'oth_last_rep_days',\n",
" 'oth_last_rep_dpd',\n",
" 'oth_last_trx_amount',\n",
" 'pf_con_open_count_12mo',\n",
" 'pf_delin_dist_contractcode_30_dpd_3mo',\n",
" 'pf_delin_max_dpd_12mo',\n",
" 'pf_util_creditcard_avg_12mo',\n",
" 'time_approve_to_pl_hour',\n",
" 'util_non_pl',\n",
" 'util_pl']"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [],
"source": [
"mod_xgb = pickle.load(open('xgb_pl_model_v2.p', 'rb'))"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [],
"source": [
"recalc_feats['recalc_score'] = mod_xgb.predict_proba(recalc_feats[feats_used],\n",
" ntree_limit=mod_xgb.best_iteration)[:,1]"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(154753, 5)"
]
},
"execution_count": 167,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"comb_temp = pl_v2.set_index('trx_id')\n",
"comb_temp.loc[comb_temp['score'] == -1, 'score'] = recalc_feats.set_index('trx_id')['recalc_score']\n",
"comb_temp = comb_temp.reset_index()\n",
"comb_temp.shape"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(154753, 5)"
]
},
"execution_count": 168,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_v2 = comb_temp.copy()\n",
"pl_v2.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Merge trx and plv2 score"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(154755, 9)"
]
},
"execution_count": 169,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_trx.shape"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(154753, 5)"
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_v2.shape"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [],
"source": [
"comb_df = pl_trx.merge(pl_v2[['trx_id', 'user_id', 'score']], how='left', on=['user_id', 'trx_id'])"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>transaction_date</th>\n",
" <th>payment_type</th>\n",
" <th>trx_id</th>\n",
" <th>amount</th>\n",
" <th>dob</th>\n",
" <th>performance_window</th>\n",
" <th>max_dpd</th>\n",
" <th>flag_rejected</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1511</td>\n",
" <td>2019-12-02 19:56:22</td>\n",
" <td>3_months</td>\n",
" <td>48077443</td>\n",
" <td>3500000.0</td>\n",
" <td>71</td>\n",
" <td>71</td>\n",
" <td>-5.0</td>\n",
" <td>False</td>\n",
" <td>0.009816</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1590</td>\n",
" <td>2020-01-01 14:30:32</td>\n",
" <td>3_months</td>\n",
" <td>51947725</td>\n",
" <td>10500000.0</td>\n",
" <td>41</td>\n",
" <td>41</td>\n",
" <td>-7.0</td>\n",
" <td>False</td>\n",
" <td>0.132705</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1860</td>\n",
" <td>2020-01-27 15:19:15</td>\n",
" <td>6_months</td>\n",
" <td>55263919</td>\n",
" <td>15000000.0</td>\n",
" <td>15</td>\n",
" <td>15</td>\n",
" <td>-15.0</td>\n",
" <td>False</td>\n",
" <td>0.015513</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1988</td>\n",
" <td>2020-01-08 13:51:13</td>\n",
" <td>6_months</td>\n",
" <td>52887383</td>\n",
" <td>13500000.0</td>\n",
" <td>34</td>\n",
" <td>34</td>\n",
" <td>4.0</td>\n",
" <td>False</td>\n",
" <td>0.104434</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2711</td>\n",
" <td>2020-01-16 12:46:24</td>\n",
" <td>6_months</td>\n",
" <td>53892762</td>\n",
" <td>4000000.0</td>\n",
" <td>26</td>\n",
" <td>26</td>\n",
" <td>-4.0</td>\n",
" <td>False</td>\n",
" <td>0.003002</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user_id transaction_date payment_type trx_id amount dob \\\n",
"0 1511 2019-12-02 19:56:22 3_months 48077443 3500000.0 71 \n",
"1 1590 2020-01-01 14:30:32 3_months 51947725 10500000.0 41 \n",
"2 1860 2020-01-27 15:19:15 6_months 55263919 15000000.0 15 \n",
"3 1988 2020-01-08 13:51:13 6_months 52887383 13500000.0 34 \n",
"4 2711 2020-01-16 12:46:24 6_months 53892762 4000000.0 26 \n",
"\n",
" performance_window max_dpd flag_rejected score \n",
"0 71 -5.0 False 0.009816 \n",
"1 41 -7.0 False 0.132705 \n",
"2 15 -15.0 False 0.015513 \n",
"3 34 4.0 False 0.104434 \n",
"4 26 -4.0 False 0.003002 "
]
},
"execution_count": 173,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"comb_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 174,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"11"
]
},
"execution_count": 174,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"comb_df['dob'].min()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get January Data Only"
]
},
{
"cell_type": "code",
"execution_count": 176,
"metadata": {},
"outputs": [],
"source": [
"jan_df = comb_df[comb_df['transaction_date'].dt.month == 1]"
]
},
{
"cell_type": "code",
"execution_count": 178,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"user_id counts: 78465\n",
"trx_id counts: 79853\n",
"earliest trx date: 2020-01-01 00:04:40\n",
"latest trx date: 2020-01-31 23:59:10\n",
"Percent of rejected users: 0.10981428374638398\n"
]
}
],
"source": [
"print('user_id counts:', jan_df['user_id'].nunique())\n",
"print('trx_id counts:', jan_df['trx_id'].nunique())\n",
"print('earliest trx date:', jan_df['transaction_date'].min())\n",
"print('latest trx date:', jan_df['transaction_date'].max())\n",
"print('Percent of rejected users:', jan_df['flag_rejected'].mean())"
]
},
{
"cell_type": "code",
"execution_count": 180,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 10)"
]
},
"execution_count": 180,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"jan_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [],
"source": [
"jan_df.to_parquet('data/pl_threshold_jan_11022020.parquet', compression='gzip')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}