{
"cells": [
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:36.988168Z",
"start_time": "2019-11-07T03:55:36.706164Z"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pymysql\n",
"from datetime import datetime"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:36.994804Z",
"start_time": "2019-11-07T03:55:36.989531Z"
}
},
"outputs": [],
"source": [
"import configparser\n",
"config = configparser.ConfigParser()\n",
"config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
"\n",
"host = config['MYSQL-ROOT']['HOST']\n",
"user = config['MYSQL-ROOT']['USER']\n",
"password = config['MYSQL-ROOT']['PASSWORD']\n",
"\n",
"def connect_sql():\n",
" cnx = pymysql.connect(host=host,\n",
" user=user,\n",
" password=password,\n",
" cursorclass=pymysql.cursors.DictCursor)\n",
" return cnx"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:37.026188Z",
"start_time": "2019-11-07T03:55:36.997299Z"
}
},
"outputs": [],
"source": [
"base_df = pd.read_parquet('data/pl_rejected_users_junjulaugsep_04022020.parquet')\n",
"base_df['transaction_date'] = base_df['transaction_date'].astype('datetime64[ms]')\n",
"base_df = base_df.rename(columns={'transaction_id':'trx_id'})"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:37.034477Z",
"start_time": "2019-11-07T03:55:37.027603Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 10)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"base_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:37.043587Z",
"start_time": "2019-11-07T03:55:37.036542Z"
}
},
"outputs": [],
"source": [
"uids = tuple(base_df['user_id'].unique().tolist())\n",
"tids = tuple(base_df['trx_id'].unique().tolist())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"15307"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(uids)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"16190"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(tids)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Snapshot"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:37.078945Z",
"start_time": "2019-11-07T03:55:37.074323Z"
}
},
"outputs": [],
"source": [
"snapshot_q = f\"\"\"\n",
"SELECT \n",
" base.user_id, \n",
" base.id AS trx_id,\n",
" os_amount,\n",
" current_dpd,\n",
" COALESCE(os_amount,0) + monthly_installment AS os_amount_inc\n",
"FROM \n",
"\t\t(SELECT user_id,id,transaction_date \n",
"\t\tFROM l2alpha.`transaction` WHERE id IN {tids}) AS base\n",
"\tLEFT JOIN \n",
"\t\tds.user_level_archive AS ula \n",
"\tON base.user_id = ula.user_id AND DATE(transaction_date) = snapshot_date \n",
"\tLEFT JOIN\n",
"\t\t(SELECT user_id,monthly_installment, transaction_id\n",
"\t\tFROM l2alpha.loan \n",
"\t\tWHERE transaction_id IN {tids}) AS lo \n",
"\tON lo.transaction_id = base.id\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:08.943657Z",
"start_time": "2019-11-07T03:55:37.081391Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"snapshot = pd.read_sql(snapshot_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:08.949079Z",
"start_time": "2019-11-07T03:58:08.945547Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 5)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"snapshot.shape"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:08.959459Z",
"start_time": "2019-11-07T03:58:08.950846Z"
}
},
"outputs": [],
"source": [
"snapshot.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Delinquency"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:08.985180Z",
"start_time": "2019-11-07T03:58:08.960895Z"
}
},
"outputs": [],
"source": [
"delin_q = f'''\n",
"SELECT \n",
" base.user_id,\n",
" base.id AS trx_id,\n",
" MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 7 AND\n",
" datediff(DATE(transaction_date),snapshot_date) >=0 THEN current_dpd ELSE NULL END) AS delin_max_dpd_7d,\n",
" MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 15 AND\n",
" datediff(DATE(transaction_date),snapshot_date) >=0 THEN current_dpd ELSE NULL END) AS delin_max_dpd_15d,\n",
" MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 30 AND\n",
" datediff(DATE(transaction_date),snapshot_date) >=0 THEN current_dpd ELSE NULL END) AS delin_max_dpd_1mo,\n",
" MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 60 AND\n",
" datediff(DATE(transaction_date),snapshot_date) >=0 THEN current_dpd ELSE NULL END) AS delin_max_dpd_2mo,\n",
" MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 90 AND\n",
" datediff(DATE(transaction_date),snapshot_date) >=0 THEN current_dpd ELSE NULL END) AS delin_max_dpd_3mo,\n",
" \n",
" MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 7 AND\n",
" datediff(DATE(transaction_date),snapshot_date) >=0 THEN dpd_amount ELSE NULL END) AS delin_max_dpd_amt_7d,\n",
" MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 15 AND\n",
" datediff(DATE(transaction_date),snapshot_date) >=0 THEN dpd_amount ELSE NULL END) AS delin_max_dpd_amt_15d,\n",
" MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 30 AND\n",
" datediff(DATE(transaction_date),snapshot_date) >=0 THEN dpd_amount ELSE NULL END) AS delin_max_dpd_amt_1mo,\n",
" MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 60 AND\n",
" datediff(DATE(transaction_date),snapshot_date) >=0 THEN dpd_amount ELSE NULL END) AS delin_max_dpd_amt_2mo,\n",
" MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 90 AND\n",
" datediff(DATE(transaction_date),snapshot_date) >=0 THEN dpd_amount ELSE NULL END) AS delin_max_dpd_amt_3mo\n",
"FROM\n",
"\t\t(SELECT user_id,id,transaction_date \n",
"\t\tFROM l2alpha.`transaction` \n",
"\t\tWHERE id IN {tids}) AS base\n",
"\tLEFT JOIN \n",
"\t (SELECT user_id,\n",
"\t \tos_amount,\n",
"\t \tcurrent_dpd,\n",
"\t \tdpd_amount,\n",
"\t \tsnapshot_date\n",
"\t FROM ds.user_level_archive\n",
"\t ) AS delin ON base.user_id = delin.user_id AND snapshot_date <= DATE(transaction_date)\n",
"GROUP BY base.id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:37.439306Z",
"start_time": "2019-11-07T03:58:08.992173Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/anaconda3/envs/env_personalloan/lib/python3.7/site-packages/pymysql/cursors.py:329: Warning: (3170, \"Memory capacity of 8388608 bytes for 'range_optimizer_max_mem_size' exceeded. Range optimization was not done for this query.\")\n",
" self._do_get_result()\n"
]
}
],
"source": [
"delin = pd.read_sql(delin_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:37.443941Z",
"start_time": "2019-11-07T03:58:37.440532Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 12)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"delin.shape"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:37.458294Z",
"start_time": "2019-11-07T03:58:37.445289Z"
}
},
"outputs": [],
"source": [
"delin.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Other"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## time_approve_to_pl_hour"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:37.472597Z",
"start_time": "2019-11-07T03:58:37.459708Z"
}
},
"outputs": [],
"source": [
"time_pl_q = f'''\n",
"SELECT \n",
" base.user_id,\n",
" base.id AS trx_id,\n",
" TIMESTAMPDIFF(HOUR, installment_ts, transaction_date) AS time_approve_to_pl_hour\n",
"FROM\n",
"\t\t(SELECT user_id,id,transaction_date \n",
"\t\tFROM l2alpha.`transaction` WHERE id IN {tids}) AS base\n",
"\tLEFT JOIN \n",
"\t (SELECT user_id, MAX(is_approved_timestamp) AS installment_ts \n",
"\t FROM l2alpha.web_userinstallmentapplicationdetails \n",
"\t WHERE user_id IN {uids}\n",
"\t GROUP BY user_id\n",
"\t ) AS wuiad \n",
"\tON base.user_id = wuiad.user_id\n",
"GROUP BY base.id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:40.087947Z",
"start_time": "2019-11-07T03:58:37.473685Z"
}
},
"outputs": [],
"source": [
"time_pl = pd.read_sql(time_pl_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:40.092390Z",
"start_time": "2019-11-07T03:58:40.089336Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 3)"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"time_pl.shape"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:40.104548Z",
"start_time": "2019-11-07T03:58:40.093767Z"
}
},
"outputs": [],
"source": [
"time_pl.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## oth_last_trx_amount"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:40.123372Z",
"start_time": "2019-11-07T03:58:40.116350Z"
}
},
"outputs": [],
"source": [
"oth_last_trx_q = f'''\n",
"SELECT main.user_id,\n",
"\t main.id AS trx_id,\n",
" amount AS oth_last_trx_amount\n",
"FROM\n",
"\t(SELECT \n",
"\t base.user_id, \n",
"\t base.id,\n",
"\t MAX(trx.id) AS last_id\n",
"\tFROM\n",
"\t\t(SELECT user_id, \n",
"\t\t\t\tid, \n",
"\t\t\t\ttransaction_date \n",
"\t\tFROM l2alpha.`transaction`\n",
"\t\tWHERE id IN {tids}) AS base\n",
"\tLEFT JOIN \n",
"\t (SELECT user_id,\n",
"\t \tid,\n",
"\t \ttransaction_date AS td,\n",
"\t \tamount,\n",
"\t \tstatus\n",
"\t FROM l2alpha.`transaction`\n",
"\t WHERE status = 4\n",
"\t ) AS trx \n",
"\tON base.user_id = trx.user_id AND td < transaction_date\n",
"\tGROUP BY base.id\n",
"\t) AS main\n",
"LEFT JOIN \n",
"\t(SELECT id, amount\n",
"\tFROM l2alpha.`transaction`\n",
"\t) AS tr \n",
"ON tr.id = main.last_id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:08.704375Z",
"start_time": "2019-11-07T03:58:40.124523Z"
}
},
"outputs": [],
"source": [
"oth_last_trx = pd.read_sql(oth_last_trx_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:08.708913Z",
"start_time": "2019-11-07T04:00:08.705517Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 3)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"oth_last_trx.shape"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:08.744539Z",
"start_time": "2019-11-07T04:00:08.713156Z"
}
},
"outputs": [],
"source": [
"oth_last_trx.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## oth_first_trx_amount"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:08.753397Z",
"start_time": "2019-11-07T04:00:08.748442Z"
}
},
"outputs": [],
"source": [
"oth_first_trx_q = f'''\n",
"SELECT main.user_id,\n",
"\t main.id AS trx_id,\n",
" amount AS oth_first_trx_amount\n",
"FROM\n",
"\t(SELECT \n",
"\t base.user_id, \n",
"\t base.id,\n",
"\t MIN(trx.id) AS first_id\n",
"\tFROM\n",
"\t\t(SELECT user_id,\n",
"\t\t\t\tid,\n",
"\t\t\t\ttransaction_date \n",
"\t\tFROM l2alpha.`transaction` \n",
"\t\tWHERE id IN {tids}) AS base\n",
"\tLEFT JOIN \n",
"\t (SELECT user_id,\n",
"\t \tid,\n",
"\t \ttransaction_date AS td,\n",
"\t \tamount,\n",
"\t \tstatus\n",
"\t FROM l2alpha.`transaction` \n",
"\t WHERE status = 4\n",
"\t ) AS trx \n",
"\tON base.user_id = trx.user_id AND td < transaction_date\n",
"\tGROUP BY base.id\n",
"\t) AS main\n",
"LEFT JOIN \n",
"\t(SELECT id, amount\n",
"\tFROM l2alpha.`transaction`\n",
"\t) AS tr \n",
"ON tr.id = main.first_id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:11.643352Z",
"start_time": "2019-11-07T04:00:08.756040Z"
}
},
"outputs": [],
"source": [
"oth_first_trx = pd.read_sql(oth_first_trx_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:11.648365Z",
"start_time": "2019-11-07T04:00:11.644573Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 3)"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"oth_first_trx.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:11.658313Z",
"start_time": "2019-11-07T04:00:11.650662Z"
}
},
"outputs": [],
"source": [
"oth_first_trx.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## oth_last_rep_xxx:\n",
"- dpd\n",
"- days\n",
"- channel\n",
"- amount"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:11.673428Z",
"start_time": "2019-11-07T04:00:11.659983Z"
}
},
"outputs": [],
"source": [
"oth_last_rep_q = f'''\n",
"SELECT main.user_id,\n",
"\t main.id AS trx_id,\n",
" MAX(dpd) AS oth_last_rep_dpd,\n",
" datediff(main.transaction_date,main.last_pd) AS oth_last_rep_days,\n",
" payment_channel AS oth_last_rep_channel,\n",
" SUM(amount) AS oth_last_rep_amount\n",
"FROM\n",
"\t(SELECT base.user_id,\n",
"\t\t\tbase.id,\n",
"\t \tbase.transaction_date,\n",
"\t \tMAX(up.pd) AS last_pd\n",
"\tFROM\n",
"\t\t(SELECT user_id,\n",
" id,\n",
" transaction_date,\n",
" status\n",
"\t\tFROM l2alpha.`transaction`\n",
"\t\tWHERE id IN {tids}\n",
" ) AS base\n",
"\tLEFT JOIN \n",
"\t (SELECT user_id,\n",
"\t \tid,\n",
"\t \tpayment_type,\n",
"\t \tpayment_channel,\n",
"\t \tpayment_date AS pd,\n",
"\t \tdpd,\n",
"\t \tstatus\n",
"\t FROM l2alpha.user_payment \n",
"\t WHERE status = 2 AND payment_type = 'INSTALLMENT' AND payment_channel <> 'CANCELLATION'\n",
"\t ) AS up \n",
"\tON base.user_id = up.user_id AND pd < transaction_date\n",
"\tGROUP BY base.id\n",
"\t) AS main\n",
"LEFT JOIN \n",
"\t(SELECT user_id, \n",
"\t\t\tid, \n",
"\t\t\tdpd, \n",
"\t\t\tpayment_date,\n",
" payment_channel,\n",
" amount\n",
"\tFROM l2alpha.user_payment) AS up2\n",
"ON up2.payment_date = main.last_pd AND main.user_id = up2.user_id\n",
"GROUP BY main.id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:45.110051Z",
"start_time": "2019-11-07T04:00:11.674575Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"oth_last_rep = pd.read_sql(oth_last_rep_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:45.115239Z",
"start_time": "2019-11-07T04:00:45.111412Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 6)"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"oth_last_rep.shape"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:45.140803Z",
"start_time": "2019-11-07T04:00:45.116464Z"
}
},
"outputs": [],
"source": [
"last_rep_channel_encoding = {'BUKALAPAK': 1, 'TOKOPEDIA': 1,\n",
" 'BCA_VA': 2, 'MANDIRI_OPEN_VA': 2, 'MANDIRI_VA': 2, 'PERMATA_VA': 2,\n",
" 'ALFAMART_VA': 3, 'INDOMARET_VA': 3,\n",
" 'KREDIVO': 4, 'MANUAL_ALFAMART VA': 4, 'MANUAL_INDOMARET_VA': 4, \n",
" 'MANUAL_BCA VA': 4, 'MANUAL_BCA_VA': 4,\n",
" 'MANUAL_MANDIRI': 4, 'MANUAL_MANDIRI_OPEN_': 4, 'MANUAL_MANDIRI_VA': 4, \n",
" 'MANUAL_PERMATA VA': 4, 'MANUAL_PERMATA_VA': 4,\n",
" 'MANUAL_PC': 4, 'MANUAL_EXCESS': 4, 'MANUAL_WAIVER': 4, 'MANUAL_WAIVER_CLCT': 4}\n",
"\n",
"default_value = 3"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:45.147923Z",
"start_time": "2019-11-07T04:00:45.142356Z"
}
},
"outputs": [],
"source": [
"oth_last_rep['oth_last_rep_channel_raw'] = oth_last_rep['oth_last_rep_channel']\n",
"oth_last_rep['oth_last_rep_channel'] = oth_last_rep['oth_last_rep_channel'].map(last_rep_channel_encoding)\\\n",
" .fillna(default_value).astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:45.176873Z",
"start_time": "2019-11-07T04:00:45.149190Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"3 10194\n",
"2 5529\n",
"1 460\n",
"4 7\n",
"Name: oth_last_rep_channel, dtype: int64"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"oth_last_rep['oth_last_rep_channel'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Utilization"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## util_non_pl"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:45.181482Z",
"start_time": "2019-11-07T04:00:45.178221Z"
}
},
"outputs": [],
"source": [
"util_non_pl_q = f'''\n",
"SELECT main.user_id,\n",
"\t main.trx_id,\n",
"\t COALESCE(SUM(loan_amount/transaction_limit), 0) AS util_non_pl\n",
"FROM\n",
"\t(SELECT user_id, \n",
"\t\t trx_id, \n",
"\t\t loan_id,\n",
"\t\t loan_amount,\n",
"\t\t tenure,\n",
"\t\t MAX(creation_time) AS latest_limit\n",
"\tFROM\t\n",
"\t\t(SELECT tr.user_id,\n",
"\t\t tr.id AS trx_id,\n",
"\t\t lo.loan_id,\n",
"\t\t loan_amount,\n",
"\t\t creation_time,\n",
"\t\t tenure,\n",
"\t\t COALESCE(TIMESTAMPDIFF(DAY, creation_time, transaction_date), 0) AS time_diff\n",
"\t\tFROM \n",
"\t\t\t\t(SELECT user_id,\n",
"\t\t\t\t\t\ttransaction_date,\n",
"\t\t\t\t \tid\n",
"\t\t\t\tFROM l2alpha.`transaction` \n",
"\t\t\t\tWHERE id IN {tids}) AS tr\n",
"\t\t\tLEFT JOIN \n",
"\t\t\t\t(SELECT id AS loan_id,\n",
"\t\t\t\t\t\tuser_id,\n",
"\t\t\t\t \tstart_date,\n",
"\t\t\t\t \tsettlement_date,\n",
"\t\t\t\t \tloan_amount,\n",
"\t\t\t\t \ttenure,\n",
"\t\t\t\t \ttransaction_id \n",
"\t\t\t\tFROM l2alpha.loan\n",
"\t\t\t\tWHERE is_active = 1 AND loan_amount > 0) AS lo\n",
"\t\t\tON lo.user_id = tr.user_id AND start_date < transaction_date AND (settlement_date IS NULL OR settlement_date > transaction_date)\n",
"\t\t\t\n",
"\t\t\tLEFT JOIN \n",
"\t\t\t\t(SELECT user_id,\n",
"\t\t\t\t\t\tCASE WHEN installment_type = '30_days' THEN 30\n",
"\t\t\t\t\t\tWHEN installment_type = '3_months' THEN 90 \n",
"\t\t\t\t\t\tWHEN installment_type = '6_months' THEN 180 \n",
"\t\t\t\t\t\tWHEN installment_type = '12_months' THEN 360 \n",
"\t\t\t\t\t\tELSE NULL END AS installment_type,\n",
"\t\t\t\t\t\tcreation_time\n",
"\t\t\t\tFROM l2alpha.transaction_limit_history\n",
"\t\t\t\t) AS tlh\n",
"\t\t\tON tlh.user_id = lo.user_id AND tlh.installment_type = lo.tenure\n",
"\t\t) AS base\n",
"\tWHERE time_diff >= 0\n",
"\tGROUP BY trx_id, loan_id\n",
"\t) AS main\n",
"\t\n",
"\tLEFT JOIN \n",
"\t(SELECT user_id,\n",
"\t\t\ttransaction_limit,\n",
"\t\t\tCASE WHEN installment_type = '30_days' THEN 30\n",
"\t\t\tWHEN installment_type = '3_months' THEN 90 \n",
"\t\t\tWHEN installment_type = '6_months' THEN 180 \n",
"\t\t\tWHEN installment_type = '12_months' THEN 360 \n",
"\t\t\tELSE NULL END AS installment_type,\n",
"\t\t\tcreation_time\n",
"\tFROM l2alpha.transaction_limit_history\n",
"\tWHERE user_id IN {uids}\n",
"\tGROUP BY user_id, transaction_limit, installment_type, creation_time\n",
"\t) AS lim\n",
"\tON lim.creation_time = main.latest_limit AND lim.user_id = main.user_id\n",
"\tAND lim.installment_type = main.tenure\n",
"GROUP BY main.trx_id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:51.464862Z",
"start_time": "2019-11-07T04:00:45.182833Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/anaconda3/envs/env_personalloan/lib/python3.7/site-packages/pymysql/cursors.py:329: Warning: (1052, \"Column 'installment_type' in group statement is ambiguous\")\n",
" self._do_get_result()\n"
]
}
],
"source": [
"util_non_pl = pd.read_sql(util_non_pl_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:51.469712Z",
"start_time": "2019-11-07T04:00:51.466148Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 3)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"util_non_pl.shape"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:51.515275Z",
"start_time": "2019-11-07T04:00:51.471223Z"
}
},
"outputs": [],
"source": [
"util_non_pl.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## util_pl"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:51.534178Z",
"start_time": "2019-11-07T04:00:51.523536Z"
}
},
"outputs": [],
"source": [
"util_pl_q = f'''\n",
"SELECT main.user_id, \n",
"\t trx_id,\n",
"\t COALESCE((amount/transaction_limit), 0) AS util_pl\n",
"FROM\n",
"\t(SELECT user_id, trx_id, amount, payment_type, transaction_date,\n",
"\t\t MAX(creation_time) AS latest_limit\n",
"\tFROM\n",
"\t\t(SELECT tr.user_id, tr.id AS trx_id, tr.amount, tr.payment_type, tr.transaction_date,\n",
"\t\t\t\ttlh.creation_time,\n",
"\t\t\t TIMESTAMPDIFF(DAY, creation_time, transaction_date) AS time_diff\n",
"\t\tFROM\n",
"\t\t\t(SELECT *\n",
"\t\t\tFROM l2alpha.`transaction` \n",
"\t\t\tWHERE id IN {tids}\n",
"\t\t\t) AS tr\n",
"\t\t\tLEFT JOIN l2alpha.transaction_limit_history AS tlh\n",
"\t\t\tON tr.user_id = tlh.user_id AND tr.payment_type = tlh.installment_type\n",
"\t\t) AS base\n",
"\tWHERE time_diff >= 0\n",
"\tGROUP BY base.trx_id\n",
"\t) AS main\n",
"\t\n",
"\tLEFT JOIN \n",
"\t(SELECT user_id,\n",
"\t\t\ttransaction_limit,\n",
"\t\t\tinstallment_type,\n",
"\t\t\tcreation_time\n",
"\tFROM l2alpha.transaction_limit_history\n",
"\tWHERE user_id IN {uids}\n",
"\tGROUP BY user_id, transaction_limit, installment_type, creation_time\n",
"\t) AS lim\n",
"\tON lim.creation_time = main.latest_limit AND lim.user_id = main.user_id \n",
" AND lim.installment_type = main.payment_type\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:54.715476Z",
"start_time": "2019-11-07T04:00:51.536994Z"
}
},
"outputs": [],
"source": [
"util_pl = pd.read_sql(util_pl_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:54.720364Z",
"start_time": "2019-11-07T04:00:54.716934Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 3)"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"util_pl.shape"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:54.733988Z",
"start_time": "2019-11-07T04:00:54.722455Z"
}
},
"outputs": [],
"source": [
"util_pl.head()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"util_pl = util_pl[['user_id', 'trx_id', 'util_pl']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 5. A - Score Feature"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:54.753981Z",
"start_time": "2019-11-07T04:00:54.735531Z"
}
},
"outputs": [],
"source": [
"ascore_q = f'''\n",
"SELECT asc2.user_id, calibrated_final, engine\n",
"FROM ds.a_score_css_2 AS asc2\n",
"JOIN\n",
"(SELECT user_id, max(id) AS latest_id\n",
"FROM ds.a_score_css_2 \n",
"WHERE user_id IN {uids}\n",
"AND engine NOT LIKE '%pd3%'\n",
"AND engine NOT LIKE '%web%'\n",
"GROUP BY user_id\n",
") AS asc1\n",
"ON asc2.id = asc1.latest_id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:01:06.104428Z",
"start_time": "2019-11-07T04:00:54.755309Z"
}
},
"outputs": [],
"source": [
"a_score = pd.read_sql(ascore_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:01:06.110083Z",
"start_time": "2019-11-07T04:01:06.105797Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(14942, 3)"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a_score.shape"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:01:06.120607Z",
"start_time": "2019-11-07T04:01:06.113854Z"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"14942"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a_score['user_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:01:06.141338Z",
"start_time": "2019-11-07T04:01:06.123192Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"a_score.head(20)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"ios_model_used = []\n",
"engines_used = a_score.engine.unique().tolist()\n",
"for engine in engines_used:\n",
" if 'ios_' in engine:\n",
" ios_model_used.append(engine)\n",
"engines_used = [eng for eng in engines_used if 'ios' not in eng]\n",
"\n",
"x = pd.DataFrame(engines_used)\n",
"x['eng'] = x[0].apply(lambda x: None if pd.isna(x) else x.split()[0])\n",
"x['vers'] = x[0].apply(lambda x: None if pd.isna(x) else x.split()[1].split('.')[-1])\n",
"x['vers'] = np.where(x['vers']=='h2o',None,x['vers'])\n",
"\n",
"x.dropna(inplace=True)\n",
"x['vers'] = x['vers'].astype(int)\n",
"\n",
"model_used = x.loc[x[['eng','vers']].groupby('eng').idxmax()['vers'].tolist()][0].tolist()\n",
"model_used = model_used + ios_model_used\n",
"model_used.remove('cicilan_fresh_normal 0.0.2')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"model_used"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"a_score['calibrated_final'] = np.where(a_score['engine'].isin(model_used),\n",
" a_score['calibrated_final'], np.nan)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"a_score = a_score[['user_id', 'calibrated_final', 'engine']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 6. Installed Payday Loan App Count"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:01:06.262833Z",
"start_time": "2019-11-07T04:01:06.257660Z"
}
},
"outputs": [],
"source": [
"ins_app_q = f'''\n",
"SELECT user_id,\n",
" installed_applications\n",
"FROM l2alpha.web_userdevicedetail\n",
"WHERE user_id IN {uids}\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:40.041309Z",
"start_time": "2019-11-07T04:01:06.264471Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"ins_app = pd.read_sql(ins_app_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:40.048118Z",
"start_time": "2019-11-07T04:02:40.042945Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(15307, 2)"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ins_app.shape"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"plo_app_list = pd.read_csv('data/payday_700_list.csv', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:40.084424Z",
"start_time": "2019-11-07T04:02:40.066336Z"
}
},
"outputs": [],
"source": [
"payday_loan_apps = plo_app_list[0].unique().tolist()\n",
"payday_loan_apps = [x.lower() for x in payday_loan_apps]"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:40.109404Z",
"start_time": "2019-11-07T04:02:40.097156Z"
}
},
"outputs": [],
"source": [
"def create_feats_apps(raw_df,label='installed_applications'):\n",
"\n",
" df = raw_df.copy()\n",
" df[label] = df[label].fillna('[]')\n",
" df[label] = df[label].apply(lambda x: eval(x))\n",
"\n",
" def count_apps(x,list_apps):\n",
" return len([ft for ft in x if ft.lower() in list_apps])\n",
"\n",
" df['ap_co_plo'] = df[label].apply(lambda x:count_apps(x,payday_loan_apps))\n",
" \n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.459722Z",
"start_time": "2019-11-07T04:02:40.111155Z"
}
},
"outputs": [],
"source": [
"plo_apps = create_feats_apps(ins_app)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.507926Z",
"start_time": "2019-11-07T04:02:44.461220Z"
}
},
"outputs": [],
"source": [
"plo_apps = plo_apps.drop(columns=['installed_applications'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Series nth PL transaction"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"def get_pl_history(base_df):\n",
" uids = tuple(base_df['user_id'])\n",
" max_trx_date = base_df['transaction_date'].max()\n",
"\n",
" query = f\"\"\"\n",
" SELECT user_id,\n",
" id AS trx_id,\n",
" status,\n",
" payment_type,\n",
" amount,\n",
" transaction_date\n",
" FROM l2alpha.transaction\n",
" WHERE user_id IN {uids}\n",
" AND transaction_type = 2\n",
" AND transaction_date <= '{max_trx_date}'\n",
" \"\"\"\n",
"\n",
" df = pd.read_sql(query, connect_sql())\n",
" df['gen_payment_type'] = df['payment_type'].apply(lambda x: 'mini' if x=='30_days' else 'jumbo')\n",
" \n",
" out_df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(df, how='left', on='user_id')\\\n",
" .rename(columns={'transaction_date_x':'trx_dt_current',\n",
" 'transaction_date_y':'trx_dt_past',\n",
" 'trx_id_x':'trx_id',\n",
" 'trx_id_y':'trx_id_ref'})\n",
" out_df['time_diff'] = (out_df['trx_dt_current'] - out_df['trx_dt_past']).dt.total_seconds()\n",
" return out_df"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"def create_pl_series(df, base_df, pl_type='all'):\n",
" if pl_type != 'all':\n",
" df = df[df['gen_payment_type'] == pl_type]\n",
" \n",
" df = df[(df['time_diff'] >= 0) &\n",
" (df['status'] == 4)]\n",
" \n",
" series_df = df.groupby(['user_id', 'trx_id', 'gen_payment_type']).size()\\\n",
" .unstack(level=[2]).fillna(0)\n",
" series_df.columns = [f'pl_series_{col}' for col in series_df.columns.values]\n",
" \n",
" series_df['pl_series_jumbomini'] = series_df['pl_series_jumbo'] + series_df['pl_series_mini']\n",
" series_df = series_df.reset_index()\n",
" return series_df"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"pl_hist = get_pl_history(base_df)"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"series_feats = create_pl_series(pl_hist, base_df)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6030, 5)"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"series_feats.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Number of Previous PL Transactions"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"def create_prev_pl_feats(df, base_df, window='all'):\n",
" if isinstance(window, int):\n",
" df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
" name = str(window)\n",
" elif window == 'all':\n",
" df = df[df['time_diff'] > 0]\n",
" name = 'ever'\n",
" \n",
" prev_df = df.groupby(['user_id', 'trx_id', 'status', 'gen_payment_type']).size()\\\n",
" .unstack(level=[3,2]).fillna(0)\n",
" prev_df.columns = [f'pl_trx_co_{col[0]}_{col[1]}_{name}' for col in prev_df.columns.values]\n",
" \n",
" keywords = [2, 4, 5, 6, 7, 11, 'jumbo', 'mini']\n",
" stat_cols_dict = {}\n",
" for word in keywords:\n",
" str_word = str(word) if isinstance(word, int) else word\n",
" stat_cols_dict[word] = [col for col in prev_df if str_word in col]\n",
" \n",
" name_status = 'pl_trx_co_jumbomini_{}_{}'\n",
" name_pl_type = 'pl_trx_co_{}_all_{}'\n",
" for k, v in stat_cols_dict.items():\n",
" if isinstance(k, int):\n",
" prev_df[name_status.format(str(k), name)] = prev_df[v].sum(axis=1)\n",
" elif isinstance(k, str):\n",
" prev_df[name_pl_type.format(k, name)] = prev_df[v].sum(axis=1)\n",
" \n",
" prev_df[f'pl_trx_co_jumbomini_all_{name}'] = prev_df[f'pl_trx_co_jumbo_all_{name}'] + \\\n",
" prev_df[f'pl_trx_co_mini_all_{name}']\n",
" \n",
" prev_df = base_df[['user_id', 'trx_id']].merge(prev_df, how='left', on=['user_id', 'trx_id']).fillna(0)\n",
" \n",
" return prev_df"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"prev_obs_feats_all = create_prev_pl_feats(pl_hist, base_df, window='all')\n",
"prev_obs_feats_180 = create_prev_pl_feats(pl_hist, base_df, window=180)\n",
"prev_obs_feats_90 = create_prev_pl_feats(pl_hist, base_df, window=90)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 23)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(16190, 21)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(16190, 19)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(prev_obs_feats_all.shape)\n",
"display(prev_obs_feats_180.shape)\n",
"display(prev_obs_feats_90.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Early Settlement"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"def get_trx_settle_data(base_df):\n",
" uids = tuple(base_df['user_id'])\n",
" max_trx_date = base_df['transaction_date'].max()\n",
"\n",
" early_settle_q = f'''\n",
" SELECT *,\n",
" DATE(transaction_date) + INTERVAL payment_type_int DAY AS due_date\n",
" FROM\n",
" (SELECT tr.user_id, tr.trx_id, transaction_date, payment_type, transaction_type, status,\n",
" CASE WHEN payment_type = '30_days' THEN '30'\n",
" WHEN payment_type = '3_months' THEN '90'\n",
" WHEN payment_type = '6_months' THEN '180' \n",
" WHEN payment_type = '12_months' THEN '360' ELSE NULL END AS payment_type_int,\n",
"\n",
" settlement_date, loan_amount, is_active\n",
" FROM\n",
" (SELECT user_id,\n",
" id AS trx_id,\n",
" transaction_date,\n",
" payment_type,\n",
" transaction_type,\n",
" status\n",
" FROM l2alpha.`transaction`\n",
" WHERE user_id IN {uids}\n",
" AND transaction_type = 2\n",
" AND status = 4\n",
" AND transaction_date <= '{max_trx_date}'\n",
" ) AS tr\n",
"\n",
" LEFT JOIN\n",
"\n",
" (SELECT transaction_id AS trx_id,\n",
" settlement_date,\n",
" loan_amount,\n",
" is_active\n",
" FROM l2alpha.loan\n",
" ) AS lo\n",
"\n",
" ON tr.trx_id = lo.trx_id\n",
" ) AS base\n",
" WHERE loan_amount > 0 AND is_active = 1\n",
" '''\n",
" query_df = pd.read_sql(early_settle_q, connect_sql())\n",
" query_df['settle_to_due'] = (query_df['due_date'] - query_df['settlement_date'].dt.date).dt.days\n",
" df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(query_df, how='left', on='user_id')\\\n",
" .rename(columns={'transaction_date_x':'trx_dt_current',\n",
" 'transaction_date_y':'trx_dt_past',\n",
" 'trx_id_x':'trx_id',\n",
" 'trx_id_y':'trx_id_ref'})\n",
" df['time_diff'] = (df['trx_dt_current'] - df['trx_dt_past']).dt.total_seconds()\n",
" df = df.sort_values(['user_id', 'trx_dt_past']).reset_index(drop=True)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"def create_early_settle_feats(df, base_df, window='all'):\n",
" if isinstance(window, int):\n",
" df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
" name = str(window)\n",
" elif window == 'all':\n",
" df = df[df['time_diff'] > 0]\n",
" name = 'ever'\n",
" df = df.sort_values(['user_id', 'trx_id', 'trx_dt_past'])\n",
" \n",
" # Calculate how many times the user has settled early in the last XX days\n",
" co_df = df.copy()\n",
" co_df['flag_early_settle'] = np.where(co_df['settle_to_due'] >= 30, 1, 0)\n",
" co_df['flag_early_settle'] = np.where(co_df['settle_to_due'].isna(), np.nan, co_df['flag_early_settle'])\n",
" co_df = co_df.groupby('trx_id').agg({'flag_early_settle':'sum'}).reset_index()\n",
" co_df.columns = ['trx_id', f'early_settle_co_{name}']\n",
" \n",
" # Calculate the min, max, average, and median days the user has settled early in the last XX days\n",
" # Also get the last time the user settled early (how many days early).\n",
" es_df = df.groupby('trx_id').agg({'settle_to_due':[max, min, 'mean', 'median', 'last']})\n",
" col_names = ['settle_to_due_max_{}', 'settle_to_due_min_{}', 'settle_to_due_avg_{}', \n",
" 'settle_to_due_med_{}', 'settle_to_due_last_pl_{}']\n",
" es_df.columns = [col.format(name) for col in col_names]\n",
" es_df = es_df.reset_index()\n",
" out_df = base_df[['user_id', 'trx_id']].merge(es_df, how='left', on='trx_id')\\\n",
" .merge(co_df, how='left', on='trx_id')\n",
" return out_df"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"trx_settle = get_trx_settle_data(base_df)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"early_settle_feats_ever = create_early_settle_feats(trx_settle, base_df, window='all')\n",
"early_settle_feats_180 = create_early_settle_feats(trx_settle, base_df, window=180)\n",
"early_settle_feats_90 = create_early_settle_feats(trx_settle, base_df, window=90)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 8)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(16190, 8)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(16190, 8)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(early_settle_feats_ever.shape)\n",
"display(early_settle_feats_180.shape)\n",
"display(early_settle_feats_90.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Time from last PL Settlement"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"def create_time_from_last_pl_feat(df):\n",
" time_from_last_pl = df[df['time_diff'] >= 0]\n",
" time_from_last_pl.loc[(time_from_last_pl['time_diff'] == 0) & \n",
" (time_from_last_pl['settlement_date'].notna()), 'settlement_date'] = np.nan\n",
" time_from_last_pl = time_from_last_pl.sort_values(['user_id', 'trx_id', 'trx_dt_past'])\n",
"\n",
" time_from_last_pl_grp = time_from_last_pl.groupby('trx_id').tail(2)\n",
" time_from_last_pl_grp['settle_shift'] = time_from_last_pl_grp['settlement_date'].shift()\n",
" time_from_last_pl_grp['time_from_prev_pl_settle'] = (time_from_last_pl_grp['trx_dt_current'] - \\\n",
" time_from_last_pl_grp['settle_shift']).dt.total_seconds()/3600\n",
" \n",
" time_from_last_pl_final = time_from_last_pl_grp.groupby('trx_id').tail(1)\n",
" time_from_last_pl_final = time_from_last_pl_final[['user_id', 'trx_id', \n",
" 'time_from_prev_pl_settle']].reset_index(drop=True)\n",
" return time_from_last_pl_final"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"time_from_prev_pl_settle = create_time_from_last_pl_feat(trx_settle)"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6030, 3)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(time_from_prev_pl_settle.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Date and day of week of transaction date"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"date_feats = base_df[['user_id', 'trx_id', 'transaction_date']].copy()\n",
"date_feats['td_date'] = date_feats['transaction_date'].dt.day\n",
"date_feats['td_day_of_week'] = date_feats['transaction_date'].dt.dayofweek\n",
"date_feats = date_feats[['user_id', 'trx_id', 'td_date', 'td_day_of_week']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Digital and Non-Digital Transactions"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"def get_trx_history(base_df):\n",
" uids = tuple(base_df['user_id'])\n",
" max_trx_date = base_df['transaction_date'].max()\n",
" \n",
" # Remove all promotional and service related fees\n",
" excluded_product = ('biaya layanan'\n",
" ,'shipping fee'\n",
" ,'admin fee' \n",
" ,'biaya kirim yang dibebankan' \n",
" ,'biaya tambahan' \n",
" ,'discount' \n",
" ,'adminfee' \n",
" ,'shippingfee'\n",
" ,'additionalfee' \n",
" ,'diskon tambahan' \n",
" ,'donasi'\n",
" ,'donation' \n",
" ,'charge kredivo 2.3%' \n",
" ,'lainnya' \n",
" ,'user credit'\n",
" ,'deals') \n",
" \n",
" query = f\"\"\"\n",
" SELECT\n",
" t.user_id,\n",
" t.transaction_id AS trx_id,\n",
" t.transaction_date,\n",
" LOWER(p.name) AS product_name,\n",
" LOWER(p.category) AS category,\n",
" l.loan_amount,\n",
" t.merchant_id\n",
" FROM\n",
" (\n",
" SELECT\n",
" user_id,\n",
" id AS transaction_id,\n",
" transaction_date,\n",
" merchant_id\n",
" FROM\n",
" l2alpha.`transaction`\n",
" WHERE\n",
" user_id IN {uids}\n",
" AND status = 4\n",
" AND transaction_type <> 2\n",
" AND transaction_date <= '{max_trx_date}') AS t\n",
" JOIN (\n",
" SELECT\n",
" name,\n",
" id AS product_id,\n",
" transaction_id,\n",
" category,\n",
" sku\n",
" FROM\n",
" l2alpha.product\n",
" WHERE\n",
" sku_type = 0\n",
" AND name NOT IN {excluded_product}\n",
" AND sku NOT IN {excluded_product}) AS p ON\n",
" t.transaction_id = p.transaction_id\n",
" JOIN (\n",
" SELECT\n",
" transaction_id,\n",
" loan_amount\n",
" FROM\n",
" l2alpha.loan\n",
" WHERE\n",
" is_active = 1\n",
" AND loan_amount > 0) AS l ON\n",
" p.transaction_id = l.transaction_id\n",
" JOIN (\n",
" SELECT\n",
" id,\n",
" name\n",
" FROM\n",
" l2alpha.merchant) AS m ON\n",
" t.merchant_id = m.id\n",
" \"\"\"\n",
" \n",
" df = pd.read_sql(query, connect_sql())\n",
" out_df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(df, how='left', on='user_id')\\\n",
" .rename(columns={'transaction_date_x':'trx_dt_curr',\n",
" 'transaction_date_y':'trx_dt_past',\n",
" 'trx_id_x':'trx_id',\n",
" 'trx_id_y':'trx_id_ref'})\n",
" out_df['time_diff'] = (out_df['trx_dt_curr'] - out_df['trx_dt_past']).dt.total_seconds()\n",
" return out_df"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"def get_ticket(x):\n",
" if int(x['merchant_id']) in [154,428,48,87,432]:\n",
" if 'hotel' not in x['product_name'] and 'hotel' not in x['category']:\n",
" return True\n",
" elif x['merchant_id'] in [270]:\n",
" if 'flight' in x['product_name'] or 'flight' in x['category']:\n",
" return True\n",
"\n",
" return False\n",
"\n",
"def get_utilities(x):\n",
" for w in ['pdam','pln','tv kabel', 'electricity','electricity_postpaid',\n",
" 'listrik pln', 'pln','pdam','internet & tv kabel','air pdam', \n",
" '75061609','75062176', 'listrik prabayar', 'listrik lainnya','pulsa listrik']:\n",
" if w in x['product_name'] or w in x['category']: \n",
" return True\n",
"\n",
" if re.search(r'(tagihan).*(listrik|telkom)|(voucher|token).*(listrik|pln)|pln.*(prepaid|postpaid|token)|\\bpdam\\b', x['product_name']):\n",
" return True\n",
"\n",
" return False\n",
"\n",
"def get_game(x):\n",
" if int(x['merchant_id']) in [426,197]:\n",
" return True\n",
" elif re.search(r'(voucher|data).*game|game.*(voucher|data)|\\bgamesmax\\b', x['product_name']):\n",
" return True\n",
" elif re.search('voucher', x['category']):\n",
" if re.search('game',x['product_name']):\n",
" return True\n",
" else:\n",
" return False\n",
"\n",
"def get_pulsa(x):\n",
" if int(x['merchant_id']) == 8:\n",
" for n in ['pln','listrik','pdam','game']:\n",
" if n in x['product_name']:\n",
" return False\n",
" if 'rp' in x['product_name']:\n",
" return True\n",
" elif x['merchant_id'] != 8:\n",
" for n in ['pln','listrik','pdam','game', 'flight','mobiles & tablets', 'komputer & aksesoris']:\n",
" if n in x['product_name'] or n in x['category']:\n",
" return False\n",
" for w in ['mobile', 'digital products','pulsa', \n",
" 'virtual','75061635','paket data','digital utilities','pasca bayar']:\n",
" if w in x['category']:\n",
" return True\n",
"\n",
" if re.search(r'\\bpulsa\\b', x['product_name']):\n",
" return True\n",
"\n",
" for w in ['paket data', 'kuota', 'xl xtra', 'indosat' , 'telkomsel', 'voucher 3',\n",
" 'smartfren', 'voucher tri', 'xl xtra combo', 'axis', 'paket internet']:\n",
" if w in x['product_name']:\n",
" return True\n",
"\n",
" else:\n",
" return False\n",
" \n",
"def get_digital_product_status(df):\n",
" df['trx_cat_ticket'] = df.apply(lambda z: get_ticket(z), axis=1)\n",
" df['trx_cat_game'] = df.apply(lambda z: get_game(z), axis=1)\n",
" df['trx_cat_pulsa'] = df.apply(lambda z: get_pulsa(z), axis=1)\n",
" df['trx_cat_utilities'] = df.apply(lambda z: get_utilities(z), axis=1)\n",
" \n",
" df['trx_cat_digital'] = df[['trx_cat_game', 'trx_cat_pulsa', 'trx_cat_utilities']].any(1)\n",
" \n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"def create_prev_digital_trx_per_type(df, keyword, window='all'):\n",
" if keyword not in ['ticket', 'game', 'pulsa', 'utilities', 'digital']:\n",
" raise ValueError(f'{keyword} not available. Try ticket, game, pulsa, utilities, digital.')\n",
" \n",
" if isinstance(window, int):\n",
" df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
" name = str(window)\n",
" elif window == 'all':\n",
" df = df[df['time_diff'] > 0]\n",
" name = 'ever'\n",
" elif isinstance(window, int) == False or window != 'all':\n",
" raise ValueError(f'The input \"{window}\" is wrong. Window value must be an integer or \"all\".')\n",
" \n",
" key = 'trx_cat_'+ keyword\n",
" digital = df.groupby(['user_id', 'trx_id', key]).agg({key:'count',\n",
" 'loan_amount':['sum', 'mean']})\\\n",
" .unstack().fillna(0)\n",
" \n",
" columns = []\n",
" for groups in digital.columns.values:\n",
" col_name = []\n",
" for item in groups:\n",
" col_name.append(str(item))\n",
" new_name = '-'.join(col_name)\n",
" columns.append(new_name)\n",
" digital.columns = columns\n",
" \n",
" if keyword == 'digital':\n",
" final = digital.reset_index()\n",
" final.columns = ['user_id', 'trx_id',\n",
" f'nondgtl_nonpl_trx_co_{name}',\n",
" f'dgtl_trx_co_all_{name}',\n",
" f'nondgtl_nonpl_trx_sum_amt_{name}',\n",
" f'dgtl_trx_sum_amt_all_{name}',\n",
" f'nondgtl_nonpl_trx_avg_amt_{name}',\n",
" f'dgtl_trx_avg_amt_all_{name}']\n",
" \n",
" else:\n",
" relevant_columns = [col for col in digital.columns if 'True' in col]\n",
" final = digital[relevant_columns].reset_index()\n",
" final.columns = ['user_id', 'trx_id', \n",
" f'dgtl_trx_co_{keyword}_{name}', \n",
" f'dgtl_trx_sum_amt_{keyword}_{name}',\n",
" f'dgtl_trx_avg_amt_{keyword}_{name}']\n",
" \n",
" return final"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"def create_digital_trx_feats(df_in, begin, finish):\n",
" # Load Raw Data\n",
" st_tm_1 = datetime.now()\n",
" print('Querying Transaction Data')\n",
" \n",
" df = df_in.iloc[begin:finish].reset_index(drop=True).copy()\n",
" display(df.shape)\n",
" trx_hist = get_trx_history(df)\n",
" trx_hist = trx_hist.dropna(how='any')\n",
" display(trx_hist.shape)\n",
" \n",
" print('query time:', datetime.now()-st_tm_1)\n",
" \n",
" #Create Digital Product Flags\n",
" st_tm_2 = datetime.now()\n",
" print('Create Digital Product Flags')\n",
" \n",
" trx_hist = get_digital_product_status(trx_hist)\n",
" display(trx_hist.shape)\n",
" \n",
" print('Create Digital Product Flags time:', datetime.now()-st_tm_2)\n",
" \n",
" #Create Features\n",
" st_tm_3 = datetime.now()\n",
" print('Create Features')\n",
" \n",
" ticket_all = create_prev_digital_trx_per_type(trx_hist, 'ticket', window='all')\n",
" ticket_180 = create_prev_digital_trx_per_type(trx_hist, 'ticket', window=180)\n",
" ticket_90 = create_prev_digital_trx_per_type(trx_hist, 'ticket', window=90)\n",
" \n",
" game_all = create_prev_digital_trx_per_type(trx_hist, 'game', window='all')\n",
" game_180 = create_prev_digital_trx_per_type(trx_hist, 'game', window=180)\n",
" game_90 = create_prev_digital_trx_per_type(trx_hist, 'game', window=90)\n",
" \n",
" pulsa_all = create_prev_digital_trx_per_type(trx_hist, 'pulsa', window='all')\n",
" pulsa_180 = create_prev_digital_trx_per_type(trx_hist, 'pulsa', window=180)\n",
" pulsa_90 = create_prev_digital_trx_per_type(trx_hist, 'pulsa', window=90)\n",
" \n",
" utilities_all = create_prev_digital_trx_per_type(trx_hist, 'utilities', window='all')\n",
" utilities_180 = create_prev_digital_trx_per_type(trx_hist, 'utilities', window=180)\n",
" utilities_90 = create_prev_digital_trx_per_type(trx_hist, 'utilities', window=90)\n",
" \n",
" digital_all = create_prev_digital_trx_per_type(trx_hist, 'digital', window='all')\n",
" digital_180 = create_prev_digital_trx_per_type(trx_hist, 'digital', window=180)\n",
" digital_90 = create_prev_digital_trx_per_type(trx_hist, 'digital', window=90)\n",
" \n",
" print('Create Features time:', datetime.now()-st_tm_3)\n",
" \n",
" #Combine DataFrames\n",
" st_tm_4 = datetime.now()\n",
" print('Combine DataFrames')\n",
" \n",
" df_out = df[['user_id', 'trx_id']].merge(ticket_all, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(ticket_180, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(ticket_90, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(game_all, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(game_180, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(game_90, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(pulsa_all, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(pulsa_180, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(pulsa_90, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(utilities_all, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(utilities_180, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(utilities_90, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(digital_all, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(digital_180, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(digital_90, how='left', on=['user_id', 'trx_id'])\n",
" \n",
" print('combine dataframe time:', datetime.now()-st_tm_4)\n",
" print('total processing time:', datetime.now()-st_tm_1)\n",
" \n",
" return df_out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0025009d62ad4ebf8d91fd5f1dc30a4d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=1, max=3, min=1)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"process data from 0 to 5000.\n",
"Querying Transaction Data\n"
]
},
{
"data": {
"text/plain": [
"(5000, 10)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(584705, 15)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Create Digital Product Flags time: 0:06:38.272084\n",
"Create Features\n",
"Create Features time: 0:00:01.189097\n",
"Combine DataFrames\n",
"combine dataframe time: 0:00:00.071482\n",
"total processing time: 0:10:41.888767\n",
"batch 1 done!\n",
"================================================================================================================\n",
"process data from 5000 to 10000.\n",
"Querying Transaction Data\n"
]
},
{
"data": {
"text/plain": [
"(5000, 10)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(175423, 10)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"query time: 0:01:21.783504\n",
"Create Digital Product Flags\n"
]
},
{
"data": {
"text/plain": [
"(175423, 15)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Create Digital Product Flags time: 0:02:03.173538\n",
"Create Features\n",
"Create Features time: 0:00:00.643540\n",
"Combine DataFrames\n",
"combine dataframe time: 0:00:00.071640\n",
"total processing time: 0:03:25.672570\n",
"batch 2 done!\n",
"================================================================================================================\n",
"process data from 10000 to 15000.\n",
"Querying Transaction Data\n"
]
},
{
"data": {
"text/plain": [
"(5000, 10)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(72388, 10)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"query time: 0:00:58.083601\n",
"Create Digital Product Flags\n"
]
},
{
"data": {
"text/plain": [
"(72388, 15)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Create Digital Product Flags time: 0:00:51.773500\n",
"Create Features\n",
"Create Features time: 0:00:00.259759\n",
"Combine DataFrames\n",
"combine dataframe time: 0:00:00.067609\n",
"total processing time: 0:01:50.184895\n",
"batch 3 done!\n",
"================================================================================================================\n",
"process data from 15000 to 20000.\n",
"Querying Transaction Data\n"
]
},
{
"data": {
"text/plain": [
"(1190, 10)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(7930, 10)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"query time: 0:00:13.325071\n",
"Create Digital Product Flags\n"
]
},
{
"data": {
"text/plain": [
"(7930, 15)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Create Digital Product Flags time: 0:00:05.521818\n",
"Create Features\n",
"Create Features time: 0:00:00.225180\n",
"Combine DataFrames\n",
"combine dataframe time: 0:00:00.056082\n",
"total processing time: 0:00:19.128461\n",
"batch 4 done!\n",
"================================================================================================================\n",
"Total Processing Time: 0:16:17.223499\n"
]
}
],
"source": [
"from ipywidgets import IntProgress\n",
"from IPython.display import display\n",
"\n",
"FOLDER_LOC = 'data/rej_dgtl_trx_feats'\n",
"batch = 1\n",
"batch_size = 5000\n",
"\n",
"max_count = round(base_df.shape[0]/batch_size)\n",
"f = IntProgress(min=batch, max=max_count) # instantiate the bar\n",
"display(f) # display the bar\n",
"\n",
"st_tm = datetime.now()\n",
"\n",
"while (batch*batch_size) - base_df.shape[0] < batch_size:\n",
" f.value += 1\n",
" start = (batch-1)*batch_size\n",
" end = batch*batch_size\n",
" print(f'process data from {start} to {end}.')\n",
" dgtl_trx_feats = create_digital_trx_feats(base_df, start, end)\n",
" dgtl_trx_feats.to_parquet(FOLDER_LOC + '/dgtl_trx_feats_batch'+str(batch)+'.parquet', compression='gzip')\n",
" print(f'batch {batch} done!')\n",
" print('================================================================================================================')\n",
" batch += 1\n",
" \n",
"print('Total Processing Time:', datetime.now() - st_tm)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"from os import listdir\n",
"file_batches = [file for file in listdir(FOLDER_LOC) if file.endswith('.parquet')]"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['dgtl_trx_feats_batch1.parquet',\n",
" 'dgtl_trx_feats_batch2.parquet',\n",
" 'dgtl_trx_feats_batch4.parquet',\n",
" 'dgtl_trx_feats_batch3.parquet']"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"file_batches"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"comb = []\n",
"for part in file_batches:\n",
" raw = pd.read_parquet(FOLDER_LOC+'/'+part)\n",
" comb.append(raw)\n",
" \n",
"dgtl_trx_feats = pd.concat(comb)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 56)"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dgtl_trx_feats.shape"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"dgtl_trx_feats.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Denied Transactions"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"denied_trx_q = f'''\n",
"SELECT base.user_id,\n",
"base.pl_trx_id AS trx_id,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 7 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_7d,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 15 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_15d,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 30 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_1mo,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 60 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_2mo,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 90 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_3mo,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 120 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_4mo,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 180 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_6mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 7 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
" THEN base.amount ELSE 0 END) AS denied_trx_sum_7d,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 15 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
" THEN base.amount ELSE 0 END) AS denied_trx_sum_15d,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 30 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
" THEN base.amount ELSE 0 END) AS denied_trx_sum_1mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 60 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
" THEN base.amount ELSE 0 END) AS denied_trx_sum_2mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 90 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
" THEN base.amount ELSE 0 END) AS denied_trx_sum_3mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 120 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
" THEN base.amount ELSE 0 END) AS denied_trx_sum_4mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 180 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
" THEN base.amount ELSE 0 END) AS denied_trx_sum_6mo \n",
"FROM\n",
" (SELECT tr1.user_id,\n",
" tr1.id AS pl_trx_id,\n",
" tr1.pl_transaction_date,\n",
" tr2.id AS transaction_id,\n",
" tr2.transaction_date,\n",
" tr2.amount\n",
" FROM\n",
" (SELECT user_id, \n",
" id, \n",
" transaction_date AS pl_transaction_date\n",
" FROM l2alpha.`transaction` WHERE id IN {tids}\n",
" ) AS tr1\n",
" LEFT JOIN\n",
" (SELECT * \n",
" FROM l2alpha.`transaction`\n",
" WHERE status = 5\n",
" ) AS tr2 \n",
" ON tr1.user_id = tr2.user_id \n",
" AND tr2.transaction_date <= tr1.pl_transaction_date\n",
" ) AS base\n",
"GROUP BY base.user_id, base.pl_transaction_date\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"denied_trx = pd.read_sql(denied_trx_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 16)"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"denied_trx.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Settled Transactions"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"paid_trx_q = f'''\n",
"SELECT base.user_id,\n",
"base.pl_trx_id AS trx_id,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 7 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_7d,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 15 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_15d,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 30 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_1mo,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 60 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_2mo,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 90 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_3mo,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 180 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_6mo,\n",
"COUNT(DISTINCT(base.transaction_id)) AS settled_trx_co_ever,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 7 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_7d,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 15 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_15d,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 30 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_1mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 60 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_2mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 90 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_3mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 180 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_6mo,\n",
"SUM(COALESCE(base.loan_amount,0)) AS settled_trx_sum_ever\n",
"FROM\n",
" (SELECT tr.user_id,\n",
" tr.id AS pl_trx_id,\n",
" tr.pl_transaction_date, \n",
" l.transaction_id,\n",
" l.start_date,\n",
" l.loan_amount,\n",
" l.tenure\n",
"\n",
"\tFROM \n",
"\t (SELECT user_id, \n",
"\t id, \n",
"\t transaction_date AS pl_transaction_date\n",
"\t FROM l2alpha.`transaction` WHERE id IN {tids}\n",
"\t ) AS tr\n",
"\tINNER JOIN \n",
"\t (SELECT user_id,\n",
"\t start_date, \n",
"\t settlement_date,\n",
"\t loan_amount,\n",
"\t tenure,\n",
"\t transaction_id \n",
"\t FROM l2alpha.loan\n",
"\t WHERE is_active = 1 AND loan_amount > 0\n",
"\t ) AS l\n",
"\tON l.user_id = tr.user_id AND settlement_date <= pl_transaction_date\n",
"\t) AS base\n",
"GROUP BY base.user_id, base.pl_transaction_date\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"paid_trx = pd.read_sql(paid_trx_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7282, 16)"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"paid_trx.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Combine all dataframes"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.586494Z",
"start_time": "2019-11-07T04:02:44.510100Z"
}
},
"outputs": [],
"source": [
"final_df = base_df.merge(snapshot, on=['trx_id', 'user_id'], how='left')\\\n",
" .merge(delin, on=['trx_id', 'user_id'], how='left')\\\n",
" .merge(time_pl, on=['trx_id', 'user_id'], how='left')\\\n",
" .merge(oth_last_trx, on=['trx_id', 'user_id'], how='left')\\\n",
" .merge(oth_first_trx, on=['trx_id', 'user_id'], how='left')\\\n",
" .merge(oth_last_rep, on=['trx_id', 'user_id'], how='left')\\\n",
" .merge(util_non_pl, on=['trx_id', 'user_id'], how='left')\\\n",
" .merge(util_pl, on=['trx_id', 'user_id'], how='left')\\\n",
" .merge(a_score, on='user_id', how='left')\\\n",
" .merge(plo_apps, on='user_id', how='left')\\\n",
" .merge(series_feats, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(prev_obs_feats_all, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(prev_obs_feats_180, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(prev_obs_feats_90, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(early_settle_feats_ever, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(early_settle_feats_180, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(early_settle_feats_90, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(time_from_prev_pl_settle, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(date_feats, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(dgtl_trx_feats, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(denied_trx, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(paid_trx, how='left', on=['user_id', 'trx_id'])"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.591815Z",
"start_time": "2019-11-07T04:02:44.587913Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 198)"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 9)"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"base_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.617944Z",
"start_time": "2019-11-07T04:02:44.593760Z"
}
},
"outputs": [],
"source": [
"final_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.798454Z",
"start_time": "2019-11-07T04:02:44.619626Z"
}
},
"outputs": [],
"source": [
"final_df.to_parquet('data/pl_rejected_users_feats_04022020.parquet', compression='gzip')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}