{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:36.988168Z",
"start_time": "2019-11-07T03:55:36.706164Z"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pymysql\n",
"from datetime import datetime"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:36.994804Z",
"start_time": "2019-11-07T03:55:36.989531Z"
}
},
"outputs": [],
"source": [
"import configparser\n",
"config = configparser.ConfigParser()\n",
"config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
"\n",
"host = config['MYSQL-ROOT']['HOST']\n",
"user = config['MYSQL-ROOT']['USER']\n",
"password = config['MYSQL-ROOT']['PASSWORD']\n",
"\n",
"def connect_sql():\n",
" cnx = pymysql.connect(host=host,\n",
" user=user,\n",
" password=password,\n",
" cursorclass=pymysql.cursors.DictCursor)\n",
" return cnx"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:37.026188Z",
"start_time": "2019-11-07T03:55:36.997299Z"
}
},
"outputs": [],
"source": [
"base_df = pd.read_parquet('data/pl_threshold_jan_11022020.parquet')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:37.034477Z",
"start_time": "2019-11-07T03:55:37.027603Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 10)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"base_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:37.043587Z",
"start_time": "2019-11-07T03:55:37.036542Z"
}
},
"outputs": [],
"source": [
"uids = tuple(base_df['user_id'].unique().tolist())\n",
"tids = tuple(base_df['trx_id'].unique().tolist())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"78465"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(uids)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"79853"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(tids)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Snapshot"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:37.078945Z",
"start_time": "2019-11-07T03:55:37.074323Z"
}
},
"outputs": [],
"source": [
"snapshot_q = f\"\"\"\n",
"SELECT \n",
" base.user_id, \n",
" base.id AS trx_id,\n",
" current_dpd\n",
"FROM \n",
"\t\t(SELECT user_id,id,transaction_date \n",
"\t\tFROM l2alpha.`transaction` WHERE id IN {tids}) AS base\n",
"\tLEFT JOIN \n",
"\t\tds.user_level_archive AS ula \n",
"\tON base.user_id = ula.user_id AND DATE(transaction_date) = snapshot_date \n",
"\tLEFT JOIN\n",
"\t\t(SELECT user_id,monthly_installment, transaction_id\n",
"\t\tFROM l2alpha.loan \n",
"\t\tWHERE transaction_id IN {tids}) AS lo \n",
"\tON lo.transaction_id = base.id\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:08.943657Z",
"start_time": "2019-11-07T03:55:37.081391Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/anaconda3/envs/env_personalloan/lib/python3.7/site-packages/pymysql/cursors.py:329: Warning: (3170, \"Memory capacity of 8388608 bytes for 'range_optimizer_max_mem_size' exceeded. Range optimization was not done for this query.\")\n",
" self._do_get_result()\n"
]
}
],
"source": [
"snapshot = pd.read_sql(snapshot_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:08.949079Z",
"start_time": "2019-11-07T03:58:08.945547Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 3)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"snapshot.shape"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:08.959459Z",
"start_time": "2019-11-07T03:58:08.950846Z"
}
},
"outputs": [],
"source": [
"snapshot.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Delinquency"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:08.985180Z",
"start_time": "2019-11-07T03:58:08.960895Z"
}
},
"outputs": [],
"source": [
"delin_q = f'''\n",
"SELECT \n",
" base.user_id,\n",
" base.id AS trx_id,\n",
" MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 90 AND\n",
" datediff(DATE(transaction_date),snapshot_date) >=0 THEN current_dpd ELSE NULL END) AS delin_max_dpd_90\n",
"FROM\n",
"\t\t(SELECT user_id,id,transaction_date \n",
"\t\tFROM l2alpha.`transaction` \n",
"\t\tWHERE id IN {tids}) AS base\n",
"\tLEFT JOIN \n",
"\t (SELECT user_id,\n",
"\t \tos_amount,\n",
"\t \tcurrent_dpd,\n",
"\t \tdpd_amount,\n",
"\t \tsnapshot_date\n",
"\t FROM ds.user_level_archive\n",
"\t ) AS delin ON base.user_id = delin.user_id AND snapshot_date <= DATE(transaction_date)\n",
"GROUP BY base.id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:37.439306Z",
"start_time": "2019-11-07T03:58:08.992173Z"
}
},
"outputs": [],
"source": [
"delin = pd.read_sql(delin_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:37.443941Z",
"start_time": "2019-11-07T03:58:37.440532Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 3)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"delin.shape"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:37.458294Z",
"start_time": "2019-11-07T03:58:37.445289Z"
}
},
"outputs": [],
"source": [
"delin.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Other"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## time_approve_to_pl_hour"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:37.472597Z",
"start_time": "2019-11-07T03:58:37.459708Z"
}
},
"outputs": [],
"source": [
"time_pl_q = f'''\n",
"SELECT \n",
" base.user_id,\n",
" base.id AS trx_id,\n",
" TIMESTAMPDIFF(HOUR, installment_ts, transaction_date) AS time_appr_to_pl_hour\n",
"FROM\n",
"\t\t(SELECT user_id,id,transaction_date \n",
"\t\tFROM l2alpha.`transaction` WHERE id IN {tids}) AS base\n",
"\tLEFT JOIN \n",
"\t (SELECT user_id, MAX(is_approved_timestamp) AS installment_ts \n",
"\t FROM l2alpha.web_userinstallmentapplicationdetails \n",
"\t WHERE user_id IN {uids}\n",
"\t GROUP BY user_id\n",
"\t ) AS wuiad \n",
"\tON base.user_id = wuiad.user_id\n",
"GROUP BY base.id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:40.087947Z",
"start_time": "2019-11-07T03:58:37.473685Z"
}
},
"outputs": [],
"source": [
"time_pl = pd.read_sql(time_pl_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:40.092390Z",
"start_time": "2019-11-07T03:58:40.089336Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 3)"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"time_pl.shape"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:58:40.104548Z",
"start_time": "2019-11-07T03:58:40.093767Z"
}
},
"outputs": [],
"source": [
"time_pl.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## oth_last_rep_xxx:\n",
"- dpd\n",
"- days"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:11.673428Z",
"start_time": "2019-11-07T04:00:11.659983Z"
}
},
"outputs": [],
"source": [
"oth_last_rep_q = f'''\n",
"SELECT main.user_id,\n",
"\t main.id AS trx_id,\n",
" MAX(dpd) AS oth_last_rep_dpd,\n",
" datediff(main.transaction_date,main.last_pd) AS oth_last_rep_days\n",
"FROM\n",
"\t(SELECT base.user_id,\n",
"\t\t\tbase.id,\n",
"\t \tbase.transaction_date,\n",
"\t \tMAX(up.pd) AS last_pd\n",
"\tFROM\n",
"\t\t(SELECT user_id,\n",
" id,\n",
" transaction_date,\n",
" status\n",
"\t\tFROM l2alpha.`transaction`\n",
"\t\tWHERE id IN {tids}\n",
" ) AS base\n",
"\tLEFT JOIN \n",
"\t (SELECT user_id,\n",
"\t \tid,\n",
"\t \tpayment_type,\n",
"\t \tpayment_channel,\n",
"\t \tpayment_date AS pd,\n",
"\t \tdpd,\n",
"\t \tstatus\n",
"\t FROM l2alpha.user_payment \n",
"\t WHERE status = 2 AND payment_type = 'INSTALLMENT' AND payment_channel <> 'CANCELLATION'\n",
"\t ) AS up \n",
"\tON base.user_id = up.user_id AND pd < transaction_date\n",
"\tGROUP BY base.id\n",
"\t) AS main\n",
"LEFT JOIN \n",
"\t(SELECT user_id, \n",
"\t\t\tid, \n",
"\t\t\tdpd, \n",
"\t\t\tpayment_date,\n",
" payment_channel,\n",
" amount\n",
"\tFROM l2alpha.user_payment) AS up2\n",
"ON up2.payment_date = main.last_pd AND main.user_id = up2.user_id\n",
"GROUP BY main.id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:45.110051Z",
"start_time": "2019-11-07T04:00:11.674575Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"oth_last_rep = pd.read_sql(oth_last_rep_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:45.115239Z",
"start_time": "2019-11-07T04:00:45.111412Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 4)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"oth_last_rep.shape"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"oth_last_rep.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Utilization"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## util_non_pl"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:45.181482Z",
"start_time": "2019-11-07T04:00:45.178221Z"
}
},
"outputs": [],
"source": [
"util_non_pl_q = f'''\n",
"SELECT main.user_id,\n",
"\t main.trx_id,\n",
"\t COALESCE(SUM(loan_amount/transaction_limit), 0) AS util_non_pl\n",
"FROM\n",
"\t(SELECT user_id, \n",
"\t\t trx_id, \n",
"\t\t loan_id,\n",
"\t\t loan_amount,\n",
"\t\t tenure,\n",
"\t\t MAX(creation_time) AS latest_limit\n",
"\tFROM\t\n",
"\t\t(SELECT tr.user_id,\n",
"\t\t tr.id AS trx_id,\n",
"\t\t lo.loan_id,\n",
"\t\t loan_amount,\n",
"\t\t creation_time,\n",
"\t\t tenure,\n",
"\t\t COALESCE(TIMESTAMPDIFF(DAY, creation_time, transaction_date), 0) AS time_diff\n",
"\t\tFROM \n",
"\t\t\t\t(SELECT user_id,\n",
"\t\t\t\t\t\ttransaction_date,\n",
"\t\t\t\t \tid\n",
"\t\t\t\tFROM l2alpha.`transaction` \n",
"\t\t\t\tWHERE id IN {tids}) AS tr\n",
"\t\t\tLEFT JOIN \n",
"\t\t\t\t(SELECT id AS loan_id,\n",
"\t\t\t\t\t\tuser_id,\n",
"\t\t\t\t \tstart_date,\n",
"\t\t\t\t \tsettlement_date,\n",
"\t\t\t\t \tloan_amount,\n",
"\t\t\t\t \ttenure,\n",
"\t\t\t\t \ttransaction_id \n",
"\t\t\t\tFROM l2alpha.loan\n",
"\t\t\t\tWHERE is_active = 1 AND loan_amount > 0) AS lo\n",
"\t\t\tON lo.user_id = tr.user_id AND start_date < transaction_date AND (settlement_date IS NULL OR settlement_date > transaction_date)\n",
"\t\t\t\n",
"\t\t\tLEFT JOIN \n",
"\t\t\t\t(SELECT user_id,\n",
"\t\t\t\t\t\tCASE WHEN installment_type = '30_days' THEN 30\n",
"\t\t\t\t\t\tWHEN installment_type = '3_months' THEN 90 \n",
"\t\t\t\t\t\tWHEN installment_type = '6_months' THEN 180 \n",
"\t\t\t\t\t\tWHEN installment_type = '12_months' THEN 360 \n",
"\t\t\t\t\t\tELSE NULL END AS installment_type,\n",
"\t\t\t\t\t\tcreation_time\n",
"\t\t\t\tFROM l2alpha.transaction_limit_history\n",
"\t\t\t\t) AS tlh\n",
"\t\t\tON tlh.user_id = lo.user_id AND tlh.installment_type = lo.tenure\n",
"\t\t) AS base\n",
"\tWHERE time_diff >= 0\n",
"\tGROUP BY trx_id, loan_id\n",
"\t) AS main\n",
"\t\n",
"\tLEFT JOIN \n",
"\t(SELECT user_id,\n",
"\t\t\ttransaction_limit,\n",
"\t\t\tCASE WHEN installment_type = '30_days' THEN 30\n",
"\t\t\tWHEN installment_type = '3_months' THEN 90 \n",
"\t\t\tWHEN installment_type = '6_months' THEN 180 \n",
"\t\t\tWHEN installment_type = '12_months' THEN 360 \n",
"\t\t\tELSE NULL END AS installment_type,\n",
"\t\t\tcreation_time\n",
"\tFROM l2alpha.transaction_limit_history\n",
"\tWHERE user_id IN {uids}\n",
"\tGROUP BY user_id, transaction_limit, installment_type, creation_time\n",
"\t) AS lim\n",
"\tON lim.creation_time = main.latest_limit AND lim.user_id = main.user_id\n",
"\tAND lim.installment_type = main.tenure\n",
"GROUP BY main.trx_id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:51.464862Z",
"start_time": "2019-11-07T04:00:45.182833Z"
}
},
"outputs": [],
"source": [
"util_non_pl = pd.read_sql(util_non_pl_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:51.469712Z",
"start_time": "2019-11-07T04:00:51.466148Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 3)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"util_non_pl.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:51.515275Z",
"start_time": "2019-11-07T04:00:51.471223Z"
}
},
"outputs": [],
"source": [
"util_non_pl.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## util_pl"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:51.534178Z",
"start_time": "2019-11-07T04:00:51.523536Z"
}
},
"outputs": [],
"source": [
"util_pl_q = f'''\n",
"SELECT main.user_id, \n",
"\t trx_id,\n",
"\t COALESCE((amount/transaction_limit), 0) AS util_pl\n",
"FROM\n",
"\t(SELECT user_id, trx_id, amount, payment_type, transaction_date,\n",
"\t\t MAX(creation_time) AS latest_limit\n",
"\tFROM\n",
"\t\t(SELECT tr.user_id, tr.id AS trx_id, tr.amount, tr.payment_type, tr.transaction_date,\n",
"\t\t\t\ttlh.creation_time,\n",
"\t\t\t TIMESTAMPDIFF(DAY, creation_time, transaction_date) AS time_diff\n",
"\t\tFROM\n",
"\t\t\t(SELECT *\n",
"\t\t\tFROM l2alpha.`transaction` \n",
"\t\t\tWHERE id IN {tids}\n",
"\t\t\t) AS tr\n",
"\t\t\tLEFT JOIN l2alpha.transaction_limit_history AS tlh\n",
"\t\t\tON tr.user_id = tlh.user_id AND tr.payment_type = tlh.installment_type\n",
"\t\t) AS base\n",
"\tWHERE time_diff >= 0\n",
"\tGROUP BY base.trx_id\n",
"\t) AS main\n",
"\t\n",
"\tLEFT JOIN \n",
"\t(SELECT user_id,\n",
"\t\t\ttransaction_limit,\n",
"\t\t\tinstallment_type,\n",
"\t\t\tcreation_time\n",
"\tFROM l2alpha.transaction_limit_history\n",
"\tWHERE user_id IN {uids}\n",
"\tGROUP BY user_id, transaction_limit, installment_type, creation_time\n",
"\t) AS lim\n",
"\tON lim.creation_time = main.latest_limit AND lim.user_id = main.user_id \n",
" AND lim.installment_type = main.payment_type\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:54.715476Z",
"start_time": "2019-11-07T04:00:51.536994Z"
}
},
"outputs": [],
"source": [
"util_pl = pd.read_sql(util_pl_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:54.720364Z",
"start_time": "2019-11-07T04:00:54.716934Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 3)"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"util_pl.shape"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:00:54.733988Z",
"start_time": "2019-11-07T04:00:54.722455Z"
}
},
"outputs": [],
"source": [
"util_pl.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 6. Installed Payday Loan App Count"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:01:06.262833Z",
"start_time": "2019-11-07T04:01:06.257660Z"
}
},
"outputs": [],
"source": [
"ins_app_q = f'''\n",
"SELECT user_id,\n",
" installed_applications\n",
"FROM l2alpha.web_userdevicedetail\n",
"WHERE user_id IN {uids}\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:40.041309Z",
"start_time": "2019-11-07T04:01:06.264471Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"ins_app = pd.read_sql(ins_app_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:40.048118Z",
"start_time": "2019-11-07T04:02:40.042945Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(78465, 2)"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ins_app.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plo_app_list = pd.read_csv('data/payday_700_list.csv', header=None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:40.084424Z",
"start_time": "2019-11-07T04:02:40.066336Z"
}
},
"outputs": [],
"source": [
"payday_loan_apps = plo_app_list[0].unique().tolist()\n",
"payday_loan_apps = [x.lower() for x in payday_loan_apps]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:40.109404Z",
"start_time": "2019-11-07T04:02:40.097156Z"
}
},
"outputs": [],
"source": [
"def create_feats_apps(raw_df,label='installed_applications'):\n",
"\n",
" df = raw_df.copy()\n",
" df[label] = df[label].fillna('[]')\n",
" df[label] = df[label].apply(lambda x: eval(x))\n",
"\n",
" def count_apps(x,list_apps):\n",
" return len([ft for ft in x if ft.lower() in list_apps])\n",
"\n",
" df['ap_co_plo'] = df[label].apply(lambda x:count_apps(x,payday_loan_apps))\n",
" \n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.459722Z",
"start_time": "2019-11-07T04:02:40.111155Z"
}
},
"outputs": [],
"source": [
"plo_apps = create_feats_apps(ins_app)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.507926Z",
"start_time": "2019-11-07T04:02:44.461220Z"
}
},
"outputs": [],
"source": [
"plo_apps = plo_apps.drop(columns=['installed_applications'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Combine all dataframes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.586494Z",
"start_time": "2019-11-07T04:02:44.510100Z"
}
},
"outputs": [],
"source": [
"final_df = base_df[['trx_id', 'user_id']]\\\n",
" .merge(snapshot, on=['trx_id', 'user_id'], how='left')\\\n",
" .merge(delin, on=['trx_id', 'user_id'], how='left')\\\n",
" .merge(time_pl, on=['trx_id', 'user_id'], how='left')\\\n",
" .merge(oth_last_rep, on=['trx_id', 'user_id'], how='left')\\\n",
" .merge(util_non_pl, on=['trx_id', 'user_id'], how='left')\\\n",
" .merge(util_pl, on=['trx_id', 'user_id'], how='left')\\\n",
" .merge(plo_apps, on='user_id', how='left')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.591815Z",
"start_time": "2019-11-07T04:02:44.587913Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 10)"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 10)"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"base_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.617944Z",
"start_time": "2019-11-07T04:02:44.593760Z"
}
},
"outputs": [],
"source": [
"final_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.798454Z",
"start_time": "2019-11-07T04:02:44.619626Z"
}
},
"outputs": [],
"source": [
"final_df.to_parquet('data/pl_threshold_feats_11022020_part1.parquet', compression='gzip')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}