{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:36.988168Z",
"start_time": "2019-11-07T03:55:36.706164Z"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pymysql\n",
"from datetime import datetime"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:36.994804Z",
"start_time": "2019-11-07T03:55:36.989531Z"
}
},
"outputs": [],
"source": [
"import configparser\n",
"config = configparser.ConfigParser()\n",
"config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
"\n",
"host = config['MYSQL-ROOT']['HOST']\n",
"user = config['MYSQL-ROOT']['USER']\n",
"password = config['MYSQL-ROOT']['PASSWORD']\n",
"\n",
"def connect_sql():\n",
" cnx = pymysql.connect(host=host,\n",
" user=user,\n",
" password=password,\n",
" cursorclass=pymysql.cursors.DictCursor)\n",
" return cnx"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:37.026188Z",
"start_time": "2019-11-07T03:55:36.997299Z"
}
},
"outputs": [],
"source": [
"base_df = pd.read_parquet('data/pl_threshold_jan_11022020.parquet')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:37.034477Z",
"start_time": "2019-11-07T03:55:37.027603Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 10)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"base_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:37.043587Z",
"start_time": "2019-11-07T03:55:37.036542Z"
}
},
"outputs": [],
"source": [
"uids = tuple(base_df['user_id'].unique().tolist())\n",
"tids = tuple(base_df['trx_id'].unique().tolist())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"78465"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(uids)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"79853"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(tids)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Number of Previous PL Transactions"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def get_pl_history(base_df):\n",
" uids = tuple(base_df['user_id'])\n",
" max_trx_date = base_df['transaction_date'].max()\n",
"\n",
" query = f\"\"\"\n",
" SELECT user_id,\n",
" id AS trx_id,\n",
" status,\n",
" payment_type,\n",
" amount,\n",
" transaction_date\n",
" FROM l2alpha.transaction\n",
" WHERE user_id IN {uids}\n",
" AND transaction_type = 2\n",
" AND transaction_date <= '{max_trx_date}'\n",
" \"\"\"\n",
"\n",
" df = pd.read_sql(query, connect_sql())\n",
" df['gen_payment_type'] = df['payment_type'].apply(lambda x: 'mini' if x=='30_days' else 'jumbo')\n",
" \n",
" out_df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(df, how='left', on='user_id')\\\n",
" .rename(columns={'transaction_date_x':'trx_dt_current',\n",
" 'transaction_date_y':'trx_dt_past',\n",
" 'trx_id_x':'trx_id',\n",
" 'trx_id_y':'trx_id_ref'})\n",
" out_df['time_diff'] = (out_df['trx_dt_current'] - out_df['trx_dt_past']).dt.total_seconds()\n",
" return out_df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def create_prev_pl_feats(df, base_df, window='all'):\n",
" if isinstance(window, int):\n",
" df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
" name = str(window)\n",
" elif window == 'all':\n",
" df = df[df['time_diff'] > 0]\n",
" name = 'ever'\n",
" \n",
" prev_df = df.groupby(['user_id', 'trx_id', 'status', 'gen_payment_type']).size()\\\n",
" .unstack(level=[3,2]).fillna(0)\n",
" prev_df.columns = [f'pl_trx_co_{col[0]}_{col[1]}_{name}' for col in prev_df.columns.values]\n",
" \n",
" keywords = [2, 4, 5, 6, 7, 11, 'jumbo', 'mini']\n",
" stat_cols_dict = {}\n",
" for word in keywords:\n",
" str_word = str(word) if isinstance(word, int) else word\n",
" stat_cols_dict[word] = [col for col in prev_df if str_word in col]\n",
" \n",
" name_status = 'pl_trx_co_jumbomini_{}_{}'\n",
" name_pl_type = 'pl_trx_co_{}_all_{}'\n",
" for k, v in stat_cols_dict.items():\n",
" if isinstance(k, int):\n",
" prev_df[name_status.format(str(k), name)] = prev_df[v].sum(axis=1)\n",
" elif isinstance(k, str):\n",
" prev_df[name_pl_type.format(k, name)] = prev_df[v].sum(axis=1)\n",
" \n",
" prev_df[f'pl_trx_co_jumbomini_all_{name}'] = prev_df[f'pl_trx_co_jumbo_all_{name}'] + \\\n",
" prev_df[f'pl_trx_co_mini_all_{name}']\n",
" \n",
" prev_df = base_df[['user_id', 'trx_id']].merge(prev_df, how='left', on=['user_id', 'trx_id']).fillna(0)\n",
" \n",
" return prev_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pl_hist = get_pl_history(base_df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"prev_obs_feats_90 = create_prev_pl_feats(pl_hist, base_df, window=90)\n",
"prev_obs_feats_90 = prev_obs_feats_90.rename(columns={'pl_trx_co_jumbomini_4_90':'pl_trx_suc_co_90'})\n",
"prev_obs_feats_90 = prev_obs_feats_90[['user_id', 'trx_id', 'pl_trx_suc_co_90']]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 3)"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prev_obs_feats_90.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Early Settlement"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_trx_settle_data(base_df):\n",
" uids = tuple(base_df['user_id'])\n",
" max_trx_date = base_df['transaction_date'].max()\n",
"\n",
" early_settle_q = f'''\n",
" SELECT *,\n",
" DATE(transaction_date) + INTERVAL payment_type_int DAY AS due_date\n",
" FROM\n",
" (SELECT tr.user_id, tr.trx_id, transaction_date, payment_type, transaction_type, status,\n",
" CASE WHEN payment_type = '30_days' THEN '30'\n",
" WHEN payment_type = '3_months' THEN '90'\n",
" WHEN payment_type = '6_months' THEN '180' \n",
" WHEN payment_type = '12_months' THEN '360' ELSE NULL END AS payment_type_int,\n",
"\n",
" settlement_date, loan_amount, is_active\n",
" FROM\n",
" (SELECT user_id,\n",
" id AS trx_id,\n",
" transaction_date,\n",
" payment_type,\n",
" transaction_type,\n",
" status\n",
" FROM l2alpha.`transaction`\n",
" WHERE user_id IN {uids}\n",
" AND transaction_type = 2\n",
" AND status = 4\n",
" AND transaction_date <= '{max_trx_date}'\n",
" ) AS tr\n",
"\n",
" LEFT JOIN\n",
"\n",
" (SELECT transaction_id AS trx_id,\n",
" settlement_date,\n",
" loan_amount,\n",
" is_active\n",
" FROM l2alpha.loan\n",
" ) AS lo\n",
"\n",
" ON tr.trx_id = lo.trx_id\n",
" ) AS base\n",
" WHERE loan_amount > 0 AND is_active = 1\n",
" '''\n",
" query_df = pd.read_sql(early_settle_q, connect_sql())\n",
" query_df['settle_to_due'] = (query_df['due_date'] - query_df['settlement_date'].dt.date).dt.days\n",
" df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(query_df, how='left', on='user_id')\\\n",
" .rename(columns={'transaction_date_x':'trx_dt_current',\n",
" 'transaction_date_y':'trx_dt_past',\n",
" 'trx_id_x':'trx_id',\n",
" 'trx_id_y':'trx_id_ref'})\n",
" df['time_diff'] = (df['trx_dt_current'] - df['trx_dt_past']).dt.total_seconds()\n",
" df = df.sort_values(['user_id', 'trx_dt_past']).reset_index(drop=True)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def create_early_settle_feats(df, base_df, window='all'):\n",
" if isinstance(window, int):\n",
" df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
" name = str(window)\n",
" elif window == 'all':\n",
" df = df[df['time_diff'] > 0]\n",
" name = 'ever'\n",
" df = df.sort_values(['user_id', 'trx_id', 'trx_dt_past'])\n",
" \n",
" # Calculate how many times the user has settled early in the last XX days\n",
" co_df = df.copy()\n",
" co_df['flag_early_settle'] = np.where(co_df['settle_to_due'] >= 30, 1, 0)\n",
" co_df['flag_early_settle'] = np.where(co_df['settle_to_due'].isna(), np.nan, co_df['flag_early_settle'])\n",
" co_df = co_df.groupby('trx_id').agg({'flag_early_settle':'sum'}).reset_index()\n",
" co_df.columns = ['trx_id', f'early_settle_co_{name}']\n",
" \n",
" # Calculate the min, max, average, and median days the user has settled early in the last XX days\n",
" # Also get the last time the user settled early (how many days early).\n",
" es_df = df.groupby('trx_id').agg({'settle_to_due':[max, min, 'mean', 'median', 'last']})\n",
" col_names = ['settle_to_due_max_{}', 'settle_to_due_min_{}', 'settle_to_due_avg_{}', \n",
" 'settle_to_due_med_{}', 'settle_to_due_last_pl_{}']\n",
" es_df.columns = [col.format(name) for col in col_names]\n",
" es_df = es_df.reset_index()\n",
" out_df = base_df[['user_id', 'trx_id']].merge(es_df, how='left', on='trx_id')\\\n",
" .merge(co_df, how='left', on='trx_id')\n",
" return out_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"trx_settle = get_trx_settle_data(base_df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"early_settle_feats_180 = create_early_settle_feats(trx_settle, base_df, window=180)\n",
"early_settle_feats_180 = early_settle_feats_180.rename(columns={'settle_to_due_last_pl_180':'pl_settle_to_due_last_180'})\n",
"early_settle_feats_180 = early_settle_feats_180[['user_id', 'trx_id', 'pl_settle_to_due_last_180']]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 3)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(early_settle_feats_180.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Time from last PL Settlement"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def create_time_from_last_pl_feat(df):\n",
" time_from_last_pl = df[df['time_diff'] >= 0]\n",
" time_from_last_pl.loc[(time_from_last_pl['time_diff'] == 0) & \n",
" (time_from_last_pl['settlement_date'].notna()), 'settlement_date'] = np.nan\n",
" time_from_last_pl = time_from_last_pl.sort_values(['user_id', 'trx_id', 'trx_dt_past'])\n",
"\n",
" time_from_last_pl_grp = time_from_last_pl.groupby('trx_id').tail(2)\n",
" time_from_last_pl_grp['settle_shift'] = time_from_last_pl_grp['settlement_date'].shift()\n",
" time_from_last_pl_grp['time_from_prev_pl_settle'] = (time_from_last_pl_grp['trx_dt_current'] - \\\n",
" time_from_last_pl_grp['settle_shift']).dt.total_seconds()/3600\n",
" \n",
" time_from_last_pl_final = time_from_last_pl_grp.groupby('trx_id').tail(1)\n",
" time_from_last_pl_final = time_from_last_pl_final[['user_id', 'trx_id', \n",
" 'time_from_prev_pl_settle']].reset_index(drop=True)\n",
" return time_from_last_pl_final"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"time_from_prev_pl_settle = create_time_from_last_pl_feat(trx_settle)\n",
"time_from_prev_pl_settle = time_from_prev_pl_settle.rename(columns={'time_from_prev_pl_settle':'time_from_last_sett_pl_hour'})"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(75884, 3)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(time_from_prev_pl_settle.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Date and day of week of transaction date"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"date_feats = base_df[['user_id', 'trx_id', 'transaction_date']].copy()\n",
"date_feats['date_of_month'] = date_feats['transaction_date'].dt.day\n",
"date_feats = date_feats[['user_id', 'trx_id', 'date_of_month']]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 3)"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"date_feats.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Denied Transactions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"denied_trx_q = f'''\n",
"SELECT base.user_id,\n",
"base.pl_trx_id AS trx_id,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 90 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS trx_denied_co_90 \n",
"FROM\n",
" (SELECT tr1.user_id,\n",
" tr1.id AS pl_trx_id,\n",
" tr1.pl_transaction_date,\n",
" tr2.id AS transaction_id,\n",
" tr2.transaction_date,\n",
" tr2.amount\n",
" FROM\n",
" (SELECT user_id, \n",
" id, \n",
" transaction_date AS pl_transaction_date\n",
" FROM l2alpha.`transaction` WHERE id IN {tids}\n",
" ) AS tr1\n",
" LEFT JOIN\n",
" (SELECT * \n",
" FROM l2alpha.`transaction`\n",
" WHERE status = 5\n",
" ) AS tr2 \n",
" ON tr1.user_id = tr2.user_id \n",
" AND tr2.transaction_date <= tr1.pl_transaction_date\n",
" ) AS base\n",
"GROUP BY base.user_id, base.pl_transaction_date\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"denied_trx = pd.read_sql(denied_trx_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 3)"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"denied_trx.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"denied_trx.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Settled Transactions"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"paid_trx_q = f'''\n",
"SELECT base.user_id,\n",
"base.pl_trx_id AS trx_id,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 30 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.loan_amount ELSE 0 END) AS trx_sett_sum_30\n",
"FROM\n",
" (SELECT tr.user_id,\n",
" tr.id AS pl_trx_id,\n",
" tr.pl_transaction_date, \n",
" l.transaction_id,\n",
" l.start_date,\n",
" l.loan_amount,\n",
" l.tenure\n",
"\n",
"\tFROM \n",
"\t (SELECT user_id, \n",
"\t id, \n",
"\t transaction_date AS pl_transaction_date\n",
"\t FROM l2alpha.`transaction` WHERE id IN {tids}\n",
"\t ) AS tr\n",
"\tINNER JOIN \n",
"\t (SELECT user_id,\n",
"\t start_date, \n",
"\t settlement_date,\n",
"\t loan_amount,\n",
"\t tenure,\n",
"\t transaction_id \n",
"\t FROM l2alpha.loan\n",
"\t WHERE is_active = 1 AND loan_amount > 0\n",
"\t ) AS l\n",
"\tON l.user_id = tr.user_id AND settlement_date <= pl_transaction_date\n",
"\t) AS base\n",
"GROUP BY base.user_id, base.pl_transaction_date\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"paid_trx = pd.read_sql(paid_trx_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(64718, 3)"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"paid_trx.shape"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"paid_trx.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Combine all dataframes"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.586494Z",
"start_time": "2019-11-07T04:02:44.510100Z"
}
},
"outputs": [],
"source": [
"final_df = base_df[['trx_id', 'user_id']]\\\n",
" .merge(prev_obs_feats_90, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(early_settle_feats_180, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(time_from_prev_pl_settle, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(date_feats, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(denied_trx, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(paid_trx, how='left', on=['user_id', 'trx_id'])"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.591815Z",
"start_time": "2019-11-07T04:02:44.587913Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 8)"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 10)"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"base_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.617944Z",
"start_time": "2019-11-07T04:02:44.593760Z"
}
},
"outputs": [],
"source": [
"final_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"trx_id 0.000000\n",
"user_id 0.000000\n",
"pl_trx_suc_co_90 0.000000\n",
"pl_settle_to_due_last_180 0.443352\n",
"time_from_last_sett_pl_hour 0.351145\n",
"date_of_month 0.000000\n",
"trx_denied_co_90 0.000000\n",
"trx_sett_sum_30 0.189536\n",
"dtype: float64"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_df.isna().mean()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.798454Z",
"start_time": "2019-11-07T04:02:44.619626Z"
}
},
"outputs": [],
"source": [
"final_df.to_parquet('data/pl_threshold_feats_11022020_part2.parquet', compression='gzip')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}