{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pymysql\n",
"import re\n",
"from datetime import datetime\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import configparser\n",
"config = configparser.ConfigParser()\n",
"config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
"\n",
"host = config['MYSQL-ROOT']['HOST']\n",
"user = config['MYSQL-ROOT']['USER']\n",
"password = config['MYSQL-ROOT']['PASSWORD']\n",
"\n",
"def connect_sql():\n",
" cnx = pymysql.connect(host=host,\n",
" user=user,\n",
" password=password,\n",
" cursorclass=pymysql.cursors.DictCursor)\n",
" return cnx"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"raw_df = pd.read_parquet('data/interim/pl_jumbo_mini_junjulaugsep_27012020.parquet')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(150672, 17)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"135508"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_df['user_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"150672"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_df['trx_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2019-09-28 23:49:58')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_df['transaction_date'].max()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"tids = tuple(raw_df['trx_id'].unique())\n",
"uids = tuple(raw_df['user_id'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"150672"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(tids)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"135508"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(uids)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Denied Transactions"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"denied_trx_q = f'''\n",
"SELECT base.user_id,\n",
"base.pl_trx_id AS trx_id,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 7 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_7d,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 15 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_15d,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 30 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_1mo,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 60 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_2mo,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 90 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_3mo,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 120 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_4mo,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 180 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
" THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_6mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 7 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
" THEN base.amount ELSE 0 END) AS denied_trx_sum_7d,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 15 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
" THEN base.amount ELSE 0 END) AS denied_trx_sum_15d,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 30 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
" THEN base.amount ELSE 0 END) AS denied_trx_sum_1mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 60 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
" THEN base.amount ELSE 0 END) AS denied_trx_sum_2mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 90 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
" THEN base.amount ELSE 0 END) AS denied_trx_sum_3mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 120 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
" THEN base.amount ELSE 0 END) AS denied_trx_sum_4mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 180 AND \n",
" datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
" THEN base.amount ELSE 0 END) AS denied_trx_sum_6mo \n",
"FROM\n",
" (SELECT tr1.user_id,\n",
" tr1.id AS pl_trx_id,\n",
" tr1.pl_transaction_date,\n",
" tr2.id AS transaction_id,\n",
" tr2.transaction_date,\n",
" tr2.amount\n",
" FROM\n",
" (SELECT user_id, \n",
" id, \n",
" transaction_date AS pl_transaction_date\n",
" FROM l2alpha.`transaction` WHERE id IN {tids}\n",
" ) AS tr1\n",
" LEFT JOIN\n",
" (SELECT * \n",
" FROM l2alpha.`transaction`\n",
" WHERE status = 5\n",
" ) AS tr2 \n",
" ON tr1.user_id = tr2.user_id \n",
" AND tr2.transaction_date <= tr1.pl_transaction_date\n",
" ) AS base\n",
"GROUP BY base.user_id, base.pl_transaction_date\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"denied_trx = pd.read_sql(denied_trx_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(150672, 16)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"denied_trx.shape"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"135508"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"denied_trx['user_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"150672"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"denied_trx['trx_id'].nunique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Settled Transactions"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"paid_trx_q = f'''\n",
"SELECT base.user_id,\n",
"base.pl_trx_id AS trx_id,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 7 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_7d,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 15 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_15d,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 30 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_1mo,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 60 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_2mo,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 90 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_3mo,\n",
"COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 180 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_6mo,\n",
"COUNT(DISTINCT(base.transaction_id)) AS settled_trx_co_ever,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 7 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_7d,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 15 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_15d,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 30 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_1mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 60 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_2mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 90 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_3mo,\n",
"SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 180 AND \n",
" datediff(DATE(pl_transaction_date),start_date) >=0 \n",
" THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_6mo,\n",
"SUM(COALESCE(base.loan_amount,0)) AS settled_trx_sum_ever\n",
"FROM\n",
" (SELECT tr.user_id,\n",
" tr.id AS pl_trx_id,\n",
" tr.pl_transaction_date, \n",
" l.transaction_id,\n",
" l.start_date,\n",
" l.loan_amount,\n",
" l.tenure\n",
"\n",
"\tFROM \n",
"\t (SELECT user_id, \n",
"\t id, \n",
"\t transaction_date AS pl_transaction_date\n",
"\t FROM l2alpha.`transaction` WHERE id IN {tids}\n",
"\t ) AS tr\n",
"\tINNER JOIN \n",
"\t (SELECT user_id,\n",
"\t start_date, \n",
"\t settlement_date,\n",
"\t loan_amount,\n",
"\t tenure,\n",
"\t transaction_id \n",
"\t FROM l2alpha.loan\n",
"\t WHERE is_active = 1 AND loan_amount > 0\n",
"\t ) AS l\n",
"\tON l.user_id = tr.user_id AND settlement_date <= pl_transaction_date\n",
"\t) AS base\n",
"GROUP BY base.user_id, base.pl_transaction_date\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"paid_trx = pd.read_sql(paid_trx_q, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(124816, 16)"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"paid_trx.shape"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"112438"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"paid_trx['user_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"124816"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"paid_trx['trx_id'].nunique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Combine"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"trx_feats = raw_df[['user_id', 'trx_id']].merge(denied_trx, how='left', on=['user_id', 'trx_id'])\\\n",
" .merge(paid_trx, how='left', on=['user_id', 'trx_id'])"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(150672, 30)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trx_feats.shape"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"135508"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trx_feats['user_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"150672"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trx_feats['trx_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"trx_feats.to_parquet('data/interim/pl_model_v3_new_trx_feats_27012020.parquet', compression='gzip')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}