{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load Libraries and Query Utility Functions"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/psycopg2/__init__.py:144: UserWarning: The psycopg2 wheel package will be renamed from release 2.8; in order to keep installing from binary please use \"pip install psycopg2-binary\" instead. For details see: <http://initd.org/psycopg/docs/install.html#binary-install-from-pypi>.\n",
" \"\"\")\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pymysql\n",
"import psycopg2\n",
"\n",
"from datetime import datetime, timedelta"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import configparser\n",
"config = configparser.ConfigParser()\n",
"config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
"\n",
"host = config['MYSQL']['HOST']\n",
"user = config['MYSQL']['USER']\n",
"password = config['MYSQL']['PASSWORD']\n",
"\n",
"def connect_sql():\n",
" cnx = pymysql.connect(host=host,\n",
" user=user,\n",
" password=password,\n",
" cursorclass=pymysql.cursors.DictCursor)\n",
" return cnx"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"host = config['PERISCOPE']['HOST']\n",
"user = config['PERISCOPE']['USER']\n",
"password = config['PERISCOPE']['PASSWORD']\n",
"port = config['PERISCOPE']['PORT']\n",
"dbname = config['PERISCOPE']['DB']\n",
"\n",
"def connect_periscope():\n",
" cnx = psycopg2.connect(dbname=dbname, \n",
" host=host, \n",
" port=port,\n",
" user=user,\n",
" password=password)\n",
" return cnx"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get Personal Loan transactions\n",
"from 2019-06 to 2019-09"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"query = '''\n",
"SELECT main.*\n",
"FROM(\n",
" SELECT \n",
" user_id,\n",
" transaction_date, \n",
" payment_type,\n",
" id AS trx_id,\n",
" loan_amount\n",
" FROM \n",
" (\n",
" SELECT id, user_id, status, transaction_type, transaction_date, payment_type\n",
" FROM l2alpha.`transaction`\n",
" WHERE status = 4 AND transaction_type = 2 AND payment_type <> '30_days'\n",
" AND transaction_date >= '2019-06-01' AND transaction_date < '2019-10-01'\n",
" ) AS tr\n",
" JOIN\n",
" (\n",
" SELECT start_date, transaction_id, loan_amount\n",
" FROM l2alpha.loan\n",
" WHERE loan_amount > 0 AND is_active = 1\n",
" ) AS lo ON lo.transaction_id = tr.id\n",
" ) AS main\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"raw_df = pd.read_sql(query,connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(154256, 5)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"raw_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2019-09-30 23:58:30')"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_df['transaction_date'].max()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get Flag Bad User\n",
"from 2019-06 to 2019-09"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"get_performance = '''\n",
"SELECT user_base_w_first_61.*, dlv.*\n",
"FROM\n",
" (SELECT user_base.*, MIN(snapshot_date) AS first_61_after_trx\n",
" FROM\n",
" (SELECT \n",
" base.*, COUNT(*) as performance_window, MAX(ula.current_dpd) as max_dpd\n",
" FROM\n",
" (SELECT \n",
" user_id, transaction_date, payment_type, id AS trx_id, loan_amount\n",
" FROM \n",
" (SELECT \n",
" id, user_id, status, transaction_type, transaction_date, payment_type\n",
" FROM\n",
" db_11466_public.transaction\n",
" WHERE \n",
" status = 4 AND transaction_type = 2 AND transaction_date >= '2019-06-01' AND transaction_date < '2019-10-01' AND payment_type != '30_days'\n",
" ) AS tr\n",
" JOIN\n",
" (SELECT \n",
" start_date, transaction_id, loan_amount\n",
" FROM \n",
" db_11466_public.loan\n",
" WHERE \n",
" loan_amount > 0 AND is_active = 1\n",
" ) AS lo ON lo.transaction_id = tr.id\n",
" ) AS base\n",
" LEFT JOIN\n",
" (SELECT\n",
" user_id, snapshot_date, current_dpd\n",
" FROM \n",
" db_13080_public.user_level_archive\n",
" ) AS ula ON base.user_id = ula.user_id\n",
" WHERE \n",
" (snapshot_date is NULL) OR \n",
" (snapshot_date > transaction_date AND datediff('day', base.transaction_date, ula.snapshot_date) <= 121)\n",
" GROUP BY \n",
" base.user_id, base.transaction_date, base.payment_type, base.trx_id, base.loan_amount\n",
" ) AS user_base\n",
" LEFT JOIN\n",
" (SELECT \n",
" user_id, snapshot_date, current_dpd\n",
" FROM \n",
" db_13080_public.user_level_archive\n",
" WHERE \n",
" current_dpd >= 61\n",
" ) AS ula_2 ON user_base.user_id = ula_2.user_id\n",
" WHERE \n",
" snapshot_date is null or snapshot_date > transaction_date\n",
" GROUP BY \n",
" user_base.user_id, user_base.transaction_date, user_base.payment_type, user_base.trx_id, user_base.loan_amount, user_base.performance_window, user_base.max_dpd\n",
" ) AS user_base_w_first_61\n",
" LEFT JOIN\n",
" (SELECT \n",
" transaction_id, user_id AS user_id_1, start_date, loan_disbursement, first_dpd_1, first_dpd_7, first_dpd_15, first_dpd_30, first_dpd_60, first_dpd_90 \n",
" FROM \n",
" dim_loan_vintage\n",
" ) AS dlv ON user_base_w_first_61.trx_id = dlv.transaction_id\n",
"WHERE datediff('day', transaction_date, CURRENT_DATE) >= 121\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"perf = pd.read_sql(get_performance, connect_periscope())"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(150672, 18)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"perf.shape"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['user_id', 'transaction_date', 'payment_type', 'trx_id', 'loan_amount',\n",
" 'performance_window', 'max_dpd', 'first_61_after_trx', 'transaction_id',\n",
" 'user_id_1', 'start_date', 'loan_disbursement', 'first_dpd_1',\n",
" 'first_dpd_7', 'first_dpd_15', 'first_dpd_30', 'first_dpd_60',\n",
" 'first_dpd_90'],\n",
" dtype='object')"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"perf.columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Double check that data fulfills expectations\n",
"- has been 121 days since the transaction was made\n",
"- transaction_date and start_date should be the same or close together"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True 150672\n",
"Name: transaction_date, dtype: int64"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"((datetime.now().date() - perf[\"transaction_date\"].dt.normalize().dt.date).dt.days >= 121).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True 150672\n",
"dtype: int64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(perf[\"loan_amount\"] == perf[\"loan_disbursement\"]).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True 150672\n",
"dtype: int64"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(perf[\"user_id\"] == perf[\"user_id_1\"]).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True 150644\n",
"False 28\n",
"dtype: int64"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(perf[\"transaction_date\"].dt.date == perf[\"start_date\"].dt.date).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"perf[perf[\"transaction_date\"].dt.date != perf[\"start_date\"].dt.date]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create flag bad user and flag bad trx"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"perf[\"flag_bad_usr\"] = perf[\"max_dpd\"] > 60"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"perf[\"first_dpd_60\"] = pd.to_datetime(perf[\"first_dpd_60\"])\n",
"\n",
"perf[\"duration_to_default\"] = perf[\"first_dpd_60\"] - perf[\"start_date\"].dt.normalize()\n",
"perf[\"flag_bad_trx\"] = perf[\"duration_to_default\"].dt.days <= 121"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False 144230\n",
"True 6442\n",
"Name: flag_bad_usr, dtype: int64"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"perf['flag_bad_usr'].value_counts(dropna=False)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.04275512371243496"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"perf['flag_bad_usr'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False 144602\n",
"True 6070\n",
"Name: flag_bad_trx, dtype: int64"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"perf[\"flag_bad_trx\"].value_counts(dropna=False)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.04028618455983859"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"perf[\"flag_bad_trx\"].mean()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"perf.columns.tolist()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Combine Datasets"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"perf_columns = ['user_id',\n",
" 'transaction_date',\n",
" 'payment_type',\n",
" 'trx_id',\n",
" 'loan_amount',\n",
" 'performance_window',\n",
" 'max_dpd',\n",
" 'first_61_after_trx',\n",
" 'transaction_id',\n",
"# 'user_id_1',\n",
" 'start_date',\n",
"# 'loan_disbursement',\n",
" 'first_dpd_1',\n",
" 'first_dpd_7',\n",
" 'first_dpd_15',\n",
" 'first_dpd_30',\n",
" 'first_dpd_60',\n",
" 'first_dpd_90',\n",
" 'flag_bad_usr',\n",
"# 'duration_to_default',\n",
" 'flag_bad_trx'\n",
" ]\n",
"\n",
"from_perf = perf[perf_columns]"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"merge_df = from_perf.merge(raw_df, how='left', on=['user_id', 'trx_id', 'payment_type', \n",
" 'transaction_date', 'loan_amount'])"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['user_id', 'transaction_date', 'payment_type', 'trx_id', 'loan_amount',\n",
" 'performance_window', 'max_dpd', 'first_61_after_trx', 'transaction_id',\n",
" 'start_date', 'first_dpd_1', 'first_dpd_7', 'first_dpd_15',\n",
" 'first_dpd_30', 'first_dpd_60', 'first_dpd_90', 'flag_bad_usr',\n",
" 'flag_bad_trx'],\n",
" dtype='object')"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merge_df.columns"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"(150672, 18)"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merge_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"user_id 0.000000\n",
"transaction_date 0.000000\n",
"payment_type 0.000000\n",
"trx_id 0.000000\n",
"loan_amount 0.000000\n",
"performance_window 0.000000\n",
"max_dpd 0.000093\n",
"first_61_after_trx 0.923324\n",
"transaction_id 0.000000\n",
"start_date 0.000000\n",
"first_dpd_1 0.616936\n",
"first_dpd_7 0.825024\n",
"first_dpd_15 0.870354\n",
"first_dpd_30 0.904116\n",
"first_dpd_60 0.938177\n",
"first_dpd_90 0.960285\n",
"flag_bad_usr 0.000000\n",
"flag_bad_trx 0.000000\n",
"dtype: float64"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merge_df.isna().mean()"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2019-09-28 23:49:58')"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merge_df['transaction_date'].max()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:114: FutureWarning: A future version of pandas will default to `skipna=True`. To silence this warning, pass `skipna=True|False` explicitly.\n",
" result = infer_dtype(pandas_collection)\n"
]
}
],
"source": [
"merge_df.to_parquet('data/interim/pl_jumbo_mini_junjulaugsep_27012020.parquet', compression='gzip')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}