{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pymysql"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import configparser\n",
"config = configparser.ConfigParser()\n",
"config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
"\n",
"host = config['MYSQL-ROOT']['HOST']\n",
"user = config['MYSQL-ROOT']['USER']\n",
"password = config['MYSQL-ROOT']['PASSWORD']\n",
"\n",
"def connect_sql():\n",
" cnx = pymysql.connect(host=host,\n",
" user=user,\n",
" password=password,\n",
" cursorclass=pymysql.cursors.DictCursor)\n",
" return cnx"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get Rejected users"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"query = '''\n",
"SELECT\n",
"\tbase.*,\n",
"\tCOUNT(*) AS performance_window,\n",
"\tMAX(ula.current_dpd) AS max_dpd\n",
"FROM\n",
"\t(\n",
"\tSELECT\n",
"\t\ttar.*,\n",
"\t\ttr.user_id,\n",
"\t\ttr.transaction_date,\n",
"\t\ttr.amount,\n",
" tr.payment_type\n",
"\tFROM\n",
"\t\t(\n",
"\t\tSELECT\n",
"\t\t\t*\n",
"\t\tFROM\n",
"\t\t\tl2alpha.transaction_approval_rules\n",
"\t\tWHERE\n",
"\t\t\trule_name = 'pl_score'\n",
"\t\t\tAND `result` = 'REJECT'\n",
"\t\t\tAND remarks LIKE '%Personal Loan Score%' ) AS tar\n",
"\tLEFT JOIN l2alpha.`transaction` AS tr ON\n",
"\t\ttar.transaction_id = tr.id\n",
"\tWHERE\n",
"\t\tpayment_type <> '30_days'\n",
"\t\tAND transaction_date >= '2019-06-01'\n",
"\t\tAND transaction_date < '2019-10-01'\n",
"\t\tAND datediff(CURDATE(),\n",
"\t\ttransaction_date) >= 121 ) AS base\n",
"LEFT JOIN (\n",
"\tSELECT\n",
"\t\tuser_id,\n",
"\t\tsnapshot_date,\n",
"\t\tcurrent_dpd\n",
"\tFROM\n",
"\t\tds.user_level_archive ) AS ula ON\n",
"\tbase.user_id = ula.user_id\n",
"WHERE\n",
"\t(snapshot_date IS NULL)\n",
"\tOR (snapshot_date > transaction_date\n",
"\tAND datediff(ula.snapshot_date,\n",
"\tbase.transaction_date) <= 121)\n",
"GROUP BY\n",
"\tbase.user_id,\n",
"\tbase.transaction_date,\n",
"\tbase.transaction_id\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_sql(query, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(16190, 14)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sanity check data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9218703 4\n",
"4964211 3\n",
"1608647 3\n",
"5677366 3\n",
"4769698 3\n",
" ..\n",
"7273493 1\n",
"8045586 1\n",
"10003470 1\n",
"9778188 1\n",
"9080834 1\n",
"Name: user_id, Length: 15307, dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['user_id'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"37849087 1\n",
"31492900 1\n",
"36354882 1\n",
"33608511 1\n",
"31120190 1\n",
" ..\n",
"37131688 1\n",
"31681540 1\n",
"36556195 1\n",
"32169188 1\n",
"33851393 1\n",
"Name: transaction_id, Length: 16190, dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['transaction_id'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"677"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['max_dpd'].isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True 16190\n",
"Name: remarks, dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['remarks'].str.contains('Personal Loan Score').value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"REJECT 16190\n",
"Name: result, dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['result'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pl_score 16190\n",
"Name: rule_name, dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['rule_name'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['id', 'rule_name', 'result', 'remarks', 'version', 'creation_time',\n",
" 'modification_time', 'transaction_id', 'user_id', 'transaction_date',\n",
" 'amount', 'payment_type', 'performance_window', 'max_dpd'],\n",
" dtype='object')"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save the dataset"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"final_df = df[['transaction_id', 'user_id', 'rule_name', 'result', 'remarks', \n",
" 'transaction_date', 'amount', 'performance_window', 'max_dpd']]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"final_df"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2019-09-30 23:56:42')"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_df['transaction_date'].max()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2019-06-01 00:44:53')"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_df['transaction_date'].min()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"final_df.to_parquet('data/pl_rejected_users_junjulaugsep_04022020.parquet', compression='gzip')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}