{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import pymysql\n",
"import psycopg2\n",
"from datetime import datetime\n",
"\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import configparser\n",
"config = configparser.ConfigParser()\n",
"config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
"\n",
"host = config['MYSQL-ROOT']['HOST']\n",
"user = config['MYSQL-ROOT']['USER']\n",
"password = config['MYSQL-ROOT']['PASSWORD']\n",
"\n",
"def connect_sql():\n",
" cnx = pymysql.connect(host=host,\n",
" user=user,\n",
" password=password,\n",
" cursorclass=pymysql.cursors.DictCursor)\n",
" return cnx"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"host = config['PERISCOPE']['HOST']\n",
"user = config['PERISCOPE']['USER']\n",
"password = config['PERISCOPE']['PASSWORD']\n",
"port = config['PERISCOPE']['PORT']\n",
"dbname = config['PERISCOPE']['DB']\n",
"\n",
"def connect_periscope():\n",
" cnx = psycopg2.connect(dbname=dbname, \n",
" host=host, \n",
" port=port,\n",
" user=user,\n",
" password=password)\n",
" return cnx"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load Data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"pl_df = pd.read_parquet('data/interim/pl_model_v3_feats_14012020.parquet')\n",
"pf_df = pd.read_parquet('data/interim/pl_model_v3_pefindo_feats_08012020.parquet')\\\n",
" .rename(columns={'first_trx_date':'transaction_date'})\n",
"pf_df['transaction_date'] = pf_df['transaction_date'].astype('datetime64[ms]')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(105086, 31)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(105086, 10)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(pl_df.shape)\n",
"display(pf_df.shape)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['user_id', 'transaction_date', 'payment_type', 'trx_id', 'loan_amount',\n",
" 'os_amount', 'current_dpd', 'os_amount_inc', 'delin_max_dpd_7d',\n",
" 'delin_max_dpd_15d', 'delin_max_dpd_1mo', 'delin_max_dpd_2mo',\n",
" 'delin_max_dpd_3mo', 'delin_max_dpd_amt_7d', 'delin_max_dpd_amt_15d',\n",
" 'delin_max_dpd_amt_1mo', 'delin_max_dpd_amt_2mo',\n",
" 'delin_max_dpd_amt_3mo', 'time_approve_to_pl_hour',\n",
" 'oth_last_trx_amount', 'oth_first_trx_amount', 'oth_last_rep_dpd',\n",
" 'oth_last_rep_days', 'oth_last_rep_channel', 'oth_last_rep_amount',\n",
" 'oth_last_rep_channel_raw', 'util_non_pl', 'util_pl',\n",
" 'calibrated_final', 'engine', 'ap_co_plo'],\n",
" dtype='object')"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pl_df.columns"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"raw_df = pl_df.merge(pf_df, how='left', on=['user_id', 'trx_id', 'transaction_date'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(105086, 38)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"rename_cols = {'a_score':'calibrated_final', \n",
" 'pf_delin_dist_code_30_dpd_3mo':'pf_delin_dist_contractcode_30_dpd_3mo'}\n",
"raw_df = raw_df.rename(columns=rename_cols)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Fix oth_last_rep_channel\n",
"Channel to integer representation had unexpected value 5. The value was only supposed to run from 1-4. This was caused by a bug in the mapping dictionary."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"last_rep_channel_encoding = {'BUKALAPAK': 1, 'TOKOPEDIA': 1,\n",
" 'BCA_VA': 2, 'MANDIRI_OPEN_VA': 2, 'MANDIRI_VA': 2, 'PERMATA_VA': 2,\n",
" 'ALFAMART_VA': 3, 'INDOMARET_VA': 3,\n",
" 'KREDIVO': 4, 'MANUAL_ALFAMART VA': 4, 'MANUAL_INDOMARET_VA': 4, \n",
" 'MANUAL_BCA VA': 4, 'MANUAL_BCA_VA': 4,\n",
" 'MANUAL_MANDIRI': 4, 'MANUAL_MANDIRI_OPEN_': 4, 'MANUAL_MANDIRI_VA': 4, \n",
" 'MANUAL_PERMATA VA': 4, 'MANUAL_PERMATA_VA': 4,\n",
" 'MANUAL_PC': 4, 'MANUAL_EXCESS': 4, 'MANUAL_WAIVER': 4, 'MANUAL_WAIVER_CLCT': 4}\n",
"\n",
"default_value = 3"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"raw_df['oth_last_rep_channel'] = raw_df['oth_last_rep_channel_raw'].map(last_rep_channel_encoding)\\\n",
" .fillna(default_value).astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2 65857\n",
"3 30068\n",
"1 8988\n",
"4 173\n",
"Name: oth_last_rep_channel, dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_df['oth_last_rep_channel'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get Flag Bad"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"get_performance = '''\n",
"SELECT user_base_w_first_61.*, dlv.*\n",
"FROM\n",
" (SELECT user_base.*, MIN(snapshot_date) AS first_61_after_trx\n",
" FROM\n",
" (SELECT \n",
" base.*, COUNT(*) as performance_window, MAX(ula.current_dpd) as max_dpd\n",
" FROM\n",
" (SELECT \n",
" user_id, transaction_date, payment_type, id AS trx_id, loan_amount\n",
" FROM \n",
" (SELECT \n",
" id, user_id, status, transaction_type, transaction_date, payment_type\n",
" FROM\n",
" db_11466_public.transaction\n",
" WHERE \n",
" status = 4 AND transaction_type = 2 AND transaction_date >= '2019-06-01' AND transaction_date < '2019-10-01' AND payment_type != '30_days'\n",
" ) AS tr\n",
" JOIN\n",
" (SELECT \n",
" start_date, transaction_id, loan_amount\n",
" FROM \n",
" db_11466_public.loan\n",
" WHERE \n",
" loan_amount > 0 AND is_active = 1\n",
" ) AS lo ON lo.transaction_id = tr.id\n",
" ) AS base\n",
" LEFT JOIN\n",
" (SELECT\n",
" user_id, snapshot_date, current_dpd\n",
" FROM \n",
" db_13080_public.user_level_archive\n",
" ) AS ula ON base.user_id = ula.user_id\n",
" WHERE \n",
" (snapshot_date is NULL) OR \n",
" (snapshot_date > transaction_date AND datediff('day', base.transaction_date, ula.snapshot_date) <= 121)\n",
" GROUP BY \n",
" base.user_id, base.transaction_date, base.payment_type, base.trx_id, base.loan_amount\n",
" ) AS user_base\n",
" LEFT JOIN\n",
" (SELECT \n",
" user_id, snapshot_date, current_dpd\n",
" FROM \n",
" db_13080_public.user_level_archive\n",
" WHERE \n",
" current_dpd >= 61\n",
" ) AS ula_2 ON user_base.user_id = ula_2.user_id\n",
" WHERE \n",
" snapshot_date is null or snapshot_date > transaction_date\n",
" GROUP BY \n",
" user_base.user_id, user_base.transaction_date, user_base.payment_type, user_base.trx_id, user_base.loan_amount, user_base.performance_window, user_base.max_dpd\n",
" ) AS user_base_w_first_61\n",
" LEFT JOIN\n",
" (SELECT \n",
" transaction_id, user_id AS user_id_1, start_date, loan_disbursement, first_dpd_1, first_dpd_7, first_dpd_15, first_dpd_30, first_dpd_60, first_dpd_90 \n",
" FROM \n",
" dim_loan_vintage\n",
" ) AS dlv ON user_base_w_first_61.trx_id = dlv.transaction_id\n",
"WHERE datediff('day', transaction_date, CURRENT_DATE) >= 121\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"perf = pd.read_sql(get_performance, connect_periscope())"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"# perf = pd.read_csv('data/pl_users_performance_junjulaugsep_24012020.csv')\n",
"perf['transaction_date'] = perf['transaction_date'].astype('datetime64[ms]')\n",
"perf['start_date'] = perf['start_date'].astype('datetime64[ms]')"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(144281, 18)"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"perf.shape"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['user_id', 'transaction_date', 'payment_type', 'trx_id', 'loan_amount',\n",
" 'performance_window', 'max_dpd', 'first_61_after_trx', 'transaction_id',\n",
" 'user_id_1', 'start_date', 'loan_disbursement', 'first_dpd_1',\n",
" 'first_dpd_7', 'first_dpd_15', 'first_dpd_30', 'first_dpd_60',\n",
" 'first_dpd_90'],\n",
" dtype='object')"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"perf.columns"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True 144281\n",
"Name: transaction_date, dtype: int64"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"((datetime.now().date() - perf[\"transaction_date\"].dt.normalize().dt.date).dt.days >= 121).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True 144281\n",
"dtype: int64"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(perf[\"loan_amount\"] == perf[\"loan_disbursement\"]).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True 144281\n",
"dtype: int64"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(perf[\"user_id\"] == perf[\"user_id_1\"]).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True 144254\n",
"False 27\n",
"dtype: int64"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(perf[\"transaction_date\"].dt.date == perf[\"start_date\"].dt.date).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"perf[perf[\"transaction_date\"].dt.date != perf[\"start_date\"].dt.date]"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"perf[\"flag_bad_usr\"] = perf[\"max_dpd\"] > 60"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"perf[\"first_dpd_60\"] = pd.to_datetime(perf[\"first_dpd_60\"])"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False 138139\n",
"True 6142\n",
"Name: flag_bad_usr, dtype: int64"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"perf['flag_bad_usr'].value_counts(dropna=False)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.04256970772312363"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"perf['flag_bad_usr'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['user_id',\n",
" 'transaction_date',\n",
" 'payment_type',\n",
" 'trx_id',\n",
" 'loan_amount',\n",
" 'performance_window',\n",
" 'max_dpd',\n",
" 'first_61_after_trx',\n",
" 'transaction_id',\n",
" 'user_id_1',\n",
" 'start_date',\n",
" 'loan_disbursement',\n",
" 'first_dpd_1',\n",
" 'first_dpd_7',\n",
" 'first_dpd_15',\n",
" 'first_dpd_30',\n",
" 'first_dpd_60',\n",
" 'first_dpd_90',\n",
" 'flag_bad_usr']"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"perf.columns.tolist()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Combine DataFrames"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"perf_columns = ['user_id',\n",
" 'transaction_date',\n",
" 'payment_type',\n",
" 'trx_id',\n",
" 'loan_amount',\n",
" 'performance_window',\n",
" 'max_dpd',\n",
" 'first_61_after_trx',\n",
" 'transaction_id',\n",
"# 'user_id_1',\n",
" 'start_date',\n",
"# 'loan_disbursement',\n",
" 'first_dpd_1',\n",
" 'first_dpd_7',\n",
" 'first_dpd_15',\n",
" 'first_dpd_30',\n",
" 'first_dpd_60',\n",
" 'first_dpd_90',\n",
" 'flag_bad_usr',\n",
"# 'duration_to_default',\n",
"# 'flag_bad_trx'\n",
" ]\n",
"\n",
"from_perf = perf[perf_columns]"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"feats_used = ['ap_co_plo',\n",
" 'calibrated_final',\n",
" 'current_dpd',\n",
" 'delin_max_dpd_3mo',\n",
" 'delin_max_dpd_amt_3mo',\n",
" 'flag_bad_pefindo',\n",
" 'flag_good_pefindo',\n",
" 'os_amount',\n",
" 'oth_first_trx_amount',\n",
" 'oth_last_rep_channel',\n",
" 'oth_last_rep_days',\n",
" 'oth_last_rep_dpd',\n",
" 'oth_last_rep_amount',\n",
" 'oth_last_trx_amount',\n",
" 'pf_con_open_count_12mo',\n",
" 'pf_delin_dist_contractcode_30_dpd_3mo',\n",
" 'pf_delin_max_dpd_12mo',\n",
" 'pf_util_creditcard_avg_12mo',\n",
" 'time_approve_to_pl_hour',\n",
" 'util_non_pl',\n",
" 'util_pl']"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(105086, 24)"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from_df = raw_df[['user_id', 'trx_id', 'payment_type']+feats_used].copy()\n",
"from_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"merge_df = from_perf.merge(from_df, how='left', on=['user_id', 'trx_id', 'payment_type'])"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(144281, 37)"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merge_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"user_id 0.000000\n",
"transaction_date 0.000000\n",
"payment_type 0.000000\n",
"trx_id 0.000000\n",
"performance_window 0.000000\n",
"max_dpd 0.000097\n",
"first_61_after_trx 0.924973\n",
"transaction_id 0.000000\n",
"start_date 0.000000\n",
"first_dpd_1 0.799121\n",
"first_dpd_7 0.929859\n",
"first_dpd_15 0.957985\n",
"first_dpd_30 0.977682\n",
"first_dpd_60 0.994788\n",
"first_dpd_90 0.999515\n",
"flag_bad_usr 0.000000\n",
"ap_co_plo 0.271955\n",
"calibrated_final 0.747569\n",
"current_dpd 0.463304\n",
"delin_max_dpd_3mo 0.351252\n",
"delin_max_dpd_amt_3mo 0.351273\n",
"flag_bad_pefindo 0.613151\n",
"flag_good_pefindo 0.613151\n",
"os_amount 0.463304\n",
"oth_first_trx_amount 0.343566\n",
"oth_last_rep_channel 0.271955\n",
"oth_last_rep_days 0.383509\n",
"oth_last_rep_dpd 0.383509\n",
"oth_last_rep_amount 0.383509\n",
"oth_last_trx_amount 0.343566\n",
"pf_con_open_count_12mo 0.613151\n",
"pf_delin_dist_contractcode_30_dpd_3mo 0.613151\n",
"pf_delin_max_dpd_12mo 0.613151\n",
"pf_util_creditcard_avg_12mo 0.613151\n",
"time_approve_to_pl_hour 0.271983\n",
"util_non_pl 0.271955\n",
"util_pl 0.271955\n",
"dtype: float64"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merge_df.isna().mean()"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2019-09-25 23:59:11')"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merge_df['transaction_date'].max()"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"merge_df.to_parquet('data/feats_proba_user_data_24012020.parquet', compression='gzip')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}