past-data-projects / personal_loan_credit_risk / 000-Model_Definitions / 003 - Combine Datasets and Get flag_bad.ipynb
003 - Combine Datasets and Get flag_bad.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import pymysql\n",
    "import psycopg2\n",
    "from datetime import datetime\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import configparser\n",
    "config = configparser.ConfigParser()\n",
    "config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
    "\n",
    "host = config['MYSQL-ROOT']['HOST']\n",
    "user = config['MYSQL-ROOT']['USER']\n",
    "password = config['MYSQL-ROOT']['PASSWORD']\n",
    "\n",
    "def connect_sql():\n",
    "    cnx = pymysql.connect(host=host,\n",
    "                          user=user,\n",
    "                          password=password,\n",
    "                          cursorclass=pymysql.cursors.DictCursor)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "host = config['PERISCOPE']['HOST']\n",
    "user = config['PERISCOPE']['USER']\n",
    "password = config['PERISCOPE']['PASSWORD']\n",
    "port = config['PERISCOPE']['PORT']\n",
    "dbname = config['PERISCOPE']['DB']\n",
    "\n",
    "def connect_periscope():\n",
    "    cnx = psycopg2.connect(dbname=dbname, \n",
    "                            host=host, \n",
    "                            port=port,\n",
    "                            user=user,\n",
    "                            password=password)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "pl_df = pd.read_parquet('data/interim/pl_model_v3_feats_14012020.parquet')\n",
    "pf_df = pd.read_parquet('data/interim/pl_model_v3_pefindo_feats_08012020.parquet')\\\n",
    "          .rename(columns={'first_trx_date':'transaction_date'})\n",
    "pf_df['transaction_date'] = pf_df['transaction_date'].astype('datetime64[ms]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(105086, 31)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(105086, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(pl_df.shape)\n",
    "display(pf_df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['user_id', 'transaction_date', 'payment_type', 'trx_id', 'loan_amount',\n",
       "       'os_amount', 'current_dpd', 'os_amount_inc', 'delin_max_dpd_7d',\n",
       "       'delin_max_dpd_15d', 'delin_max_dpd_1mo', 'delin_max_dpd_2mo',\n",
       "       'delin_max_dpd_3mo', 'delin_max_dpd_amt_7d', 'delin_max_dpd_amt_15d',\n",
       "       'delin_max_dpd_amt_1mo', 'delin_max_dpd_amt_2mo',\n",
       "       'delin_max_dpd_amt_3mo', 'time_approve_to_pl_hour',\n",
       "       'oth_last_trx_amount', 'oth_first_trx_amount', 'oth_last_rep_dpd',\n",
       "       'oth_last_rep_days', 'oth_last_rep_channel', 'oth_last_rep_amount',\n",
       "       'oth_last_rep_channel_raw', 'util_non_pl', 'util_pl',\n",
       "       'calibrated_final', 'engine', 'ap_co_plo'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pl_df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_df = pl_df.merge(pf_df, how='left', on=['user_id', 'trx_id', 'transaction_date'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(105086, 38)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "rename_cols = {'a_score':'calibrated_final', \n",
    "               'pf_delin_dist_code_30_dpd_3mo':'pf_delin_dist_contractcode_30_dpd_3mo'}\n",
    "raw_df = raw_df.rename(columns=rename_cols)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fix oth_last_rep_channel\n",
    "Channel to integer representation had unexpected value 5. The value was only supposed to run from 1-4. This was caused by a bug in the mapping dictionary."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "last_rep_channel_encoding = {'BUKALAPAK': 1, 'TOKOPEDIA': 1,\n",
    "                             'BCA_VA': 2, 'MANDIRI_OPEN_VA': 2, 'MANDIRI_VA': 2, 'PERMATA_VA': 2,\n",
    "                             'ALFAMART_VA': 3, 'INDOMARET_VA': 3,\n",
    "                             'KREDIVO': 4, 'MANUAL_ALFAMART VA': 4, 'MANUAL_INDOMARET_VA': 4, \n",
    "                             'MANUAL_BCA VA': 4, 'MANUAL_BCA_VA': 4,\n",
    "                             'MANUAL_MANDIRI': 4, 'MANUAL_MANDIRI_OPEN_': 4, 'MANUAL_MANDIRI_VA': 4, \n",
    "                             'MANUAL_PERMATA VA': 4, 'MANUAL_PERMATA_VA': 4,\n",
    "                             'MANUAL_PC': 4, 'MANUAL_EXCESS': 4, 'MANUAL_WAIVER': 4, 'MANUAL_WAIVER_CLCT': 4}\n",
    "\n",
    "default_value = 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_df['oth_last_rep_channel'] = raw_df['oth_last_rep_channel_raw'].map(last_rep_channel_encoding)\\\n",
    "                                                                   .fillna(default_value).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2    65857\n",
       "3    30068\n",
       "1     8988\n",
       "4      173\n",
       "Name: oth_last_rep_channel, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df['oth_last_rep_channel'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get Flag Bad"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_performance = '''\n",
    "SELECT user_base_w_first_61.*, dlv.*\n",
    "FROM\n",
    "  (SELECT user_base.*, MIN(snapshot_date) AS first_61_after_trx\n",
    "  FROM\n",
    "    (SELECT \n",
    "      base.*, COUNT(*) as performance_window, MAX(ula.current_dpd) as max_dpd\n",
    "    FROM\n",
    "      (SELECT \n",
    "        user_id, transaction_date, payment_type, id AS trx_id, loan_amount\n",
    "      FROM \n",
    "        (SELECT \n",
    "          id, user_id, status, transaction_type, transaction_date, payment_type\n",
    "        FROM\n",
    "          db_11466_public.transaction\n",
    "        WHERE \n",
    "          status = 4 AND transaction_type = 2 AND transaction_date >= '2019-06-01' AND transaction_date < '2019-10-01' AND payment_type != '30_days'\n",
    "        ) AS tr\n",
    "        JOIN\n",
    "        (SELECT \n",
    "          start_date, transaction_id, loan_amount\n",
    "        FROM \n",
    "          db_11466_public.loan\n",
    "        WHERE \n",
    "          loan_amount > 0 AND is_active = 1\n",
    "        ) AS lo ON lo.transaction_id = tr.id\n",
    "      ) AS base\n",
    "      LEFT JOIN\n",
    "      (SELECT\n",
    "        user_id, snapshot_date, current_dpd\n",
    "       FROM \n",
    "        db_13080_public.user_level_archive\n",
    "      ) AS ula ON base.user_id = ula.user_id\n",
    "      WHERE \n",
    "        (snapshot_date is NULL) OR \n",
    "        (snapshot_date > transaction_date AND datediff('day', base.transaction_date, ula.snapshot_date) <= 121)\n",
    "      GROUP BY \n",
    "        base.user_id, base.transaction_date, base.payment_type, base.trx_id, base.loan_amount\n",
    "    ) AS user_base\n",
    "    LEFT JOIN\n",
    "    (SELECT \n",
    "      user_id, snapshot_date, current_dpd\n",
    "     FROM \n",
    "      db_13080_public.user_level_archive\n",
    "     WHERE \n",
    "      current_dpd >= 61\n",
    "    ) AS ula_2 ON user_base.user_id = ula_2.user_id\n",
    "    WHERE \n",
    "      snapshot_date is null or snapshot_date > transaction_date\n",
    "    GROUP BY \n",
    "      user_base.user_id, user_base.transaction_date, user_base.payment_type, user_base.trx_id, user_base.loan_amount, user_base.performance_window, user_base.max_dpd\n",
    "  ) AS user_base_w_first_61\n",
    "  LEFT JOIN\n",
    "  (SELECT \n",
    "    transaction_id, user_id AS user_id_1, start_date, loan_disbursement, first_dpd_1, first_dpd_7, first_dpd_15, first_dpd_30, first_dpd_60, first_dpd_90 \n",
    "  FROM \n",
    "    dim_loan_vintage\n",
    "  ) AS dlv ON user_base_w_first_61.trx_id = dlv.transaction_id\n",
    "WHERE datediff('day', transaction_date, CURRENT_DATE) >= 121\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "perf = pd.read_sql(get_performance, connect_periscope())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "# perf = pd.read_csv('data/pl_users_performance_junjulaugsep_24012020.csv')\n",
    "perf['transaction_date'] = perf['transaction_date'].astype('datetime64[ms]')\n",
    "perf['start_date'] = perf['start_date'].astype('datetime64[ms]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(144281, 18)"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "perf.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['user_id', 'transaction_date', 'payment_type', 'trx_id', 'loan_amount',\n",
       "       'performance_window', 'max_dpd', 'first_61_after_trx', 'transaction_id',\n",
       "       'user_id_1', 'start_date', 'loan_disbursement', 'first_dpd_1',\n",
       "       'first_dpd_7', 'first_dpd_15', 'first_dpd_30', 'first_dpd_60',\n",
       "       'first_dpd_90'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "perf.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True    144281\n",
       "Name: transaction_date, dtype: int64"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "((datetime.now().date() - perf[\"transaction_date\"].dt.normalize().dt.date).dt.days >= 121).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True    144281\n",
       "dtype: int64"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(perf[\"loan_amount\"] == perf[\"loan_disbursement\"]).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True    144281\n",
       "dtype: int64"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(perf[\"user_id\"] == perf[\"user_id_1\"]).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True     144254\n",
       "False        27\n",
       "dtype: int64"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(perf[\"transaction_date\"].dt.date == perf[\"start_date\"].dt.date).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "perf[perf[\"transaction_date\"].dt.date != perf[\"start_date\"].dt.date]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "perf[\"flag_bad_usr\"] = perf[\"max_dpd\"] > 60"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "perf[\"first_dpd_60\"] = pd.to_datetime(perf[\"first_dpd_60\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False    138139\n",
       "True       6142\n",
       "Name: flag_bad_usr, dtype: int64"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "perf['flag_bad_usr'].value_counts(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.04256970772312363"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "perf['flag_bad_usr'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['user_id',\n",
       " 'transaction_date',\n",
       " 'payment_type',\n",
       " 'trx_id',\n",
       " 'loan_amount',\n",
       " 'performance_window',\n",
       " 'max_dpd',\n",
       " 'first_61_after_trx',\n",
       " 'transaction_id',\n",
       " 'user_id_1',\n",
       " 'start_date',\n",
       " 'loan_disbursement',\n",
       " 'first_dpd_1',\n",
       " 'first_dpd_7',\n",
       " 'first_dpd_15',\n",
       " 'first_dpd_30',\n",
       " 'first_dpd_60',\n",
       " 'first_dpd_90',\n",
       " 'flag_bad_usr']"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "perf.columns.tolist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combine DataFrames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "perf_columns = ['user_id',\n",
    "                'transaction_date',\n",
    "                'payment_type',\n",
    "                'trx_id',\n",
    "                'loan_amount',\n",
    "                'performance_window',\n",
    "                'max_dpd',\n",
    "                'first_61_after_trx',\n",
    "                'transaction_id',\n",
    "#                 'user_id_1',\n",
    "                'start_date',\n",
    "#                 'loan_disbursement',\n",
    "                'first_dpd_1',\n",
    "                'first_dpd_7',\n",
    "                'first_dpd_15',\n",
    "                'first_dpd_30',\n",
    "                'first_dpd_60',\n",
    "                'first_dpd_90',\n",
    "                'flag_bad_usr',\n",
    "#                 'duration_to_default',\n",
    "#                 'flag_bad_trx'\n",
    "               ]\n",
    "\n",
    "from_perf = perf[perf_columns]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "feats_used = ['ap_co_plo',\n",
    "                'calibrated_final',\n",
    "                'current_dpd',\n",
    "                'delin_max_dpd_3mo',\n",
    "                'delin_max_dpd_amt_3mo',\n",
    "                'flag_bad_pefindo',\n",
    "                'flag_good_pefindo',\n",
    "                'os_amount',\n",
    "                'oth_first_trx_amount',\n",
    "                'oth_last_rep_channel',\n",
    "                'oth_last_rep_days',\n",
    "                'oth_last_rep_dpd',\n",
    "                'oth_last_rep_amount',\n",
    "                'oth_last_trx_amount',\n",
    "                'pf_con_open_count_12mo',\n",
    "                'pf_delin_dist_contractcode_30_dpd_3mo',\n",
    "                'pf_delin_max_dpd_12mo',\n",
    "                'pf_util_creditcard_avg_12mo',\n",
    "                'time_approve_to_pl_hour',\n",
    "                'util_non_pl',\n",
    "                'util_pl']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(105086, 24)"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from_df = raw_df[['user_id', 'trx_id', 'payment_type']+feats_used].copy()\n",
    "from_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "merge_df = from_perf.merge(from_df, how='left', on=['user_id', 'trx_id', 'payment_type'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(144281, 37)"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "user_id                                  0.000000\n",
       "transaction_date                         0.000000\n",
       "payment_type                             0.000000\n",
       "trx_id                                   0.000000\n",
       "performance_window                       0.000000\n",
       "max_dpd                                  0.000097\n",
       "first_61_after_trx                       0.924973\n",
       "transaction_id                           0.000000\n",
       "start_date                               0.000000\n",
       "first_dpd_1                              0.799121\n",
       "first_dpd_7                              0.929859\n",
       "first_dpd_15                             0.957985\n",
       "first_dpd_30                             0.977682\n",
       "first_dpd_60                             0.994788\n",
       "first_dpd_90                             0.999515\n",
       "flag_bad_usr                             0.000000\n",
       "ap_co_plo                                0.271955\n",
       "calibrated_final                         0.747569\n",
       "current_dpd                              0.463304\n",
       "delin_max_dpd_3mo                        0.351252\n",
       "delin_max_dpd_amt_3mo                    0.351273\n",
       "flag_bad_pefindo                         0.613151\n",
       "flag_good_pefindo                        0.613151\n",
       "os_amount                                0.463304\n",
       "oth_first_trx_amount                     0.343566\n",
       "oth_last_rep_channel                     0.271955\n",
       "oth_last_rep_days                        0.383509\n",
       "oth_last_rep_dpd                         0.383509\n",
       "oth_last_rep_amount                      0.383509\n",
       "oth_last_trx_amount                      0.343566\n",
       "pf_con_open_count_12mo                   0.613151\n",
       "pf_delin_dist_contractcode_30_dpd_3mo    0.613151\n",
       "pf_delin_max_dpd_12mo                    0.613151\n",
       "pf_util_creditcard_avg_12mo              0.613151\n",
       "time_approve_to_pl_hour                  0.271983\n",
       "util_non_pl                              0.271955\n",
       "util_pl                                  0.271955\n",
       "dtype: float64"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_df.isna().mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('2019-09-25 23:59:11')"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_df['transaction_date'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "merge_df.to_parquet('data/feats_proba_user_data_24012020.parquet', compression='gzip')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}