past-data-projects / personal_loan_credit_risk / 600-Threshold_Analysis / 601 - Get Threshold Dataset Jan2020.ipynb
601 - Get Threshold Dataset Jan2020.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pymysql\n",
    "import psycopg2\n",
    "import pickle\n",
    "from datetime import datetime, timedelta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import configparser\n",
    "config = configparser.ConfigParser()\n",
    "config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
    "\n",
    "host = config['MYSQL-ROOT']['HOST']\n",
    "user = config['MYSQL-ROOT']['USER']\n",
    "password = config['MYSQL-ROOT']['PASSWORD']\n",
    "\n",
    "def connect_sql():\n",
    "    cnx = pymysql.connect(host=host,\n",
    "                          user=user,\n",
    "                          password=password,\n",
    "                          cursorclass=pymysql.cursors.DictCursor)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get Pass PL Trx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "pass_query = '''\n",
    "SELECT\n",
    "\tbase.*,\n",
    "\tCOUNT(*) AS performance_window,\n",
    "\tMAX(ula.current_dpd) AS max_dpd\n",
    "FROM\n",
    "\t(\n",
    "\tSELECT\n",
    "\t\tuser_id,\n",
    "\t\ttransaction_date,\n",
    "\t\tpayment_type,\n",
    "\t\tid AS trx_id,\n",
    "\t\tamount,\n",
    "\t\tdatediff(CURDATE(), transaction_date) AS dob\n",
    "\tFROM\n",
    "\t\t(\n",
    "\t\tSELECT\n",
    "\t\t\tid,\n",
    "\t\t\tuser_id,\n",
    "\t\t\tstatus,\n",
    "\t\t\tamount,\n",
    "\t\t\ttransaction_type,\n",
    "\t\t\ttransaction_date,\n",
    "\t\t\tpayment_type\n",
    "\t\tFROM\n",
    "\t\t\tl2alpha.`transaction`\n",
    "\t\tWHERE\n",
    "\t\t\tstatus = 4\n",
    "\t\t\tAND transaction_type = 2\n",
    "\t\t\tAND payment_type <> '30_days'\n",
    "\t\t\tAND transaction_date >= '2019-12-01'\n",
    "\t\t\tAND transaction_date < '2020-02-01' ) AS tr\n",
    "\tJOIN (\n",
    "\t\tSELECT\n",
    "\t\t\tstart_date,\n",
    "\t\t\ttransaction_id\n",
    "\t\tFROM\n",
    "\t\t\tl2alpha.loan\n",
    "\t\tWHERE\n",
    "\t\t\tloan_amount > 0\n",
    "\t\t\tAND is_active = 1 ) AS lo ON\n",
    "\t\tlo.transaction_id = tr.id ) AS base\n",
    "LEFT JOIN (\n",
    "\tSELECT\n",
    "\t\tuser_id,\n",
    "\t\tsnapshot_date,\n",
    "\t\tcurrent_dpd\n",
    "\tFROM\n",
    "\t\tds.user_level_archive ) AS ula ON\n",
    "\tbase.user_id = ula.user_id\n",
    "WHERE\n",
    "\t(snapshot_date IS NULL)\n",
    "\tOR (snapshot_date > transaction_date\n",
    "\tAND datediff(ula.snapshot_date, base.transaction_date) <= 121)\n",
    "GROUP BY\n",
    "\tbase.user_id,\n",
    "\tbase.transaction_date,\n",
    "\tbase.trx_id\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "pl_trx_pass = pd.read_sql(pass_query, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(138501, 8)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pl_trx_pass.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "user_id counts: 132873\n",
      "trx_id counts: 138501\n",
      "earliest trx date: 2019-12-01 00:00:53\n",
      "latest trx date: 2020-01-31 23:59:01\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1     71084\n",
       "12    67417\n",
       "Name: transaction_date, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "print('user_id counts:', pl_trx_pass['user_id'].nunique())\n",
    "print('trx_id counts:', pl_trx_pass['trx_id'].nunique())\n",
    "print('earliest trx date:', pl_trx_pass['transaction_date'].min())\n",
    "print('latest trx date:', pl_trx_pass['transaction_date'].max())\n",
    "display(pl_trx_pass['transaction_date'].dt.month.value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>transaction_date</th>\n",
       "      <th>payment_type</th>\n",
       "      <th>trx_id</th>\n",
       "      <th>amount</th>\n",
       "      <th>dob</th>\n",
       "      <th>performance_window</th>\n",
       "      <th>max_dpd</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1511</td>\n",
       "      <td>2019-12-02 19:56:22</td>\n",
       "      <td>3_months</td>\n",
       "      <td>48077443</td>\n",
       "      <td>3500000.0</td>\n",
       "      <td>71</td>\n",
       "      <td>71</td>\n",
       "      <td>-5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1590</td>\n",
       "      <td>2020-01-01 14:30:32</td>\n",
       "      <td>3_months</td>\n",
       "      <td>51947725</td>\n",
       "      <td>10500000.0</td>\n",
       "      <td>41</td>\n",
       "      <td>41</td>\n",
       "      <td>-7.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1860</td>\n",
       "      <td>2020-01-27 15:19:15</td>\n",
       "      <td>6_months</td>\n",
       "      <td>55263919</td>\n",
       "      <td>15000000.0</td>\n",
       "      <td>15</td>\n",
       "      <td>15</td>\n",
       "      <td>-15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1988</td>\n",
       "      <td>2020-01-08 13:51:13</td>\n",
       "      <td>6_months</td>\n",
       "      <td>52887383</td>\n",
       "      <td>13500000.0</td>\n",
       "      <td>34</td>\n",
       "      <td>34</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2711</td>\n",
       "      <td>2020-01-16 12:46:24</td>\n",
       "      <td>6_months</td>\n",
       "      <td>53892762</td>\n",
       "      <td>4000000.0</td>\n",
       "      <td>26</td>\n",
       "      <td>26</td>\n",
       "      <td>-4.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id    transaction_date payment_type    trx_id      amount  dob  \\\n",
       "0     1511 2019-12-02 19:56:22     3_months  48077443   3500000.0   71   \n",
       "1     1590 2020-01-01 14:30:32     3_months  51947725  10500000.0   41   \n",
       "2     1860 2020-01-27 15:19:15     6_months  55263919  15000000.0   15   \n",
       "3     1988 2020-01-08 13:51:13     6_months  52887383  13500000.0   34   \n",
       "4     2711 2020-01-16 12:46:24     6_months  53892762   4000000.0   26   \n",
       "\n",
       "   performance_window  max_dpd  \n",
       "0                  71     -5.0  \n",
       "1                  41     -7.0  \n",
       "2                  15    -15.0  \n",
       "3                  34      4.0  \n",
       "4                  26     -4.0  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pl_trx_pass.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get Reject PL Trx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "reject_query = '''\n",
    "SELECT\n",
    "\tbase.*,\n",
    "\tCOUNT(*) AS performance_window,\n",
    "\tMAX(ula.current_dpd) AS max_dpd\n",
    "FROM\n",
    "\t(\n",
    "\tSELECT\n",
    "\t\ttar.*,\n",
    "\t\ttr.user_id,\n",
    "\t\ttr.transaction_date,\n",
    "\t\ttr.amount,\n",
    "        tr.payment_type,\n",
    "        datediff(CURDATE(), tr.transaction_date) AS dob\n",
    "\tFROM\n",
    "\t\t(\n",
    "\t\tSELECT\n",
    "\t\t\t*\n",
    "\t\tFROM\n",
    "\t\t\tl2alpha.transaction_approval_rules\n",
    "\t\tWHERE\n",
    "\t\t\trule_name = 'pl_score'\n",
    "\t\t\tAND `result` = 'REJECT'\n",
    "\t\t\tAND remarks LIKE '%Personal Loan Score%' ) AS tar\n",
    "\tLEFT JOIN l2alpha.`transaction` AS tr ON\n",
    "\t\ttar.transaction_id = tr.id\n",
    "\tWHERE\n",
    "\t\tpayment_type <> '30_days'\n",
    "\t\tAND transaction_date >= '2019-12-01'\n",
    "\t\tAND transaction_date < '2020-02-01'\n",
    "\t\t) AS base\n",
    "LEFT JOIN (\n",
    "\tSELECT\n",
    "\t\tuser_id,\n",
    "\t\tsnapshot_date,\n",
    "\t\tcurrent_dpd\n",
    "\tFROM\n",
    "\t\tds.user_level_archive ) AS ula ON\n",
    "\tbase.user_id = ula.user_id\n",
    "WHERE\n",
    "\t(snapshot_date IS NULL)\n",
    "\tOR (snapshot_date > transaction_date\n",
    "\tAND datediff(ula.snapshot_date,\n",
    "\tbase.transaction_date) <= 121)\n",
    "GROUP BY\n",
    "\tbase.user_id,\n",
    "\tbase.transaction_date,\n",
    "\tbase.transaction_id\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "pl_trx_reject = pd.read_sql(reject_query, connect_sql())\n",
    "pl_trx_reject = pl_trx_reject.rename(columns={'transaction_id':'trx_id'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(16254, 15)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pl_trx_reject.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "user_id counts: 15488\n",
      "trx_id counts: 16254\n",
      "earliest trx date: 2019-12-01 00:00:57\n",
      "latest trx date: 2020-01-31 23:59:10\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1     8769\n",
       "12    7485\n",
       "Name: transaction_date, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "print('user_id counts:', pl_trx_reject['user_id'].nunique())\n",
    "print('trx_id counts:', pl_trx_reject['trx_id'].nunique())\n",
    "print('earliest trx date:', pl_trx_reject['transaction_date'].min())\n",
    "print('latest trx date:', pl_trx_reject['transaction_date'].max())\n",
    "display(pl_trx_reject['transaction_date'].dt.month.value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "pl_trx_reject.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combine Pass and Reject Trx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [],
   "source": [
    "pl_trx = pd.concat([pl_trx_pass, pl_trx_reject[pl_trx_pass.columns]]).reset_index(drop=True)\n",
    "pl_trx['flag_rejected'] = pl_trx['trx_id'].isin(pl_trx_reject['trx_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "pl_trx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "user_id counts: 145943\n",
      "trx_id counts: 154755\n",
      "earliest trx date: 2019-12-01 00:00:53\n",
      "latest trx date: 2020-01-31 23:59:10\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1     79853\n",
       "12    74902\n",
       "Name: transaction_date, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Percent of rejected by PD users: 0.10503053213143355\n"
     ]
    }
   ],
   "source": [
    "print('user_id counts:', pl_trx['user_id'].nunique())\n",
    "print('trx_id counts:', pl_trx['trx_id'].nunique())\n",
    "print('earliest trx date:', pl_trx['transaction_date'].min())\n",
    "print('latest trx date:', pl_trx['transaction_date'].max())\n",
    "display(pl_trx['transaction_date'].dt.month.value_counts())\n",
    "print()\n",
    "print('Percent of rejected by PD users:', pl_trx['flag_rejected'].mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get PL V2 Score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [],
   "source": [
    "tids = tuple(pl_trx['trx_id'])\n",
    "plv2_query = f'''\n",
    "SELECT trx_id, user_id, score, feature, calculation_date\n",
    "FROM ds.b_score_pl_log\n",
    "WHERE trx_id IN {tids}\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [],
   "source": [
    "pl_v2 = pd.read_sql(plv2_query, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OG dataset size: 154755\n",
      "PLV2 Score dataset size: 156609\n",
      "user_id counts: 145942\n",
      "trx_id counts: 154753\n"
     ]
    }
   ],
   "source": [
    "print('OG dataset size:', pl_trx.shape[0])\n",
    "print('PLV2 Score dataset size:', pl_v2.shape[0])\n",
    "print('user_id counts:', pl_v2['user_id'].nunique())\n",
    "print('trx_id counts:', pl_v2['trx_id'].nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop duplicates\n",
    "pl_v2 = pl_v2.sort_values(by=['trx_id', 'user_id', 'calculation_date'])\\\n",
    "             .drop_duplicates(subset=['trx_id'], keep='last')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(154753, 5)"
      ]
     },
     "execution_count": 159,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pl_v2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Trx where score == -1: 618\n"
     ]
    }
   ],
   "source": [
    "print('Trx where score == -1:',pl_v2[pl_v2['score'] < 0].shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Update Dataset with Recalculated PL Scores\n",
    "There were random cases where users PL score were not calculated (caused by a bug). These users were granted a PL regardless they passed the PL score or not. The PLV2 score for these users must be recalculated to get a full picture. These users are denoted by \"score = -1\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [],
   "source": [
    "recalc = pl_v2[pl_v2['score'] == -1][['trx_id', 'user_id', 'score', 'feature']].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [],
   "source": [
    "nan = np.nan\n",
    "recalc['feature'] = recalc.apply(lambda x: eval(x['feature']), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [],
   "source": [
    "recalc_feats = pd.DataFrame(recalc['feature'].tolist(), index=recalc['trx_id']).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [],
   "source": [
    "feats_used = ['ap_co_plo',\n",
    "                'calibrated_final',\n",
    "                'current_dpd',\n",
    "                'delin_max_dpd_3mo',\n",
    "                'delin_max_dpd_amt_3mo',\n",
    "                'flag_bad_pefindo',\n",
    "                'flag_good_pefindo',\n",
    "                'os_amount',\n",
    "                'oth_first_trx_amount',\n",
    "                'oth_last_rep_channel',\n",
    "                'oth_last_rep_days',\n",
    "                'oth_last_rep_dpd',\n",
    "                'oth_last_trx_amount',\n",
    "                'pf_con_open_count_12mo',\n",
    "                'pf_delin_dist_contractcode_30_dpd_3mo',\n",
    "                'pf_delin_max_dpd_12mo',\n",
    "                'pf_util_creditcard_avg_12mo',\n",
    "                'time_approve_to_pl_hour',\n",
    "                'util_non_pl',\n",
    "                'util_pl']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [],
   "source": [
    "mod_xgb = pickle.load(open('xgb_pl_model_v2.p', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [],
   "source": [
    "recalc_feats['recalc_score'] = mod_xgb.predict_proba(recalc_feats[feats_used],\n",
    "                                                     ntree_limit=mod_xgb.best_iteration)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(154753, 5)"
      ]
     },
     "execution_count": 167,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "comb_temp = pl_v2.set_index('trx_id')\n",
    "comb_temp.loc[comb_temp['score'] == -1, 'score'] = recalc_feats.set_index('trx_id')['recalc_score']\n",
    "comb_temp = comb_temp.reset_index()\n",
    "comb_temp.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(154753, 5)"
      ]
     },
     "execution_count": 168,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pl_v2 = comb_temp.copy()\n",
    "pl_v2.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Merge trx and plv2 score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(154755, 9)"
      ]
     },
     "execution_count": 169,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pl_trx.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(154753, 5)"
      ]
     },
     "execution_count": 170,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pl_v2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {},
   "outputs": [],
   "source": [
    "comb_df = pl_trx.merge(pl_v2[['trx_id', 'user_id', 'score']], how='left', on=['user_id', 'trx_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>transaction_date</th>\n",
       "      <th>payment_type</th>\n",
       "      <th>trx_id</th>\n",
       "      <th>amount</th>\n",
       "      <th>dob</th>\n",
       "      <th>performance_window</th>\n",
       "      <th>max_dpd</th>\n",
       "      <th>flag_rejected</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1511</td>\n",
       "      <td>2019-12-02 19:56:22</td>\n",
       "      <td>3_months</td>\n",
       "      <td>48077443</td>\n",
       "      <td>3500000.0</td>\n",
       "      <td>71</td>\n",
       "      <td>71</td>\n",
       "      <td>-5.0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.009816</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1590</td>\n",
       "      <td>2020-01-01 14:30:32</td>\n",
       "      <td>3_months</td>\n",
       "      <td>51947725</td>\n",
       "      <td>10500000.0</td>\n",
       "      <td>41</td>\n",
       "      <td>41</td>\n",
       "      <td>-7.0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.132705</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1860</td>\n",
       "      <td>2020-01-27 15:19:15</td>\n",
       "      <td>6_months</td>\n",
       "      <td>55263919</td>\n",
       "      <td>15000000.0</td>\n",
       "      <td>15</td>\n",
       "      <td>15</td>\n",
       "      <td>-15.0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.015513</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1988</td>\n",
       "      <td>2020-01-08 13:51:13</td>\n",
       "      <td>6_months</td>\n",
       "      <td>52887383</td>\n",
       "      <td>13500000.0</td>\n",
       "      <td>34</td>\n",
       "      <td>34</td>\n",
       "      <td>4.0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.104434</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2711</td>\n",
       "      <td>2020-01-16 12:46:24</td>\n",
       "      <td>6_months</td>\n",
       "      <td>53892762</td>\n",
       "      <td>4000000.0</td>\n",
       "      <td>26</td>\n",
       "      <td>26</td>\n",
       "      <td>-4.0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.003002</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id    transaction_date payment_type    trx_id      amount  dob  \\\n",
       "0     1511 2019-12-02 19:56:22     3_months  48077443   3500000.0   71   \n",
       "1     1590 2020-01-01 14:30:32     3_months  51947725  10500000.0   41   \n",
       "2     1860 2020-01-27 15:19:15     6_months  55263919  15000000.0   15   \n",
       "3     1988 2020-01-08 13:51:13     6_months  52887383  13500000.0   34   \n",
       "4     2711 2020-01-16 12:46:24     6_months  53892762   4000000.0   26   \n",
       "\n",
       "   performance_window  max_dpd  flag_rejected     score  \n",
       "0                  71     -5.0          False  0.009816  \n",
       "1                  41     -7.0          False  0.132705  \n",
       "2                  15    -15.0          False  0.015513  \n",
       "3                  34      4.0          False  0.104434  \n",
       "4                  26     -4.0          False  0.003002  "
      ]
     },
     "execution_count": 173,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "comb_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11"
      ]
     },
     "execution_count": 174,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "comb_df['dob'].min()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get January Data Only"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {},
   "outputs": [],
   "source": [
    "jan_df = comb_df[comb_df['transaction_date'].dt.month == 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "user_id counts: 78465\n",
      "trx_id counts: 79853\n",
      "earliest trx date: 2020-01-01 00:04:40\n",
      "latest trx date: 2020-01-31 23:59:10\n",
      "Percent of rejected users: 0.10981428374638398\n"
     ]
    }
   ],
   "source": [
    "print('user_id counts:', jan_df['user_id'].nunique())\n",
    "print('trx_id counts:', jan_df['trx_id'].nunique())\n",
    "print('earliest trx date:', jan_df['transaction_date'].min())\n",
    "print('latest trx date:', jan_df['transaction_date'].max())\n",
    "print('Percent of rejected users:', jan_df['flag_rejected'].mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(79853, 10)"
      ]
     },
     "execution_count": 180,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "jan_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [],
   "source": [
    "jan_df.to_parquet('data/pl_threshold_jan_11022020.parquet', compression='gzip')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}