past-data-projects / personal_loan_credit_risk / 500-Calibration / 500 - Rejected Users for Calibration.ipynb
500 - Rejected Users for Calibration.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pymysql"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import configparser\n",
    "config = configparser.ConfigParser()\n",
    "config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
    "\n",
    "host = config['MYSQL-ROOT']['HOST']\n",
    "user = config['MYSQL-ROOT']['USER']\n",
    "password = config['MYSQL-ROOT']['PASSWORD']\n",
    "\n",
    "def connect_sql():\n",
    "    cnx = pymysql.connect(host=host,\n",
    "                          user=user,\n",
    "                          password=password,\n",
    "                          cursorclass=pymysql.cursors.DictCursor)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get Rejected users"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = '''\n",
    "SELECT\n",
    "\tbase.*,\n",
    "\tCOUNT(*) AS performance_window,\n",
    "\tMAX(ula.current_dpd) AS max_dpd\n",
    "FROM\n",
    "\t(\n",
    "\tSELECT\n",
    "\t\ttar.*,\n",
    "\t\ttr.user_id,\n",
    "\t\ttr.transaction_date,\n",
    "\t\ttr.amount,\n",
    "        tr.payment_type\n",
    "\tFROM\n",
    "\t\t(\n",
    "\t\tSELECT\n",
    "\t\t\t*\n",
    "\t\tFROM\n",
    "\t\t\tl2alpha.transaction_approval_rules\n",
    "\t\tWHERE\n",
    "\t\t\trule_name = 'pl_score'\n",
    "\t\t\tAND `result` = 'REJECT'\n",
    "\t\t\tAND remarks LIKE '%Personal Loan Score%' ) AS tar\n",
    "\tLEFT JOIN l2alpha.`transaction` AS tr ON\n",
    "\t\ttar.transaction_id = tr.id\n",
    "\tWHERE\n",
    "\t\tpayment_type <> '30_days'\n",
    "\t\tAND transaction_date >= '2019-06-01'\n",
    "\t\tAND transaction_date < '2019-10-01'\n",
    "\t\tAND datediff(CURDATE(),\n",
    "\t\ttransaction_date) >= 121 ) AS base\n",
    "LEFT JOIN (\n",
    "\tSELECT\n",
    "\t\tuser_id,\n",
    "\t\tsnapshot_date,\n",
    "\t\tcurrent_dpd\n",
    "\tFROM\n",
    "\t\tds.user_level_archive ) AS ula ON\n",
    "\tbase.user_id = ula.user_id\n",
    "WHERE\n",
    "\t(snapshot_date IS NULL)\n",
    "\tOR (snapshot_date > transaction_date\n",
    "\tAND datediff(ula.snapshot_date,\n",
    "\tbase.transaction_date) <= 121)\n",
    "GROUP BY\n",
    "\tbase.user_id,\n",
    "\tbase.transaction_date,\n",
    "\tbase.transaction_id\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_sql(query, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(16190, 14)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sanity check data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9218703     4\n",
       "4964211     3\n",
       "1608647     3\n",
       "5677366     3\n",
       "4769698     3\n",
       "           ..\n",
       "7273493     1\n",
       "8045586     1\n",
       "10003470    1\n",
       "9778188     1\n",
       "9080834     1\n",
       "Name: user_id, Length: 15307, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['user_id'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "37849087    1\n",
       "31492900    1\n",
       "36354882    1\n",
       "33608511    1\n",
       "31120190    1\n",
       "           ..\n",
       "37131688    1\n",
       "31681540    1\n",
       "36556195    1\n",
       "32169188    1\n",
       "33851393    1\n",
       "Name: transaction_id, Length: 16190, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['transaction_id'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "677"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['max_dpd'].isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True    16190\n",
       "Name: remarks, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['remarks'].str.contains('Personal Loan Score').value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "REJECT    16190\n",
       "Name: result, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['result'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pl_score    16190\n",
       "Name: rule_name, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['rule_name'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['id', 'rule_name', 'result', 'remarks', 'version', 'creation_time',\n",
       "       'modification_time', 'transaction_id', 'user_id', 'transaction_date',\n",
       "       'amount', 'payment_type', 'performance_window', 'max_dpd'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_df = df[['transaction_id', 'user_id', 'rule_name', 'result', 'remarks', \n",
    "               'transaction_date', 'amount', 'performance_window', 'max_dpd']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('2019-09-30 23:56:42')"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_df['transaction_date'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('2019-06-01 00:44:53')"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_df['transaction_date'].min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_df.to_parquet('data/pl_rejected_users_junjulaugsep_04022020.parquet', compression='gzip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}