past-data-projects / personal_loan_credit_risk / 000-Model_Definitions / 000 - Determine flag_bad Definition.ipynb
000 - Determine flag_bad Definition.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load Libraries and Query Utility Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/psycopg2/__init__.py:144: UserWarning: The psycopg2 wheel package will be renamed from release 2.8; in order to keep installing from binary please use \"pip install psycopg2-binary\" instead. For details see: <http://initd.org/psycopg/docs/install.html#binary-install-from-pypi>.\n",
      "  \"\"\")\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pymysql\n",
    "import psycopg2\n",
    "\n",
    "from datetime import datetime, timedelta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import configparser\n",
    "config = configparser.ConfigParser()\n",
    "config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
    "\n",
    "host = config['MYSQL']['HOST']\n",
    "user = config['MYSQL']['USER']\n",
    "password = config['MYSQL']['PASSWORD']\n",
    "\n",
    "def connect_sql():\n",
    "    cnx = pymysql.connect(host=host,\n",
    "                          user=user,\n",
    "                          password=password,\n",
    "                          cursorclass=pymysql.cursors.DictCursor)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "host = config['PERISCOPE']['HOST']\n",
    "user = config['PERISCOPE']['USER']\n",
    "password = config['PERISCOPE']['PASSWORD']\n",
    "port = config['PERISCOPE']['PORT']\n",
    "dbname = config['PERISCOPE']['DB']\n",
    "\n",
    "def connect_periscope():\n",
    "    cnx = psycopg2.connect(dbname=dbname, \n",
    "                            host=host, \n",
    "                            port=port,\n",
    "                            user=user,\n",
    "                            password=password)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get Personal Loan transactions\n",
    "from 2019-06 to 2019-09"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = '''\n",
    "SELECT main.*\n",
    "FROM(\n",
    "    SELECT \n",
    "        user_id,\n",
    "        transaction_date, \n",
    "        payment_type,\n",
    "        id AS trx_id,\n",
    "        loan_amount\n",
    "    FROM \n",
    "            (\n",
    "            SELECT id, user_id, status, transaction_type, transaction_date, payment_type\n",
    "            FROM l2alpha.`transaction`\n",
    "            WHERE status = 4 AND transaction_type = 2 AND payment_type <> '30_days'\n",
    "            AND transaction_date >= '2019-06-01' AND transaction_date < '2019-10-01'\n",
    "            ) AS tr\n",
    "         JOIN\n",
    "            (\n",
    "            SELECT start_date, transaction_id, loan_amount\n",
    "            FROM l2alpha.loan\n",
    "            WHERE loan_amount > 0 AND is_active = 1\n",
    "            ) AS lo ON lo.transaction_id = tr.id\n",
    "    ) AS main\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_df = pd.read_sql(query,connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(154256, 5)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('2019-09-30 23:58:30')"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df['transaction_date'].max()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get Flag Bad User\n",
    "from 2019-06 to 2019-09"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_performance = '''\n",
    "SELECT user_base_w_first_61.*, dlv.*\n",
    "FROM\n",
    "  (SELECT user_base.*, MIN(snapshot_date) AS first_61_after_trx\n",
    "  FROM\n",
    "    (SELECT \n",
    "      base.*, COUNT(*) as performance_window, MAX(ula.current_dpd) as max_dpd\n",
    "    FROM\n",
    "      (SELECT \n",
    "        user_id, transaction_date, payment_type, id AS trx_id, loan_amount\n",
    "      FROM \n",
    "        (SELECT \n",
    "          id, user_id, status, transaction_type, transaction_date, payment_type\n",
    "        FROM\n",
    "          db_11466_public.transaction\n",
    "        WHERE \n",
    "          status = 4 AND transaction_type = 2 AND transaction_date >= '2019-06-01' AND transaction_date < '2019-10-01' AND payment_type != '30_days'\n",
    "        ) AS tr\n",
    "        JOIN\n",
    "        (SELECT \n",
    "          start_date, transaction_id, loan_amount\n",
    "        FROM \n",
    "          db_11466_public.loan\n",
    "        WHERE \n",
    "          loan_amount > 0 AND is_active = 1\n",
    "        ) AS lo ON lo.transaction_id = tr.id\n",
    "      ) AS base\n",
    "      LEFT JOIN\n",
    "      (SELECT\n",
    "        user_id, snapshot_date, current_dpd\n",
    "       FROM \n",
    "        db_13080_public.user_level_archive\n",
    "      ) AS ula ON base.user_id = ula.user_id\n",
    "      WHERE \n",
    "        (snapshot_date is NULL) OR \n",
    "        (snapshot_date > transaction_date AND datediff('day', base.transaction_date, ula.snapshot_date) <= 121)\n",
    "      GROUP BY \n",
    "        base.user_id, base.transaction_date, base.payment_type, base.trx_id, base.loan_amount\n",
    "    ) AS user_base\n",
    "    LEFT JOIN\n",
    "    (SELECT \n",
    "      user_id, snapshot_date, current_dpd\n",
    "     FROM \n",
    "      db_13080_public.user_level_archive\n",
    "     WHERE \n",
    "      current_dpd >= 61\n",
    "    ) AS ula_2 ON user_base.user_id = ula_2.user_id\n",
    "    WHERE \n",
    "      snapshot_date is null or snapshot_date > transaction_date\n",
    "    GROUP BY \n",
    "      user_base.user_id, user_base.transaction_date, user_base.payment_type, user_base.trx_id, user_base.loan_amount, user_base.performance_window, user_base.max_dpd\n",
    "  ) AS user_base_w_first_61\n",
    "  LEFT JOIN\n",
    "  (SELECT \n",
    "    transaction_id, user_id AS user_id_1, start_date, loan_disbursement, first_dpd_1, first_dpd_7, first_dpd_15, first_dpd_30, first_dpd_60, first_dpd_90 \n",
    "  FROM \n",
    "    dim_loan_vintage\n",
    "  ) AS dlv ON user_base_w_first_61.trx_id = dlv.transaction_id\n",
    "WHERE datediff('day', transaction_date, CURRENT_DATE) >= 121\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "perf = pd.read_sql(get_performance, connect_periscope())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 18)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "perf.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['user_id', 'transaction_date', 'payment_type', 'trx_id', 'loan_amount',\n",
       "       'performance_window', 'max_dpd', 'first_61_after_trx', 'transaction_id',\n",
       "       'user_id_1', 'start_date', 'loan_disbursement', 'first_dpd_1',\n",
       "       'first_dpd_7', 'first_dpd_15', 'first_dpd_30', 'first_dpd_60',\n",
       "       'first_dpd_90'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "perf.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Double check that data fulfills expectations\n",
    "- has been 121 days since the transaction was made\n",
    "- transaction_date and start_date should be the same or close together"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True    150672\n",
       "Name: transaction_date, dtype: int64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "((datetime.now().date() - perf[\"transaction_date\"].dt.normalize().dt.date).dt.days >= 121).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True    150672\n",
       "dtype: int64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(perf[\"loan_amount\"] == perf[\"loan_disbursement\"]).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True    150672\n",
       "dtype: int64"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(perf[\"user_id\"] == perf[\"user_id_1\"]).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True     150644\n",
       "False        28\n",
       "dtype: int64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(perf[\"transaction_date\"].dt.date == perf[\"start_date\"].dt.date).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "perf[perf[\"transaction_date\"].dt.date != perf[\"start_date\"].dt.date]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create flag bad user and flag bad trx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "perf[\"flag_bad_usr\"] = perf[\"max_dpd\"] > 60"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "perf[\"first_dpd_60\"] = pd.to_datetime(perf[\"first_dpd_60\"])\n",
    "\n",
    "perf[\"duration_to_default\"] = perf[\"first_dpd_60\"] - perf[\"start_date\"].dt.normalize()\n",
    "perf[\"flag_bad_trx\"] = perf[\"duration_to_default\"].dt.days <= 121"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False    144230\n",
       "True       6442\n",
       "Name: flag_bad_usr, dtype: int64"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "perf['flag_bad_usr'].value_counts(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.04275512371243496"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "perf['flag_bad_usr'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False    144602\n",
       "True       6070\n",
       "Name: flag_bad_trx, dtype: int64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "perf[\"flag_bad_trx\"].value_counts(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.04028618455983859"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "perf[\"flag_bad_trx\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "perf.columns.tolist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combine Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "perf_columns = ['user_id',\n",
    "                'transaction_date',\n",
    "                'payment_type',\n",
    "                'trx_id',\n",
    "                'loan_amount',\n",
    "                'performance_window',\n",
    "                'max_dpd',\n",
    "                'first_61_after_trx',\n",
    "                'transaction_id',\n",
    "#                 'user_id_1',\n",
    "                'start_date',\n",
    "#                 'loan_disbursement',\n",
    "                'first_dpd_1',\n",
    "                'first_dpd_7',\n",
    "                'first_dpd_15',\n",
    "                'first_dpd_30',\n",
    "                'first_dpd_60',\n",
    "                'first_dpd_90',\n",
    "                'flag_bad_usr',\n",
    "#                 'duration_to_default',\n",
    "                'flag_bad_trx'\n",
    "               ]\n",
    "\n",
    "from_perf = perf[perf_columns]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "merge_df = from_perf.merge(raw_df, how='left', on=['user_id', 'trx_id', 'payment_type', \n",
    "                                                   'transaction_date', 'loan_amount'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['user_id', 'transaction_date', 'payment_type', 'trx_id', 'loan_amount',\n",
       "       'performance_window', 'max_dpd', 'first_61_after_trx', 'transaction_id',\n",
       "       'start_date', 'first_dpd_1', 'first_dpd_7', 'first_dpd_15',\n",
       "       'first_dpd_30', 'first_dpd_60', 'first_dpd_90', 'flag_bad_usr',\n",
       "       'flag_bad_trx'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 18)"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "user_id               0.000000\n",
       "transaction_date      0.000000\n",
       "payment_type          0.000000\n",
       "trx_id                0.000000\n",
       "loan_amount           0.000000\n",
       "performance_window    0.000000\n",
       "max_dpd               0.000093\n",
       "first_61_after_trx    0.923324\n",
       "transaction_id        0.000000\n",
       "start_date            0.000000\n",
       "first_dpd_1           0.616936\n",
       "first_dpd_7           0.825024\n",
       "first_dpd_15          0.870354\n",
       "first_dpd_30          0.904116\n",
       "first_dpd_60          0.938177\n",
       "first_dpd_90          0.960285\n",
       "flag_bad_usr          0.000000\n",
       "flag_bad_trx          0.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_df.isna().mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('2019-09-28 23:49:58')"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_df['transaction_date'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:114: FutureWarning: A future version of pandas will default to `skipna=True`. To silence this warning, pass `skipna=True|False` explicitly.\n",
      "  result = infer_dtype(pandas_collection)\n"
     ]
    }
   ],
   "source": [
    "merge_df.to_parquet('data/interim/pl_jumbo_mini_junjulaugsep_27012020.parquet', compression='gzip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}