past-data-projects / personal_loan_credit_risk / 000-Model_Definitions / 001 - Generate PL Features.ipynb
001 - Generate PL Features.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:36.988168Z",
     "start_time": "2019-11-07T03:55:36.706164Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pymysql"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:36.994804Z",
     "start_time": "2019-11-07T03:55:36.989531Z"
    }
   },
   "outputs": [],
   "source": [
    "import configparser\n",
    "config = configparser.ConfigParser()\n",
    "config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
    "\n",
    "host = config['MYSQL-ROOT']['HOST']\n",
    "user = config['MYSQL-ROOT']['USER']\n",
    "password = config['MYSQL-ROOT']['PASSWORD']\n",
    "\n",
    "def connect_sql():\n",
    "    cnx = pymysql.connect(host=host,\n",
    "                          user=user,\n",
    "                          password=password,\n",
    "                          cursorclass=pymysql.cursors.DictCursor)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:37.026188Z",
     "start_time": "2019-11-07T03:55:36.997299Z"
    }
   },
   "outputs": [],
   "source": [
    "base_df = pd.read_parquet('data/interim/pl_jumbo_mini_junjulaugsep_27012020.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:37.034477Z",
     "start_time": "2019-11-07T03:55:37.027603Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 18)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:37.043587Z",
     "start_time": "2019-11-07T03:55:37.036542Z"
    }
   },
   "outputs": [],
   "source": [
    "uids = tuple(base_df['user_id'].unique().tolist())\n",
    "tids = tuple(base_df['trx_id'].unique().tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "135508"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(uids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "150672"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('2019-09-28 23:49:58')"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_df['transaction_date'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "user_id               0.000000\n",
       "transaction_date      0.000000\n",
       "payment_type          0.000000\n",
       "trx_id                0.000000\n",
       "loan_amount           0.000000\n",
       "performance_window    0.000000\n",
       "max_dpd               0.000093\n",
       "first_61_after_trx    0.923324\n",
       "transaction_id        0.000000\n",
       "start_date            0.000000\n",
       "first_dpd_1           0.616936\n",
       "first_dpd_7           0.825024\n",
       "first_dpd_15          0.870354\n",
       "first_dpd_30          0.904116\n",
       "first_dpd_60          0.938177\n",
       "first_dpd_90          0.960285\n",
       "flag_bad_usr          0.000000\n",
       "flag_bad_trx          0.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_df.isna().mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Snapshot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:37.078945Z",
     "start_time": "2019-11-07T03:55:37.074323Z"
    }
   },
   "outputs": [],
   "source": [
    "snapshot_q = f\"\"\"\n",
    "SELECT \n",
    "    base.user_id, \n",
    "    base.id AS trx_id,\n",
    "    os_amount,\n",
    "    current_dpd,\n",
    "    COALESCE(os_amount,0) + monthly_installment AS os_amount_inc\n",
    "FROM \n",
    "\t\t(SELECT user_id,id,transaction_date \n",
    "\t\tFROM l2alpha.`transaction` WHERE id IN {tids}) AS base\n",
    "\tLEFT JOIN \n",
    "\t\tds.user_level_archive AS ula \n",
    "\tON base.user_id = ula.user_id AND DATE(transaction_date) = snapshot_date \n",
    "\tLEFT JOIN\n",
    "\t\t(SELECT user_id,monthly_installment, transaction_id\n",
    "\t\tFROM l2alpha.loan \n",
    "\t\tWHERE transaction_id IN {tids}) AS lo \n",
    "\tON lo.transaction_id = base.id\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:08.943657Z",
     "start_time": "2019-11-07T03:55:37.081391Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ec2-user/anaconda3/envs/env_personalloan/lib/python3.7/site-packages/pymysql/cursors.py:329: Warning: (3170, \"Memory capacity of 8388608 bytes for 'range_optimizer_max_mem_size' exceeded. Range optimization was not done for this query.\")\n",
      "  self._do_get_result()\n"
     ]
    }
   ],
   "source": [
    "snapshot = pd.read_sql(snapshot_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:08.949079Z",
     "start_time": "2019-11-07T03:58:08.945547Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 5)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "snapshot.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:08.959459Z",
     "start_time": "2019-11-07T03:58:08.950846Z"
    }
   },
   "outputs": [],
   "source": [
    "snapshot.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Delinquency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:08.985180Z",
     "start_time": "2019-11-07T03:58:08.960895Z"
    }
   },
   "outputs": [],
   "source": [
    "delin_q = f'''\n",
    "SELECT \n",
    "    base.user_id,\n",
    "    base.id AS trx_id,\n",
    "    MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 7 AND\n",
    "                  datediff(DATE(transaction_date),snapshot_date) >=0   THEN current_dpd ELSE NULL END) AS delin_max_dpd_7d,\n",
    "    MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 15 AND\n",
    "                  datediff(DATE(transaction_date),snapshot_date) >=0  THEN current_dpd ELSE NULL END) AS delin_max_dpd_15d,\n",
    "    MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 30 AND\n",
    "                  datediff(DATE(transaction_date),snapshot_date) >=0  THEN current_dpd ELSE NULL END) AS delin_max_dpd_1mo,\n",
    "    MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 60 AND\n",
    "                  datediff(DATE(transaction_date),snapshot_date) >=0  THEN current_dpd ELSE NULL END) AS delin_max_dpd_2mo,\n",
    "    MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 90 AND\n",
    "                  datediff(DATE(transaction_date),snapshot_date) >=0  THEN current_dpd ELSE NULL END) AS delin_max_dpd_3mo,\n",
    "    \n",
    "    MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 7 AND\n",
    "                  datediff(DATE(transaction_date),snapshot_date) >=0   THEN dpd_amount ELSE NULL END) AS delin_max_dpd_amt_7d,\n",
    "    MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 15 AND\n",
    "                  datediff(DATE(transaction_date),snapshot_date) >=0  THEN dpd_amount ELSE NULL END) AS delin_max_dpd_amt_15d,\n",
    "    MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 30 AND\n",
    "                  datediff(DATE(transaction_date),snapshot_date) >=0  THEN dpd_amount ELSE NULL END) AS delin_max_dpd_amt_1mo,\n",
    "    MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 60 AND\n",
    "                  datediff(DATE(transaction_date),snapshot_date) >=0  THEN dpd_amount ELSE NULL END) AS delin_max_dpd_amt_2mo,\n",
    "    MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 90 AND\n",
    "                  datediff(DATE(transaction_date),snapshot_date) >=0  THEN dpd_amount ELSE NULL END) AS delin_max_dpd_amt_3mo\n",
    "FROM\n",
    "\t\t(SELECT user_id,id,transaction_date \n",
    "\t\tFROM l2alpha.`transaction` \n",
    "\t\tWHERE id IN {tids}) AS base\n",
    "\tLEFT JOIN \n",
    "\t    (SELECT user_id,\n",
    "\t        \tos_amount,\n",
    "\t        \tcurrent_dpd,\n",
    "\t        \tdpd_amount,\n",
    "\t        \tsnapshot_date\n",
    "\t    FROM ds.user_level_archive\n",
    "\t    ) AS delin ON base.user_id = delin.user_id AND snapshot_date <= DATE(transaction_date)\n",
    "GROUP BY base.id\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:37.439306Z",
     "start_time": "2019-11-07T03:58:08.992173Z"
    }
   },
   "outputs": [],
   "source": [
    "delin = pd.read_sql(delin_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:37.443941Z",
     "start_time": "2019-11-07T03:58:37.440532Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 12)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "delin.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:37.458294Z",
     "start_time": "2019-11-07T03:58:37.445289Z"
    }
   },
   "outputs": [],
   "source": [
    "delin.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. Other"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## time_approve_to_pl_hour"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:37.472597Z",
     "start_time": "2019-11-07T03:58:37.459708Z"
    }
   },
   "outputs": [],
   "source": [
    "time_pl_q = f'''\n",
    "SELECT \n",
    "    base.user_id,\n",
    "    base.id AS trx_id,\n",
    "    TIMESTAMPDIFF(HOUR, installment_ts, transaction_date) AS time_approve_to_pl_hour\n",
    "FROM\n",
    "\t\t(SELECT user_id,id,transaction_date \n",
    "\t\tFROM l2alpha.`transaction` WHERE id IN {tids}) AS base\n",
    "\tLEFT JOIN \n",
    "\t    (SELECT user_id, MAX(is_approved_timestamp) AS installment_ts \n",
    "\t    FROM l2alpha.web_userinstallmentapplicationdetails \n",
    "\t    WHERE user_id IN {uids}\n",
    "\t    GROUP BY user_id\n",
    "\t    ) AS wuiad \n",
    "\tON base.user_id = wuiad.user_id\n",
    "GROUP BY base.id\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:40.087947Z",
     "start_time": "2019-11-07T03:58:37.473685Z"
    }
   },
   "outputs": [],
   "source": [
    "time_pl = pd.read_sql(time_pl_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:40.092390Z",
     "start_time": "2019-11-07T03:58:40.089336Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 3)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "time_pl.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:40.104548Z",
     "start_time": "2019-11-07T03:58:40.093767Z"
    }
   },
   "outputs": [],
   "source": [
    "time_pl.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## oth_last_trx_amount"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:40.123372Z",
     "start_time": "2019-11-07T03:58:40.116350Z"
    }
   },
   "outputs": [],
   "source": [
    "oth_last_trx_q = f'''\n",
    "SELECT main.user_id,\n",
    "\t   main.id AS trx_id,\n",
    "       amount AS oth_last_trx_amount\n",
    "FROM\n",
    "\t(SELECT \n",
    "\t    base.user_id, \n",
    "\t    base.id,\n",
    "\t    MAX(trx.id) AS last_id\n",
    "\tFROM\n",
    "\t\t(SELECT user_id, \n",
    "\t\t\t\tid, \n",
    "\t\t\t\ttransaction_date \n",
    "\t\tFROM l2alpha.`transaction`\n",
    "\t\tWHERE id IN {tids}) AS base\n",
    "\tLEFT JOIN \n",
    "\t    (SELECT user_id,\n",
    "\t        \tid,\n",
    "\t        \ttransaction_date AS td,\n",
    "\t        \tamount,\n",
    "\t        \tstatus\n",
    "\t    FROM l2alpha.`transaction`\n",
    "\t    WHERE status = 4\n",
    "\t    ) AS trx \n",
    "\tON base.user_id = trx.user_id AND td < transaction_date\n",
    "\tGROUP BY base.id\n",
    "\t) AS main\n",
    "LEFT JOIN \n",
    "\t(SELECT id, amount\n",
    "\tFROM l2alpha.`transaction`\n",
    "\t) AS tr \n",
    "ON tr.id = main.last_id\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:08.704375Z",
     "start_time": "2019-11-07T03:58:40.124523Z"
    }
   },
   "outputs": [],
   "source": [
    "oth_last_trx = pd.read_sql(oth_last_trx_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:08.708913Z",
     "start_time": "2019-11-07T04:00:08.705517Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 3)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oth_last_trx.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:08.744539Z",
     "start_time": "2019-11-07T04:00:08.713156Z"
    }
   },
   "outputs": [],
   "source": [
    "oth_last_trx.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## oth_first_trx_amount"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:08.753397Z",
     "start_time": "2019-11-07T04:00:08.748442Z"
    }
   },
   "outputs": [],
   "source": [
    "oth_first_trx_q = f'''\n",
    "SELECT main.user_id,\n",
    "\t   main.id AS trx_id,\n",
    "       amount AS oth_first_trx_amount\n",
    "FROM\n",
    "\t(SELECT \n",
    "\t    base.user_id, \n",
    "\t    base.id,\n",
    "\t    MIN(trx.id) AS first_id\n",
    "\tFROM\n",
    "\t\t(SELECT user_id,\n",
    "\t\t\t\tid,\n",
    "\t\t\t\ttransaction_date \n",
    "\t\tFROM l2alpha.`transaction` \n",
    "\t\tWHERE id IN {tids}) AS base\n",
    "\tLEFT JOIN \n",
    "\t    (SELECT user_id,\n",
    "\t        \tid,\n",
    "\t        \ttransaction_date AS td,\n",
    "\t        \tamount,\n",
    "\t        \tstatus\n",
    "\t    FROM l2alpha.`transaction` \n",
    "\t    WHERE status = 4\n",
    "\t    ) AS trx \n",
    "\tON base.user_id = trx.user_id AND td < transaction_date\n",
    "\tGROUP BY base.id\n",
    "\t) AS main\n",
    "LEFT JOIN \n",
    "\t(SELECT id, amount\n",
    "\tFROM l2alpha.`transaction`\n",
    "\t) AS tr \n",
    "ON tr.id = main.first_id\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:11.643352Z",
     "start_time": "2019-11-07T04:00:08.756040Z"
    }
   },
   "outputs": [],
   "source": [
    "oth_first_trx = pd.read_sql(oth_first_trx_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:11.648365Z",
     "start_time": "2019-11-07T04:00:11.644573Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 3)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oth_first_trx.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:11.658313Z",
     "start_time": "2019-11-07T04:00:11.650662Z"
    }
   },
   "outputs": [],
   "source": [
    "oth_first_trx.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## oth_last_rep_xxx:\n",
    "- dpd\n",
    "- days\n",
    "- channel\n",
    "- amount"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:11.673428Z",
     "start_time": "2019-11-07T04:00:11.659983Z"
    }
   },
   "outputs": [],
   "source": [
    "oth_last_rep_q = f'''\n",
    "SELECT main.user_id,\n",
    "\t   main.id AS trx_id,\n",
    "       MAX(dpd) AS oth_last_rep_dpd,\n",
    "       datediff(main.transaction_date,main.last_pd) AS oth_last_rep_days,\n",
    "       payment_channel AS oth_last_rep_channel,\n",
    "       SUM(amount) AS oth_last_rep_amount\n",
    "FROM\n",
    "\t(SELECT base.user_id,\n",
    "\t\t\tbase.id,\n",
    "\t    \tbase.transaction_date,\n",
    "\t    \tMAX(up.pd) AS last_pd\n",
    "\tFROM\n",
    "\t\t(SELECT user_id,\n",
    "                id,\n",
    "                transaction_date,\n",
    "                status\n",
    "\t\tFROM l2alpha.`transaction`\n",
    "\t\tWHERE id IN {tids}\n",
    "        ) AS base\n",
    "\tLEFT JOIN \n",
    "\t    (SELECT user_id,\n",
    "\t        \tid,\n",
    "\t        \tpayment_type,\n",
    "\t        \tpayment_channel,\n",
    "\t        \tpayment_date AS pd,\n",
    "\t        \tdpd,\n",
    "\t        \tstatus\n",
    "\t    FROM l2alpha.user_payment \n",
    "\t    WHERE status = 2 AND payment_type = 'INSTALLMENT' AND payment_channel <> 'CANCELLATION'\n",
    "\t    ) AS up \n",
    "\tON base.user_id = up.user_id AND pd < transaction_date\n",
    "\tGROUP BY base.id\n",
    "\t) AS main\n",
    "LEFT JOIN \n",
    "\t(SELECT user_id, \n",
    "\t\t\tid, \n",
    "\t\t\tdpd, \n",
    "\t\t\tpayment_date,\n",
    "            payment_channel,\n",
    "            amount\n",
    "\tFROM l2alpha.user_payment) AS up2\n",
    "ON up2.payment_date = main.last_pd AND main.user_id = up2.user_id\n",
    "GROUP BY main.id\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:45.110051Z",
     "start_time": "2019-11-07T04:00:11.674575Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "oth_last_rep = pd.read_sql(oth_last_rep_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:45.115239Z",
     "start_time": "2019-11-07T04:00:45.111412Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 6)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oth_last_rep.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Repayment Channels are grouped together based on channel similarity. Like ecommerce channels Tokopedia and Bukalapak are grouped together. \n",
    "\n",
    "There are special cases, as following:\n",
    "\n",
    "- KREDIVO = user bayar pake excess credit (ada bayar lebih sebelumnya, kyk semacem bayar dari wallet dia sendiri)\n",
    "- MANUAL_WAIVER = finance ada subsidi bayarin tagihan nya user (misal late fee yg muncul karena kesalahan sistem)\n",
    "- MANUAL_WAIVER_CLCT = subsidi dari tim collection (ada janjian gitu, user bayar sebagian, sisanya dibayarin collection biar lunas)\n",
    "- CANCELLATION = partial cancellation\n",
    "\n",
    "\n",
    "sisanya kalo ada MANUAL_[free text] adalah yg dijalankan manual oleh tim finance karena ada issue.. contoh user bayar, tp gak keproses.. besok nya tim Finance proses manual jadinya..\n",
    "\n",
    "\n",
    "MANDIRI khusus ada 2 karena dulu sistem nya closed amount (MANDIRI_VA), trus satu nya lagi open amount (MANDIRI_OPEN_VA).. sekarang semua nya udh open amount"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:45.140803Z",
     "start_time": "2019-11-07T04:00:45.116464Z"
    }
   },
   "outputs": [],
   "source": [
    "last_rep_channel_encoding = {'BUKALAPAK': 1, 'TOKOPEDIA': 1,\n",
    "                             'BCA_VA': 2, 'MANDIRI_OPEN_VA': 2, 'MANDIRI_VA': 2, 'PERMATA_VA': 2,\n",
    "                             'ALFAMART_VA': 3, 'INDOMARET_VA': 3,\n",
    "                             'KREDIVO': 4, 'MANUAL_ALFAMART VA': 4, 'MANUAL_INDOMARET_VA': 4, \n",
    "                             'MANUAL_BCA VA': 4, 'MANUAL_BCA_VA': 4,\n",
    "                             'MANUAL_MANDIRI': 4, 'MANUAL_MANDIRI_OPEN_': 4, 'MANUAL_MANDIRI_VA': 4, \n",
    "                             'MANUAL_PERMATA VA': 4, 'MANUAL_PERMATA_VA': 4,\n",
    "                             'MANUAL_PC': 4, 'MANUAL_EXCESS': 4, 'MANUAL_WAIVER': 4, 'MANUAL_WAIVER_CLCT': 4}\n",
    "\n",
    "default_value = 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:45.147923Z",
     "start_time": "2019-11-07T04:00:45.142356Z"
    }
   },
   "outputs": [],
   "source": [
    "oth_last_rep['oth_last_rep_channel_raw'] = oth_last_rep['oth_last_rep_channel']\n",
    "oth_last_rep['oth_last_rep_channel'] = oth_last_rep['oth_last_rep_channel'].map(last_rep_channel_encoding)\\\n",
    "                                                                           .fillna(default_value).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:45.176873Z",
     "start_time": "2019-11-07T04:00:45.149190Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2    93008\n",
       "3    45168\n",
       "1    12300\n",
       "4      196\n",
       "Name: oth_last_rep_channel, dtype: int64"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oth_last_rep['oth_last_rep_channel'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4. Utilization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## util_non_pl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:45.181482Z",
     "start_time": "2019-11-07T04:00:45.178221Z"
    }
   },
   "outputs": [],
   "source": [
    "util_non_pl_q = f'''\n",
    "SELECT main.user_id,\n",
    "\t   main.trx_id,\n",
    "\t   COALESCE(SUM(loan_amount/transaction_limit), 0) AS util_non_pl\n",
    "FROM\n",
    "\t(SELECT user_id, \n",
    "\t\t   trx_id, \n",
    "\t\t   loan_id,\n",
    "\t\t   loan_amount,\n",
    "\t\t   tenure,\n",
    "\t\t   MAX(creation_time) AS latest_limit\n",
    "\tFROM\t\n",
    "\t\t(SELECT tr.user_id,\n",
    "\t\t       tr.id AS trx_id,\n",
    "\t\t       lo.loan_id,\n",
    "\t\t       loan_amount,\n",
    "\t\t       creation_time,\n",
    "\t\t       tenure,\n",
    "\t\t       COALESCE(TIMESTAMPDIFF(DAY, creation_time, transaction_date), 0) AS time_diff\n",
    "\t\tFROM \n",
    "\t\t\t\t(SELECT user_id,\n",
    "\t\t\t\t\t\ttransaction_date,\n",
    "\t\t\t\t    \tid\n",
    "\t\t\t\tFROM l2alpha.`transaction` \n",
    "\t\t\t\tWHERE id IN {tids}) AS tr\n",
    "\t\t\tLEFT JOIN \n",
    "\t\t\t\t(SELECT id AS loan_id,\n",
    "\t\t\t\t\t\tuser_id,\n",
    "\t\t\t\t    \tstart_date,\n",
    "\t\t\t\t    \tsettlement_date,\n",
    "\t\t\t\t    \tloan_amount,\n",
    "\t\t\t\t    \ttenure,\n",
    "\t\t\t\t    \ttransaction_id \n",
    "\t\t\t\tFROM l2alpha.loan\n",
    "\t\t\t\tWHERE is_active = 1 AND loan_amount > 0) AS lo\n",
    "\t\t\tON lo.user_id = tr.user_id AND start_date < transaction_date AND (settlement_date IS NULL OR settlement_date > transaction_date)\n",
    "\t\t\t\n",
    "\t\t\tLEFT JOIN \n",
    "\t\t\t\t(SELECT user_id,\n",
    "\t\t\t\t\t\tCASE WHEN installment_type = '30_days' THEN 30\n",
    "\t\t\t\t\t\tWHEN installment_type = '3_months' THEN 90 \n",
    "\t\t\t\t\t\tWHEN installment_type = '6_months' THEN 180 \n",
    "\t\t\t\t\t\tWHEN installment_type = '12_months' THEN 360 \n",
    "\t\t\t\t\t\tELSE NULL END AS  installment_type,\n",
    "\t\t\t\t\t\tcreation_time\n",
    "\t\t\t\tFROM l2alpha.transaction_limit_history\n",
    "\t\t\t\t) AS tlh\n",
    "\t\t\tON tlh.user_id = lo.user_id AND tlh.installment_type = lo.tenure\n",
    "\t\t) AS base\n",
    "\tWHERE time_diff >= 0\n",
    "\tGROUP BY trx_id, loan_id\n",
    "\t) AS main\n",
    "\t\n",
    "\tLEFT JOIN \n",
    "\t(SELECT user_id,\n",
    "\t\t\ttransaction_limit,\n",
    "\t\t\tCASE WHEN installment_type = '30_days' THEN 30\n",
    "\t\t\tWHEN installment_type = '3_months' THEN 90 \n",
    "\t\t\tWHEN installment_type = '6_months' THEN 180 \n",
    "\t\t\tWHEN installment_type = '12_months' THEN 360 \n",
    "\t\t\tELSE NULL END AS  installment_type,\n",
    "\t\t\tcreation_time\n",
    "\tFROM l2alpha.transaction_limit_history\n",
    "\tWHERE user_id IN {uids}\n",
    "\tGROUP BY user_id, transaction_limit, installment_type, creation_time\n",
    "\t) AS lim\n",
    "\tON lim.creation_time = main.latest_limit AND lim.user_id = main.user_id\n",
    "\tAND lim.installment_type = main.tenure\n",
    "GROUP BY main.trx_id\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:51.464862Z",
     "start_time": "2019-11-07T04:00:45.182833Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ec2-user/anaconda3/envs/env_personalloan/lib/python3.7/site-packages/pymysql/cursors.py:329: Warning: (1052, \"Column 'installment_type' in group statement is ambiguous\")\n",
      "  self._do_get_result()\n",
      "/home/ec2-user/anaconda3/envs/env_personalloan/lib/python3.7/site-packages/pymysql/cursors.py:329: Warning: (3170, \"Memory capacity of 8388608 bytes for 'range_optimizer_max_mem_size' exceeded. Range optimization was not done for this query.\")\n",
      "  self._do_get_result()\n"
     ]
    }
   ],
   "source": [
    "util_non_pl = pd.read_sql(util_non_pl_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:51.469712Z",
     "start_time": "2019-11-07T04:00:51.466148Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 3)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "util_non_pl.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:51.515275Z",
     "start_time": "2019-11-07T04:00:51.471223Z"
    }
   },
   "outputs": [],
   "source": [
    "util_non_pl.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## util_pl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:51.534178Z",
     "start_time": "2019-11-07T04:00:51.523536Z"
    }
   },
   "outputs": [],
   "source": [
    "util_pl_q = f'''\n",
    "SELECT main.user_id, \n",
    "\t   trx_id,\n",
    "\t   COALESCE((amount/transaction_limit), 0) AS util_pl\n",
    "FROM\n",
    "\t(SELECT user_id, trx_id, amount, payment_type, transaction_date,\n",
    "\t\t   MAX(creation_time) AS latest_limit\n",
    "\tFROM\n",
    "\t\t(SELECT tr.user_id, tr.id AS trx_id, tr.amount, tr.payment_type, tr.transaction_date,\n",
    "\t\t\t\ttlh.creation_time,\n",
    "\t\t\t   TIMESTAMPDIFF(DAY, creation_time, transaction_date) AS time_diff\n",
    "\t\tFROM\n",
    "\t\t\t(SELECT *\n",
    "\t\t\tFROM l2alpha.`transaction` \n",
    "\t\t\tWHERE id IN {tids}\n",
    "\t\t\t) AS tr\n",
    "\t\t\tLEFT JOIN l2alpha.transaction_limit_history AS tlh\n",
    "\t\t\tON tr.user_id = tlh.user_id AND tr.payment_type = tlh.installment_type\n",
    "\t\t) AS base\n",
    "\tWHERE time_diff >= 0\n",
    "\tGROUP BY base.trx_id\n",
    "\t) AS main\n",
    "\t\n",
    "\tLEFT JOIN \n",
    "\t(SELECT user_id,\n",
    "\t\t\ttransaction_limit,\n",
    "\t\t\tinstallment_type,\n",
    "\t\t\tcreation_time\n",
    "\tFROM l2alpha.transaction_limit_history\n",
    "\tWHERE user_id IN {uids}\n",
    "\tGROUP BY user_id, transaction_limit, installment_type, creation_time\n",
    "\t) AS lim\n",
    "\tON lim.creation_time = main.latest_limit AND lim.user_id = main.user_id \n",
    "    AND lim.installment_type = main.payment_type\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:54.715476Z",
     "start_time": "2019-11-07T04:00:51.536994Z"
    }
   },
   "outputs": [],
   "source": [
    "util_pl = pd.read_sql(util_pl_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:54.720364Z",
     "start_time": "2019-11-07T04:00:54.716934Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 3)"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "util_pl.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:54.733988Z",
     "start_time": "2019-11-07T04:00:54.722455Z"
    }
   },
   "outputs": [],
   "source": [
    "util_pl.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "util_pl = util_pl[['user_id', 'trx_id', 'util_pl']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. A - Score Feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:54.753981Z",
     "start_time": "2019-11-07T04:00:54.735531Z"
    }
   },
   "outputs": [],
   "source": [
    "ascore_q = f'''\n",
    "SELECT asc2.user_id, calibrated_final, engine\n",
    "FROM ds.a_score_css_2 AS asc2\n",
    "JOIN\n",
    "(SELECT user_id, max(id) AS latest_id\n",
    "FROM ds.a_score_css_2 \n",
    "WHERE user_id IN {uids}\n",
    "AND engine NOT LIKE '%pd3%'\n",
    "AND engine NOT LIKE '%web%'\n",
    "GROUP BY user_id\n",
    ") AS asc1\n",
    "ON asc2.id = asc1.latest_id\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:01:06.104428Z",
     "start_time": "2019-11-07T04:00:54.755309Z"
    }
   },
   "outputs": [],
   "source": [
    "a_score = pd.read_sql(ascore_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:01:06.110083Z",
     "start_time": "2019-11-07T04:01:06.105797Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(129789, 3)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a_score.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:01:06.120607Z",
     "start_time": "2019-11-07T04:01:06.113854Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "129789"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a_score['user_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:01:06.141338Z",
     "start_time": "2019-11-07T04:01:06.123192Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "a_score.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "ios_model_used = []\n",
    "engines_used = a_score.engine.unique().tolist()\n",
    "for engine in engines_used:\n",
    "    if 'ios_' in engine:\n",
    "        ios_model_used.append(engine)\n",
    "engines_used = [eng for eng in engines_used if 'ios' not in eng]\n",
    "\n",
    "x = pd.DataFrame(engines_used)\n",
    "x['eng'] = x[0].apply(lambda x: None if pd.isna(x) else x.split()[0])\n",
    "x['vers'] = x[0].apply(lambda x: None if pd.isna(x) else x.split()[1].split('.')[-1])\n",
    "x['vers'] = np.where(x['vers']=='h2o',None,x['vers'])\n",
    "\n",
    "x.dropna(inplace=True)\n",
    "x['vers'] = x['vers'].astype(int)\n",
    "\n",
    "model_used = x.loc[x[['eng','vers']].groupby('eng').idxmax()['vers'].tolist()][0].tolist()\n",
    "model_used = model_used + ios_model_used\n",
    "model_used.remove('cicilan_fresh_normal 0.0.2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_used"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "a_score['calibrated_final'] = np.where(a_score['engine'].isin(model_used),\n",
    "                                       a_score['calibrated_final'], np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "a_score = a_score[['user_id', 'calibrated_final', 'engine']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6. Installed Payday Loan App Count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:01:06.262833Z",
     "start_time": "2019-11-07T04:01:06.257660Z"
    }
   },
   "outputs": [],
   "source": [
    "ins_app_q = f'''\n",
    "SELECT user_id,\n",
    "       installed_applications\n",
    "FROM l2alpha.web_userdevicedetail\n",
    "WHERE user_id IN {uids}\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:40.041309Z",
     "start_time": "2019-11-07T04:01:06.264471Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "ins_app = pd.read_sql(ins_app_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:40.048118Z",
     "start_time": "2019-11-07T04:02:40.042945Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(135508, 2)"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ins_app.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "plo_app_list = pd.read_csv('data/payday_700_list.csv', header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:40.084424Z",
     "start_time": "2019-11-07T04:02:40.066336Z"
    }
   },
   "outputs": [],
   "source": [
    "payday_loan_apps = plo_app_list[0].unique().tolist()\n",
    "payday_loan_apps = [x.lower() for x in payday_loan_apps]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:40.109404Z",
     "start_time": "2019-11-07T04:02:40.097156Z"
    }
   },
   "outputs": [],
   "source": [
    "def create_feats_apps(raw_df,label='installed_applications'):\n",
    "\n",
    "    df = raw_df.copy()\n",
    "    df[label] = df[label].fillna('[]')\n",
    "    df[label] = df[label].apply(lambda x: eval(x))\n",
    "\n",
    "    def count_apps(x,list_apps):\n",
    "        return len([ft for ft in x if ft.lower() in list_apps])\n",
    "\n",
    "    df['ap_co_plo'] = df[label].apply(lambda x:count_apps(x,payday_loan_apps))\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.459722Z",
     "start_time": "2019-11-07T04:02:40.111155Z"
    }
   },
   "outputs": [],
   "source": [
    "plo_apps = create_feats_apps(ins_app)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.507926Z",
     "start_time": "2019-11-07T04:02:44.461220Z"
    }
   },
   "outputs": [],
   "source": [
    "plo_apps = plo_apps.drop(columns=['installed_applications'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combine all dataframes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.586494Z",
     "start_time": "2019-11-07T04:02:44.510100Z"
    }
   },
   "outputs": [],
   "source": [
    "final_df = base_df[['user_id', 'trx_id']].merge(snapshot, on=['trx_id', 'user_id'], how='left')\\\n",
    "                                          .merge(delin, on=['trx_id', 'user_id'], how='left')\\\n",
    "                                          .merge(time_pl, on=['trx_id', 'user_id'], how='left')\\\n",
    "                                          .merge(oth_last_trx, on=['trx_id', 'user_id'], how='left')\\\n",
    "                                          .merge(oth_first_trx, on=['trx_id', 'user_id'], how='left')\\\n",
    "                                          .merge(oth_last_rep, on=['trx_id', 'user_id'], how='left')\\\n",
    "                                          .merge(util_non_pl, on=['trx_id', 'user_id'], how='left')\\\n",
    "                                          .merge(util_pl, on=['trx_id', 'user_id'], how='left')\\\n",
    "                                          .merge(a_score, on='user_id', how='left')\\\n",
    "                                          .merge(plo_apps, on='user_id', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.591815Z",
     "start_time": "2019-11-07T04:02:44.587913Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 28)"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 17)"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.617944Z",
     "start_time": "2019-11-07T04:02:44.593760Z"
    }
   },
   "outputs": [],
   "source": [
    "final_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.798454Z",
     "start_time": "2019-11-07T04:02:44.619626Z"
    }
   },
   "outputs": [],
   "source": [
    "final_df.to_parquet('data/interim/pl_model_v3_feats_27012020.parquet', compression='gzip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}