past-data-projects / personal_loan_credit_risk / 700-Productionization / 701 - Generate Features for Unit Test.ipynb
701 - Generate Features for Unit Test.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:36.988168Z",
     "start_time": "2019-11-07T03:55:36.706164Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pymysql\n",
    "import boto3\n",
    "import json\n",
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:36.994804Z",
     "start_time": "2019-11-07T03:55:36.989531Z"
    }
   },
   "outputs": [],
   "source": [
    "import configparser\n",
    "config = configparser.ConfigParser()\n",
    "config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
    "\n",
    "host = config['MYSQL-ROOT']['HOST']\n",
    "user = config['MYSQL-ROOT']['USER']\n",
    "password = config['MYSQL-ROOT']['PASSWORD']\n",
    "\n",
    "def connect_sql():\n",
    "    cnx = pymysql.connect(host=host,\n",
    "                          user=user,\n",
    "                          password=password,\n",
    "                          cursorclass=pymysql.cursors.DictCursor)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:37.026188Z",
     "start_time": "2019-11-07T03:55:36.997299Z"
    }
   },
   "outputs": [],
   "source": [
    "base_df = pd.read_csv('test_pl_v3_19022020.csv')\n",
    "base_df = base_df.rename(columns={'trx_date':'transaction_date',\n",
    "                                  'trx id':'trx_id'})\n",
    "base_df['transaction_date'] = base_df['transaction_date'].astype('datetime64[ms]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:37.034477Z",
     "start_time": "2019-11-07T03:55:37.027603Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 8)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "base_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:37.043587Z",
     "start_time": "2019-11-07T03:55:37.036542Z"
    }
   },
   "outputs": [],
   "source": [
    "uids = tuple(base_df['user_id'].unique().tolist())\n",
    "tids = tuple(base_df['trx_id'].unique().tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "34"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(uids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "35"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tids)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Snapshot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:37.078945Z",
     "start_time": "2019-11-07T03:55:37.074323Z"
    }
   },
   "outputs": [],
   "source": [
    "snapshot_q = f\"\"\"\n",
    "SELECT \n",
    "    base.user_id, \n",
    "    base.id AS trx_id,\n",
    "    current_dpd\n",
    "FROM \n",
    "\t\t(SELECT user_id,id,transaction_date \n",
    "\t\tFROM l2alpha.`transaction` WHERE id IN {tids}) AS base\n",
    "\tLEFT JOIN \n",
    "\t\tds.user_level_archive AS ula \n",
    "\tON base.user_id = ula.user_id AND DATE(transaction_date) = snapshot_date \n",
    "\tLEFT JOIN\n",
    "\t\t(SELECT user_id,monthly_installment, transaction_id\n",
    "\t\tFROM l2alpha.loan \n",
    "\t\tWHERE transaction_id IN {tids}) AS lo \n",
    "\tON lo.transaction_id = base.id\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:08.943657Z",
     "start_time": "2019-11-07T03:55:37.081391Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "snapshot = pd.read_sql(snapshot_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:08.949079Z",
     "start_time": "2019-11-07T03:58:08.945547Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 3)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "snapshot.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:08.959459Z",
     "start_time": "2019-11-07T03:58:08.950846Z"
    }
   },
   "outputs": [],
   "source": [
    "snapshot.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Delinquency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:08.985180Z",
     "start_time": "2019-11-07T03:58:08.960895Z"
    }
   },
   "outputs": [],
   "source": [
    "delin_q = f'''\n",
    "SELECT \n",
    "    base.user_id,\n",
    "    base.id AS trx_id,\n",
    "    MAX(CASE WHEN datediff(DATE(transaction_date),snapshot_date) <= 90 AND\n",
    "                  datediff(DATE(transaction_date),snapshot_date) >=0  THEN current_dpd ELSE NULL END) AS delin_max_dpd_90\n",
    "FROM\n",
    "\t\t(SELECT user_id,id,transaction_date \n",
    "\t\tFROM l2alpha.`transaction` \n",
    "\t\tWHERE id IN {tids}) AS base\n",
    "\tLEFT JOIN \n",
    "\t    (SELECT user_id,\n",
    "\t        \tos_amount,\n",
    "\t        \tcurrent_dpd,\n",
    "\t        \tdpd_amount,\n",
    "\t        \tsnapshot_date\n",
    "\t    FROM ds.user_level_archive\n",
    "\t    ) AS delin ON base.user_id = delin.user_id AND snapshot_date <= DATE(transaction_date)\n",
    "GROUP BY base.id\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:37.439306Z",
     "start_time": "2019-11-07T03:58:08.992173Z"
    }
   },
   "outputs": [],
   "source": [
    "delin = pd.read_sql(delin_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:37.443941Z",
     "start_time": "2019-11-07T03:58:37.440532Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 3)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "delin.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:37.458294Z",
     "start_time": "2019-11-07T03:58:37.445289Z"
    }
   },
   "outputs": [],
   "source": [
    "delin.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. Other"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## time_approve_to_pl_hour"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:37.472597Z",
     "start_time": "2019-11-07T03:58:37.459708Z"
    }
   },
   "outputs": [],
   "source": [
    "time_pl_q = f'''\n",
    "SELECT \n",
    "    base.user_id,\n",
    "    base.id AS trx_id,\n",
    "    TIMESTAMPDIFF(HOUR, installment_ts, transaction_date) AS time_appr_to_pl_hour\n",
    "FROM\n",
    "\t\t(SELECT user_id,id,transaction_date \n",
    "\t\tFROM l2alpha.`transaction` WHERE id IN {tids}) AS base\n",
    "\tLEFT JOIN \n",
    "\t    (SELECT user_id, MAX(is_approved_timestamp) AS installment_ts \n",
    "\t    FROM l2alpha.web_userinstallmentapplicationdetails \n",
    "\t    WHERE user_id IN {uids}\n",
    "\t    GROUP BY user_id\n",
    "\t    ) AS wuiad \n",
    "\tON base.user_id = wuiad.user_id\n",
    "GROUP BY base.id\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:40.087947Z",
     "start_time": "2019-11-07T03:58:37.473685Z"
    }
   },
   "outputs": [],
   "source": [
    "time_pl = pd.read_sql(time_pl_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:40.092390Z",
     "start_time": "2019-11-07T03:58:40.089336Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 3)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "time_pl.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:58:40.104548Z",
     "start_time": "2019-11-07T03:58:40.093767Z"
    }
   },
   "outputs": [],
   "source": [
    "time_pl.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## oth_last_rep_xxx:\n",
    "- dpd\n",
    "- days"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:11.673428Z",
     "start_time": "2019-11-07T04:00:11.659983Z"
    }
   },
   "outputs": [],
   "source": [
    "oth_last_rep_q = f'''\n",
    "SELECT main.user_id,\n",
    "\t   main.id AS trx_id,\n",
    "       MAX(dpd) AS oth_last_rep_dpd,\n",
    "       datediff(main.transaction_date,main.last_pd) AS oth_last_rep_days\n",
    "FROM\n",
    "\t(SELECT base.user_id,\n",
    "\t\t\tbase.id,\n",
    "\t    \tbase.transaction_date,\n",
    "\t    \tMAX(up.pd) AS last_pd\n",
    "\tFROM\n",
    "\t\t(SELECT user_id,\n",
    "                id,\n",
    "                transaction_date,\n",
    "                status\n",
    "\t\tFROM l2alpha.`transaction`\n",
    "\t\tWHERE id IN {tids}\n",
    "        ) AS base\n",
    "\tLEFT JOIN \n",
    "\t    (SELECT user_id,\n",
    "\t        \tid,\n",
    "\t        \tpayment_type,\n",
    "\t        \tpayment_channel,\n",
    "\t        \tpayment_date AS pd,\n",
    "\t        \tdpd,\n",
    "\t        \tstatus\n",
    "\t    FROM l2alpha.user_payment \n",
    "\t    WHERE status = 2 AND payment_type = 'INSTALLMENT' AND payment_channel <> 'CANCELLATION'\n",
    "\t    ) AS up \n",
    "\tON base.user_id = up.user_id AND pd < transaction_date\n",
    "\tGROUP BY base.id\n",
    "\t) AS main\n",
    "LEFT JOIN \n",
    "\t(SELECT user_id, \n",
    "\t\t\tid, \n",
    "\t\t\tdpd, \n",
    "\t\t\tpayment_date,\n",
    "            payment_channel,\n",
    "            amount\n",
    "\tFROM l2alpha.user_payment) AS up2\n",
    "ON up2.payment_date = main.last_pd AND main.user_id = up2.user_id\n",
    "GROUP BY main.id\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:45.110051Z",
     "start_time": "2019-11-07T04:00:11.674575Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "oth_last_rep = pd.read_sql(oth_last_rep_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:45.115239Z",
     "start_time": "2019-11-07T04:00:45.111412Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 4)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oth_last_rep.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "oth_last_rep.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4. Utilization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## util_non_pl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:45.181482Z",
     "start_time": "2019-11-07T04:00:45.178221Z"
    }
   },
   "outputs": [],
   "source": [
    "util_non_pl_q = f'''\n",
    "SELECT main.user_id,\n",
    "\t   main.trx_id,\n",
    "\t   COALESCE(SUM(loan_amount/transaction_limit), 0) AS util_non_pl\n",
    "FROM\n",
    "\t(SELECT user_id, \n",
    "\t\t   trx_id, \n",
    "\t\t   loan_id,\n",
    "\t\t   loan_amount,\n",
    "\t\t   tenure,\n",
    "\t\t   MAX(creation_time) AS latest_limit\n",
    "\tFROM\t\n",
    "\t\t(SELECT tr.user_id,\n",
    "\t\t       tr.id AS trx_id,\n",
    "\t\t       lo.loan_id,\n",
    "\t\t       loan_amount,\n",
    "\t\t       creation_time,\n",
    "\t\t       tenure,\n",
    "\t\t       COALESCE(TIMESTAMPDIFF(DAY, creation_time, transaction_date), 0) AS time_diff\n",
    "\t\tFROM \n",
    "\t\t\t\t(SELECT user_id,\n",
    "\t\t\t\t\t\ttransaction_date,\n",
    "\t\t\t\t    \tid\n",
    "\t\t\t\tFROM l2alpha.`transaction` \n",
    "\t\t\t\tWHERE id IN {tids}) AS tr\n",
    "\t\t\tLEFT JOIN \n",
    "\t\t\t\t(SELECT id AS loan_id,\n",
    "\t\t\t\t\t\tuser_id,\n",
    "\t\t\t\t    \tstart_date,\n",
    "\t\t\t\t    \tsettlement_date,\n",
    "\t\t\t\t    \tloan_amount,\n",
    "\t\t\t\t    \ttenure,\n",
    "\t\t\t\t    \ttransaction_id \n",
    "\t\t\t\tFROM l2alpha.loan\n",
    "\t\t\t\tWHERE is_active = 1 AND loan_amount > 0) AS lo\n",
    "\t\t\tON lo.user_id = tr.user_id AND start_date < transaction_date AND (settlement_date IS NULL OR settlement_date > transaction_date)\n",
    "\t\t\t\n",
    "\t\t\tLEFT JOIN \n",
    "\t\t\t\t(SELECT user_id,\n",
    "\t\t\t\t\t\tCASE WHEN installment_type = '30_days' THEN 30\n",
    "\t\t\t\t\t\tWHEN installment_type = '3_months' THEN 90 \n",
    "\t\t\t\t\t\tWHEN installment_type = '6_months' THEN 180 \n",
    "\t\t\t\t\t\tWHEN installment_type = '12_months' THEN 360 \n",
    "\t\t\t\t\t\tELSE NULL END AS  installment_type,\n",
    "\t\t\t\t\t\tcreation_time\n",
    "\t\t\t\tFROM l2alpha.transaction_limit_history\n",
    "\t\t\t\t) AS tlh\n",
    "\t\t\tON tlh.user_id = lo.user_id AND tlh.installment_type = lo.tenure\n",
    "\t\t) AS base\n",
    "\tWHERE time_diff >= 0\n",
    "\tGROUP BY trx_id, loan_id\n",
    "\t) AS main\n",
    "\t\n",
    "\tLEFT JOIN \n",
    "\t(SELECT user_id,\n",
    "\t\t\ttransaction_limit,\n",
    "\t\t\tCASE WHEN installment_type = '30_days' THEN 30\n",
    "\t\t\tWHEN installment_type = '3_months' THEN 90 \n",
    "\t\t\tWHEN installment_type = '6_months' THEN 180 \n",
    "\t\t\tWHEN installment_type = '12_months' THEN 360 \n",
    "\t\t\tELSE NULL END AS  installment_type,\n",
    "\t\t\tcreation_time\n",
    "\tFROM l2alpha.transaction_limit_history\n",
    "\tWHERE user_id IN {uids}\n",
    "\tGROUP BY user_id, transaction_limit, installment_type, creation_time\n",
    "\t) AS lim\n",
    "\tON lim.creation_time = main.latest_limit AND lim.user_id = main.user_id\n",
    "\tAND lim.installment_type = main.tenure\n",
    "GROUP BY main.trx_id\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:51.464862Z",
     "start_time": "2019-11-07T04:00:45.182833Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ec2-user/anaconda3/envs/env_personalloan/lib/python3.7/site-packages/pymysql/cursors.py:329: Warning: (1052, \"Column 'installment_type' in group statement is ambiguous\")\n",
      "  self._do_get_result()\n"
     ]
    }
   ],
   "source": [
    "util_non_pl = pd.read_sql(util_non_pl_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:51.469712Z",
     "start_time": "2019-11-07T04:00:51.466148Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 3)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "util_non_pl.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:51.515275Z",
     "start_time": "2019-11-07T04:00:51.471223Z"
    }
   },
   "outputs": [],
   "source": [
    "util_non_pl.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## util_pl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:51.534178Z",
     "start_time": "2019-11-07T04:00:51.523536Z"
    }
   },
   "outputs": [],
   "source": [
    "util_pl_q = f'''\n",
    "SELECT main.user_id, \n",
    "\t   trx_id,\n",
    "\t   COALESCE((amount/transaction_limit), 0) AS util_pl\n",
    "FROM\n",
    "\t(SELECT user_id, trx_id, amount, payment_type, transaction_date,\n",
    "\t\t   MAX(creation_time) AS latest_limit\n",
    "\tFROM\n",
    "\t\t(SELECT tr.user_id, tr.id AS trx_id, tr.amount, tr.payment_type, tr.transaction_date,\n",
    "\t\t\t\ttlh.creation_time,\n",
    "\t\t\t   TIMESTAMPDIFF(DAY, creation_time, transaction_date) AS time_diff\n",
    "\t\tFROM\n",
    "\t\t\t(SELECT *\n",
    "\t\t\tFROM l2alpha.`transaction` \n",
    "\t\t\tWHERE id IN {tids}\n",
    "\t\t\t) AS tr\n",
    "\t\t\tLEFT JOIN l2alpha.transaction_limit_history AS tlh\n",
    "\t\t\tON tr.user_id = tlh.user_id AND tr.payment_type = tlh.installment_type\n",
    "\t\t) AS base\n",
    "\tWHERE time_diff >= 0\n",
    "\tGROUP BY base.trx_id\n",
    "\t) AS main\n",
    "\t\n",
    "\tLEFT JOIN \n",
    "\t(SELECT user_id,\n",
    "\t\t\ttransaction_limit,\n",
    "\t\t\tinstallment_type,\n",
    "\t\t\tcreation_time\n",
    "\tFROM l2alpha.transaction_limit_history\n",
    "\tWHERE user_id IN {uids}\n",
    "\tGROUP BY user_id, transaction_limit, installment_type, creation_time\n",
    "\t) AS lim\n",
    "\tON lim.creation_time = main.latest_limit AND lim.user_id = main.user_id \n",
    "    AND lim.installment_type = main.payment_type\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:54.715476Z",
     "start_time": "2019-11-07T04:00:51.536994Z"
    }
   },
   "outputs": [],
   "source": [
    "util_pl = pd.read_sql(util_pl_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:54.720364Z",
     "start_time": "2019-11-07T04:00:54.716934Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 3)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "util_pl.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:00:54.733988Z",
     "start_time": "2019-11-07T04:00:54.722455Z"
    }
   },
   "outputs": [],
   "source": [
    "util_pl.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6. Installed Payday Loan App Count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:01:06.262833Z",
     "start_time": "2019-11-07T04:01:06.257660Z"
    }
   },
   "outputs": [],
   "source": [
    "ins_app_q = f'''\n",
    "SELECT user_id,\n",
    "       installed_applications\n",
    "FROM l2alpha.web_userdevicedetail\n",
    "WHERE user_id IN {uids}\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:40.041309Z",
     "start_time": "2019-11-07T04:01:06.264471Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "ins_app = pd.read_sql(ins_app_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:40.048118Z",
     "start_time": "2019-11-07T04:02:40.042945Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(34, 2)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ins_app.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "plo_app_list = pd.read_csv('data/payday_700_list.csv', header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:40.084424Z",
     "start_time": "2019-11-07T04:02:40.066336Z"
    }
   },
   "outputs": [],
   "source": [
    "payday_loan_apps = plo_app_list[0].unique().tolist()\n",
    "payday_loan_apps = [x.lower() for x in payday_loan_apps]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:40.109404Z",
     "start_time": "2019-11-07T04:02:40.097156Z"
    }
   },
   "outputs": [],
   "source": [
    "def create_feats_apps(raw_df,label='installed_applications'):\n",
    "\n",
    "    df = raw_df.copy()\n",
    "    df[label] = df[label].fillna('[]')\n",
    "    df[label] = df[label].apply(lambda x: eval(x))\n",
    "\n",
    "    def count_apps(x,list_apps):\n",
    "        return len([ft for ft in x if ft.lower() in list_apps])\n",
    "\n",
    "    df['ap_co_plo'] = df[label].apply(lambda x:count_apps(x,payday_loan_apps))\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.459722Z",
     "start_time": "2019-11-07T04:02:40.111155Z"
    }
   },
   "outputs": [],
   "source": [
    "plo_apps = create_feats_apps(ins_app)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.507926Z",
     "start_time": "2019-11-07T04:02:44.461220Z"
    }
   },
   "outputs": [],
   "source": [
    "plo_apps = plo_apps.drop(columns=['installed_applications'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(34, 2)"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "plo_apps.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Number of Previous PL Transactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_pl_history(base_df):\n",
    "    uids = tuple(base_df['user_id'])\n",
    "    max_trx_date = base_df['transaction_date'].max()\n",
    "\n",
    "    query = f\"\"\"\n",
    "        SELECT user_id,\n",
    "               id AS trx_id,\n",
    "               status,\n",
    "               payment_type,\n",
    "               amount,\n",
    "               transaction_date\n",
    "        FROM l2alpha.transaction\n",
    "        WHERE user_id IN {uids}\n",
    "        AND transaction_type = 2\n",
    "        AND transaction_date <= '{max_trx_date}'\n",
    "    \"\"\"\n",
    "\n",
    "    df = pd.read_sql(query, connect_sql())\n",
    "    df['gen_payment_type'] = df['payment_type'].apply(lambda x: 'mini' if x=='30_days' else 'jumbo')\n",
    "    \n",
    "    out_df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(df, how='left', on='user_id')\\\n",
    "                     .rename(columns={'transaction_date_x':'trx_dt_current',\n",
    "                                      'transaction_date_y':'trx_dt_past',\n",
    "                                      'trx_id_x':'trx_id',\n",
    "                                      'trx_id_y':'trx_id_ref'})\n",
    "    out_df['time_diff'] = (out_df['trx_dt_current'] - out_df['trx_dt_past']).dt.total_seconds()\n",
    "    return out_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_prev_pl_feats(df, base_df, window='all'):\n",
    "    if isinstance(window, int):\n",
    "        df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
    "        name = str(window)\n",
    "    elif window == 'all':\n",
    "        df = df[df['time_diff'] > 0]\n",
    "        name = 'ever'\n",
    "    \n",
    "    prev_df = df.groupby(['user_id', 'trx_id', 'status', 'gen_payment_type']).size()\\\n",
    "                .unstack(level=[3,2]).fillna(0)\n",
    "    prev_df.columns = [f'pl_trx_co_{col[0]}_{col[1]}_{name}' for col in prev_df.columns.values]\n",
    "    \n",
    "    keywords = [2, 4, 5, 6, 7, 11, 'jumbo', 'mini']\n",
    "    stat_cols_dict = {}\n",
    "    for word in keywords:\n",
    "        str_word = str(word) if isinstance(word, int) else word\n",
    "        stat_cols_dict[word] = [col for col in prev_df if str_word in col]\n",
    "    \n",
    "    name_status = 'pl_trx_co_jumbomini_{}_{}'\n",
    "    name_pl_type = 'pl_trx_co_{}_all_{}'\n",
    "    for k, v in stat_cols_dict.items():\n",
    "        if isinstance(k, int):\n",
    "            prev_df[name_status.format(str(k), name)] = prev_df[v].sum(axis=1)\n",
    "        elif isinstance(k, str):\n",
    "            prev_df[name_pl_type.format(k, name)] = prev_df[v].sum(axis=1)\n",
    "            \n",
    "    prev_df[f'pl_trx_co_jumbomini_all_{name}'] = prev_df[f'pl_trx_co_jumbo_all_{name}'] + \\\n",
    "                                         prev_df[f'pl_trx_co_mini_all_{name}']\n",
    "    \n",
    "    prev_df = base_df[['user_id', 'trx_id']].merge(prev_df, how='left', on=['user_id', 'trx_id']).fillna(0)\n",
    "    \n",
    "    return prev_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "pl_hist = get_pl_history(base_df)\n",
    "prev_obs_feats_90 = create_prev_pl_feats(pl_hist, base_df, window=90)\n",
    "prev_obs_feats_90 = prev_obs_feats_90.rename(columns={'pl_trx_co_jumbomini_4_90':'pl_trx_suc_co_90'})\n",
    "prev_obs_feats_90 = prev_obs_feats_90[['user_id', 'trx_id', 'pl_trx_suc_co_90']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 3)"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prev_obs_feats_90.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Early Settlement"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_trx_settle_data(base_df):\n",
    "    uids = tuple(base_df['user_id'])\n",
    "    max_trx_date = base_df['transaction_date'].max()\n",
    "\n",
    "    early_settle_q = f'''\n",
    "    SELECT *,\n",
    "            DATE(transaction_date) + INTERVAL payment_type_int DAY AS due_date\n",
    "    FROM\n",
    "        (SELECT tr.user_id, tr.trx_id, transaction_date, payment_type, transaction_type, status,\n",
    "                CASE WHEN payment_type = '30_days' THEN '30'\n",
    "                WHEN payment_type = '3_months' THEN '90'\n",
    "                WHEN payment_type = '6_months' THEN '180' \n",
    "                WHEN payment_type = '12_months' THEN '360' ELSE NULL END AS payment_type_int,\n",
    "\n",
    "                settlement_date, loan_amount, is_active\n",
    "        FROM\n",
    "            (SELECT user_id,\n",
    "                   id AS trx_id,\n",
    "                   transaction_date,\n",
    "                   payment_type,\n",
    "                   transaction_type,\n",
    "                   status\n",
    "            FROM l2alpha.`transaction`\n",
    "            WHERE user_id IN {uids}\n",
    "            AND transaction_type = 2\n",
    "            AND status = 4\n",
    "            AND transaction_date <= '{max_trx_date}'\n",
    "            ) AS tr\n",
    "\n",
    "            LEFT JOIN\n",
    "\n",
    "            (SELECT transaction_id AS trx_id,\n",
    "                   settlement_date,\n",
    "                   loan_amount,\n",
    "                   is_active\n",
    "            FROM l2alpha.loan\n",
    "            ) AS lo\n",
    "\n",
    "            ON tr.trx_id = lo.trx_id\n",
    "            ) AS base\n",
    "        WHERE loan_amount > 0 AND is_active = 1\n",
    "    '''\n",
    "    query_df = pd.read_sql(early_settle_q, connect_sql())\n",
    "    query_df['settle_to_due'] = (query_df['due_date'] - query_df['settlement_date'].dt.date).dt.days\n",
    "    df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(query_df, how='left', on='user_id')\\\n",
    "                                         .rename(columns={'transaction_date_x':'trx_dt_current',\n",
    "                                          'transaction_date_y':'trx_dt_past',\n",
    "                                          'trx_id_x':'trx_id',\n",
    "                                          'trx_id_y':'trx_id_ref'})\n",
    "    df['time_diff'] = (df['trx_dt_current'] - df['trx_dt_past']).dt.total_seconds()\n",
    "    df = df.sort_values(['user_id', 'trx_dt_past']).reset_index(drop=True)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_early_settle_feats(df, base_df, window='all'):\n",
    "    if isinstance(window, int):\n",
    "        df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
    "        name = str(window)\n",
    "    elif window == 'all':\n",
    "        df = df[df['time_diff'] > 0]\n",
    "        name = 'ever'\n",
    "    df = df.sort_values(['user_id', 'trx_id', 'trx_dt_past'])\n",
    "    \n",
    "    # Calculate how many times the user has settled early in the last XX days\n",
    "    co_df = df.copy()\n",
    "    co_df['flag_early_settle'] = np.where(co_df['settle_to_due'] >= 30, 1, 0)\n",
    "    co_df['flag_early_settle'] = np.where(co_df['settle_to_due'].isna(), np.nan, co_df['flag_early_settle'])\n",
    "    co_df = co_df.groupby('trx_id').agg({'flag_early_settle':'sum'}).reset_index()\n",
    "    co_df.columns = ['trx_id', f'early_settle_co_{name}']\n",
    "    \n",
    "    # Calculate the min, max, average, and median days the user has settled early in the last XX days\n",
    "    # Also get the last time the user settled early (how many days early).\n",
    "    es_df = df.groupby('trx_id').agg({'settle_to_due':[max, min, 'mean', 'median', 'last']})\n",
    "    col_names = ['settle_to_due_max_{}', 'settle_to_due_min_{}', 'settle_to_due_avg_{}', \n",
    "                 'settle_to_due_med_{}', 'settle_to_due_last_pl_{}']\n",
    "    es_df.columns = [col.format(name) for col in col_names]\n",
    "    es_df = es_df.reset_index()\n",
    "    out_df = base_df[['user_id', 'trx_id']].merge(es_df, how='left', on='trx_id')\\\n",
    "                                           .merge(co_df, how='left', on='trx_id')\n",
    "    return out_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "trx_settle = get_trx_settle_data(base_df)\n",
    "early_settle_feats_180 = create_early_settle_feats(trx_settle, base_df, window=180)\n",
    "early_settle_feats_180 = early_settle_feats_180.rename(columns={'settle_to_due_last_pl_180':'pl_settle_to_due_last_180'})\n",
    "early_settle_feats_180 = early_settle_feats_180[['user_id', 'trx_id', 'pl_settle_to_due_last_180']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 3)"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "early_settle_feats_180.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Time from last PL Settlement"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_time_from_last_pl_feat(df, base):\n",
    "    time_from_last_pl = df[(~df['trx_id_ref'].isin(base['trx_id'])) & (df['time_diff'] >= 0)]\n",
    "    time_from_last_pl.loc[(time_from_last_pl['time_diff'] == 0) & \n",
    "                          (time_from_last_pl['settlement_date'].notna()), 'settlement_date'] = np.nan\n",
    "    time_from_last_pl = time_from_last_pl.sort_values(['user_id', 'trx_id', 'trx_dt_past'])\n",
    "\n",
    "    time_from_last_pl_grp = time_from_last_pl.groupby('trx_id').tail(1)\n",
    "    time_from_last_pl_grp['time_from_prev_pl_settle'] = (time_from_last_pl_grp['trx_dt_current'] - \\\n",
    "                                                         time_from_last_pl_grp['settlement_date'])\\\n",
    "                                                            .dt.total_seconds()/3600\n",
    "    time_from_last_pl_final = time_from_last_pl_grp[['user_id', 'trx_id', 'time_from_prev_pl_settle']]\n",
    "    return time_from_last_pl_final"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "time_from_prev_pl_settle = create_time_from_last_pl_feat(trx_settle, base_df)\n",
    "time_from_prev_pl_settle = time_from_prev_pl_settle.rename(columns={'time_from_prev_pl_settle':'time_from_last_sett_pl_hour'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(23, 3)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(time_from_prev_pl_settle.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Date of month of transaction date"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "date_feats = base_df[['user_id', 'trx_id', 'transaction_date']].copy()\n",
    "date_feats['date_of_month'] = date_feats['transaction_date'].dt.day\n",
    "date_feats = date_feats[['user_id', 'trx_id', 'date_of_month']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 3)"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "date_feats.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Denied Transactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "denied_trx_q = f'''\n",
    "SELECT base.user_id,\n",
    "base.pl_trx_id AS trx_id,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 90 AND \n",
    "               datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
    "               THEN base.transaction_id ELSE NULL END)) AS trx_denied_co_90                \n",
    "FROM\n",
    "    (SELECT tr1.user_id,\n",
    "            tr1.id AS pl_trx_id,\n",
    "            tr1.pl_transaction_date,\n",
    "            tr2.id AS transaction_id,\n",
    "            tr2.transaction_date,\n",
    "            tr2.amount\n",
    "    FROM\n",
    "        (SELECT user_id, \n",
    "              id, \n",
    "              transaction_date AS pl_transaction_date\n",
    "        FROM l2alpha.`transaction` WHERE id IN {tids}\n",
    "        ) AS tr1\n",
    "    LEFT JOIN\n",
    "        (SELECT * \n",
    "        FROM l2alpha.`transaction`\n",
    "        WHERE status = 5\n",
    "        ) AS tr2 \n",
    "    ON tr1.user_id = tr2.user_id \n",
    "    AND tr2.transaction_date <= tr1.pl_transaction_date\n",
    "    ) AS base\n",
    "GROUP BY base.user_id, base.pl_transaction_date\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "denied_trx = pd.read_sql(denied_trx_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 3)"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "denied_trx.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Settled Transactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "paid_trx_q = f'''\n",
    "SELECT base.user_id,\n",
    "base.pl_trx_id AS trx_id,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 30 AND \n",
    "              datediff(DATE(pl_transaction_date),start_date) >= 0 \n",
    "    THEN base.loan_amount ELSE 0 END) AS trx_sett_sum_30\n",
    "FROM\n",
    "    (SELECT tr.user_id,\n",
    "     tr.id AS pl_trx_id,\n",
    "     tr.pl_transaction_date, \n",
    "     l.transaction_id,\n",
    "     l.start_date,\n",
    "     l.loan_amount,\n",
    "     l.tenure\n",
    "\n",
    "\tFROM \n",
    "\t    (SELECT user_id, \n",
    "\t            id, \n",
    "\t            transaction_date AS pl_transaction_date\n",
    "\t    FROM l2alpha.`transaction` WHERE id IN {tids}\n",
    "\t    ) AS tr\n",
    "\tINNER JOIN \n",
    "\t    (SELECT user_id,\n",
    "\t            start_date, \n",
    "\t            settlement_date,\n",
    "\t            loan_amount,\n",
    "\t            tenure,\n",
    "\t            transaction_id \n",
    "\t    FROM l2alpha.loan\n",
    "\t    WHERE is_active = 1 AND loan_amount > 0\n",
    "\t    ) AS l\n",
    "\tON l.user_id = tr.user_id AND settlement_date <= pl_transaction_date\n",
    "\t) AS base\n",
    "GROUP BY base.user_id, base.pl_transaction_date\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "paid_trx = pd.read_sql(paid_trx_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(27, 3)"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paid_trx.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Digital and Non-Digital Transactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_trx_history(base_df):\n",
    "    uids = tuple(base_df['user_id'])\n",
    "    max_trx_date = base_df['transaction_date'].max()\n",
    "    \n",
    "    # Remove all promotional and service related fees\n",
    "    excluded_product = ('biaya layanan'\n",
    "                        ,'shipping fee'\n",
    "                        ,'admin fee' \n",
    "                        ,'biaya kirim yang dibebankan' \n",
    "                        ,'biaya tambahan' \n",
    "                        ,'discount'             \n",
    "                        ,'adminfee'       \n",
    "                        ,'shippingfee'\n",
    "                        ,'additionalfee'                    \n",
    "                        ,'diskon tambahan' \n",
    "                        ,'donasi'\n",
    "                        ,'donation'                    \n",
    "                        ,'charge kredivo 2.3%'  \n",
    "                        ,'lainnya'   \n",
    "                        ,'user credit'\n",
    "                        ,'deals')  \n",
    "    \n",
    "    query = f\"\"\"\n",
    "    SELECT\n",
    "        t.user_id,\n",
    "        t.transaction_id AS trx_id,\n",
    "        t.transaction_date,\n",
    "        LOWER(p.name) AS product_name,\n",
    "        LOWER(p.category) AS category,\n",
    "        l.loan_amount,\n",
    "        t.merchant_id\n",
    "    FROM\n",
    "            (\n",
    "            SELECT\n",
    "                user_id,\n",
    "                id AS transaction_id,\n",
    "                transaction_date,\n",
    "                merchant_id\n",
    "            FROM\n",
    "                l2alpha.`transaction`\n",
    "            WHERE\n",
    "                user_id IN {uids}\n",
    "                AND status = 4\n",
    "                AND transaction_type <> 2\n",
    "                AND transaction_date <= '{max_trx_date}') AS t\n",
    "        JOIN (\n",
    "            SELECT\n",
    "                name,\n",
    "                id AS product_id,\n",
    "                transaction_id,\n",
    "                category,\n",
    "                sku\n",
    "            FROM\n",
    "                l2alpha.product\n",
    "            WHERE\n",
    "                sku_type = 0\n",
    "                AND name NOT IN {excluded_product}\n",
    "                AND sku NOT IN {excluded_product}) AS p ON\n",
    "            t.transaction_id = p.transaction_id\n",
    "        JOIN (\n",
    "            SELECT\n",
    "                transaction_id,\n",
    "                loan_amount\n",
    "            FROM\n",
    "                l2alpha.loan\n",
    "            WHERE\n",
    "                is_active = 1\n",
    "                AND loan_amount > 0) AS l ON\n",
    "            p.transaction_id = l.transaction_id\n",
    "        JOIN (\n",
    "            SELECT\n",
    "                id,\n",
    "                name\n",
    "            FROM\n",
    "                l2alpha.merchant) AS m ON\n",
    "            t.merchant_id = m.id\n",
    "    \"\"\"\n",
    "    \n",
    "    df = pd.read_sql(query, connect_sql())\n",
    "    out_df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(df, how='left', on='user_id')\\\n",
    "                    .rename(columns={'transaction_date_x':'trx_dt_curr',\n",
    "                                     'transaction_date_y':'trx_dt_past',\n",
    "                                     'trx_id_x':'trx_id',\n",
    "                                     'trx_id_y':'trx_id_ref'})\n",
    "    out_df['time_diff'] = (out_df['trx_dt_curr'] - out_df['trx_dt_past']).dt.total_seconds()\n",
    "    return out_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_prev_digital_trx_per_type(df, keyword, window='all'):\n",
    "    if keyword not in ['ticket', 'game', 'pulsa', 'utilities', 'digital']:\n",
    "        raise ValueError(f'{keyword} not available. Try ticket, game, pulsa, utilities, digital.')\n",
    "                         \n",
    "    if isinstance(window, int):\n",
    "        df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
    "        name = str(window)\n",
    "    elif window == 'all':\n",
    "        df = df[df['time_diff'] > 0]\n",
    "        name = 'ever'\n",
    "    elif isinstance(window, int) == False or window != 'all':\n",
    "        raise ValueError(f'The input \"{window}\" is wrong. Window value must be an integer or \"all\".')\n",
    "        \n",
    "    key = 'trx_cat_'+ keyword\n",
    "    digital = df.groupby(['user_id', 'trx_id', key]).agg({key:'count',\n",
    "                                                          'loan_amount':['sum', 'mean']})\\\n",
    "                .unstack().fillna(0)\n",
    "    \n",
    "    columns = []\n",
    "    for groups in digital.columns.values:\n",
    "        col_name = []\n",
    "        for item in groups:\n",
    "            col_name.append(str(item))\n",
    "        new_name = '-'.join(col_name)\n",
    "        columns.append(new_name)\n",
    "    digital.columns = columns\n",
    "    \n",
    "    if keyword == 'digital':\n",
    "        final = digital.reset_index()\n",
    "        final.columns = ['user_id', 'trx_id',\n",
    "                         f'nondgtl_nonpl_trx_co_{name}',\n",
    "                         f'dgtl_trx_co_all_{name}',\n",
    "                         f'nondgtl_nonpl_trx_sum_amt_{name}',\n",
    "                         f'dgtl_trx_sum_amt_all_{name}',\n",
    "                         f'nondgtl_nonpl_trx_avg_amt_{name}',\n",
    "                         f'dgtl_trx_avg_amt_all_{name}']\n",
    "        \n",
    "    else:\n",
    "        relevant_columns = [col for col in digital.columns if 'True' in col]\n",
    "        final = digital[relevant_columns].reset_index()\n",
    "        final.columns = ['user_id', 'trx_id', \n",
    "                         f'dgtl_trx_co_{keyword}_{name}', \n",
    "                         f'dgtl_trx_sum_amt_{keyword}_{name}',\n",
    "                         f'dgtl_trx_avg_amt_{keyword}_{name}']\n",
    "    \n",
    "    return final"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_digital_trx_feats(df):\n",
    "    # Load Raw Data\n",
    "    st_tm_1 = datetime.now()\n",
    "    print('Querying Transaction Data')\n",
    "    \n",
    "    trx_hist = get_trx_history(df)\n",
    "    trx_hist = trx_hist.dropna(how='any')\n",
    "    display(trx_hist.shape)\n",
    "    \n",
    "    print('query time:', datetime.now()-st_tm_1)\n",
    "    \n",
    "    #Create Digital Product Flags\n",
    "    st_tm_2 = datetime.now()\n",
    "    print('Create Digital Product Flags')\n",
    "    \n",
    "    trx_hist = get_digital_product_status(trx_hist)\n",
    "    display(trx_hist.shape)\n",
    "    \n",
    "    print('Create Digital Product Flags time:', datetime.now()-st_tm_2)\n",
    "    \n",
    "    #Create Features\n",
    "    st_tm_3 = datetime.now()\n",
    "    print('Create Features')\n",
    "    \n",
    "    digital_180 = create_prev_digital_trx_per_type(trx_hist, 'digital', window=180)\n",
    "    \n",
    "    print('Create Features time:', datetime.now()-st_tm_3)\n",
    "    \n",
    "    #Combine DataFrames\n",
    "    st_tm_4 = datetime.now()\n",
    "    print('Combine DataFrames')\n",
    "    \n",
    "    df_out = df[['user_id', 'trx_id']].merge(digital_180, how='left', on=['user_id', 'trx_id'])\n",
    "    \n",
    "    print('combine dataframe time:', datetime.now()-st_tm_4)\n",
    "    print('total processing time:', datetime.now()-st_tm_1)\n",
    "    \n",
    "    return df_out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(1359, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query time: 0:00:02.621247\n",
      "Create Digital Product Flags\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(1359, 15)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create Digital Product Flags time: 0:00:00.905178\n",
      "Create Features\n",
      "Create Features time: 0:00:00.017356\n",
      "Combine DataFrames\n",
      "combine dataframe time: 0:00:00.003762\n",
      "total processing time: 0:00:03.547844\n"
     ]
    }
   ],
   "source": [
    "dgtl_trx_feats = create_digital_trx_feats(base_df)[['user_id', 'trx_id', 'nondgtl_nonpl_trx_co_180']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 3)"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dgtl_trx_feats.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pefindo Feats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/ec2-user/SageMaker/zhilal/adhoc/pl_score_features_and_proba/27-11-2019/pl_bscore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "def connect_s3():\n",
    "    s3_client = boto3.client('s3',\n",
    "                            )\n",
    "    return s3_client"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "query_pf = f'''\n",
    "SELECT user_id, css_user_id\n",
    "FROM l2alpha.css_users\n",
    "WHERE user_id IN {uids}\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "css_uid_df = pd.read_sql(query_pf, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "css_uid_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "pf_df = base_df.merge(css_uid_df, how='left', on='user_id')\n",
    "pf_df = pf_df[['user_id', 'trx_id', 'transaction_date', 'css_user_id']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "pf_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "AWS_S3_BUCKET_NAME = 'csk-production'\n",
    "AWS_S3_FILE_PATH = 'userdata/pefindo/raw/'\n",
    "\n",
    "def get_read_json_from_s3(s3_client, bucket, key):\n",
    "    try:\n",
    "        obj = s3_client.get_object(\n",
    "            Bucket=bucket,\n",
    "            Key=key\n",
    "        )\n",
    "        js_raw = json.loads(obj['Body'].read().decode('utf-8'))\n",
    "        return js_raw\n",
    "    except Exception as e:\n",
    "        print(\"Can't get object from S3, error: {}\".format(e))\n",
    "        return None\n",
    "\n",
    "def get_raw_pefindo(css_user_id):\n",
    "    key = AWS_S3_FILE_PATH + css_user_id + '.json'\n",
    "    print(key)\n",
    "    raw_json = get_read_json_from_s3(connect_s3(), AWS_S3_BUCKET_NAME, key)\n",
    "    return raw_json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pl_bscore.feature_creation import create_pefindo_features\n",
    "\n",
    "def get_pf_feats(df):\n",
    "    pf_feat_dict = {}\n",
    "    pefindo_feats = create_pefindo_features.PFFeatures(df['transaction_date'], df['pefindo_raw'])\n",
    "    if df['pefindo_raw'] != None:\n",
    "        pf_feat_dict['pf_delin_max_dpd_12mo'] = pefindo_feats.pf_delin_max_dpd_12mo()\n",
    "    else:\n",
    "        pf_feat_dict['pf_delin_max_dpd_12mo'] = np.nan\n",
    "    return pf_feat_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_pef_features(df):\n",
    "    #Load Raw Data\n",
    "    st_tm_1 = datetime.now()\n",
    "    print('Loading Pefindo Data')\n",
    "#     df = df_in.iloc[begin:finish].reset_index(drop=True).copy()\n",
    "#     display(df.shape)\n",
    "    df['pefindo_raw'] = df.apply(lambda x: get_raw_pefindo(x['css_user_id']), axis=1)\n",
    "    display(df.shape)\n",
    "    print('load files from S3 time:', datetime.now()-st_tm_1)\n",
    "    \n",
    "    #Process Data into Features\n",
    "    st_tm_2 = datetime.now()\n",
    "    print('Processing Features')\n",
    "    df['transaction_date'] = df['transaction_date'].astype(str)\n",
    "    df['raw_feats'] = df.apply(get_pf_feats, axis=1)\n",
    "    print('feature processing time:', datetime.now()-st_tm_2)\n",
    "    \n",
    "    #Normalize DataFrame\n",
    "    st_tm_3 = datetime.now()\n",
    "    print('Normalizing DataFrame')\n",
    "    df_out = df.merge(pd.DataFrame(df['raw_feats'].tolist()), left_index=True, right_index=True)\n",
    "    df_out = df_out.drop(columns=['pefindo_raw', 'raw_feats'])\n",
    "    print('normalize dataframe:', datetime.now()-st_tm_3)\n",
    "    print('total processing time:', datetime.now()-st_tm_1)\n",
    "    \n",
    "    return df_out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Pefindo Data\n",
      "userdata/pefindo/raw/774edde6-0936-47d6-bc7b-c8b3899a9cab.json\n",
      "userdata/pefindo/raw/d5a2c84f-cf07-4c7b-977b-5792c36cccbb.json\n",
      "userdata/pefindo/raw/cfeb31b8-cc9e-4003-8afb-cbbd3d5e8a43.json\n",
      "Can't get object from S3, error: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.\n",
      "userdata/pefindo/raw/4b445f35-8349-41b3-8ba0-90169b9f979b.json\n",
      "Can't get object from S3, error: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.\n",
      "userdata/pefindo/raw/a411710f-eb55-4145-b2b9-bdf02989543c.json\n",
      "userdata/pefindo/raw/37c1d182-288c-4774-bd7d-fb14572d095a.json\n",
      "userdata/pefindo/raw/e26c2a05-3fa8-4508-b74c-5ad1d6baeb97.json\n",
      "userdata/pefindo/raw/70d3910d-07d2-4621-96ec-060bebe4be84.json\n",
      "Can't get object from S3, error: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.\n",
      "userdata/pefindo/raw/30788cc2-a54e-46ab-85c4-239653ccd519.json\n",
      "Can't get object from S3, error: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.\n",
      "userdata/pefindo/raw/a32e8bd2-0625-4e7a-abd4-803d646edd51.json\n",
      "userdata/pefindo/raw/ac311ca1-2fce-4a13-9eb6-b4174c8aba40.json\n",
      "userdata/pefindo/raw/0c5af50a-aa0e-4467-9166-25a4331add5e.json\n",
      "userdata/pefindo/raw/75654aa1-e860-4efb-a68d-da9ef5c82e6f.json\n",
      "userdata/pefindo/raw/008c6224-8e81-4679-bc63-1ce7e89fd289.json\n",
      "userdata/pefindo/raw/410c21b7-c768-4ab0-b674-76037c5e277d.json\n",
      "Can't get object from S3, error: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.\n",
      "userdata/pefindo/raw/ab2d3433-89f9-4b16-8cb8-0de3e502e10e.json\n",
      "userdata/pefindo/raw/7b52f866-df09-4f35-b7b1-bcf725ad8c1f.json\n",
      "userdata/pefindo/raw/62a08a0d-aed9-4b7d-afc7-d25d20d23c3d.json\n",
      "Can't get object from S3, error: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.\n",
      "userdata/pefindo/raw/65d59d44-7a85-4879-83f2-54e62598126e.json\n",
      "userdata/pefindo/raw/ab728a05-d104-46e4-b7aa-6b2c4218b6b8.json\n",
      "Can't get object from S3, error: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.\n",
      "userdata/pefindo/raw/8aed5328-5fec-4337-ad9d-aeb80f2e71ba.json\n",
      "userdata/pefindo/raw/80312aeb-03b2-4a23-a503-12b18cfed533.json\n",
      "userdata/pefindo/raw/68b3efef-87ea-4770-b2f5-6c78d9299464.json\n",
      "userdata/pefindo/raw/12b79dc7-7c7d-438c-8aba-0e3c2dddb9b5.json\n",
      "userdata/pefindo/raw/d09a454c-b22c-4285-ae2b-21244b12249b.json\n",
      "userdata/pefindo/raw/def3935e-f60d-46fb-98a1-b18ab48880ca.json\n",
      "userdata/pefindo/raw/5481a8b6-cc9a-42cd-93ea-abedc000f4b7.json\n",
      "userdata/pefindo/raw/4971f6f2-915f-4290-85e9-233d37e80e2d.json\n",
      "userdata/pefindo/raw/dc7689fc-147b-41f9-a268-1b9fcf694b38.json\n",
      "Can't get object from S3, error: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.\n",
      "userdata/pefindo/raw/7a45c107-2934-42a1-b2c7-dc18da87e1cc.json\n",
      "userdata/pefindo/raw/58c2b810-6654-4d56-be70-49f7da790f0b.json\n",
      "Can't get object from S3, error: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.\n",
      "userdata/pefindo/raw/2dcb24a5-cd24-4e14-8046-cb6e09163d3b.json\n",
      "userdata/pefindo/raw/2dcb24a5-cd24-4e14-8046-cb6e09163d3b.json\n",
      "userdata/pefindo/raw/899d6d0c-2076-4b09-b8d8-aa9a99a24bed.json\n",
      "userdata/pefindo/raw/8dfe25c0-1bff-4631-a382-8b8f18e404f5.json\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(35, 5)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "load files from S3 time: 0:00:02.392329\n",
      "Processing Features\n",
      "feature processing time: 0:00:00.253351\n",
      "Normalizing DataFrame\n",
      "normalize dataframe: 0:00:00.002585\n",
      "total processing time: 0:00:02.648828\n"
     ]
    }
   ],
   "source": [
    "pefindo_feats = create_pef_features(pf_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 5)"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pefindo_feats.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "pefindo_feats = pefindo_feats[['user_id', 'trx_id', 'pf_delin_max_dpd_12mo']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Payment_type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "payment_type_q = f'''\n",
    "SELECT user_id, id AS trx_id, payment_type\n",
    "FROM l2alpha.transaction\n",
    "WHERE id IN {tids}\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "payment_type_df = pd.read_sql(payment_type_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "payment_type_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combine all dataframes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.586494Z",
     "start_time": "2019-11-07T04:02:44.510100Z"
    }
   },
   "outputs": [],
   "source": [
    "final_df = base_df[['trx_id', 'user_id']]\\\n",
    "                  .merge(snapshot, on=['trx_id', 'user_id'], how='left')\\\n",
    "                  .merge(delin, on=['trx_id', 'user_id'], how='left')\\\n",
    "                  .merge(time_pl, on=['trx_id', 'user_id'], how='left')\\\n",
    "                  .merge(oth_last_rep, on=['trx_id', 'user_id'], how='left')\\\n",
    "                  .merge(util_non_pl, on=['trx_id', 'user_id'], how='left')\\\n",
    "                  .merge(util_pl, on=['trx_id', 'user_id'], how='left')\\\n",
    "                  .merge(plo_apps, on='user_id', how='left')\\\n",
    "                  .merge(prev_obs_feats_90, how='left', on=['user_id', 'trx_id'])\\\n",
    "                  .merge(early_settle_feats_180, how='left', on=['user_id', 'trx_id'])\\\n",
    "                  .merge(time_from_prev_pl_settle, how='left', on=['user_id', 'trx_id'])\\\n",
    "                  .merge(date_feats, how='left', on=['user_id', 'trx_id'])\\\n",
    "                  .merge(denied_trx, how='left', on=['user_id', 'trx_id'])\\\n",
    "                  .merge(paid_trx, how='left', on=['user_id', 'trx_id'])\\\n",
    "                  .merge(dgtl_trx_feats, how='left', on=['user_id', 'trx_id'])\\\n",
    "                  .merge(pefindo_feats, how='left', on=['user_id', 'trx_id'])\\\n",
    "                  .merge(payment_type_df, how='left', on=['user_id', 'trx_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.591815Z",
     "start_time": "2019-11-07T04:02:44.587913Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 19)"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35, 8)"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.617944Z",
     "start_time": "2019-11-07T04:02:44.593760Z"
    }
   },
   "outputs": [],
   "source": [
    "final_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ap_co_plo',\n",
       " 'current_dpd',\n",
       " 'date_of_month',\n",
       " 'delin_max_dpd_90',\n",
       " 'nondgtl_nonpl_trx_co_180',\n",
       " 'oth_last_rep_days',\n",
       " 'oth_last_rep_dpd',\n",
       " 'payment_type',\n",
       " 'pf_delin_max_dpd_12mo',\n",
       " 'pl_settle_to_due_last_180',\n",
       " 'pl_trx_suc_co_90',\n",
       " 'time_appr_to_pl_hour',\n",
       " 'time_from_last_sett_pl_hour',\n",
       " 'trx_denied_co_90',\n",
       " 'trx_id',\n",
       " 'trx_sett_sum_30',\n",
       " 'user_id',\n",
       " 'util_non_pl',\n",
       " 'util_pl']"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(final_df.columns.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = ['ap_co_plo',\n",
    "            'current_dpd',\n",
    "            'date_of_month',\n",
    "            'delin_max_dpd_90',\n",
    "            'nondgtl_nonpl_trx_co_180',\n",
    "            'oth_last_rep_days',\n",
    "            'oth_last_rep_dpd',\n",
    "            'payment_type',\n",
    "            'pf_delin_max_dpd_12mo',\n",
    "            'pl_settle_to_due_last_180',\n",
    "            'pl_trx_suc_co_90',\n",
    "            'time_appr_to_pl_hour',\n",
    "            'time_from_last_sett_pl_hour',\n",
    "            'trx_denied_co_90',\n",
    "            'trx_sett_sum_30',\n",
    "            'util_non_pl',\n",
    "            'util_pl'\n",
    "           ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.798454Z",
     "start_time": "2019-11-07T04:02:44.619626Z"
    }
   },
   "outputs": [],
   "source": [
    "final_df.to_parquet('data/unittest_feats_19022020.parquet', compression='gzip')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}