past-data-projects / personal_loan_credit_risk / 200-Feature_Creation / 202 - Additional New Features transaction related.ipynb
202 - Additional New Features transaction related.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pymysql\n",
    "import re\n",
    "from datetime import datetime\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import configparser\n",
    "config = configparser.ConfigParser()\n",
    "config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
    "\n",
    "host = config['MYSQL-ROOT']['HOST']\n",
    "user = config['MYSQL-ROOT']['USER']\n",
    "password = config['MYSQL-ROOT']['PASSWORD']\n",
    "\n",
    "def connect_sql():\n",
    "    cnx = pymysql.connect(host=host,\n",
    "                          user=user,\n",
    "                          password=password,\n",
    "                          cursorclass=pymysql.cursors.DictCursor)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_df = pd.read_parquet('data/interim/pl_jumbo_mini_junjulaugsep_27012020.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 17)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "135508"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df['user_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "150672"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df['trx_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('2019-09-28 23:49:58')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df['transaction_date'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "tids = tuple(raw_df['trx_id'].unique())\n",
    "uids = tuple(raw_df['user_id'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "150672"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "135508"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(uids)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Denied Transactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "denied_trx_q = f'''\n",
    "SELECT base.user_id,\n",
    "base.pl_trx_id AS trx_id,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 7 AND \n",
    "               datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
    "               THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_7d,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 15 AND \n",
    "               datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
    "               THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_15d,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 30 AND \n",
    "               datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
    "               THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_1mo,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 60 AND \n",
    "               datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
    "               THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_2mo,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 90 AND \n",
    "               datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
    "               THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_3mo,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 120 AND \n",
    "               datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
    "               THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_4mo,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 180 AND \n",
    "               datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
    "               THEN base.transaction_id ELSE NULL END)) AS denied_trx_co_6mo,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 7 AND \n",
    "                   datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
    "                   THEN base.amount ELSE 0 END) AS denied_trx_sum_7d,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 15 AND \n",
    "                   datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
    "                   THEN base.amount ELSE 0 END) AS denied_trx_sum_15d,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 30 AND \n",
    "                   datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
    "                   THEN base.amount ELSE 0 END) AS denied_trx_sum_1mo,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 60 AND \n",
    "                   datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
    "                   THEN base.amount ELSE 0 END) AS denied_trx_sum_2mo,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 90 AND \n",
    "                   datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
    "                   THEN base.amount ELSE 0 END) AS denied_trx_sum_3mo,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 120 AND \n",
    "                   datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
    "                   THEN base.amount ELSE 0 END) AS denied_trx_sum_4mo,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 180 AND \n",
    "                   datediff(DATE(pl_transaction_date),transaction_date) >=0 \n",
    "                   THEN base.amount ELSE 0 END) AS denied_trx_sum_6mo                    \n",
    "FROM\n",
    "    (SELECT tr1.user_id,\n",
    "            tr1.id AS pl_trx_id,\n",
    "            tr1.pl_transaction_date,\n",
    "            tr2.id AS transaction_id,\n",
    "            tr2.transaction_date,\n",
    "            tr2.amount\n",
    "    FROM\n",
    "        (SELECT user_id, \n",
    "              id, \n",
    "              transaction_date AS pl_transaction_date\n",
    "        FROM l2alpha.`transaction` WHERE id IN {tids}\n",
    "        ) AS tr1\n",
    "    LEFT JOIN\n",
    "        (SELECT * \n",
    "        FROM l2alpha.`transaction`\n",
    "        WHERE status = 5\n",
    "        ) AS tr2 \n",
    "    ON tr1.user_id = tr2.user_id \n",
    "    AND tr2.transaction_date <= tr1.pl_transaction_date\n",
    "    ) AS base\n",
    "GROUP BY base.user_id, base.pl_transaction_date\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "denied_trx = pd.read_sql(denied_trx_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 16)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "denied_trx.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "135508"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "denied_trx['user_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "150672"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "denied_trx['trx_id'].nunique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Settled Transactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "paid_trx_q = f'''\n",
    "SELECT base.user_id,\n",
    "base.pl_trx_id AS trx_id,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 7 AND \n",
    "                   datediff(DATE(pl_transaction_date),start_date) >=0 \n",
    "                   THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_7d,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 15 AND \n",
    "                   datediff(DATE(pl_transaction_date),start_date) >=0 \n",
    "                   THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_15d,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 30 AND \n",
    "                   datediff(DATE(pl_transaction_date),start_date) >=0 \n",
    "                   THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_1mo,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 60 AND \n",
    "                   datediff(DATE(pl_transaction_date),start_date) >=0 \n",
    "                   THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_2mo,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 90 AND \n",
    "                   datediff(DATE(pl_transaction_date),start_date) >=0 \n",
    "                   THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_3mo,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 180 AND \n",
    "                   datediff(DATE(pl_transaction_date),start_date) >=0 \n",
    "                   THEN base.transaction_id ELSE NULL END)) AS settled_trx_co_6mo,\n",
    "COUNT(DISTINCT(base.transaction_id)) AS settled_trx_co_ever,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 7 AND \n",
    "                   datediff(DATE(pl_transaction_date),start_date) >=0 \n",
    "                   THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_7d,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 15 AND \n",
    "                   datediff(DATE(pl_transaction_date),start_date) >=0 \n",
    "                   THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_15d,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 30 AND \n",
    "                   datediff(DATE(pl_transaction_date),start_date) >=0 \n",
    "                   THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_1mo,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 60 AND \n",
    "                   datediff(DATE(pl_transaction_date),start_date) >=0 \n",
    "                   THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_2mo,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 90 AND \n",
    "                   datediff(DATE(pl_transaction_date),start_date) >=0 \n",
    "                   THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_3mo,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 180 AND \n",
    "                   datediff(DATE(pl_transaction_date),start_date) >=0 \n",
    "                   THEN base.loan_amount ELSE 0 END) AS settled_trx_sum_6mo,\n",
    "SUM(COALESCE(base.loan_amount,0)) AS settled_trx_sum_ever\n",
    "FROM\n",
    "    (SELECT tr.user_id,\n",
    "     tr.id AS pl_trx_id,\n",
    "     tr.pl_transaction_date, \n",
    "     l.transaction_id,\n",
    "     l.start_date,\n",
    "     l.loan_amount,\n",
    "     l.tenure\n",
    "\n",
    "\tFROM \n",
    "\t    (SELECT user_id, \n",
    "\t            id, \n",
    "\t            transaction_date AS pl_transaction_date\n",
    "\t    FROM l2alpha.`transaction` WHERE id IN {tids}\n",
    "\t    ) AS tr\n",
    "\tINNER JOIN \n",
    "\t    (SELECT user_id,\n",
    "\t            start_date, \n",
    "\t            settlement_date,\n",
    "\t            loan_amount,\n",
    "\t            tenure,\n",
    "\t            transaction_id \n",
    "\t    FROM l2alpha.loan\n",
    "\t    WHERE is_active = 1 AND loan_amount > 0\n",
    "\t    ) AS l\n",
    "\tON l.user_id = tr.user_id AND settlement_date <= pl_transaction_date\n",
    "\t) AS base\n",
    "GROUP BY base.user_id, base.pl_transaction_date\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "paid_trx = pd.read_sql(paid_trx_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(124816, 16)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paid_trx.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "112438"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paid_trx['user_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "124816"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paid_trx['trx_id'].nunique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "trx_feats = raw_df[['user_id', 'trx_id']].merge(denied_trx, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                         .merge(paid_trx, how='left', on=['user_id', 'trx_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 30)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trx_feats.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "135508"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trx_feats['user_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "150672"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trx_feats['trx_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "trx_feats.to_parquet('data/interim/pl_model_v3_new_trx_feats_27012020.parquet', compression='gzip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}