past-data-projects / personal_loan_credit_risk / 700-Productionization / 700 - Features for Production.ipynb
700 - Features for Production.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pymysql"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import configparser\n",
    "config = configparser.ConfigParser()\n",
    "config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
    "\n",
    "host = config['MYSQL-ROOT']['HOST']\n",
    "user = config['MYSQL-ROOT']['USER']\n",
    "password = config['MYSQL-ROOT']['PASSWORD']\n",
    "\n",
    "def connect_sql():\n",
    "    cnx = pymysql.connect(host=host,\n",
    "                          user=user,\n",
    "                          password=password,\n",
    "                          cursorclass=pymysql.cursors.DictCursor)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = ['ap_co_plo',\n",
    "            'current_dpd',\n",
    "            'date_of_month',\n",
    "            'delin_max_dpd_90',\n",
    "            'nondgtl_nonpl_trx_co_180',\n",
    "            'oth_last_rep_days',\n",
    "            'oth_last_rep_dpd',\n",
    "            'payment_type',\n",
    "            'pf_delin_max_dpd_12mo',\n",
    "            'pl_settle_to_due_last_180',\n",
    "            'pl_trx_jumbomini_suc_co_90',\n",
    "            'time_appr_to_pl_hour',\n",
    "            'time_from_last_sett_pl_hour',\n",
    "            'trx_denied_co_90',\n",
    "            'trx_sett_sum_30',\n",
    "            'util_non_pl',\n",
    "            'util_pl'\n",
    "           ]\n",
    "features = sorted(features)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- ap_co_plo\n",
    "    - --> same as PL V2\n",
    "\n",
    "\n",
    "- current_dpd\n",
    "- delin_max_dpd_90 (renamed from delin_max_dpd_3mo)\n",
    "    - --> same as PL V2 (less features)\n",
    "\n",
    "\n",
    "- oth_last_rep_days\n",
    "- oth_last_rep_dpd\n",
    "    - --> same as PL V2 (less features)\n",
    "\n",
    "\n",
    "- pf_delin_max_dpd_12mo\n",
    "    - --> same as PL V2\n",
    "\n",
    "\n",
    "- time_appr_to_pl_hour (renamed from time_approve_to_pl_hour)\n",
    "    - --> same as PL V2\n",
    "\n",
    "\n",
    "- util_non_pl\n",
    "    - --> same as PL V2\n",
    "\n",
    "\n",
    "- util_pl\n",
    "    - --> same as PL V2\n",
    "\n",
    "\n",
    "- pl_trx_jumbomini_suc_co_90 (in the code it's named pl_trx_co_jumbomini_4_90)\n",
    "    - new feature. No NaN values\n",
    "\n",
    "\n",
    "- date_of_month\n",
    "    - get date of day\n",
    "\n",
    "\n",
    "- payment_type\n",
    "    - get payment type\n",
    "\n",
    "\n",
    "- nondgtl_nonpl_trx_co_180\n",
    "    - --> requires adjustments\n",
    "\n",
    "\n",
    "- trx_denied_co_90\n",
    "    - --> new feature. No NaN values\n",
    "\n",
    "\n",
    "- trx_sett_sum_30\n",
    "    - --> new feature. NaN value possible. How to handle this?\n",
    "\n",
    "\n",
    "- pl_settle_to_due_last_180 (in the code it's named settle_to_due_last_pl_180)\n",
    "- time_from_last_sett_pl_hour (in the code it's named time_from_prev_pl_settle)\n",
    "    - --> requires adjustments"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Base Query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_base_pl_trx(tid):\n",
    "    query_kr = f'''\n",
    "    SELECT user_id, \n",
    "           id AS trx_id,\n",
    "           payment_type, \n",
    "           transaction_type,\n",
    "           transaction_date\n",
    "    FROM l2alpha.`transaction`\n",
    "    WHERE id = {tid}\n",
    "    '''\n",
    "    kr_df = pd.read_sql_query(query_kr, connect_sql())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# New Features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## pl_trx_jumbomini_suc_co_90"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_pl_trx(tids):\n",
    "    pl_trx_query = f'''\n",
    "    SELECT\n",
    "        base.user_id,\n",
    "        base.pl_trx_id AS trx_id,\n",
    "        COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date), transaction_date) <= 90 \n",
    "        AND datediff(DATE(pl_transaction_date), transaction_date) >= 0 \n",
    "        THEN base.transaction_id ELSE NULL END)) AS pl_trx_jumbomini_suc_co_90\n",
    "    FROM\n",
    "        (\n",
    "        SELECT\n",
    "            tr1.user_id,\n",
    "            tr1.id AS pl_trx_id,\n",
    "            tr1.pl_transaction_date,\n",
    "            tr2.id AS transaction_id,\n",
    "            tr2.transaction_date,\n",
    "            tr2.amount\n",
    "        FROM\n",
    "            (\n",
    "            SELECT\n",
    "                user_id,\n",
    "                id,\n",
    "                transaction_date AS pl_transaction_date\n",
    "            FROM\n",
    "                l2alpha.`transaction`\n",
    "            WHERE\n",
    "                id = {tids} ) AS tr1\n",
    "        LEFT JOIN (\n",
    "            SELECT\n",
    "                *\n",
    "            FROM\n",
    "                l2alpha.`transaction`\n",
    "            WHERE\n",
    "                status = 4\n",
    "                and transaction_type = 2 ) AS tr2 ON\n",
    "            tr1.user_id = tr2.user_id\n",
    "            AND tr2.transaction_date < tr1.pl_transaction_date ) AS base\n",
    "    GROUP BY\n",
    "        base.user_id,\n",
    "        base.pl_transaction_date\n",
    "    '''\n",
    "    pl_trx = pd.read_sql(pl_trx_query, connect_sql())\n",
    "    return pl_trx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_pl_trx(tids)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## nondgtl_nonpl_trx_co_180"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_trx_history(base_df):\n",
    "    uids = tuple(base_df['user_id'])\n",
    "    max_trx_date = base_df['transaction_date'].max()\n",
    "    \n",
    "    # Remove all promotional and service related fees\n",
    "    excluded_product = ('biaya layanan'\n",
    "                        ,'shipping fee'\n",
    "                        ,'admin fee' \n",
    "                        ,'biaya kirim yang dibebankan' \n",
    "                        ,'biaya tambahan' \n",
    "                        ,'discount'             \n",
    "                        ,'adminfee'       \n",
    "                        ,'shippingfee'\n",
    "                        ,'additionalfee'                    \n",
    "                        ,'diskon tambahan' \n",
    "                        ,'donasi'\n",
    "                        ,'donation'                    \n",
    "                        ,'charge kredivo 2.3%'  \n",
    "                        ,'lainnya'   \n",
    "                        ,'user credit'\n",
    "                        ,'deals')  \n",
    "    \n",
    "    query = f\"\"\"\n",
    "    SELECT\n",
    "        t.user_id,\n",
    "        t.transaction_id AS trx_id,\n",
    "        t.transaction_date,\n",
    "        LOWER(p.name) AS product_name,\n",
    "        LOWER(p.category) AS category,\n",
    "        l.loan_amount,\n",
    "        t.merchant_id\n",
    "    FROM\n",
    "            (\n",
    "            SELECT\n",
    "                user_id,\n",
    "                id AS transaction_id,\n",
    "                transaction_date,\n",
    "                merchant_id\n",
    "            FROM\n",
    "                l2alpha.`transaction`\n",
    "            WHERE\n",
    "                user_id IN {uids}\n",
    "                AND status = 4\n",
    "                AND transaction_type <> 2\n",
    "                AND transaction_date <= '{max_trx_date}') AS t\n",
    "        JOIN (\n",
    "            SELECT\n",
    "                name,\n",
    "                id AS product_id,\n",
    "                transaction_id,\n",
    "                category,\n",
    "                sku\n",
    "            FROM\n",
    "                l2alpha.product\n",
    "            WHERE\n",
    "                sku_type = 0\n",
    "                AND name NOT IN {excluded_product}\n",
    "                AND sku NOT IN {excluded_product}) AS p ON\n",
    "            t.transaction_id = p.transaction_id\n",
    "        JOIN (\n",
    "            SELECT\n",
    "                transaction_id,\n",
    "                loan_amount\n",
    "            FROM\n",
    "                l2alpha.loan\n",
    "            WHERE\n",
    "                is_active = 1\n",
    "                AND loan_amount > 0) AS l ON\n",
    "            p.transaction_id = l.transaction_id\n",
    "        JOIN (\n",
    "            SELECT\n",
    "                id,\n",
    "                name\n",
    "            FROM\n",
    "                l2alpha.merchant) AS m ON\n",
    "            t.merchant_id = m.id\n",
    "    \"\"\"\n",
    "    \n",
    "    df = pd.read_sql(query, connect_sql())\n",
    "    out_df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(df, how='left', on='user_id')\\\n",
    "                    .rename(columns={'transaction_date_x':'trx_dt_curr',\n",
    "                                     'transaction_date_y':'trx_dt_past',\n",
    "                                     'trx_id_x':'trx_id',\n",
    "                                     'trx_id_y':'trx_id_ref'})\n",
    "    out_df['time_diff'] = (out_df['trx_dt_curr'] - out_df['trx_dt_past']).dt.total_seconds()\n",
    "    return out_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "# def get_ticket(x):\n",
    "#     if int(x['merchant_id']) in [154,428,48,87,432]:\n",
    "#         if 'hotel' not in x['product_name'] and 'hotel' not in x['category']:\n",
    "#             return True\n",
    "#     elif x['merchant_id'] in [270]:\n",
    "#         if 'flight' in x['product_name'] or 'flight' in x['category']:\n",
    "#             return True\n",
    "\n",
    "#     return False\n",
    "\n",
    "def get_utilities(x):\n",
    "    for w in ['pdam','pln','tv kabel', 'electricity','electricity_postpaid',\n",
    "              'listrik pln', 'pln','pdam','internet & tv kabel','air pdam', \n",
    "              '75061609','75062176', 'listrik prabayar', 'listrik lainnya','pulsa listrik']:\n",
    "        if w in x['product_name'] or w in x['category']:        \n",
    "            return True\n",
    "\n",
    "    if re.search(r'(tagihan).*(listrik|telkom)|(voucher|token).*(listrik|pln)|pln.*(prepaid|postpaid|token)|\\bpdam\\b', x['product_name']):\n",
    "        return True\n",
    "\n",
    "    return False\n",
    "\n",
    "def get_game(x):\n",
    "    if int(x['merchant_id']) in [426,197]:\n",
    "        return True\n",
    "    elif re.search(r'(voucher|data).*game|game.*(voucher|data)|\\bgamesmax\\b', x['product_name']):\n",
    "        return True\n",
    "    elif re.search('voucher', x['category']):\n",
    "        if re.search('game',x['product_name']):\n",
    "            return True\n",
    "    else:\n",
    "        return False\n",
    "\n",
    "def get_pulsa(x):\n",
    "    if int(x['merchant_id']) == 8:\n",
    "        for n in ['pln','listrik','pdam','game']:\n",
    "            if n in x['product_name']:\n",
    "                return False\n",
    "        if 'rp' in x['product_name']:\n",
    "            return True\n",
    "    elif x['merchant_id'] != 8:\n",
    "        for n in ['pln','listrik','pdam','game', 'flight','mobiles & tablets', 'komputer & aksesoris']:\n",
    "            if n in x['product_name'] or n in x['category']:\n",
    "                return False\n",
    "        for w in ['mobile', 'digital products','pulsa', \n",
    "              'virtual','75061635','paket data','digital utilities','pasca bayar']:\n",
    "            if w in x['category']:\n",
    "                return True\n",
    "\n",
    "        if re.search(r'\\bpulsa\\b', x['product_name']):\n",
    "            return True\n",
    "\n",
    "        for w in ['paket data', 'kuota', 'xl xtra', 'indosat' , 'telkomsel', 'voucher 3',\n",
    "                  'smartfren', 'voucher tri', 'xl xtra combo', 'axis', 'paket internet']:\n",
    "            if w in x['product_name']:\n",
    "                return True\n",
    "\n",
    "    else:\n",
    "        return False\n",
    "\n",
    "def get_digital_product_status(df):\n",
    "#     df['trx_cat_ticket'] = df.apply(lambda x: get_ticket(x), axis=1)\n",
    "    df['trx_cat_game'] = df.apply(lambda x: get_game(x), axis=1)\n",
    "    df['trx_cat_pulsa'] = df.apply(lambda x: get_pulsa(x), axis=1)\n",
    "    df['trx_cat_utilities'] = df.apply(lambda x: get_utilities(x), axis=1)\n",
    "    \n",
    "    df['trx_cat_digital'] = df[['trx_cat_game', 'trx_cat_pulsa', 'trx_cat_utilities']].any(1)\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_prev_digital_trx_per_type(df, keyword, window='all'):\n",
    "    if keyword not in ['ticket', 'game', 'pulsa', 'utilities', 'digital']:\n",
    "        raise ValueError(f'{keyword} not available. Try ticket, game, pulsa, utilities, digital.')\n",
    "                         \n",
    "    if isinstance(window, int):\n",
    "        df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
    "        name = str(window)\n",
    "    elif window == 'all':\n",
    "        df = df[df['time_diff'] > 0]\n",
    "        name = 'ever'\n",
    "    elif isinstance(window, int) == False or window != 'all':\n",
    "        raise ValueError(f'The input \"{window}\" is wrong. Window value must be an integer or \"all\".')\n",
    "        \n",
    "    key = 'trx_cat_'+ keyword\n",
    "    digital = df.groupby(['user_id', 'trx_id', key]).agg({key:'count',\n",
    "                                                          'loan_amount':['sum', 'mean']})\\\n",
    "                .unstack().fillna(0)\n",
    "    \n",
    "    columns = []\n",
    "    for groups in digital.columns.values:\n",
    "        col_name = []\n",
    "        for item in groups:\n",
    "            col_name.append(str(item))\n",
    "        new_name = '-'.join(col_name)\n",
    "        columns.append(new_name)\n",
    "    digital.columns = columns\n",
    "    \n",
    "    if keyword == 'digital':\n",
    "        final = digital.reset_index()\n",
    "        final.columns = ['user_id', 'trx_id',\n",
    "                         f'nondgtl_nonpl_trx_co_{name}',\n",
    "                         f'dgtl_trx_co_all_{name}',\n",
    "                         f'nondgtl_nonpl_trx_sum_amt_{name}',\n",
    "                         f'dgtl_trx_sum_amt_all_{name}',\n",
    "                         f'nondgtl_nonpl_trx_avg_amt_{name}',\n",
    "                         f'dgtl_trx_avg_amt_all_{name}']\n",
    "        \n",
    "    else:\n",
    "        relevant_columns = [col for col in digital.columns if 'True' in col]\n",
    "        final = digital[relevant_columns].reset_index()\n",
    "        final.columns = ['user_id', 'trx_id', \n",
    "                         f'dgtl_trx_co_{keyword}_{name}', \n",
    "                         f'dgtl_trx_sum_amt_{keyword}_{name}',\n",
    "                         f'dgtl_trx_avg_amt_{keyword}_{name}']\n",
    "    \n",
    "    return final"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_prev_digital_trx_per_type(trx_hist, 'digital', window=180)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## trx_denied_co_90"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_denied_trx(tid):\n",
    "    denied_trx_q = f'''\n",
    "    SELECT\n",
    "        base.user_id,\n",
    "        base.pl_trx_id AS trx_id,\n",
    "        COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date), transaction_date) <= 90 \n",
    "        AND datediff(DATE(pl_transaction_date), transaction_date) >= 0 \n",
    "        THEN base.transaction_id ELSE NULL END)) AS trx_denied_co_90\n",
    "    FROM\n",
    "        (\n",
    "        SELECT\n",
    "            tr1.user_id,\n",
    "            tr1.id AS pl_trx_id,\n",
    "            tr1.pl_transaction_date,\n",
    "            tr2.id AS transaction_id,\n",
    "            tr2.transaction_date,\n",
    "            tr2.amount\n",
    "        FROM\n",
    "            (\n",
    "            SELECT\n",
    "                user_id,\n",
    "                id,\n",
    "                transaction_date AS pl_transaction_date\n",
    "            FROM\n",
    "                l2alpha.`transaction`\n",
    "            WHERE\n",
    "                id = {tid} ) AS tr1\n",
    "        LEFT JOIN (\n",
    "            SELECT\n",
    "                *\n",
    "            FROM\n",
    "                l2alpha.`transaction`\n",
    "            WHERE\n",
    "                status = 5 ) AS tr2 ON\n",
    "            tr1.user_id = tr2.user_id\n",
    "            AND tr2.transaction_date <= tr1.pl_transaction_date ) AS base\n",
    "    GROUP BY\n",
    "        base.user_id,\n",
    "        base.pl_transaction_date\n",
    "    '''\n",
    "    denied_trx = pd.read_sql(denied_trx_q, connect_sql())\n",
    "    return denied_trx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_denied_trx(tids)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## trx_sett_sum_30"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_settled_trx(tid):\n",
    "    paid_trx_q = f'''\n",
    "    SELECT base.user_id,\n",
    "    base.pl_trx_id AS trx_id,\n",
    "    SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 30 AND \n",
    "                       datediff(DATE(pl_transaction_date),start_date) >=0 \n",
    "                       THEN base.loan_amount ELSE 0 END) AS trx_sett_sum_30\n",
    "    FROM\n",
    "        (SELECT tr.user_id,\n",
    "         tr.id AS pl_trx_id,\n",
    "         tr.pl_transaction_date, \n",
    "         l.transaction_id,\n",
    "         l.start_date,\n",
    "         l.loan_amount,\n",
    "         l.tenure\n",
    "\n",
    "        FROM \n",
    "            (SELECT user_id, \n",
    "                    id, \n",
    "                    transaction_date AS pl_transaction_date\n",
    "            FROM l2alpha.`transaction` WHERE id = {tid}\n",
    "            ) AS tr\n",
    "        INNER JOIN \n",
    "            (SELECT user_id,\n",
    "                    start_date, \n",
    "                    settlement_date,\n",
    "                    loan_amount,\n",
    "                    tenure,\n",
    "                    transaction_id \n",
    "            FROM l2alpha.loan\n",
    "            WHERE is_active = 1 AND loan_amount > 0\n",
    "            ) AS l\n",
    "        ON l.user_id = tr.user_id AND settlement_date <= pl_transaction_date\n",
    "        ) AS base\n",
    "    GROUP BY base.user_id, base.pl_transaction_date\n",
    "    '''\n",
    "    settled_trx = pd.read_sql(paid_trx_q, connect_sql())\n",
    "    return settled_trx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_settled_trx(tids)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# pl_settle_to_due_last_180 (from settle_to_due_last_pl_180)\n",
    "# time_from_last_sett_pl_hour (from time_from_prev_pl_settle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_trx_settle_data(base_df):\n",
    "    uids = tuple(base_df['user_id'])\n",
    "    max_trx_date = base_df['transaction_date'].max()\n",
    "\n",
    "    early_settle_q = f'''\n",
    "    SELECT *,\n",
    "            DATE(transaction_date) + INTERVAL payment_type_int DAY AS due_date\n",
    "    FROM\n",
    "        (SELECT tr.user_id, tr.trx_id, transaction_date, payment_type, transaction_type, status,\n",
    "                CASE WHEN payment_type = '30_days' THEN '30'\n",
    "                WHEN payment_type = '3_months' THEN '90'\n",
    "                WHEN payment_type = '6_months' THEN '180' \n",
    "                WHEN payment_type = '12_months' THEN '360' ELSE NULL END AS payment_type_int,\n",
    "\n",
    "                settlement_date, loan_amount, is_active\n",
    "        FROM\n",
    "            (SELECT user_id,\n",
    "                   id AS trx_id,\n",
    "                   transaction_date,\n",
    "                   payment_type,\n",
    "                   transaction_type,\n",
    "                   status\n",
    "            FROM l2alpha.`transaction`\n",
    "            WHERE user_id IN {uids}\n",
    "            AND transaction_type = 2\n",
    "            AND status = 4\n",
    "            AND transaction_date <= '{max_trx_date}'\n",
    "            ) AS tr\n",
    "\n",
    "            LEFT JOIN\n",
    "\n",
    "            (SELECT transaction_id AS trx_id,\n",
    "                   settlement_date,\n",
    "                   loan_amount,\n",
    "                   is_active\n",
    "            FROM l2alpha.loan\n",
    "            ) AS lo\n",
    "\n",
    "            ON tr.trx_id = lo.trx_id\n",
    "            ) AS base\n",
    "        WHERE loan_amount > 0 AND is_active = 1\n",
    "    '''\n",
    "    query_df = pd.read_sql(early_settle_q, connect_sql())\n",
    "    query_df['settle_to_due'] = (query_df['due_date'] - query_df['settlement_date'].dt.date).dt.days\n",
    "    df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(query_df, how='left', on='user_id')\\\n",
    "                                         .rename(columns={'transaction_date_x':'trx_dt_current',\n",
    "                                          'transaction_date_y':'trx_dt_past',\n",
    "                                          'trx_id_x':'trx_id',\n",
    "                                          'trx_id_y':'trx_id_ref'})\n",
    "    df['time_diff'] = (df['trx_dt_current'] - df['trx_dt_past']).dt.total_seconds()\n",
    "    df = df.sort_values(['user_id', 'trx_dt_past']).reset_index(drop=True)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_early_settle_feats(df, base_df, window='all'):\n",
    "    if isinstance(window, int):\n",
    "        df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
    "        name = str(window)\n",
    "    elif window == 'all':\n",
    "        df = df[df['time_diff'] > 0]\n",
    "        name = 'ever'\n",
    "    df = df.sort_values(['user_id', 'trx_id', 'trx_dt_past'])\n",
    "    \n",
    "    # Calculate how many times the user has settled early in the last XX days\n",
    "    co_df = df.copy()\n",
    "    co_df['flag_early_settle'] = np.where(co_df['settle_to_due'] >= 30, 1, 0)\n",
    "    co_df['flag_early_settle'] = np.where(co_df['settle_to_due'].isna(), np.nan, co_df['flag_early_settle'])\n",
    "    co_df = co_df.groupby('trx_id').agg({'flag_early_settle':'sum'}).reset_index()\n",
    "    co_df.columns = ['trx_id', f'early_settle_co_{name}']\n",
    "    \n",
    "    # Calculate the min, max, average, and median days the user has settled early in the last XX days\n",
    "    # Also get the last time the user settled early (how many days early).\n",
    "    es_df = df.groupby('trx_id').agg({'settle_to_due':[max, min, 'mean', 'median', 'last']})\n",
    "    col_names = ['settle_to_due_max_{}', 'settle_to_due_min_{}', 'settle_to_due_avg_{}', \n",
    "                 'settle_to_due_med_{}', 'settle_to_due_last_pl_{}']\n",
    "    es_df.columns = [col.format(name) for col in col_names]\n",
    "    es_df = es_df.reset_index()\n",
    "    out_df = base_df[['user_id', 'trx_id']].merge(es_df, how='left', on='trx_id')\\\n",
    "                                           .merge(co_df, how='left', on='trx_id')\n",
    "    return out_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_early_settle_feats(trx_settle, raw_df, window=180)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_time_from_last_pl_feat(df):\n",
    "    time_from_last_pl = df[df['time_diff'] >= 0]\n",
    "    time_from_last_pl.loc[(time_from_last_pl['time_diff'] == 0) & \n",
    "                          (time_from_last_pl['settlement_date'].notna()), 'settlement_date'] = np.nan\n",
    "    time_from_last_pl = time_from_last_pl.sort_values(['user_id', 'trx_id', 'trx_dt_past'])\n",
    "\n",
    "    time_from_last_pl_grp = time_from_last_pl.groupby('trx_id').tail(2)\n",
    "    time_from_last_pl_grp['settle_shift'] = time_from_last_pl_grp['settlement_date'].shift()\n",
    "    time_from_last_pl_grp['time_from_prev_pl_settle'] = (time_from_last_pl_grp['trx_dt_current'] - \\\n",
    "                                                          time_from_last_pl_grp['settle_shift']).dt.total_seconds()/3600\n",
    "    \n",
    "    time_from_last_pl_final = time_from_last_pl_grp.groupby('trx_id').tail(1)\n",
    "    time_from_last_pl_final = time_from_last_pl_final[['user_id', 'trx_id', \n",
    "                                                       'time_from_prev_pl_settle']].reset_index(drop=True)\n",
    "    return time_from_last_pl_final"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_time_from_last_pl_feat(trx_settle)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}