past-data-projects / life_time_value / 500-Productionization / 510-SplineCoxReg_Feature_Functions.ipynb
510-SplineCoxReg_Feature_Functions.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pymysql\n",
    "import configparser\n",
    "\n",
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = configparser.ConfigParser()\n",
    "config.read('/home/ec2-user/SageMaker/zhilal_working_directory/config.ini')\n",
    "\n",
    "host_l2alpha = config['MYSQL-ROOT']['HOST']\n",
    "user_l2alpha = config['MYSQL-ROOT']['USER']\n",
    "password_l2alpha = config['MYSQL-ROOT']['PASSWORD']\n",
    "\n",
    "def connect_sql():\n",
    "    cnx = pymysql.connect(host=host_l2alpha,\n",
    "                          user=user_l2alpha,\n",
    "                          password=password_l2alpha,\n",
    "                          cursorclass=pymysql.cursors.DictCursor)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Premium Model Features\n",
    "features_prm = [\n",
    "    'ap_co_plo',\n",
    "    'de_education',\n",
    "    'delin_dpd_max_90',\n",
    "    'rep_count_30',\n",
    "    'snapshot_current_dpd',\n",
    "    'snapshot_days_premium',\n",
    "    'snapshot_os_amount',\n",
    "    'trx_active_amt_avg',\n",
    "    'trx_denied_count_120',\n",
    "    'trx_merch_count_dist_60',\n",
    "    'trx_pl_count_90',\n",
    "    'trx_suc_amt_sum_30',\n",
    "    'util_latest'\n",
    "]\n",
    "\n",
    "# Basic Model Features\n",
    "features_bsc = [\n",
    "    'ap_co_plo',\n",
    "    'de_education',\n",
    "    'delin_dpd_max_90',\n",
    "    'rep_full_paid_count_60',\n",
    "    'snapshot_current_dpd',\n",
    "    'snapshot_dob',\n",
    "    'trx_active_count',\n",
    "    'trx_denied_count_120',\n",
    "    'trx_merch_count_dist_120',\n",
    "    'trx_pl_count_60',\n",
    "    'trx_suc_amt_sum_120',\n",
    "    'util_latest'\n",
    "]\n",
    "\n",
    "# Putting it all together Feature\n",
    "features_npl = ['snapshot_os_principal']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = sorted(list(set(features_prm + features_bsc + features_npl)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature Functions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ap_co_plo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_feats_apps(raw_df, plo_app_list, label='installed_applications'):\n",
    "    df = raw_df.copy()\n",
    "    df[label] = df[label].fillna('[]')\n",
    "    df[label] = df[label].apply(lambda x: eval(x))\n",
    "\n",
    "    def count_apps(x,list_apps):\n",
    "        return len([ft for ft in x if ft.lower() in list_apps])\n",
    "\n",
    "    payday_loan_apps = plo_app_list[0].unique().tolist()\n",
    "    payday_loan_apps = [x.lower() for x in payday_loan_apps]\n",
    "\n",
    "    df['ap_co_plo'] = df[label].apply(lambda x:count_apps(x,payday_loan_apps))\n",
    "\n",
    "    return df\n",
    "\n",
    "def get_payday_apps(uids):\n",
    "    ins_app_q = f'''\n",
    "    SELECT user_id,\n",
    "           installed_applications\n",
    "    FROM l2alpha.web_userdevicedetail\n",
    "    WHERE user_id IN {uids}\n",
    "    '''\n",
    "    ins_app = pd.read_sql(ins_app_q, connect_sql())\n",
    "\n",
    "    plo_app_list = pd.read_csv('payday_700_list.csv', header=None)\n",
    "\n",
    "    plo_apps = create_feats_apps(ins_app, plo_app_list)\n",
    "    plo_apps = plo_apps.drop(columns=['installed_applications'])\n",
    "    return plo_apps"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## de_education"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_demographic_feats(uids):\n",
    "    query = f'''\n",
    "    select\n",
    "        user_id\n",
    "        , education as de_education\n",
    "    from l2alpha.web_userpersonaldetail\n",
    "    where user_id in {uids}\n",
    "    '''\n",
    "    result = pd.read_sql(query, connect_sql())\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## delin_dpd_max_90"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_delin(uids, obs_date):\n",
    "    query = f'''\n",
    "    SELECT \n",
    "        user_id\n",
    "        , MAX(CASE WHEN datediff('{obs_date}',snapshot_date) <= 90 AND\n",
    "                      datediff('{obs_date}',snapshot_date) >=0 \n",
    "                   THEN current_dpd \n",
    "                   ELSE NULL END) AS delin_dpd_max_90\n",
    "    FROM ds.user_level_archive\n",
    "    WHERE user_id in {uids} \n",
    "    AND snapshot_date < '{obs_date}'\n",
    "    GROUP BY user_id\n",
    "    '''\n",
    "    result = pd.read_sql(query, connect_sql())\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## rep_count_30"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_repayments(uids, obs_date):\n",
    "    query = f'''\n",
    "    SELECT \n",
    "        user_id              \n",
    "        , COUNT(DISTINCT(CASE WHEN datediff(DATE('{obs_date}'),payment_date) <= 30 \n",
    "                              AND datediff(DATE('{obs_date}'),payment_date) >=0\n",
    "                              THEN payment_date ELSE NULL END)) as rep_count_30\n",
    "\n",
    "    FROM l2alpha.user_payment\n",
    "    WHERE status = 2 \n",
    "    AND payment_type = 'INSTALLMENT' \n",
    "    AND payment_channel <> 'CANCELLATION' \n",
    "    AND user_id in {uids}\n",
    "    AND payment_date < '{obs_date}'\n",
    "    GROUP BY user_id\n",
    "    '''\n",
    "    result = pd.read_sql(query, connect_sql())\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## rep_full_paid_count_60"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_paid_trx(uids, obs_date):\n",
    "    query = f'''\n",
    "    SELECT \n",
    "        user_id\n",
    "        , COUNT(DISTINCT(CASE WHEN datediff(DATE('{obs_date}'),start_date) <= 60 \n",
    "                              AND datediff(DATE('{obs_date}'),start_date) >=0 \n",
    "                              THEN transaction_id \n",
    "                              ELSE NULL END)) AS rep_full_paid_count_60\n",
    "    FROM l2alpha.loan\n",
    "    WHERE is_active = 1 \n",
    "    AND loan_amount > 0\n",
    "    AND user_id in {uids} \n",
    "    AND settlement_date < '{obs_date}'\n",
    "    GROUP BY user_id\n",
    "    '''\n",
    "    result = pd.read_sql(query, connect_sql())\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## snapshot\n",
    "- snapshot_current_dpd\n",
    "- snapshot_days_premium\n",
    "- snapshot_dob\n",
    "- snapshot_os_amount\n",
    "- snapshot_os_principal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_dpd_snapshot(uids, obs_date):\n",
    "    query = f'''\n",
    "    SELECT \n",
    "        user_id\n",
    "        , os_amount  AS snapshot_os_amount\n",
    "        , current_dpd AS snapshot_current_dpd\n",
    "        , os_principal AS snapshot_os_principal\n",
    "    FROM ds.user_level_archive\n",
    "    WHERE user_id IN {uids}\n",
    "    AND snapshot_date = '{obs_date}'\n",
    "    '''\n",
    "    result = pd.read_sql(query, connect_sql())\n",
    "    return result\n",
    "\n",
    "\n",
    "def get_dob(uids, obs_date):\n",
    "    query = f'''\n",
    "    SELECT\n",
    "        wud.user_id\n",
    "        , TIMESTAMPDIFF(DAY, DATE(set_password_timestamp), '{obs_date}') AS snapshot_dob\n",
    "        , TIMESTAMPDIFF(DAY, DATE(is_approved_timestamp), '{obs_date}') AS snapshot_days_premium\n",
    "    FROM\n",
    "        (SELECT\n",
    "            user_id\n",
    "            , set_password_timestamp\n",
    "        FROM l2alpha.web_userdetail\n",
    "        WHERE user_id IN {uids}\n",
    "        ) AS wud\n",
    "        LEFT JOIN\n",
    "        (SELECT\n",
    "            user_id\n",
    "            , is_approved_timestamp\n",
    "        FROM l2alpha.web_userinstallmentapplicationdetails\n",
    "        ) AS wuid\n",
    "        ON wud.user_id = wuid.user_id\n",
    "    '''\n",
    "    result = pd.read_sql(query, connect_sql())\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## trx_active\n",
    "- trx_active_amt_avg\n",
    "- trx_active_count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_trx_info(uids, obs_date):\n",
    "    query = f'''\n",
    "    select \n",
    "        user_id\n",
    "        , count(distinct(transaction_id)) as trx_active_count\n",
    "        , sum(loan_amount)/count(distinct(transaction_id)) as trx_active_amt_avg\n",
    "    from l2alpha.loan\n",
    "    where is_active = 1\n",
    "    and loan_amount > 0\n",
    "    and user_id in {uids}\n",
    "    and start_date < '{obs_date}'\n",
    "    and (settlement_date is null or settlement_date > '{obs_date}')\n",
    "    group by user_id\n",
    "    '''\n",
    "    result = pd.read_sql(query, connect_sql())\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## trx_denied_count_120"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_previous_trx_co(uids, obs_date):\n",
    "    query = f'''\n",
    "    SELECT \n",
    "        user_id\n",
    "        , COUNT(CASE WHEN datediff('{obs_date}', DATE(transaction_date)) <= 120\n",
    "                     AND datediff('{obs_date}', DATE(transaction_date)) >=0\n",
    "                     AND status = 5\n",
    "                     THEN id ELSE NULL END) AS trx_denied_count_120\n",
    "\n",
    "    FROM l2alpha.transaction\n",
    "    WHERE user_id IN {uids}\n",
    "    AND transaction_date < '{obs_date}'\n",
    "    GROUP BY user_id\n",
    "    '''\n",
    "    result = pd.read_sql(query, connect_sql())\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## trx_merch_count_dist\n",
    "- trx_merch_count_dist_60\n",
    "- trx_merch_count_dist_120"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_count_merch_trx(uids, obs_date):\n",
    "    query = f'''\n",
    "    SELECT \n",
    "        base.user_id\n",
    "        , COUNT(DISTINCT(CASE WHEN datediff(DATE('{obs_date}'),start_date) <= 60 \n",
    "                              AND datediff(DATE('{obs_date}'),start_date) >=0 \n",
    "                              THEN base.merchant_id \n",
    "                              ELSE NULL END)) as trx_merch_count_dist_60\n",
    "        , COUNT(DISTINCT(CASE WHEN datediff(DATE('{obs_date}'),start_date) <= 120 \n",
    "                              AND datediff(DATE('{obs_date}'),start_date) >=0 \n",
    "                              THEN base.merchant_id \n",
    "                              ELSE NULL END)) as trx_merch_count_dist_120                                   \n",
    "    FROM\n",
    "        (SELECT \n",
    "            bs.*,\n",
    "            trx.merchant_id\n",
    "         FROM\n",
    "            (SELECT \n",
    "                user_id\n",
    "                , start_date\n",
    "                , transaction_id \n",
    "            FROM l2alpha.loan\n",
    "            WHERE is_active = 1 \n",
    "            AND loan_amount > 0\n",
    "            AND user_id in {uids} \n",
    "            AND start_date < '{obs_date}') AS bs\n",
    "        LEFT JOIN l2alpha.transaction as trx\n",
    "        ON bs.transaction_id = trx.id) as base\n",
    "    GROUP BY base.user_id\n",
    "    '''\n",
    "    result = pd.read_sql(query, connect_sql())\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## trx_pl_count\n",
    "- trx_pl_count_60\n",
    "- trx_pl_count_90"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_count_pl(uids, obs_date):\n",
    "    query = f'''\n",
    "    SELECT \n",
    "        user_id\n",
    "        , COUNT(DISTINCT(CASE WHEN datediff(DATE('{obs_date}'),transaction_date) <= 60 \n",
    "                              AND datediff(DATE('{obs_date}'),transaction_date) >=0 \n",
    "                              THEN id \n",
    "                              ELSE NULL END)) as trx_pl_count_60\n",
    "        , COUNT(DISTINCT(CASE WHEN datediff(DATE('{obs_date}'),transaction_date) <= 90 \n",
    "                              AND datediff(DATE('{obs_date}'),transaction_date) >=0 \n",
    "                              THEN id \n",
    "                              ELSE NULL END)) as trx_pl_count_90                 \n",
    "    FROM l2alpha.transaction\n",
    "    WHERE transaction_type = 2\n",
    "    AND status = 4 \n",
    "    AND user_id in {uids} \n",
    "    AND transaction_date < '{obs_date}'\n",
    "    GROUP BY user_id\n",
    "    '''\n",
    "    result = pd.read_sql(query, connect_sql())\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## trx_suc_amt_sum\n",
    "- trx_suc_amt_sum_30\n",
    "- trx_suc_amt_sum_120"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_previous_trx_sum(uids, obs_date):\n",
    "    query = f'''\n",
    "    SELECT \n",
    "        user_id\n",
    "        , SUM(CASE WHEN datediff('{obs_date}', DATE(transaction_date)) <= 30\n",
    "                   AND datediff('{obs_date}', DATE(transaction_date)) >=0\n",
    "                   AND status = 4\n",
    "                   THEN amount \n",
    "                   ELSE NULL END) AS trx_suc_amt_sum_30\n",
    "        , SUM(CASE WHEN datediff('{obs_date}', DATE(transaction_date)) <= 120\n",
    "                   AND datediff('{obs_date}', DATE(transaction_date)) >=0\n",
    "                   AND status = 4\n",
    "                   THEN amount \n",
    "                   ELSE NULL END) AS trx_suc_amt_sum_120\n",
    "    FROM l2alpha.transaction\n",
    "    WHERE user_id IN {uids}\n",
    "    AND transaction_date < '{obs_date}'\n",
    "    GROUP BY user_id\n",
    "    '''\n",
    "    result = pd.read_sql(query, connect_sql())\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## util_latest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_utilization(uids, obs_date):\n",
    "    query = f'''\n",
    "    SELECT main.user_id,\n",
    "           COALESCE(SUM(loan_amount/transaction_limit), 0) AS util_latest\n",
    "    FROM\n",
    "        (SELECT user_id, \n",
    "               trx_id, \n",
    "               loan_id,\n",
    "               loan_amount,\n",
    "               tenure,\n",
    "               MIN(creation_time) AS latest_limit\n",
    "        FROM\t\n",
    "            (SELECT lo.user_id,\n",
    "                   lo.transaction_id AS trx_id,\n",
    "                   lo.loan_id,\n",
    "                   loan_amount,\n",
    "                   creation_time,\n",
    "                   tenure\n",
    "            FROM \n",
    "                    (SELECT id AS loan_id,\n",
    "                            user_id,\n",
    "                            start_date,\n",
    "                            settlement_date,\n",
    "                            loan_amount,\n",
    "                            tenure,\n",
    "                            transaction_id \n",
    "                    FROM l2alpha.loan\n",
    "                    WHERE user_id IN {uids}\n",
    "                    AND is_active = 1 \n",
    "                    AND loan_amount > 0\n",
    "                    AND start_date < '{obs_date}' \n",
    "                    AND (settlement_date IS NULL OR settlement_date > '{obs_date}')\n",
    "                    ) AS lo\n",
    "                LEFT JOIN \n",
    "                    (SELECT user_id,\n",
    "                            CASE WHEN installment_type = '30_days' THEN 30\n",
    "                            WHEN installment_type = '3_months' THEN 90 \n",
    "                            WHEN installment_type = '6_months' THEN 180 \n",
    "                            WHEN installment_type = '12_months' THEN 360 \n",
    "                            ELSE NULL END AS  installment_type,\n",
    "                            creation_time\n",
    "                    FROM l2alpha.transaction_limit_history\n",
    "                    ) AS tlh\n",
    "                ON tlh.user_id = lo.user_id AND tlh.installment_type = lo.tenure\n",
    "            ) AS base\n",
    "        GROUP BY trx_id, loan_id\n",
    "        ) AS main\n",
    "\n",
    "        LEFT JOIN \n",
    "        (SELECT user_id,\n",
    "                transaction_limit,\n",
    "                CASE WHEN installment_type = '30_days' THEN 30\n",
    "                WHEN installment_type = '3_months' THEN 90 \n",
    "                WHEN installment_type = '6_months' THEN 180 \n",
    "                WHEN installment_type = '12_months' THEN 360 \n",
    "                ELSE NULL END AS installment_type,\n",
    "                creation_time\n",
    "        FROM l2alpha.transaction_limit_history\n",
    "        WHERE user_id IN {uids}\n",
    "        GROUP BY user_id, transaction_limit, installment_type, creation_time\n",
    "        ) AS lim\n",
    "        ON lim.creation_time = main.latest_limit \n",
    "        AND lim.user_id = main.user_id\n",
    "        AND lim.installment_type = main.tenure\n",
    "    GROUP BY main.user_id\n",
    "    '''\n",
    "    result = pd.read_sql(query, connect_sql())\n",
    "    result['util_latest'] = np.where(result['util_latest'] > 1, 1, result['util_latest'])\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_features(df, obs_date):\n",
    "    st_tm = datetime.now()\n",
    "    uids = tuple(df['user_id'].unique())\n",
    "    \n",
    "    print('Payday Loan feature start')\n",
    "    plo_df = get_payday_apps(uids)\n",
    "    print('plo_df size:', plo_df.shape[0])\n",
    "    \n",
    "    print('Demographic features start')\n",
    "    de_df = get_demographic_feats(uids)\n",
    "    print('de_df size:', de_df.shape[0])\n",
    "    \n",
    "    print('Delinquency features start')\n",
    "    delin_df = get_delin(uids, obs_date)\n",
    "    print('delin_df size:', delin_df.shape[0])\n",
    "    \n",
    "    print('Repayment features start')\n",
    "    rep_df = get_repayments(uids, obs_date)\n",
    "    print('rep_df size:', rep_df.shape[0])\n",
    "\n",
    "    print('Fully Paid features start')\n",
    "    fully_paid_df = get_paid_trx(uids, obs_date)\n",
    "    print('fully_paid_df size:', fully_paid_df.shape[0])\n",
    "    \n",
    "    print('DOB features start')\n",
    "    dob_df = get_dob(uids, obs_date)\n",
    "    print('dob_df size:', dob_df.shape[0])\n",
    "\n",
    "    print('DPD Snapshot features start')\n",
    "    dpd_df = get_dpd_snapshot(uids, obs_date)\n",
    "    print('dpd_df size:', dpd_df.shape[0])\n",
    "    \n",
    "    print('Trx info features start')\n",
    "    trx_info_df = get_trx_info(uids, obs_date)\n",
    "    print('trx_info_df size:', trx_info_df.shape[0])\n",
    "    \n",
    "    print('Trx Counts features start')\n",
    "    trx_counts_df = get_previous_trx_co(uids, obs_date)\n",
    "    print('trx_counts_df size:', trx_counts_df.shape[0])\n",
    "    \n",
    "    print('Merchant Trx features start')\n",
    "    merch_trx_df = get_count_merch_trx(uids, obs_date)\n",
    "    print('merch_trx_df size:', merch_trx_df.shape[0])\n",
    "    \n",
    "    print('PL Trx features start')\n",
    "    pl_trx_df = get_count_pl(uids, obs_date)\n",
    "    print('pl_trx_df size:', pl_trx_df.shape[0])\n",
    "    \n",
    "    print('Trx Sum features start')\n",
    "    trx_sum_df = get_previous_trx_sum(uids, obs_date)\n",
    "    print('trx_sum_df size:', trx_sum_df.shape[0])\n",
    "    \n",
    "    print('Utilization Trx features start')\n",
    "    util_df = get_utilization(uids, obs_date)\n",
    "    print('util_df size:', util_df.shape[0])\n",
    "    \n",
    "    print('Merge all dataframes')\n",
    "    final_df = df[['user_id']]\\\n",
    "                .merge(plo_df, how='left', on='user_id')\\\n",
    "                .merge(de_df, how='left', on='user_id')\\\n",
    "                .merge(delin_df, how='left', on='user_id')\\\n",
    "                .merge(rep_df, how='left', on='user_id')\\\n",
    "                .merge(fully_paid_df, how='left', on='user_id')\\\n",
    "                .merge(dob_df, how='left', on='user_id')\\\n",
    "                .merge(dpd_df, how='left', on='user_id')\\\n",
    "                .merge(trx_info_df, how='left', on='user_id')\\\n",
    "                .merge(trx_counts_df, how='left', on='user_id')\\\n",
    "                .merge(merch_trx_df, how='left', on='user_id')\\\n",
    "                .merge(pl_trx_df, how='left', on='user_id')\\\n",
    "                .merge(trx_sum_df, how='left', on='user_id')\\\n",
    "                .merge(util_df, how='left', on='user_id')\n",
    "    print('Feature Creation DONE')\n",
    "    print('Total time elapsed:', datetime.now() - st_tm)\n",
    "    print()\n",
    "    return final_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ap_co_plo',\n",
       " 'de_education',\n",
       " 'delin_dpd_max_90',\n",
       " 'rep_count_30',\n",
       " 'rep_full_paid_count_60',\n",
       " 'snapshot_current_dpd',\n",
       " 'snapshot_days_premium',\n",
       " 'snapshot_dob',\n",
       " 'snapshot_os_amount',\n",
       " 'snapshot_os_principal',\n",
       " 'trx_active_amt_avg',\n",
       " 'trx_active_count',\n",
       " 'trx_denied_count_120',\n",
       " 'trx_merch_count_dist_120',\n",
       " 'trx_merch_count_dist_60',\n",
       " 'trx_pl_count_60',\n",
       " 'trx_pl_count_90',\n",
       " 'trx_suc_amt_sum_120',\n",
       " 'trx_suc_amt_sum_30',\n",
       " 'util_latest']"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sanity Check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "base = pd.read_parquet('s3://finaccel-ml-model-data-production/zhilal/ltv_final/raw_cohorts/user_set_2018-07.parquet')\n",
    "\n",
    "# Make sure there is no user with user_status = 3 or Rejected \n",
    "base = base.query(\"user_status in (2,5,11,9,7)\")\n",
    "\n",
    "base['user_status_code'] = base['user_status']\n",
    "user_status_mapping = {2: 'activated', 5: 'suspended', 11:'blacklisted', 9:'closed', 7:'expired'}\n",
    "base = base.replace({'user_status': user_status_mapping})\n",
    "base['user_status'] = base['user_status'].astype('str')\n",
    "\n",
    "base['application_type_code'] = base['application_type']\n",
    "base['application_type'] = np.where(base['application_type'].isin([150, 160, 170, 180]),\n",
    "                                    'upgrade', np.where(base['application_type'].isin([100, 110, 120]), 'fresh_premium', 'basic'))\n",
    "base['application_type'] = base['application_type'].astype('str')\n",
    "\n",
    "base = base.rename(columns={'first_dpd_dt_91':'first_dpd_91_dt'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "obs_date = '2019-04-01'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "test = base.sample(250, random_state=99)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Payday Loan feature start\n",
      "plo_df size: 250\n",
      "Demographic features start\n",
      "de_df size: 250\n",
      "Delinquency features start\n",
      "delin_df size: 211\n",
      "Repayment features start\n",
      "rep_df size: 205\n",
      "Fully Paid features start\n",
      "fully_paid_df size: 200\n",
      "DOB features start\n",
      "dob_df size: 250\n",
      "DPD Snapshot features start\n",
      "dpd_df size: 177\n",
      "Trx info features start\n",
      "trx_info_df size: 176\n",
      "Trx Counts features start\n",
      "trx_counts_df size: 224\n",
      "Merchant Trx features start\n",
      "merch_trx_df size: 211\n",
      "PL Trx features start\n",
      "pl_trx_df size: 83\n",
      "Trx Sum features start\n",
      "trx_sum_df size: 224\n",
      "Utilization Trx features start\n",
      "util_df size: 176\n",
      "Merge all dataframes\n",
      "Feature Creation DONE\n",
      "Total time elapsed: 0:02:12.986406\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ec2-user/anaconda3/envs/ltv/lib/python3.8/site-packages/pymysql/cursors.py:329: Warning: (1052, \"Column 'installment_type' in group statement is ambiguous\")\n",
      "  self._do_get_result()\n"
     ]
    }
   ],
   "source": [
    "checks = get_features(test, obs_date)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>ap_co_plo</th>\n",
       "      <th>de_education</th>\n",
       "      <th>delin_dpd_max_90</th>\n",
       "      <th>rep_count_30</th>\n",
       "      <th>rep_full_paid_count_60</th>\n",
       "      <th>snapshot_dob</th>\n",
       "      <th>snapshot_days_premium</th>\n",
       "      <th>snapshot_os_amount</th>\n",
       "      <th>snapshot_current_dpd</th>\n",
       "      <th>...</th>\n",
       "      <th>trx_active_count</th>\n",
       "      <th>trx_active_amt_avg</th>\n",
       "      <th>trx_denied_count_120</th>\n",
       "      <th>trx_merch_count_dist_60</th>\n",
       "      <th>trx_merch_count_dist_120</th>\n",
       "      <th>trx_pl_count_60</th>\n",
       "      <th>trx_pl_count_90</th>\n",
       "      <th>trx_suc_amt_sum_30</th>\n",
       "      <th>trx_suc_amt_sum_120</th>\n",
       "      <th>util_latest</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4001368</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>2.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>255</td>\n",
       "      <td>256.0</td>\n",
       "      <td>1310711.20</td>\n",
       "      <td>-4.0</td>\n",
       "      <td>...</td>\n",
       "      <td>2.0</td>\n",
       "      <td>6.957495e+05</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>344557.0</td>\n",
       "      <td>2919188.0</td>\n",
       "      <td>0.162211</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3922443</td>\n",
       "      <td>9</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>264</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4119795</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>249</td>\n",
       "      <td>NaN</td>\n",
       "      <td>525041.67</td>\n",
       "      <td>-18.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>5.000000e+05</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>500000.0</td>\n",
       "      <td>2500000.0</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4027138</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>68.0</td>\n",
       "      <td>257</td>\n",
       "      <td>-157.0</td>\n",
       "      <td>320997.00</td>\n",
       "      <td>-24.0</td>\n",
       "      <td>...</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.566667e+04</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1300000.0</td>\n",
       "      <td>3741322.0</td>\n",
       "      <td>0.534998</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4052800</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>160.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>256</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2617789.78</td>\n",
       "      <td>161.0</td>\n",
       "      <td>...</td>\n",
       "      <td>10.0</td>\n",
       "      <td>2.201060e+05</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.880424</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>245</th>\n",
       "      <td>4176715</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>244</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>246</th>\n",
       "      <td>4107006</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>250</td>\n",
       "      <td>250.0</td>\n",
       "      <td>2082870.54</td>\n",
       "      <td>-14.0</td>\n",
       "      <td>...</td>\n",
       "      <td>4.0</td>\n",
       "      <td>8.024360e+05</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>293924.0</td>\n",
       "      <td>625999.0</td>\n",
       "      <td>0.324064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>247</th>\n",
       "      <td>4117848</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>247</td>\n",
       "      <td>247.0</td>\n",
       "      <td>4658910.86</td>\n",
       "      <td>-25.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.126952e+07</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.971511</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>248</th>\n",
       "      <td>4152725</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>160.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>246</td>\n",
       "      <td>NaN</td>\n",
       "      <td>513684.11</td>\n",
       "      <td>161.0</td>\n",
       "      <td>...</td>\n",
       "      <td>6.0</td>\n",
       "      <td>7.198583e+04</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.863830</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>249</th>\n",
       "      <td>3798310</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>274</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>250 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     user_id  ap_co_plo  de_education  delin_dpd_max_90  rep_count_30  \\\n",
       "0    4001368          0             5               2.0           5.0   \n",
       "1    3922443          9             3               NaN           NaN   \n",
       "2    4119795          0             4               0.0           1.0   \n",
       "3    4027138          0             2              -1.0           3.0   \n",
       "4    4052800          2             3             160.0           0.0   \n",
       "..       ...        ...           ...               ...           ...   \n",
       "245  4176715          0             3               NaN           NaN   \n",
       "246  4107006          0             2               0.0           1.0   \n",
       "247  4117848          0             2               0.0           1.0   \n",
       "248  4152725          5             2             160.0           0.0   \n",
       "249  3798310          1             3               NaN           0.0   \n",
       "\n",
       "     rep_full_paid_count_60  snapshot_dob  snapshot_days_premium  \\\n",
       "0                       6.0           255                  256.0   \n",
       "1                       NaN           264                    NaN   \n",
       "2                       1.0           249                    NaN   \n",
       "3                      68.0           257                 -157.0   \n",
       "4                       0.0           256                    NaN   \n",
       "..                      ...           ...                    ...   \n",
       "245                     NaN           244                    NaN   \n",
       "246                     5.0           250                  250.0   \n",
       "247                     NaN           247                  247.0   \n",
       "248                     0.0           246                    NaN   \n",
       "249                     0.0           274                    NaN   \n",
       "\n",
       "     snapshot_os_amount  snapshot_current_dpd  ...  trx_active_count  \\\n",
       "0            1310711.20                  -4.0  ...               2.0   \n",
       "1                   NaN                   NaN  ...               NaN   \n",
       "2             525041.67                 -18.0  ...               1.0   \n",
       "3             320997.00                 -24.0  ...               9.0   \n",
       "4            2617789.78                 161.0  ...              10.0   \n",
       "..                  ...                   ...  ...               ...   \n",
       "245                 NaN                   NaN  ...               NaN   \n",
       "246          2082870.54                 -14.0  ...               4.0   \n",
       "247          4658910.86                 -25.0  ...               1.0   \n",
       "248           513684.11                 161.0  ...               6.0   \n",
       "249                 NaN                   NaN  ...               NaN   \n",
       "\n",
       "     trx_active_amt_avg  trx_denied_count_120  trx_merch_count_dist_60  \\\n",
       "0          6.957495e+05                   3.0                      5.0   \n",
       "1                   NaN                   0.0                      NaN   \n",
       "2          5.000000e+05                   1.0                      1.0   \n",
       "3          3.566667e+04                   0.0                      2.0   \n",
       "4          2.201060e+05                   0.0                      0.0   \n",
       "..                  ...                   ...                      ...   \n",
       "245                 NaN                   0.0                      NaN   \n",
       "246        8.024360e+05                   0.0                      4.0   \n",
       "247        1.126952e+07                   0.0                      0.0   \n",
       "248        7.198583e+04                   0.0                      0.0   \n",
       "249                 NaN                   0.0                      0.0   \n",
       "\n",
       "     trx_merch_count_dist_120  trx_pl_count_60  trx_pl_count_90  \\\n",
       "0                         5.0              NaN              NaN   \n",
       "1                         NaN              NaN              NaN   \n",
       "2                         1.0              2.0              3.0   \n",
       "3                         2.0              NaN              NaN   \n",
       "4                         0.0              NaN              NaN   \n",
       "..                        ...              ...              ...   \n",
       "245                       NaN              NaN              NaN   \n",
       "246                       4.0              NaN              NaN   \n",
       "247                       0.0              NaN              NaN   \n",
       "248                       0.0              NaN              NaN   \n",
       "249                       0.0              NaN              NaN   \n",
       "\n",
       "     trx_suc_amt_sum_30  trx_suc_amt_sum_120  util_latest  \n",
       "0              344557.0            2919188.0     0.162211  \n",
       "1                   NaN                  NaN          NaN  \n",
       "2              500000.0            2500000.0     1.000000  \n",
       "3             1300000.0            3741322.0     0.534998  \n",
       "4                   NaN                  NaN     0.880424  \n",
       "..                  ...                  ...          ...  \n",
       "245                 NaN                  NaN          NaN  \n",
       "246            293924.0             625999.0     0.324064  \n",
       "247                 NaN                  NaN     0.971511  \n",
       "248                 NaN                  NaN     0.863830  \n",
       "249                 NaN                  NaN          NaN  \n",
       "\n",
       "[250 rows x 21 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "checks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_ltv",
   "language": "python",
   "name": "conda_ltv"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}