{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pymysql\n",
"import configparser\n",
"\n",
"from datetime import datetime"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"config = configparser.ConfigParser()\n",
"config.read('/home/ec2-user/SageMaker/zhilal_working_directory/config.ini')\n",
"\n",
"host_l2alpha = config['MYSQL-ROOT']['HOST']\n",
"user_l2alpha = config['MYSQL-ROOT']['USER']\n",
"password_l2alpha = config['MYSQL-ROOT']['PASSWORD']\n",
"\n",
"def connect_sql():\n",
" cnx = pymysql.connect(host=host_l2alpha,\n",
" user=user_l2alpha,\n",
" password=password_l2alpha,\n",
" cursorclass=pymysql.cursors.DictCursor)\n",
" return cnx"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Features"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Premium Model Features\n",
"features_prm = [\n",
" 'ap_co_plo',\n",
" 'de_education',\n",
" 'delin_dpd_max_90',\n",
" 'rep_count_30',\n",
" 'snapshot_current_dpd',\n",
" 'snapshot_days_premium',\n",
" 'snapshot_os_amount',\n",
" 'trx_active_amt_avg',\n",
" 'trx_denied_count_120',\n",
" 'trx_merch_count_dist_60',\n",
" 'trx_pl_count_90',\n",
" 'trx_suc_amt_sum_30',\n",
" 'util_latest'\n",
"]\n",
"\n",
"# Basic Model Features\n",
"features_bsc = [\n",
" 'ap_co_plo',\n",
" 'de_education',\n",
" 'delin_dpd_max_90',\n",
" 'rep_full_paid_count_60',\n",
" 'snapshot_current_dpd',\n",
" 'snapshot_dob',\n",
" 'trx_active_count',\n",
" 'trx_denied_count_120',\n",
" 'trx_merch_count_dist_120',\n",
" 'trx_pl_count_60',\n",
" 'trx_suc_amt_sum_120',\n",
" 'util_latest'\n",
"]\n",
"\n",
"# Putting it all together Feature\n",
"features_npl = ['snapshot_os_principal']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"features = sorted(list(set(features_prm + features_bsc + features_npl)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Feature Functions"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ap_co_plo"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def create_feats_apps(raw_df, plo_app_list, label='installed_applications'):\n",
" df = raw_df.copy()\n",
" df[label] = df[label].fillna('[]')\n",
" df[label] = df[label].apply(lambda x: eval(x))\n",
"\n",
" def count_apps(x,list_apps):\n",
" return len([ft for ft in x if ft.lower() in list_apps])\n",
"\n",
" payday_loan_apps = plo_app_list[0].unique().tolist()\n",
" payday_loan_apps = [x.lower() for x in payday_loan_apps]\n",
"\n",
" df['ap_co_plo'] = df[label].apply(lambda x:count_apps(x,payday_loan_apps))\n",
"\n",
" return df\n",
"\n",
"def get_payday_apps(uids):\n",
" ins_app_q = f'''\n",
" SELECT user_id,\n",
" installed_applications\n",
" FROM l2alpha.web_userdevicedetail\n",
" WHERE user_id IN {uids}\n",
" '''\n",
" ins_app = pd.read_sql(ins_app_q, connect_sql())\n",
"\n",
" plo_app_list = pd.read_csv('payday_700_list.csv', header=None)\n",
"\n",
" plo_apps = create_feats_apps(ins_app, plo_app_list)\n",
" plo_apps = plo_apps.drop(columns=['installed_applications'])\n",
" return plo_apps"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## de_education"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def get_demographic_feats(uids):\n",
" query = f'''\n",
" select\n",
" user_id\n",
" , education as de_education\n",
" from l2alpha.web_userpersonaldetail\n",
" where user_id in {uids}\n",
" '''\n",
" result = pd.read_sql(query, connect_sql())\n",
" return result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## delin_dpd_max_90"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def get_delin(uids, obs_date):\n",
" query = f'''\n",
" SELECT \n",
" user_id\n",
" , MAX(CASE WHEN datediff('{obs_date}',snapshot_date) <= 90 AND\n",
" datediff('{obs_date}',snapshot_date) >=0 \n",
" THEN current_dpd \n",
" ELSE NULL END) AS delin_dpd_max_90\n",
" FROM ds.user_level_archive\n",
" WHERE user_id in {uids} \n",
" AND snapshot_date < '{obs_date}'\n",
" GROUP BY user_id\n",
" '''\n",
" result = pd.read_sql(query, connect_sql())\n",
" return result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## rep_count_30"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def get_repayments(uids, obs_date):\n",
" query = f'''\n",
" SELECT \n",
" user_id \n",
" , COUNT(DISTINCT(CASE WHEN datediff(DATE('{obs_date}'),payment_date) <= 30 \n",
" AND datediff(DATE('{obs_date}'),payment_date) >=0\n",
" THEN payment_date ELSE NULL END)) as rep_count_30\n",
"\n",
" FROM l2alpha.user_payment\n",
" WHERE status = 2 \n",
" AND payment_type = 'INSTALLMENT' \n",
" AND payment_channel <> 'CANCELLATION' \n",
" AND user_id in {uids}\n",
" AND payment_date < '{obs_date}'\n",
" GROUP BY user_id\n",
" '''\n",
" result = pd.read_sql(query, connect_sql())\n",
" return result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## rep_full_paid_count_60"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def get_paid_trx(uids, obs_date):\n",
" query = f'''\n",
" SELECT \n",
" user_id\n",
" , COUNT(DISTINCT(CASE WHEN datediff(DATE('{obs_date}'),start_date) <= 60 \n",
" AND datediff(DATE('{obs_date}'),start_date) >=0 \n",
" THEN transaction_id \n",
" ELSE NULL END)) AS rep_full_paid_count_60\n",
" FROM l2alpha.loan\n",
" WHERE is_active = 1 \n",
" AND loan_amount > 0\n",
" AND user_id in {uids} \n",
" AND settlement_date < '{obs_date}'\n",
" GROUP BY user_id\n",
" '''\n",
" result = pd.read_sql(query, connect_sql())\n",
" return result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## snapshot\n",
"- snapshot_current_dpd\n",
"- snapshot_days_premium\n",
"- snapshot_dob\n",
"- snapshot_os_amount\n",
"- snapshot_os_principal"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def get_dpd_snapshot(uids, obs_date):\n",
" query = f'''\n",
" SELECT \n",
" user_id\n",
" , os_amount AS snapshot_os_amount\n",
" , current_dpd AS snapshot_current_dpd\n",
" , os_principal AS snapshot_os_principal\n",
" FROM ds.user_level_archive\n",
" WHERE user_id IN {uids}\n",
" AND snapshot_date = '{obs_date}'\n",
" '''\n",
" result = pd.read_sql(query, connect_sql())\n",
" return result\n",
"\n",
"\n",
"def get_dob(uids, obs_date):\n",
" query = f'''\n",
" SELECT\n",
" wud.user_id\n",
" , TIMESTAMPDIFF(DAY, DATE(set_password_timestamp), '{obs_date}') AS snapshot_dob\n",
" , TIMESTAMPDIFF(DAY, DATE(is_approved_timestamp), '{obs_date}') AS snapshot_days_premium\n",
" FROM\n",
" (SELECT\n",
" user_id\n",
" , set_password_timestamp\n",
" FROM l2alpha.web_userdetail\n",
" WHERE user_id IN {uids}\n",
" ) AS wud\n",
" LEFT JOIN\n",
" (SELECT\n",
" user_id\n",
" , is_approved_timestamp\n",
" FROM l2alpha.web_userinstallmentapplicationdetails\n",
" ) AS wuid\n",
" ON wud.user_id = wuid.user_id\n",
" '''\n",
" result = pd.read_sql(query, connect_sql())\n",
" return result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## trx_active\n",
"- trx_active_amt_avg\n",
"- trx_active_count"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def get_trx_info(uids, obs_date):\n",
" query = f'''\n",
" select \n",
" user_id\n",
" , count(distinct(transaction_id)) as trx_active_count\n",
" , sum(loan_amount)/count(distinct(transaction_id)) as trx_active_amt_avg\n",
" from l2alpha.loan\n",
" where is_active = 1\n",
" and loan_amount > 0\n",
" and user_id in {uids}\n",
" and start_date < '{obs_date}'\n",
" and (settlement_date is null or settlement_date > '{obs_date}')\n",
" group by user_id\n",
" '''\n",
" result = pd.read_sql(query, connect_sql())\n",
" return result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## trx_denied_count_120"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def get_previous_trx_co(uids, obs_date):\n",
" query = f'''\n",
" SELECT \n",
" user_id\n",
" , COUNT(CASE WHEN datediff('{obs_date}', DATE(transaction_date)) <= 120\n",
" AND datediff('{obs_date}', DATE(transaction_date)) >=0\n",
" AND status = 5\n",
" THEN id ELSE NULL END) AS trx_denied_count_120\n",
"\n",
" FROM l2alpha.transaction\n",
" WHERE user_id IN {uids}\n",
" AND transaction_date < '{obs_date}'\n",
" GROUP BY user_id\n",
" '''\n",
" result = pd.read_sql(query, connect_sql())\n",
" return result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## trx_merch_count_dist\n",
"- trx_merch_count_dist_60\n",
"- trx_merch_count_dist_120"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def get_count_merch_trx(uids, obs_date):\n",
" query = f'''\n",
" SELECT \n",
" base.user_id\n",
" , COUNT(DISTINCT(CASE WHEN datediff(DATE('{obs_date}'),start_date) <= 60 \n",
" AND datediff(DATE('{obs_date}'),start_date) >=0 \n",
" THEN base.merchant_id \n",
" ELSE NULL END)) as trx_merch_count_dist_60\n",
" , COUNT(DISTINCT(CASE WHEN datediff(DATE('{obs_date}'),start_date) <= 120 \n",
" AND datediff(DATE('{obs_date}'),start_date) >=0 \n",
" THEN base.merchant_id \n",
" ELSE NULL END)) as trx_merch_count_dist_120 \n",
" FROM\n",
" (SELECT \n",
" bs.*,\n",
" trx.merchant_id\n",
" FROM\n",
" (SELECT \n",
" user_id\n",
" , start_date\n",
" , transaction_id \n",
" FROM l2alpha.loan\n",
" WHERE is_active = 1 \n",
" AND loan_amount > 0\n",
" AND user_id in {uids} \n",
" AND start_date < '{obs_date}') AS bs\n",
" LEFT JOIN l2alpha.transaction as trx\n",
" ON bs.transaction_id = trx.id) as base\n",
" GROUP BY base.user_id\n",
" '''\n",
" result = pd.read_sql(query, connect_sql())\n",
" return result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## trx_pl_count\n",
"- trx_pl_count_60\n",
"- trx_pl_count_90"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def get_count_pl(uids, obs_date):\n",
" query = f'''\n",
" SELECT \n",
" user_id\n",
" , COUNT(DISTINCT(CASE WHEN datediff(DATE('{obs_date}'),transaction_date) <= 60 \n",
" AND datediff(DATE('{obs_date}'),transaction_date) >=0 \n",
" THEN id \n",
" ELSE NULL END)) as trx_pl_count_60\n",
" , COUNT(DISTINCT(CASE WHEN datediff(DATE('{obs_date}'),transaction_date) <= 90 \n",
" AND datediff(DATE('{obs_date}'),transaction_date) >=0 \n",
" THEN id \n",
" ELSE NULL END)) as trx_pl_count_90 \n",
" FROM l2alpha.transaction\n",
" WHERE transaction_type = 2\n",
" AND status = 4 \n",
" AND user_id in {uids} \n",
" AND transaction_date < '{obs_date}'\n",
" GROUP BY user_id\n",
" '''\n",
" result = pd.read_sql(query, connect_sql())\n",
" return result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## trx_suc_amt_sum\n",
"- trx_suc_amt_sum_30\n",
"- trx_suc_amt_sum_120"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def get_previous_trx_sum(uids, obs_date):\n",
" query = f'''\n",
" SELECT \n",
" user_id\n",
" , SUM(CASE WHEN datediff('{obs_date}', DATE(transaction_date)) <= 30\n",
" AND datediff('{obs_date}', DATE(transaction_date)) >=0\n",
" AND status = 4\n",
" THEN amount \n",
" ELSE NULL END) AS trx_suc_amt_sum_30\n",
" , SUM(CASE WHEN datediff('{obs_date}', DATE(transaction_date)) <= 120\n",
" AND datediff('{obs_date}', DATE(transaction_date)) >=0\n",
" AND status = 4\n",
" THEN amount \n",
" ELSE NULL END) AS trx_suc_amt_sum_120\n",
" FROM l2alpha.transaction\n",
" WHERE user_id IN {uids}\n",
" AND transaction_date < '{obs_date}'\n",
" GROUP BY user_id\n",
" '''\n",
" result = pd.read_sql(query, connect_sql())\n",
" return result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## util_latest"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def get_utilization(uids, obs_date):\n",
" query = f'''\n",
" SELECT main.user_id,\n",
" COALESCE(SUM(loan_amount/transaction_limit), 0) AS util_latest\n",
" FROM\n",
" (SELECT user_id, \n",
" trx_id, \n",
" loan_id,\n",
" loan_amount,\n",
" tenure,\n",
" MIN(creation_time) AS latest_limit\n",
" FROM\t\n",
" (SELECT lo.user_id,\n",
" lo.transaction_id AS trx_id,\n",
" lo.loan_id,\n",
" loan_amount,\n",
" creation_time,\n",
" tenure\n",
" FROM \n",
" (SELECT id AS loan_id,\n",
" user_id,\n",
" start_date,\n",
" settlement_date,\n",
" loan_amount,\n",
" tenure,\n",
" transaction_id \n",
" FROM l2alpha.loan\n",
" WHERE user_id IN {uids}\n",
" AND is_active = 1 \n",
" AND loan_amount > 0\n",
" AND start_date < '{obs_date}' \n",
" AND (settlement_date IS NULL OR settlement_date > '{obs_date}')\n",
" ) AS lo\n",
" LEFT JOIN \n",
" (SELECT user_id,\n",
" CASE WHEN installment_type = '30_days' THEN 30\n",
" WHEN installment_type = '3_months' THEN 90 \n",
" WHEN installment_type = '6_months' THEN 180 \n",
" WHEN installment_type = '12_months' THEN 360 \n",
" ELSE NULL END AS installment_type,\n",
" creation_time\n",
" FROM l2alpha.transaction_limit_history\n",
" ) AS tlh\n",
" ON tlh.user_id = lo.user_id AND tlh.installment_type = lo.tenure\n",
" ) AS base\n",
" GROUP BY trx_id, loan_id\n",
" ) AS main\n",
"\n",
" LEFT JOIN \n",
" (SELECT user_id,\n",
" transaction_limit,\n",
" CASE WHEN installment_type = '30_days' THEN 30\n",
" WHEN installment_type = '3_months' THEN 90 \n",
" WHEN installment_type = '6_months' THEN 180 \n",
" WHEN installment_type = '12_months' THEN 360 \n",
" ELSE NULL END AS installment_type,\n",
" creation_time\n",
" FROM l2alpha.transaction_limit_history\n",
" WHERE user_id IN {uids}\n",
" GROUP BY user_id, transaction_limit, installment_type, creation_time\n",
" ) AS lim\n",
" ON lim.creation_time = main.latest_limit \n",
" AND lim.user_id = main.user_id\n",
" AND lim.installment_type = main.tenure\n",
" GROUP BY main.user_id\n",
" '''\n",
" result = pd.read_sql(query, connect_sql())\n",
" result['util_latest'] = np.where(result['util_latest'] > 1, 1, result['util_latest'])\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"def get_features(df, obs_date):\n",
" st_tm = datetime.now()\n",
" uids = tuple(df['user_id'].unique())\n",
" \n",
" print('Payday Loan feature start')\n",
" plo_df = get_payday_apps(uids)\n",
" print('plo_df size:', plo_df.shape[0])\n",
" \n",
" print('Demographic features start')\n",
" de_df = get_demographic_feats(uids)\n",
" print('de_df size:', de_df.shape[0])\n",
" \n",
" print('Delinquency features start')\n",
" delin_df = get_delin(uids, obs_date)\n",
" print('delin_df size:', delin_df.shape[0])\n",
" \n",
" print('Repayment features start')\n",
" rep_df = get_repayments(uids, obs_date)\n",
" print('rep_df size:', rep_df.shape[0])\n",
"\n",
" print('Fully Paid features start')\n",
" fully_paid_df = get_paid_trx(uids, obs_date)\n",
" print('fully_paid_df size:', fully_paid_df.shape[0])\n",
" \n",
" print('DOB features start')\n",
" dob_df = get_dob(uids, obs_date)\n",
" print('dob_df size:', dob_df.shape[0])\n",
"\n",
" print('DPD Snapshot features start')\n",
" dpd_df = get_dpd_snapshot(uids, obs_date)\n",
" print('dpd_df size:', dpd_df.shape[0])\n",
" \n",
" print('Trx info features start')\n",
" trx_info_df = get_trx_info(uids, obs_date)\n",
" print('trx_info_df size:', trx_info_df.shape[0])\n",
" \n",
" print('Trx Counts features start')\n",
" trx_counts_df = get_previous_trx_co(uids, obs_date)\n",
" print('trx_counts_df size:', trx_counts_df.shape[0])\n",
" \n",
" print('Merchant Trx features start')\n",
" merch_trx_df = get_count_merch_trx(uids, obs_date)\n",
" print('merch_trx_df size:', merch_trx_df.shape[0])\n",
" \n",
" print('PL Trx features start')\n",
" pl_trx_df = get_count_pl(uids, obs_date)\n",
" print('pl_trx_df size:', pl_trx_df.shape[0])\n",
" \n",
" print('Trx Sum features start')\n",
" trx_sum_df = get_previous_trx_sum(uids, obs_date)\n",
" print('trx_sum_df size:', trx_sum_df.shape[0])\n",
" \n",
" print('Utilization Trx features start')\n",
" util_df = get_utilization(uids, obs_date)\n",
" print('util_df size:', util_df.shape[0])\n",
" \n",
" print('Merge all dataframes')\n",
" final_df = df[['user_id']]\\\n",
" .merge(plo_df, how='left', on='user_id')\\\n",
" .merge(de_df, how='left', on='user_id')\\\n",
" .merge(delin_df, how='left', on='user_id')\\\n",
" .merge(rep_df, how='left', on='user_id')\\\n",
" .merge(fully_paid_df, how='left', on='user_id')\\\n",
" .merge(dob_df, how='left', on='user_id')\\\n",
" .merge(dpd_df, how='left', on='user_id')\\\n",
" .merge(trx_info_df, how='left', on='user_id')\\\n",
" .merge(trx_counts_df, how='left', on='user_id')\\\n",
" .merge(merch_trx_df, how='left', on='user_id')\\\n",
" .merge(pl_trx_df, how='left', on='user_id')\\\n",
" .merge(trx_sum_df, how='left', on='user_id')\\\n",
" .merge(util_df, how='left', on='user_id')\n",
" print('Feature Creation DONE')\n",
" print('Total time elapsed:', datetime.now() - st_tm)\n",
" print()\n",
" return final_df"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['ap_co_plo',\n",
" 'de_education',\n",
" 'delin_dpd_max_90',\n",
" 'rep_count_30',\n",
" 'rep_full_paid_count_60',\n",
" 'snapshot_current_dpd',\n",
" 'snapshot_days_premium',\n",
" 'snapshot_dob',\n",
" 'snapshot_os_amount',\n",
" 'snapshot_os_principal',\n",
" 'trx_active_amt_avg',\n",
" 'trx_active_count',\n",
" 'trx_denied_count_120',\n",
" 'trx_merch_count_dist_120',\n",
" 'trx_merch_count_dist_60',\n",
" 'trx_pl_count_60',\n",
" 'trx_pl_count_90',\n",
" 'trx_suc_amt_sum_120',\n",
" 'trx_suc_amt_sum_30',\n",
" 'util_latest']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Sanity Check"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"base = pd.read_parquet('s3://finaccel-ml-model-data-production/zhilal/ltv_final/raw_cohorts/user_set_2018-07.parquet')\n",
"\n",
"# Make sure there is no user with user_status = 3 or Rejected \n",
"base = base.query(\"user_status in (2,5,11,9,7)\")\n",
"\n",
"base['user_status_code'] = base['user_status']\n",
"user_status_mapping = {2: 'activated', 5: 'suspended', 11:'blacklisted', 9:'closed', 7:'expired'}\n",
"base = base.replace({'user_status': user_status_mapping})\n",
"base['user_status'] = base['user_status'].astype('str')\n",
"\n",
"base['application_type_code'] = base['application_type']\n",
"base['application_type'] = np.where(base['application_type'].isin([150, 160, 170, 180]),\n",
" 'upgrade', np.where(base['application_type'].isin([100, 110, 120]), 'fresh_premium', 'basic'))\n",
"base['application_type'] = base['application_type'].astype('str')\n",
"\n",
"base = base.rename(columns={'first_dpd_dt_91':'first_dpd_91_dt'})"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"obs_date = '2019-04-01'"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"test = base.sample(250, random_state=99)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Payday Loan feature start\n",
"plo_df size: 250\n",
"Demographic features start\n",
"de_df size: 250\n",
"Delinquency features start\n",
"delin_df size: 211\n",
"Repayment features start\n",
"rep_df size: 205\n",
"Fully Paid features start\n",
"fully_paid_df size: 200\n",
"DOB features start\n",
"dob_df size: 250\n",
"DPD Snapshot features start\n",
"dpd_df size: 177\n",
"Trx info features start\n",
"trx_info_df size: 176\n",
"Trx Counts features start\n",
"trx_counts_df size: 224\n",
"Merchant Trx features start\n",
"merch_trx_df size: 211\n",
"PL Trx features start\n",
"pl_trx_df size: 83\n",
"Trx Sum features start\n",
"trx_sum_df size: 224\n",
"Utilization Trx features start\n",
"util_df size: 176\n",
"Merge all dataframes\n",
"Feature Creation DONE\n",
"Total time elapsed: 0:02:12.986406\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/anaconda3/envs/ltv/lib/python3.8/site-packages/pymysql/cursors.py:329: Warning: (1052, \"Column 'installment_type' in group statement is ambiguous\")\n",
" self._do_get_result()\n"
]
}
],
"source": [
"checks = get_features(test, obs_date)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>ap_co_plo</th>\n",
" <th>de_education</th>\n",
" <th>delin_dpd_max_90</th>\n",
" <th>rep_count_30</th>\n",
" <th>rep_full_paid_count_60</th>\n",
" <th>snapshot_dob</th>\n",
" <th>snapshot_days_premium</th>\n",
" <th>snapshot_os_amount</th>\n",
" <th>snapshot_current_dpd</th>\n",
" <th>...</th>\n",
" <th>trx_active_count</th>\n",
" <th>trx_active_amt_avg</th>\n",
" <th>trx_denied_count_120</th>\n",
" <th>trx_merch_count_dist_60</th>\n",
" <th>trx_merch_count_dist_120</th>\n",
" <th>trx_pl_count_60</th>\n",
" <th>trx_pl_count_90</th>\n",
" <th>trx_suc_amt_sum_30</th>\n",
" <th>trx_suc_amt_sum_120</th>\n",
" <th>util_latest</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4001368</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>2.0</td>\n",
" <td>5.0</td>\n",
" <td>6.0</td>\n",
" <td>255</td>\n",
" <td>256.0</td>\n",
" <td>1310711.20</td>\n",
" <td>-4.0</td>\n",
" <td>...</td>\n",
" <td>2.0</td>\n",
" <td>6.957495e+05</td>\n",
" <td>3.0</td>\n",
" <td>5.0</td>\n",
" <td>5.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>344557.0</td>\n",
" <td>2919188.0</td>\n",
" <td>0.162211</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3922443</td>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>264</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4119795</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>249</td>\n",
" <td>NaN</td>\n",
" <td>525041.67</td>\n",
" <td>-18.0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>5.000000e+05</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" <td>500000.0</td>\n",
" <td>2500000.0</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4027138</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>-1.0</td>\n",
" <td>3.0</td>\n",
" <td>68.0</td>\n",
" <td>257</td>\n",
" <td>-157.0</td>\n",
" <td>320997.00</td>\n",
" <td>-24.0</td>\n",
" <td>...</td>\n",
" <td>9.0</td>\n",
" <td>3.566667e+04</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1300000.0</td>\n",
" <td>3741322.0</td>\n",
" <td>0.534998</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4052800</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>160.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>256</td>\n",
" <td>NaN</td>\n",
" <td>2617789.78</td>\n",
" <td>161.0</td>\n",
" <td>...</td>\n",
" <td>10.0</td>\n",
" <td>2.201060e+05</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.880424</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>245</th>\n",
" <td>4176715</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>244</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>246</th>\n",
" <td>4107006</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>5.0</td>\n",
" <td>250</td>\n",
" <td>250.0</td>\n",
" <td>2082870.54</td>\n",
" <td>-14.0</td>\n",
" <td>...</td>\n",
" <td>4.0</td>\n",
" <td>8.024360e+05</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" <td>4.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>293924.0</td>\n",
" <td>625999.0</td>\n",
" <td>0.324064</td>\n",
" </tr>\n",
" <tr>\n",
" <th>247</th>\n",
" <td>4117848</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>247</td>\n",
" <td>247.0</td>\n",
" <td>4658910.86</td>\n",
" <td>-25.0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>1.126952e+07</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.971511</td>\n",
" </tr>\n",
" <tr>\n",
" <th>248</th>\n",
" <td>4152725</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>160.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>246</td>\n",
" <td>NaN</td>\n",
" <td>513684.11</td>\n",
" <td>161.0</td>\n",
" <td>...</td>\n",
" <td>6.0</td>\n",
" <td>7.198583e+04</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.863830</td>\n",
" </tr>\n",
" <tr>\n",
" <th>249</th>\n",
" <td>3798310</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>274</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>250 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" user_id ap_co_plo de_education delin_dpd_max_90 rep_count_30 \\\n",
"0 4001368 0 5 2.0 5.0 \n",
"1 3922443 9 3 NaN NaN \n",
"2 4119795 0 4 0.0 1.0 \n",
"3 4027138 0 2 -1.0 3.0 \n",
"4 4052800 2 3 160.0 0.0 \n",
".. ... ... ... ... ... \n",
"245 4176715 0 3 NaN NaN \n",
"246 4107006 0 2 0.0 1.0 \n",
"247 4117848 0 2 0.0 1.0 \n",
"248 4152725 5 2 160.0 0.0 \n",
"249 3798310 1 3 NaN 0.0 \n",
"\n",
" rep_full_paid_count_60 snapshot_dob snapshot_days_premium \\\n",
"0 6.0 255 256.0 \n",
"1 NaN 264 NaN \n",
"2 1.0 249 NaN \n",
"3 68.0 257 -157.0 \n",
"4 0.0 256 NaN \n",
".. ... ... ... \n",
"245 NaN 244 NaN \n",
"246 5.0 250 250.0 \n",
"247 NaN 247 247.0 \n",
"248 0.0 246 NaN \n",
"249 0.0 274 NaN \n",
"\n",
" snapshot_os_amount snapshot_current_dpd ... trx_active_count \\\n",
"0 1310711.20 -4.0 ... 2.0 \n",
"1 NaN NaN ... NaN \n",
"2 525041.67 -18.0 ... 1.0 \n",
"3 320997.00 -24.0 ... 9.0 \n",
"4 2617789.78 161.0 ... 10.0 \n",
".. ... ... ... ... \n",
"245 NaN NaN ... NaN \n",
"246 2082870.54 -14.0 ... 4.0 \n",
"247 4658910.86 -25.0 ... 1.0 \n",
"248 513684.11 161.0 ... 6.0 \n",
"249 NaN NaN ... NaN \n",
"\n",
" trx_active_amt_avg trx_denied_count_120 trx_merch_count_dist_60 \\\n",
"0 6.957495e+05 3.0 5.0 \n",
"1 NaN 0.0 NaN \n",
"2 5.000000e+05 1.0 1.0 \n",
"3 3.566667e+04 0.0 2.0 \n",
"4 2.201060e+05 0.0 0.0 \n",
".. ... ... ... \n",
"245 NaN 0.0 NaN \n",
"246 8.024360e+05 0.0 4.0 \n",
"247 1.126952e+07 0.0 0.0 \n",
"248 7.198583e+04 0.0 0.0 \n",
"249 NaN 0.0 0.0 \n",
"\n",
" trx_merch_count_dist_120 trx_pl_count_60 trx_pl_count_90 \\\n",
"0 5.0 NaN NaN \n",
"1 NaN NaN NaN \n",
"2 1.0 2.0 3.0 \n",
"3 2.0 NaN NaN \n",
"4 0.0 NaN NaN \n",
".. ... ... ... \n",
"245 NaN NaN NaN \n",
"246 4.0 NaN NaN \n",
"247 0.0 NaN NaN \n",
"248 0.0 NaN NaN \n",
"249 0.0 NaN NaN \n",
"\n",
" trx_suc_amt_sum_30 trx_suc_amt_sum_120 util_latest \n",
"0 344557.0 2919188.0 0.162211 \n",
"1 NaN NaN NaN \n",
"2 500000.0 2500000.0 1.000000 \n",
"3 1300000.0 3741322.0 0.534998 \n",
"4 NaN NaN 0.880424 \n",
".. ... ... ... \n",
"245 NaN NaN NaN \n",
"246 293924.0 625999.0 0.324064 \n",
"247 NaN NaN 0.971511 \n",
"248 NaN NaN 0.863830 \n",
"249 NaN NaN NaN \n",
"\n",
"[250 rows x 21 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"checks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "conda_ltv",
"language": "python",
"name": "conda_ltv"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}