{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('/home/ec2-user/SageMaker/zhilal_working_directory/15_ltv_final/utils')\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import boto3\n",
"\n",
"from datetime import timedelta\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"from lifelines.utils import datetimes_to_durations\n",
"from lifelines import CoxPHFitter\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from utils import woe, preprocessing as pr\n",
"\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Parameters"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"obs_date = '2019-04-01'\n",
"spline_censor_date = '2020-03-01'\n",
"censor_date = '2020-08-01'\n",
"app_type = ['fresh_premium', 'upgrade']"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"s3 = boto3.client('s3')\n",
"\n",
"def get_matching_s3_keys(bucket, prefix='', suffix=''):\n",
" \"\"\"\n",
" Generate the keys in an S3 bucket.\n",
"\n",
" :param bucket: Name of the S3 bucket.\n",
" :param prefix: Only fetch keys that start with this prefix (optional).\n",
" :param suffix: Only fetch keys that end with this suffix (optional).\n",
" \"\"\"\n",
" kwargs = {'Bucket': bucket, 'Prefix': prefix}\n",
" while True:\n",
" resp = s3.list_objects_v2(**kwargs)\n",
" for obj in resp['Contents']:\n",
" key = obj['Key']\n",
" if key.endswith(suffix):\n",
" yield key\n",
"\n",
" try:\n",
" kwargs['ContinuationToken'] = resp['NextContinuationToken']\n",
" except KeyError:\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"key_ls = []\n",
"for key in get_matching_s3_keys(bucket = 'finaccel-ml-model-data-production', \n",
" prefix = 'zhilal/ltv_final/raw_cohorts', \n",
" suffix = '.parquet'):\n",
" key_ls.append(key)\n",
"main_ls = [i for i in key_ls if 'user_set_' in i]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"feat_key_ls = []\n",
"for key in get_matching_s3_keys(bucket = 'finaccel-ml-model-data-production', \n",
" prefix = 'zhilal/ltv_final/features', \n",
" suffix = '.parquet'):\n",
" feat_key_ls.append(key)\n",
"feat_ls = feat_key_ls"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"main_ls = sorted(main_ls)[:17]\n",
"feat_ls = sorted(feat_ls)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['zhilal/ltv_final/raw_cohorts/user_set_2017-07.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2017-08.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2017-09.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2017-10.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2017-11.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2017-12.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2018-01.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2018-02.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2018-03.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2018-04.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2018-05.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2018-06.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2018-07.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2018-08.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2018-09.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2018-10.parquet',\n",
" 'zhilal/ltv_final/raw_cohorts/user_set_2018-11.parquet']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"main_ls"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['zhilal/ltv_final/features/features_2017-07.parquet',\n",
" 'zhilal/ltv_final/features/features_2017-08.parquet',\n",
" 'zhilal/ltv_final/features/features_2017-09.parquet',\n",
" 'zhilal/ltv_final/features/features_2017-10.parquet',\n",
" 'zhilal/ltv_final/features/features_2017-11.parquet',\n",
" 'zhilal/ltv_final/features/features_2017-12.parquet',\n",
" 'zhilal/ltv_final/features/features_2018-01.parquet',\n",
" 'zhilal/ltv_final/features/features_2018-02.parquet',\n",
" 'zhilal/ltv_final/features/features_2018-03.parquet',\n",
" 'zhilal/ltv_final/features/features_2018-04.parquet',\n",
" 'zhilal/ltv_final/features/features_2018-05.parquet',\n",
" 'zhilal/ltv_final/features/features_2018-06.parquet',\n",
" 'zhilal/ltv_final/features/features_2018-07.parquet',\n",
" 'zhilal/ltv_final/features/features_2018-08.parquet',\n",
" 'zhilal/ltv_final/features/features_2018-09.parquet',\n",
" 'zhilal/ltv_final/features/features_2018-10.parquet',\n",
" 'zhilal/ltv_final/features/features_2018-11.parquet']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feat_ls"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 0. Get all data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### user data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"rel_cols = [\n",
" 'user_id'\n",
" , 'set_password_timestamp'\n",
" , 'is_approved_timestamp'\n",
" , 'first_dpd_91_dt'\n",
" , 'first_trx_date'\n",
" , 'application_type'\n",
" , 'user_status'\n",
"]\n",
"\n",
"main_df = pd.DataFrame()\n",
"for key in main_ls:\n",
" temp_df = pd.read_parquet(r's3://finaccel-ml-model-data-production/'+key, columns=rel_cols)\n",
" \n",
" # Make sure there is no user with user_status = 3 (Rejected) or 11 (blacklisted) \n",
" temp_df = temp_df.query(\"user_status in (2,5,9,7)\")\n",
" \n",
" temp_df['application_type_code'] = temp_df['application_type']\n",
" temp_df['application_type'] = np.where(temp_df['application_type'].isin([150, 160, 170, 180]),\n",
" 'upgrade', np.where(temp_df['application_type'].isin([100, 110, 120]), 'fresh_premium', 'basic'))\n",
" temp_df['application_type'] = temp_df['application_type'].astype('str')\n",
" \n",
" # determine app_type before obs_date\n",
" temp_df['app_type_b4_obsdt'] = np.where(temp_df['is_approved_timestamp'] > pd.to_datetime(obs_date),\n",
" 'basic', temp_df['application_type'])\n",
" # Select specific app_type only\n",
" temp_df = temp_df[temp_df['app_type_b4_obsdt'].isin(app_type)].reset_index(drop=True) \n",
" \n",
" # remove users who NPL before obs_date\n",
" temp_df = temp_df[~(temp_df['first_dpd_91_dt'] < \n",
" pd.to_datetime(obs_date))].reset_index(drop=True)\n",
"\n",
" # remove users who have not made\n",
" # 1st trx before obs_date\n",
" temp_df = temp_df[~(temp_df['first_trx_date'] > pd.to_datetime(obs_date)) & \n",
" temp_df['first_trx_date'].notna()]\n",
" \n",
" main_df = pd.concat([main_df, temp_df]).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(108965, 9)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"main_df.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### feature data"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"temp_ls = []\n",
"for feat_key in feat_ls:\n",
" temp_df = pd.read_parquet('s3://finaccel-ml-model-data-production/'+feat_key)\n",
" temp_ls.append(temp_df)\n",
" \n",
"feat_df = pd.concat(temp_ls).reset_index(drop=True)\n",
"feat_df = feat_df.drop_duplicates('user_id', 'first').reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(357579, 415)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feat_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"main_df = main_df.merge(feat_df, how='left', on='user_id')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(108965, 423)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"main_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"del temp_ls, temp_df, feat_df"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# censor NPL events that happen after censor date\n",
"\n",
"main_df['first_dpd_91_censor'] = np.where(main_df['first_dpd_91_dt'] < pd.to_datetime(spline_censor_date), \n",
" main_df['first_dpd_91_dt'], None)\n",
"main_df['first_dpd_91_censor'] = main_df['first_dpd_91_censor'].astype('datetime64')\n",
"\n",
"# also set the first_dpd_91 to 1st of month\n",
"main_df['first_dpd_91_censor_bom'] = main_df['first_dpd_91_censor'] + pd.offsets.MonthEnd(0) - pd.offsets.MonthBegin(normalize=True)\n",
"\n",
"main_df['t0'] = pd.to_datetime(obs_date)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"main_df['duration_t0'] = np.where(main_df['first_dpd_91_censor'].isna(), 11, \n",
" main_df['first_dpd_91_censor'].dt.year*12 + \\\n",
" main_df['first_dpd_91_censor'].dt.month - \\\n",
" 2019*12 - 3)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"main_df['flag_churn'] = main_df['first_dpd_91_censor'].notna()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"main_df['util_latest'] = np.where(main_df['util_latest'] > 1, 1, main_df['util_latest'])\n",
"\n",
"education_mapping = {'pre_high_school': 1, \n",
" 'high_school': 2, \n",
" 'diploma': 3,\n",
" 'undergraduate': 4, \n",
" 'postgraduate': 5}\n",
"main_df['de_education'] = main_df['de_education'].map(education_mapping)\n",
"main_df['snapshot_days_premium'] = main_df['snapshot_days_premium'].fillna(120)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"train_val_temp, test_df = train_test_split(main_df, test_size=0.1, random_state=99)\n",
"train_temp, val_df = train_test_split(train_val_temp, test_size=0.1, random_state=99)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"train size: (88261, 428)\n",
"val size: (9807, 428)\n",
"test size: (10897, 428)\n"
]
}
],
"source": [
"print('train size:', train_temp.shape)\n",
"print('val size:', val_df.shape)\n",
"print('test size:', test_df.shape)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(13239, 428)"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# to reduce the size of the YAML file,\n",
"# the train data is sampled\n",
"train_df = train_temp.sample(frac=0.15, random_state=99)\n",
"train_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"cont_feats = [\n",
" 'ap_co_plo',\n",
" 'delin_dpd_max_90',\n",
" 'rep_count_30',\n",
" 'snapshot_current_dpd',\n",
" 'snapshot_days_premium',\n",
" 'snapshot_os_amount',\n",
" 'trx_active_amt_avg',\n",
" 'trx_denied_count_120',\n",
" 'trx_merch_count_dist_60',\n",
" 'trx_pl_count_90',\n",
" 'trx_suc_amt_sum_30',\n",
" 'util_latest',\n",
"]\n",
"\n",
"disc_feats = [\n",
" 'de_education'\n",
"]\n",
"\n",
"feat_dict = {**{x:'c' for x in cont_feats},\n",
" **{x:'d' for x in disc_feats},\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"def get_woe_bins(train, qnt_num, feature_dict):\n",
" bin_dict = {}\n",
" for feature, v_type in feature_dict.items():\n",
" if v_type == 'c':\n",
" _, feat_bins = pd.qcut(train[feature], qnt_num, retbins=True, duplicates='drop')\n",
" elif v_type == 'd':\n",
" feat_bins = train[feature].unique().tolist()\n",
" bin_dict[feature] = feat_bins\n",
" return bin_dict"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"feature_bins = get_woe_bins(train_df, 10, feat_dict)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"pipe_transform = pr.Pipeline_transform\n",
"pipeline_dict = dict()\n",
"\n",
"for feature in feat_dict:\n",
" pipeline_base = pr.Pipeline_base\n",
" \n",
" feat_series = train_df[feature]\n",
" target_series = train_df['flag_churn']\n",
" \n",
" feat_transformation = feature_bins[feature]\n",
" \n",
" if feature.startswith('de_'):\n",
"\n",
" tf_woe = pr.TransformerWoE(v_type='d')\n",
" tf_woe.fit(feat_series, target_series)\n",
"\n",
" steps = [('tf_woe', tf_woe)]\n",
" \n",
" else:\n",
" tf_woe = pr.TransformerWoE(v_type='c', bins=feat_transformation)\n",
" tf_woe.fit(feat_series, target_series)\n",
"\n",
" steps = [('tf_woe', tf_woe)]\n",
" \n",
" pipeline_dict[feature] = pipeline_base(steps)\n",
" \n",
"pipe_transform = pr.Pipeline_transform()\n",
"pipe_transform.pipeline_dict = pipeline_dict"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"# add imputer for features that don't have missing values\n",
"# in the train dataset, just in case\n",
"impute_dict = {\n",
" 'ap_co_plo':0,\n",
" 'snapshot_days_premium':120,\n",
" 'de_education':3,\n",
" 'trx_merch_count_dist_60':0,\n",
" 'trx_denied_count_120':0\n",
"}\n",
"\n",
"for feature in impute_dict:\n",
" tf_imp = pr.TransformerImputer()\n",
" tf_imp.params['replacement_values'] = impute_dict[feature]\n",
" \n",
" imp_transformer = ('tf_imp', tf_imp)\n",
" \n",
" pipe_transform.pipeline_dict[feature].steps = [imp_transformer] + pipe_transform.pipeline_dict[feature].steps"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ap_co_plo 0\n",
"delin_dpd_max_90 504\n",
"rep_count_30 43\n",
"snapshot_current_dpd 1352\n",
"snapshot_days_premium 0\n",
"snapshot_os_amount 1352\n",
"trx_active_amt_avg 1362\n",
"trx_denied_count_120 0\n",
"trx_merch_count_dist_60 0\n",
"trx_pl_count_90 5691\n",
"trx_suc_amt_sum_30 4632\n",
"util_latest 1362\n",
"de_education 0\n",
"dtype: int64"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df[cont_feats+disc_feats].isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"13\n",
"13\n"
]
}
],
"source": [
"print(len(feat_dict))\n",
"print(len(pipe_transform.pipeline_dict))"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"# save the transformer YAML\n",
"pipe_transform.to_yaml('spline_cox_regression_transformer_premium.yaml')"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"train size: (13239, 15)\n",
"val size: (9807, 15)\n",
"test size: (10897, 15)\n"
]
}
],
"source": [
"# transform features\n",
"final_feats = sorted(cont_feats+disc_feats)\n",
"\n",
"train = train_df.loc[:, ['duration_t0','flag_churn']+final_feats]\n",
"val = val_df.loc[:, ['duration_t0','flag_churn']+final_feats]\n",
"test = test_df.loc[:, ['duration_t0','flag_churn']+final_feats]\n",
"\n",
"train = pipe_transform.transform(train)\n",
"val = pipe_transform.transform(val)\n",
"test = pipe_transform.transform(test)\n",
"\n",
"print('train size:', train.shape)\n",
"print('val size:', val.shape)\n",
"print('test size:', test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"# save the train dataset for production\n",
"train = train.reset_index(drop=True)\n",
"train.to_parquet('spline_cox_regression_train_dataset_premium.parquet')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}