past-data-projects / life_time_value / 300-Feature_Transformations / 300-SplineCoxReg_Premium_YAML.ipynb
300-SplineCoxReg_Premium_YAML.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/ec2-user/SageMaker/zhilal_working_directory/15_ltv_final/utils')\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import boto3\n",
    "\n",
    "from datetime import timedelta\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "from lifelines.utils import datetimes_to_durations\n",
    "from lifelines import CoxPHFitter\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from utils import woe, preprocessing as pr\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "obs_date = '2019-04-01'\n",
    "spline_censor_date = '2020-03-01'\n",
    "censor_date = '2020-08-01'\n",
    "app_type = ['fresh_premium', 'upgrade']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "s3 = boto3.client('s3')\n",
    "\n",
    "def get_matching_s3_keys(bucket, prefix='', suffix=''):\n",
    "    \"\"\"\n",
    "    Generate the keys in an S3 bucket.\n",
    "\n",
    "    :param bucket: Name of the S3 bucket.\n",
    "    :param prefix: Only fetch keys that start with this prefix (optional).\n",
    "    :param suffix: Only fetch keys that end with this suffix (optional).\n",
    "    \"\"\"\n",
    "    kwargs = {'Bucket': bucket, 'Prefix': prefix}\n",
    "    while True:\n",
    "        resp = s3.list_objects_v2(**kwargs)\n",
    "        for obj in resp['Contents']:\n",
    "            key = obj['Key']\n",
    "            if key.endswith(suffix):\n",
    "                yield key\n",
    "\n",
    "        try:\n",
    "            kwargs['ContinuationToken'] = resp['NextContinuationToken']\n",
    "        except KeyError:\n",
    "            break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "key_ls = []\n",
    "for key in get_matching_s3_keys(bucket = 'finaccel-ml-model-data-production', \n",
    "                                prefix = 'zhilal/ltv_final/raw_cohorts', \n",
    "                                suffix = '.parquet'):\n",
    "    key_ls.append(key)\n",
    "main_ls = [i for i in key_ls if 'user_set_' in i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "feat_key_ls = []\n",
    "for key in get_matching_s3_keys(bucket = 'finaccel-ml-model-data-production', \n",
    "                                prefix = 'zhilal/ltv_final/features', \n",
    "                                suffix = '.parquet'):\n",
    "    feat_key_ls.append(key)\n",
    "feat_ls = feat_key_ls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "main_ls = sorted(main_ls)[:17]\n",
    "feat_ls = sorted(feat_ls)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['zhilal/ltv_final/raw_cohorts/user_set_2017-07.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2017-08.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2017-09.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2017-10.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2017-11.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2017-12.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2018-01.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2018-02.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2018-03.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2018-04.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2018-05.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2018-06.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2018-07.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2018-08.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2018-09.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2018-10.parquet',\n",
       " 'zhilal/ltv_final/raw_cohorts/user_set_2018-11.parquet']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "main_ls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['zhilal/ltv_final/features/features_2017-07.parquet',\n",
       " 'zhilal/ltv_final/features/features_2017-08.parquet',\n",
       " 'zhilal/ltv_final/features/features_2017-09.parquet',\n",
       " 'zhilal/ltv_final/features/features_2017-10.parquet',\n",
       " 'zhilal/ltv_final/features/features_2017-11.parquet',\n",
       " 'zhilal/ltv_final/features/features_2017-12.parquet',\n",
       " 'zhilal/ltv_final/features/features_2018-01.parquet',\n",
       " 'zhilal/ltv_final/features/features_2018-02.parquet',\n",
       " 'zhilal/ltv_final/features/features_2018-03.parquet',\n",
       " 'zhilal/ltv_final/features/features_2018-04.parquet',\n",
       " 'zhilal/ltv_final/features/features_2018-05.parquet',\n",
       " 'zhilal/ltv_final/features/features_2018-06.parquet',\n",
       " 'zhilal/ltv_final/features/features_2018-07.parquet',\n",
       " 'zhilal/ltv_final/features/features_2018-08.parquet',\n",
       " 'zhilal/ltv_final/features/features_2018-09.parquet',\n",
       " 'zhilal/ltv_final/features/features_2018-10.parquet',\n",
       " 'zhilal/ltv_final/features/features_2018-11.parquet']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feat_ls"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 0. Get all data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### user data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "rel_cols = [\n",
    "    'user_id'\n",
    "    , 'set_password_timestamp'\n",
    "    , 'is_approved_timestamp'\n",
    "    , 'first_dpd_91_dt'\n",
    "    , 'first_trx_date'\n",
    "    , 'application_type'\n",
    "    , 'user_status'\n",
    "]\n",
    "\n",
    "main_df = pd.DataFrame()\n",
    "for key in main_ls:\n",
    "    temp_df = pd.read_parquet(r's3://finaccel-ml-model-data-production/'+key, columns=rel_cols)\n",
    "    \n",
    "    # Make sure there is no user with user_status = 3 (Rejected) or 11 (blacklisted) \n",
    "    temp_df = temp_df.query(\"user_status in (2,5,9,7)\")\n",
    "    \n",
    "    temp_df['application_type_code'] = temp_df['application_type']\n",
    "    temp_df['application_type'] = np.where(temp_df['application_type'].isin([150, 160, 170, 180]),\n",
    "                                        'upgrade', np.where(temp_df['application_type'].isin([100, 110, 120]), 'fresh_premium', 'basic'))\n",
    "    temp_df['application_type'] = temp_df['application_type'].astype('str')\n",
    "    \n",
    "    # determine app_type before obs_date\n",
    "    temp_df['app_type_b4_obsdt'] = np.where(temp_df['is_approved_timestamp'] > pd.to_datetime(obs_date),\n",
    "                                             'basic', temp_df['application_type'])\n",
    "    # Select specific app_type only\n",
    "    temp_df = temp_df[temp_df['app_type_b4_obsdt'].isin(app_type)].reset_index(drop=True)    \n",
    "    \n",
    "    # remove users who NPL before obs_date\n",
    "    temp_df = temp_df[~(temp_df['first_dpd_91_dt'] < \n",
    "                      pd.to_datetime(obs_date))].reset_index(drop=True)\n",
    "\n",
    "    # remove users who have not made\n",
    "    # 1st trx before obs_date\n",
    "    temp_df = temp_df[~(temp_df['first_trx_date'] > pd.to_datetime(obs_date)) & \n",
    "                        temp_df['first_trx_date'].notna()]\n",
    "    \n",
    "    main_df = pd.concat([main_df, temp_df]).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(108965, 9)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "main_df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### feature data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "temp_ls = []\n",
    "for feat_key in feat_ls:\n",
    "    temp_df = pd.read_parquet('s3://finaccel-ml-model-data-production/'+feat_key)\n",
    "    temp_ls.append(temp_df)\n",
    "    \n",
    "feat_df = pd.concat(temp_ls).reset_index(drop=True)\n",
    "feat_df = feat_df.drop_duplicates('user_id', 'first').reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(357579, 415)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feat_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "main_df = main_df.merge(feat_df, how='left', on='user_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(108965, 423)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "main_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "del temp_ls, temp_df, feat_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# censor NPL events that happen after censor date\n",
    "\n",
    "main_df['first_dpd_91_censor'] = np.where(main_df['first_dpd_91_dt'] < pd.to_datetime(spline_censor_date), \n",
    "                                         main_df['first_dpd_91_dt'], None)\n",
    "main_df['first_dpd_91_censor'] = main_df['first_dpd_91_censor'].astype('datetime64')\n",
    "\n",
    "# also set the first_dpd_91 to 1st of month\n",
    "main_df['first_dpd_91_censor_bom'] = main_df['first_dpd_91_censor'] + pd.offsets.MonthEnd(0) - pd.offsets.MonthBegin(normalize=True)\n",
    "\n",
    "main_df['t0'] = pd.to_datetime(obs_date)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "main_df['duration_t0'] = np.where(main_df['first_dpd_91_censor'].isna(), 11, \n",
    "                                  main_df['first_dpd_91_censor'].dt.year*12 + \\\n",
    "                                  main_df['first_dpd_91_censor'].dt.month - \\\n",
    "                                  2019*12 - 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "main_df['flag_churn'] = main_df['first_dpd_91_censor'].notna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "main_df['util_latest'] = np.where(main_df['util_latest'] > 1, 1, main_df['util_latest'])\n",
    "\n",
    "education_mapping = {'pre_high_school': 1, \n",
    "                     'high_school': 2, \n",
    "                     'diploma': 3,\n",
    "                     'undergraduate': 4, \n",
    "                     'postgraduate': 5}\n",
    "main_df['de_education'] = main_df['de_education'].map(education_mapping)\n",
    "main_df['snapshot_days_premium'] = main_df['snapshot_days_premium'].fillna(120)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_val_temp, test_df = train_test_split(main_df, test_size=0.1, random_state=99)\n",
    "train_temp, val_df = train_test_split(train_val_temp, test_size=0.1, random_state=99)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train size: (88261, 428)\n",
      "val size: (9807, 428)\n",
      "test size: (10897, 428)\n"
     ]
    }
   ],
   "source": [
    "print('train size:', train_temp.shape)\n",
    "print('val size:', val_df.shape)\n",
    "print('test size:', test_df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(13239, 428)"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# to reduce the size of the YAML file,\n",
    "# the train data is sampled\n",
    "train_df = train_temp.sample(frac=0.15, random_state=99)\n",
    "train_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "cont_feats = [\n",
    "    'ap_co_plo',\n",
    "    'delin_dpd_max_90',\n",
    "    'rep_count_30',\n",
    "    'snapshot_current_dpd',\n",
    "    'snapshot_days_premium',\n",
    "    'snapshot_os_amount',\n",
    "    'trx_active_amt_avg',\n",
    "    'trx_denied_count_120',\n",
    "    'trx_merch_count_dist_60',\n",
    "    'trx_pl_count_90',\n",
    "    'trx_suc_amt_sum_30',\n",
    "    'util_latest',\n",
    "]\n",
    "\n",
    "disc_feats = [\n",
    "    'de_education'\n",
    "]\n",
    "\n",
    "feat_dict = {**{x:'c' for x in cont_feats},\n",
    "            **{x:'d' for x in disc_feats},\n",
    "            }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_woe_bins(train, qnt_num, feature_dict):\n",
    "    bin_dict = {}\n",
    "    for feature, v_type in feature_dict.items():\n",
    "        if v_type == 'c':\n",
    "            _, feat_bins = pd.qcut(train[feature], qnt_num, retbins=True, duplicates='drop')\n",
    "        elif v_type == 'd':\n",
    "            feat_bins = train[feature].unique().tolist()\n",
    "        bin_dict[feature] = feat_bins\n",
    "    return bin_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_bins = get_woe_bins(train_df, 10, feat_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipe_transform = pr.Pipeline_transform\n",
    "pipeline_dict = dict()\n",
    "\n",
    "for feature in feat_dict:\n",
    "    pipeline_base = pr.Pipeline_base\n",
    "    \n",
    "    feat_series = train_df[feature]\n",
    "    target_series = train_df['flag_churn']\n",
    "    \n",
    "    feat_transformation = feature_bins[feature]\n",
    "    \n",
    "    if feature.startswith('de_'):\n",
    "\n",
    "        tf_woe = pr.TransformerWoE(v_type='d')\n",
    "        tf_woe.fit(feat_series, target_series)\n",
    "\n",
    "        steps = [('tf_woe', tf_woe)]\n",
    "    \n",
    "    else:\n",
    "        tf_woe = pr.TransformerWoE(v_type='c', bins=feat_transformation)\n",
    "        tf_woe.fit(feat_series, target_series)\n",
    "\n",
    "        steps = [('tf_woe', tf_woe)]\n",
    "        \n",
    "    pipeline_dict[feature] = pipeline_base(steps)\n",
    "    \n",
    "pipe_transform = pr.Pipeline_transform()\n",
    "pipe_transform.pipeline_dict = pipeline_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add imputer for features that don't have missing values\n",
    "# in the train dataset, just in case\n",
    "impute_dict = {\n",
    "    'ap_co_plo':0,\n",
    "    'snapshot_days_premium':120,\n",
    "    'de_education':3,\n",
    "    'trx_merch_count_dist_60':0,\n",
    "    'trx_denied_count_120':0\n",
    "}\n",
    "\n",
    "for feature in impute_dict:\n",
    "    tf_imp = pr.TransformerImputer()\n",
    "    tf_imp.params['replacement_values'] = impute_dict[feature]\n",
    "    \n",
    "    imp_transformer = ('tf_imp', tf_imp)\n",
    "    \n",
    "    pipe_transform.pipeline_dict[feature].steps = [imp_transformer] + pipe_transform.pipeline_dict[feature].steps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ap_co_plo                     0\n",
       "delin_dpd_max_90            504\n",
       "rep_count_30                 43\n",
       "snapshot_current_dpd       1352\n",
       "snapshot_days_premium         0\n",
       "snapshot_os_amount         1352\n",
       "trx_active_amt_avg         1362\n",
       "trx_denied_count_120          0\n",
       "trx_merch_count_dist_60       0\n",
       "trx_pl_count_90            5691\n",
       "trx_suc_amt_sum_30         4632\n",
       "util_latest                1362\n",
       "de_education                  0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df[cont_feats+disc_feats].isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "13\n",
      "13\n"
     ]
    }
   ],
   "source": [
    "print(len(feat_dict))\n",
    "print(len(pipe_transform.pipeline_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save the transformer YAML\n",
    "pipe_transform.to_yaml('spline_cox_regression_transformer_premium.yaml')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train size: (13239, 15)\n",
      "val size: (9807, 15)\n",
      "test size: (10897, 15)\n"
     ]
    }
   ],
   "source": [
    "# transform features\n",
    "final_feats = sorted(cont_feats+disc_feats)\n",
    "\n",
    "train = train_df.loc[:, ['duration_t0','flag_churn']+final_feats]\n",
    "val = val_df.loc[:, ['duration_t0','flag_churn']+final_feats]\n",
    "test = test_df.loc[:, ['duration_t0','flag_churn']+final_feats]\n",
    "\n",
    "train = pipe_transform.transform(train)\n",
    "val = pipe_transform.transform(val)\n",
    "test = pipe_transform.transform(test)\n",
    "\n",
    "print('train size:', train.shape)\n",
    "print('val size:', val.shape)\n",
    "print('test size:', test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save the train dataset for production\n",
    "train = train.reset_index(drop=True)\n",
    "train.to_parquet('spline_cox_regression_train_dataset_premium.parquet')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}