past-data-projects / personal_loan_credit_risk / 600-Threshold_Analysis / 603 - Threshold Generate Features Jan2020 Part 2.ipynb
603 - Threshold Generate Features Jan2020 Part 2.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:36.988168Z",
     "start_time": "2019-11-07T03:55:36.706164Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pymysql\n",
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:36.994804Z",
     "start_time": "2019-11-07T03:55:36.989531Z"
    }
   },
   "outputs": [],
   "source": [
    "import configparser\n",
    "config = configparser.ConfigParser()\n",
    "config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
    "\n",
    "host = config['MYSQL-ROOT']['HOST']\n",
    "user = config['MYSQL-ROOT']['USER']\n",
    "password = config['MYSQL-ROOT']['PASSWORD']\n",
    "\n",
    "def connect_sql():\n",
    "    cnx = pymysql.connect(host=host,\n",
    "                          user=user,\n",
    "                          password=password,\n",
    "                          cursorclass=pymysql.cursors.DictCursor)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:37.026188Z",
     "start_time": "2019-11-07T03:55:36.997299Z"
    }
   },
   "outputs": [],
   "source": [
    "base_df = pd.read_parquet('data/pl_threshold_jan_11022020.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:37.034477Z",
     "start_time": "2019-11-07T03:55:37.027603Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(79853, 10)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:37.043587Z",
     "start_time": "2019-11-07T03:55:37.036542Z"
    }
   },
   "outputs": [],
   "source": [
    "uids = tuple(base_df['user_id'].unique().tolist())\n",
    "tids = tuple(base_df['trx_id'].unique().tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "78465"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(uids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "79853"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tids)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Number of Previous PL Transactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_pl_history(base_df):\n",
    "    uids = tuple(base_df['user_id'])\n",
    "    max_trx_date = base_df['transaction_date'].max()\n",
    "\n",
    "    query = f\"\"\"\n",
    "        SELECT user_id,\n",
    "               id AS trx_id,\n",
    "               status,\n",
    "               payment_type,\n",
    "               amount,\n",
    "               transaction_date\n",
    "        FROM l2alpha.transaction\n",
    "        WHERE user_id IN {uids}\n",
    "        AND transaction_type = 2\n",
    "        AND transaction_date <= '{max_trx_date}'\n",
    "    \"\"\"\n",
    "\n",
    "    df = pd.read_sql(query, connect_sql())\n",
    "    df['gen_payment_type'] = df['payment_type'].apply(lambda x: 'mini' if x=='30_days' else 'jumbo')\n",
    "    \n",
    "    out_df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(df, how='left', on='user_id')\\\n",
    "                     .rename(columns={'transaction_date_x':'trx_dt_current',\n",
    "                                      'transaction_date_y':'trx_dt_past',\n",
    "                                      'trx_id_x':'trx_id',\n",
    "                                      'trx_id_y':'trx_id_ref'})\n",
    "    out_df['time_diff'] = (out_df['trx_dt_current'] - out_df['trx_dt_past']).dt.total_seconds()\n",
    "    return out_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_prev_pl_feats(df, base_df, window='all'):\n",
    "    if isinstance(window, int):\n",
    "        df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
    "        name = str(window)\n",
    "    elif window == 'all':\n",
    "        df = df[df['time_diff'] > 0]\n",
    "        name = 'ever'\n",
    "    \n",
    "    prev_df = df.groupby(['user_id', 'trx_id', 'status', 'gen_payment_type']).size()\\\n",
    "                .unstack(level=[3,2]).fillna(0)\n",
    "    prev_df.columns = [f'pl_trx_co_{col[0]}_{col[1]}_{name}' for col in prev_df.columns.values]\n",
    "    \n",
    "    keywords = [2, 4, 5, 6, 7, 11, 'jumbo', 'mini']\n",
    "    stat_cols_dict = {}\n",
    "    for word in keywords:\n",
    "        str_word = str(word) if isinstance(word, int) else word\n",
    "        stat_cols_dict[word] = [col for col in prev_df if str_word in col]\n",
    "    \n",
    "    name_status = 'pl_trx_co_jumbomini_{}_{}'\n",
    "    name_pl_type = 'pl_trx_co_{}_all_{}'\n",
    "    for k, v in stat_cols_dict.items():\n",
    "        if isinstance(k, int):\n",
    "            prev_df[name_status.format(str(k), name)] = prev_df[v].sum(axis=1)\n",
    "        elif isinstance(k, str):\n",
    "            prev_df[name_pl_type.format(k, name)] = prev_df[v].sum(axis=1)\n",
    "            \n",
    "    prev_df[f'pl_trx_co_jumbomini_all_{name}'] = prev_df[f'pl_trx_co_jumbo_all_{name}'] + \\\n",
    "                                         prev_df[f'pl_trx_co_mini_all_{name}']\n",
    "    \n",
    "    prev_df = base_df[['user_id', 'trx_id']].merge(prev_df, how='left', on=['user_id', 'trx_id']).fillna(0)\n",
    "    \n",
    "    return prev_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pl_hist = get_pl_history(base_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prev_obs_feats_90 = create_prev_pl_feats(pl_hist, base_df, window=90)\n",
    "prev_obs_feats_90 = prev_obs_feats_90.rename(columns={'pl_trx_co_jumbomini_4_90':'pl_trx_suc_co_90'})\n",
    "prev_obs_feats_90 = prev_obs_feats_90[['user_id', 'trx_id', 'pl_trx_suc_co_90']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(79853, 3)"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prev_obs_feats_90.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Early Settlement"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_trx_settle_data(base_df):\n",
    "    uids = tuple(base_df['user_id'])\n",
    "    max_trx_date = base_df['transaction_date'].max()\n",
    "\n",
    "    early_settle_q = f'''\n",
    "    SELECT *,\n",
    "            DATE(transaction_date) + INTERVAL payment_type_int DAY AS due_date\n",
    "    FROM\n",
    "        (SELECT tr.user_id, tr.trx_id, transaction_date, payment_type, transaction_type, status,\n",
    "                CASE WHEN payment_type = '30_days' THEN '30'\n",
    "                WHEN payment_type = '3_months' THEN '90'\n",
    "                WHEN payment_type = '6_months' THEN '180' \n",
    "                WHEN payment_type = '12_months' THEN '360' ELSE NULL END AS payment_type_int,\n",
    "\n",
    "                settlement_date, loan_amount, is_active\n",
    "        FROM\n",
    "            (SELECT user_id,\n",
    "                   id AS trx_id,\n",
    "                   transaction_date,\n",
    "                   payment_type,\n",
    "                   transaction_type,\n",
    "                   status\n",
    "            FROM l2alpha.`transaction`\n",
    "            WHERE user_id IN {uids}\n",
    "            AND transaction_type = 2\n",
    "            AND status = 4\n",
    "            AND transaction_date <= '{max_trx_date}'\n",
    "            ) AS tr\n",
    "\n",
    "            LEFT JOIN\n",
    "\n",
    "            (SELECT transaction_id AS trx_id,\n",
    "                   settlement_date,\n",
    "                   loan_amount,\n",
    "                   is_active\n",
    "            FROM l2alpha.loan\n",
    "            ) AS lo\n",
    "\n",
    "            ON tr.trx_id = lo.trx_id\n",
    "            ) AS base\n",
    "        WHERE loan_amount > 0 AND is_active = 1\n",
    "    '''\n",
    "    query_df = pd.read_sql(early_settle_q, connect_sql())\n",
    "    query_df['settle_to_due'] = (query_df['due_date'] - query_df['settlement_date'].dt.date).dt.days\n",
    "    df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(query_df, how='left', on='user_id')\\\n",
    "                                         .rename(columns={'transaction_date_x':'trx_dt_current',\n",
    "                                          'transaction_date_y':'trx_dt_past',\n",
    "                                          'trx_id_x':'trx_id',\n",
    "                                          'trx_id_y':'trx_id_ref'})\n",
    "    df['time_diff'] = (df['trx_dt_current'] - df['trx_dt_past']).dt.total_seconds()\n",
    "    df = df.sort_values(['user_id', 'trx_dt_past']).reset_index(drop=True)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_early_settle_feats(df, base_df, window='all'):\n",
    "    if isinstance(window, int):\n",
    "        df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
    "        name = str(window)\n",
    "    elif window == 'all':\n",
    "        df = df[df['time_diff'] > 0]\n",
    "        name = 'ever'\n",
    "    df = df.sort_values(['user_id', 'trx_id', 'trx_dt_past'])\n",
    "    \n",
    "    # Calculate how many times the user has settled early in the last XX days\n",
    "    co_df = df.copy()\n",
    "    co_df['flag_early_settle'] = np.where(co_df['settle_to_due'] >= 30, 1, 0)\n",
    "    co_df['flag_early_settle'] = np.where(co_df['settle_to_due'].isna(), np.nan, co_df['flag_early_settle'])\n",
    "    co_df = co_df.groupby('trx_id').agg({'flag_early_settle':'sum'}).reset_index()\n",
    "    co_df.columns = ['trx_id', f'early_settle_co_{name}']\n",
    "    \n",
    "    # Calculate the min, max, average, and median days the user has settled early in the last XX days\n",
    "    # Also get the last time the user settled early (how many days early).\n",
    "    es_df = df.groupby('trx_id').agg({'settle_to_due':[max, min, 'mean', 'median', 'last']})\n",
    "    col_names = ['settle_to_due_max_{}', 'settle_to_due_min_{}', 'settle_to_due_avg_{}', \n",
    "                 'settle_to_due_med_{}', 'settle_to_due_last_pl_{}']\n",
    "    es_df.columns = [col.format(name) for col in col_names]\n",
    "    es_df = es_df.reset_index()\n",
    "    out_df = base_df[['user_id', 'trx_id']].merge(es_df, how='left', on='trx_id')\\\n",
    "                                           .merge(co_df, how='left', on='trx_id')\n",
    "    return out_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trx_settle = get_trx_settle_data(base_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "early_settle_feats_180 = create_early_settle_feats(trx_settle, base_df, window=180)\n",
    "early_settle_feats_180 = early_settle_feats_180.rename(columns={'settle_to_due_last_pl_180':'pl_settle_to_due_last_180'})\n",
    "early_settle_feats_180 = early_settle_feats_180[['user_id', 'trx_id', 'pl_settle_to_due_last_180']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(79853, 3)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(early_settle_feats_180.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Time from last PL Settlement"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_time_from_last_pl_feat(df):\n",
    "    time_from_last_pl = df[df['time_diff'] >= 0]\n",
    "    time_from_last_pl.loc[(time_from_last_pl['time_diff'] == 0) & \n",
    "                          (time_from_last_pl['settlement_date'].notna()), 'settlement_date'] = np.nan\n",
    "    time_from_last_pl = time_from_last_pl.sort_values(['user_id', 'trx_id', 'trx_dt_past'])\n",
    "\n",
    "    time_from_last_pl_grp = time_from_last_pl.groupby('trx_id').tail(2)\n",
    "    time_from_last_pl_grp['settle_shift'] = time_from_last_pl_grp['settlement_date'].shift()\n",
    "    time_from_last_pl_grp['time_from_prev_pl_settle'] = (time_from_last_pl_grp['trx_dt_current'] - \\\n",
    "                                                          time_from_last_pl_grp['settle_shift']).dt.total_seconds()/3600\n",
    "    \n",
    "    time_from_last_pl_final = time_from_last_pl_grp.groupby('trx_id').tail(1)\n",
    "    time_from_last_pl_final = time_from_last_pl_final[['user_id', 'trx_id', \n",
    "                                                       'time_from_prev_pl_settle']].reset_index(drop=True)\n",
    "    return time_from_last_pl_final"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "time_from_prev_pl_settle = create_time_from_last_pl_feat(trx_settle)\n",
    "time_from_prev_pl_settle = time_from_prev_pl_settle.rename(columns={'time_from_prev_pl_settle':'time_from_last_sett_pl_hour'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(75884, 3)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(time_from_prev_pl_settle.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Date and day of week of transaction date"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "date_feats = base_df[['user_id', 'trx_id', 'transaction_date']].copy()\n",
    "date_feats['date_of_month'] = date_feats['transaction_date'].dt.day\n",
    "date_feats = date_feats[['user_id', 'trx_id', 'date_of_month']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(79853, 3)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "date_feats.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Denied Transactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "denied_trx_q = f'''\n",
    "SELECT base.user_id,\n",
    "base.pl_trx_id AS trx_id,\n",
    "COUNT(DISTINCT(CASE WHEN datediff(DATE(pl_transaction_date),transaction_date) <= 90 AND \n",
    "               datediff(DATE(pl_transaction_date),transaction_date) >=0\n",
    "               THEN base.transaction_id ELSE NULL END)) AS trx_denied_co_90                \n",
    "FROM\n",
    "    (SELECT tr1.user_id,\n",
    "            tr1.id AS pl_trx_id,\n",
    "            tr1.pl_transaction_date,\n",
    "            tr2.id AS transaction_id,\n",
    "            tr2.transaction_date,\n",
    "            tr2.amount\n",
    "    FROM\n",
    "        (SELECT user_id, \n",
    "              id, \n",
    "              transaction_date AS pl_transaction_date\n",
    "        FROM l2alpha.`transaction` WHERE id IN {tids}\n",
    "        ) AS tr1\n",
    "    LEFT JOIN\n",
    "        (SELECT * \n",
    "        FROM l2alpha.`transaction`\n",
    "        WHERE status = 5\n",
    "        ) AS tr2 \n",
    "    ON tr1.user_id = tr2.user_id \n",
    "    AND tr2.transaction_date <= tr1.pl_transaction_date\n",
    "    ) AS base\n",
    "GROUP BY base.user_id, base.pl_transaction_date\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "denied_trx = pd.read_sql(denied_trx_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(79853, 3)"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "denied_trx.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "denied_trx.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Settled Transactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "paid_trx_q = f'''\n",
    "SELECT base.user_id,\n",
    "base.pl_trx_id AS trx_id,\n",
    "SUM(CASE WHEN datediff(DATE(pl_transaction_date),start_date) <= 30 AND \n",
    "                   datediff(DATE(pl_transaction_date),start_date) >=0 \n",
    "                   THEN base.loan_amount ELSE 0 END) AS trx_sett_sum_30\n",
    "FROM\n",
    "    (SELECT tr.user_id,\n",
    "     tr.id AS pl_trx_id,\n",
    "     tr.pl_transaction_date, \n",
    "     l.transaction_id,\n",
    "     l.start_date,\n",
    "     l.loan_amount,\n",
    "     l.tenure\n",
    "\n",
    "\tFROM \n",
    "\t    (SELECT user_id, \n",
    "\t            id, \n",
    "\t            transaction_date AS pl_transaction_date\n",
    "\t    FROM l2alpha.`transaction` WHERE id IN {tids}\n",
    "\t    ) AS tr\n",
    "\tINNER JOIN \n",
    "\t    (SELECT user_id,\n",
    "\t            start_date, \n",
    "\t            settlement_date,\n",
    "\t            loan_amount,\n",
    "\t            tenure,\n",
    "\t            transaction_id \n",
    "\t    FROM l2alpha.loan\n",
    "\t    WHERE is_active = 1 AND loan_amount > 0\n",
    "\t    ) AS l\n",
    "\tON l.user_id = tr.user_id AND settlement_date <= pl_transaction_date\n",
    "\t) AS base\n",
    "GROUP BY base.user_id, base.pl_transaction_date\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "paid_trx = pd.read_sql(paid_trx_q, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(64718, 3)"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paid_trx.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "paid_trx.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combine all dataframes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.586494Z",
     "start_time": "2019-11-07T04:02:44.510100Z"
    }
   },
   "outputs": [],
   "source": [
    "final_df = base_df[['trx_id', 'user_id']]\\\n",
    "                  .merge(prev_obs_feats_90, how='left', on=['user_id', 'trx_id'])\\\n",
    "                  .merge(early_settle_feats_180, how='left', on=['user_id', 'trx_id'])\\\n",
    "                  .merge(time_from_prev_pl_settle, how='left', on=['user_id', 'trx_id'])\\\n",
    "                  .merge(date_feats, how='left', on=['user_id', 'trx_id'])\\\n",
    "                  .merge(denied_trx, how='left', on=['user_id', 'trx_id'])\\\n",
    "                  .merge(paid_trx, how='left', on=['user_id', 'trx_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.591815Z",
     "start_time": "2019-11-07T04:02:44.587913Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(79853, 8)"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(79853, 10)"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.617944Z",
     "start_time": "2019-11-07T04:02:44.593760Z"
    }
   },
   "outputs": [],
   "source": [
    "final_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "trx_id                         0.000000\n",
       "user_id                        0.000000\n",
       "pl_trx_suc_co_90               0.000000\n",
       "pl_settle_to_due_last_180      0.443352\n",
       "time_from_last_sett_pl_hour    0.351145\n",
       "date_of_month                  0.000000\n",
       "trx_denied_co_90               0.000000\n",
       "trx_sett_sum_30                0.189536\n",
       "dtype: float64"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_df.isna().mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.798454Z",
     "start_time": "2019-11-07T04:02:44.619626Z"
    }
   },
   "outputs": [],
   "source": [
    "final_df.to_parquet('data/pl_threshold_feats_11022020_part2.parquet', compression='gzip')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}