personal_loan_credit_risk/600-Threshold_Analysis/603 - Threshold Generate Features Jan2020 Part 3.ipynb · past-data-projects

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:36.988168Z",
     "start_time": "2019-11-07T03:55:36.706164Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pymysql\n",
    "from datetime import datetime\n",
    "from os import listdir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:36.994804Z",
     "start_time": "2019-11-07T03:55:36.989531Z"
    }
   },
   "outputs": [],
   "source": [
    "import configparser\n",
    "config = configparser.ConfigParser()\n",
    "config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
    "\n",
    "host = config['MYSQL-ROOT']['HOST']\n",
    "user = config['MYSQL-ROOT']['USER']\n",
    "password = config['MYSQL-ROOT']['PASSWORD']\n",
    "\n",
    "def connect_sql():\n",
    "    cnx = pymysql.connect(host=host,\n",
    "                          user=user,\n",
    "                          password=password,\n",
    "                          cursorclass=pymysql.cursors.DictCursor)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "FOLDER_LOC = 'data/thresh_dgtl_trx_feats'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:37.026188Z",
     "start_time": "2019-11-07T03:55:36.997299Z"
    }
   },
   "outputs": [],
   "source": [
    "base_df = pd.read_parquet('data/pl_threshold_jan_11022020.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:37.034477Z",
     "start_time": "2019-11-07T03:55:37.027603Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(79853, 10)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T03:55:37.043587Z",
     "start_time": "2019-11-07T03:55:37.036542Z"
    }
   },
   "outputs": [],
   "source": [
    "uids = tuple(base_df['user_id'].unique().tolist())\n",
    "tids = tuple(base_df['trx_id'].unique().tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "78465"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(uids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "79853"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tids)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Digital and Non-Digital Transactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_trx_history(base_df):\n",
    "    uids = tuple(base_df['user_id'])\n",
    "    max_trx_date = base_df['transaction_date'].max()\n",
    "    \n",
    "    # Remove all promotional and service related fees\n",
    "    excluded_product = ('biaya layanan'\n",
    "                        ,'shipping fee'\n",
    "                        ,'admin fee' \n",
    "                        ,'biaya kirim yang dibebankan' \n",
    "                        ,'biaya tambahan' \n",
    "                        ,'discount'             \n",
    "                        ,'adminfee'       \n",
    "                        ,'shippingfee'\n",
    "                        ,'additionalfee'                    \n",
    "                        ,'diskon tambahan' \n",
    "                        ,'donasi'\n",
    "                        ,'donation'                    \n",
    "                        ,'charge kredivo 2.3%'  \n",
    "                        ,'lainnya'   \n",
    "                        ,'user credit'\n",
    "                        ,'deals')  \n",
    "    \n",
    "    query = f\"\"\"\n",
    "    SELECT\n",
    "        t.user_id,\n",
    "        t.transaction_id AS trx_id,\n",
    "        t.transaction_date,\n",
    "        LOWER(p.name) AS product_name,\n",
    "        LOWER(p.category) AS category,\n",
    "        l.loan_amount,\n",
    "        t.merchant_id\n",
    "    FROM\n",
    "            (\n",
    "            SELECT\n",
    "                user_id,\n",
    "                id AS transaction_id,\n",
    "                transaction_date,\n",
    "                merchant_id\n",
    "            FROM\n",
    "                l2alpha.`transaction`\n",
    "            WHERE\n",
    "                user_id IN {uids}\n",
    "                AND status = 4\n",
    "                AND transaction_type <> 2\n",
    "                AND transaction_date <= '{max_trx_date}') AS t\n",
    "        JOIN (\n",
    "            SELECT\n",
    "                name,\n",
    "                id AS product_id,\n",
    "                transaction_id,\n",
    "                category,\n",
    "                sku\n",
    "            FROM\n",
    "                l2alpha.product\n",
    "            WHERE\n",
    "                sku_type = 0\n",
    "                AND name NOT IN {excluded_product}\n",
    "                AND sku NOT IN {excluded_product}) AS p ON\n",
    "            t.transaction_id = p.transaction_id\n",
    "        JOIN (\n",
    "            SELECT\n",
    "                transaction_id,\n",
    "                loan_amount\n",
    "            FROM\n",
    "                l2alpha.loan\n",
    "            WHERE\n",
    "                is_active = 1\n",
    "                AND loan_amount > 0) AS l ON\n",
    "            p.transaction_id = l.transaction_id\n",
    "        JOIN (\n",
    "            SELECT\n",
    "                id,\n",
    "                name\n",
    "            FROM\n",
    "                l2alpha.merchant) AS m ON\n",
    "            t.merchant_id = m.id\n",
    "    \"\"\"\n",
    "    \n",
    "    df = pd.read_sql(query, connect_sql())\n",
    "    out_df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(df, how='left', on='user_id')\\\n",
    "                    .rename(columns={'transaction_date_x':'trx_dt_curr',\n",
    "                                     'transaction_date_y':'trx_dt_past',\n",
    "                                     'trx_id_x':'trx_id',\n",
    "                                     'trx_id_y':'trx_id_ref'})\n",
    "    out_df['time_diff'] = (out_df['trx_dt_curr'] - out_df['trx_dt_past']).dt.total_seconds()\n",
    "    return out_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "     \n",
    "def get_ticket(x):\n",
    "    if int(x['merchant_id']) in [154,428,48,87,432]:\n",
    "        if 'hotel' not in x['product_name'] and 'hotel' not in x['category']:\n",
    "            return True\n",
    "    elif x['merchant_id'] in [270]:\n",
    "        if 'flight' in x['product_name'] or 'flight' in x['category']:\n",
    "            return True\n",
    "\n",
    "    return False\n",
    "\n",
    "def get_utilities(x):\n",
    "    for w in ['pdam','pln','tv kabel', 'electricity','electricity_postpaid',\n",
    "              'listrik pln', 'pln','pdam','internet & tv kabel','air pdam', \n",
    "              '75061609','75062176', 'listrik prabayar', 'listrik lainnya','pulsa listrik']:\n",
    "        if w in x['product_name'] or w in x['category']:        \n",
    "            return True\n",
    "\n",
    "    if re.search(r'(tagihan).*(listrik|telkom)|(voucher|token).*(listrik|pln)|pln.*(prepaid|postpaid|token)|\\bpdam\\b', x['product_name']):\n",
    "        return True\n",
    "\n",
    "    return False\n",
    "\n",
    "def get_game(x):\n",
    "    if int(x['merchant_id']) in [426,197]:\n",
    "        return True\n",
    "    elif re.search(r'(voucher|data).*game|game.*(voucher|data)|\\bgamesmax\\b', x['product_name']):\n",
    "        return True\n",
    "    elif re.search('voucher', x['category']):\n",
    "        if re.search('game',x['product_name']):\n",
    "            return True\n",
    "    else:\n",
    "        return False\n",
    "\n",
    "def get_pulsa(x):\n",
    "    if int(x['merchant_id']) == 8:\n",
    "        for n in ['pln','listrik','pdam','game']:\n",
    "            if n in x['product_name']:\n",
    "                return False\n",
    "        if 'rp' in x['product_name']:\n",
    "            return True\n",
    "    elif x['merchant_id'] != 8:\n",
    "        for n in ['pln','listrik','pdam','game', 'flight','mobiles & tablets', 'komputer & aksesoris']:\n",
    "            if n in x['product_name'] or n in x['category']:\n",
    "                return False\n",
    "        for w in ['mobile', 'digital products','pulsa', \n",
    "              'virtual','75061635','paket data','digital utilities','pasca bayar']:\n",
    "            if w in x['category']:\n",
    "                return True\n",
    "\n",
    "        if re.search(r'\\bpulsa\\b', x['product_name']):\n",
    "            return True\n",
    "\n",
    "        for w in ['paket data', 'kuota', 'xl xtra', 'indosat' , 'telkomsel', 'voucher 3',\n",
    "                  'smartfren', 'voucher tri', 'xl xtra combo', 'axis', 'paket internet']:\n",
    "            if w in x['product_name']:\n",
    "                return True\n",
    "\n",
    "    else:\n",
    "        return False\n",
    "    \n",
    "def get_digital_product_status(df):\n",
    "    df['trx_cat_ticket'] = df.apply(lambda z: get_ticket(z), axis=1)\n",
    "    df['trx_cat_game'] = df.apply(lambda z: get_game(z), axis=1)\n",
    "    df['trx_cat_pulsa'] = df.apply(lambda z: get_pulsa(z), axis=1)\n",
    "    df['trx_cat_utilities'] = df.apply(lambda z: get_utilities(z), axis=1)\n",
    "    \n",
    "    df['trx_cat_digital'] = df[['trx_cat_game', 'trx_cat_pulsa', 'trx_cat_utilities']].any(1)\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_prev_digital_trx_per_type(df, keyword, window='all'):\n",
    "    if keyword not in ['ticket', 'game', 'pulsa', 'utilities', 'digital']:\n",
    "        raise ValueError(f'{keyword} not available. Try ticket, game, pulsa, utilities, digital.')\n",
    "                         \n",
    "    if isinstance(window, int):\n",
    "        df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
    "        name = str(window)\n",
    "    elif window == 'all':\n",
    "        df = df[df['time_diff'] > 0]\n",
    "        name = 'ever'\n",
    "    elif isinstance(window, int) == False or window != 'all':\n",
    "        raise ValueError(f'The input \"{window}\" is wrong. Window value must be an integer or \"all\".')\n",
    "        \n",
    "    key = 'trx_cat_'+ keyword\n",
    "    digital = df.groupby(['user_id', 'trx_id', key]).agg({key:'count',\n",
    "                                                          'loan_amount':['sum', 'mean']})\\\n",
    "                .unstack().fillna(0)\n",
    "    \n",
    "    columns = []\n",
    "    for groups in digital.columns.values:\n",
    "        col_name = []\n",
    "        for item in groups:\n",
    "            col_name.append(str(item))\n",
    "        new_name = '-'.join(col_name)\n",
    "        columns.append(new_name)\n",
    "    digital.columns = columns\n",
    "    \n",
    "    if keyword == 'digital':\n",
    "        final = digital.reset_index()\n",
    "        final.columns = ['user_id', 'trx_id',\n",
    "                         f'nondgtl_nonpl_trx_co_{name}',\n",
    "                         f'dgtl_trx_co_all_{name}',\n",
    "                         f'nondgtl_nonpl_trx_sum_amt_{name}',\n",
    "                         f'dgtl_trx_sum_amt_all_{name}',\n",
    "                         f'nondgtl_nonpl_trx_avg_amt_{name}',\n",
    "                         f'dgtl_trx_avg_amt_all_{name}']\n",
    "        \n",
    "    else:\n",
    "        relevant_columns = [col for col in digital.columns if 'True' in col]\n",
    "        final = digital[relevant_columns].reset_index()\n",
    "        final.columns = ['user_id', 'trx_id', \n",
    "                         f'dgtl_trx_co_{keyword}_{name}', \n",
    "                         f'dgtl_trx_sum_amt_{keyword}_{name}',\n",
    "                         f'dgtl_trx_avg_amt_{keyword}_{name}']\n",
    "    \n",
    "    return final"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_digital_trx_feats(df_in, begin, finish):\n",
    "    # Load Raw Data\n",
    "    st_tm_1 = datetime.now()\n",
    "    print('Querying Transaction Data')\n",
    "    \n",
    "    df = df_in.iloc[begin:finish].reset_index(drop=True).copy()\n",
    "    display(df.shape)\n",
    "    trx_hist = get_trx_history(df)\n",
    "    trx_hist = trx_hist.dropna(how='any')\n",
    "    display(trx_hist.shape)\n",
    "    \n",
    "    print('query time:', datetime.now()-st_tm_1)\n",
    "    \n",
    "    #Create Digital Product Flags\n",
    "    st_tm_2 = datetime.now()\n",
    "    print('Create Digital Product Flags')\n",
    "    \n",
    "    trx_hist = get_digital_product_status(trx_hist)\n",
    "    display(trx_hist.shape)\n",
    "    \n",
    "    print('Create Digital Product Flags time:', datetime.now()-st_tm_2)\n",
    "    \n",
    "    #Create Features\n",
    "    st_tm_3 = datetime.now()\n",
    "    print('Create Features')\n",
    "    \n",
    "    digital_180 = create_prev_digital_trx_per_type(trx_hist, 'digital', window=180)\n",
    "    \n",
    "    print('Create Features time:', datetime.now()-st_tm_3)\n",
    "    \n",
    "    #Combine DataFrames\n",
    "    st_tm_4 = datetime.now()\n",
    "    print('Combine DataFrames')\n",
    "    \n",
    "    df_out = df[['user_id', 'trx_id']].merge(digital_180, how='left', on=['user_id', 'trx_id'])\n",
    "    \n",
    "    print('combine dataframe time:', datetime.now()-st_tm_4)\n",
    "    print('total processing time:', datetime.now()-st_tm_1)\n",
    "    \n",
    "    return df_out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "58af6962f8b047f29c2f5d975aba2983",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "IntProgress(value=1, max=16, min=1)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "process data from 0 to 5000.\n",
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(5000, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from ipywidgets import IntProgress\n",
    "from IPython.display import display\n",
    "\n",
    "batch = 1\n",
    "batch_size = 5000\n",
    "\n",
    "max_count = round(base_df.shape[0]/batch_size)\n",
    "f = IntProgress(min=batch, max=max_count) # instantiate the bar\n",
    "display(f) # display the bar\n",
    "\n",
    "st_tm = datetime.now()\n",
    "\n",
    "while (batch*batch_size) - base_df.shape[0] < batch_size:\n",
    "    f.value += 1\n",
    "    start = (batch-1)*batch_size\n",
    "    end = batch*batch_size\n",
    "    print(f'process data from {start} to {end}.')\n",
    "    dgtl_trx_feats = create_digital_trx_feats(base_df, start, end)\n",
    "    dgtl_trx_feats.to_parquet(FOLDER_LOC + '/dgtl_trx_feats_batch'+str(batch)+'.parquet', compression='gzip')\n",
    "    print(f'batch {batch} done!')\n",
    "    print('================================================================================================================')\n",
    "    batch += 1\n",
    "    \n",
    "print('Total Processing Time:', datetime.now() - st_tm)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combine Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.586494Z",
     "start_time": "2019-11-07T04:02:44.510100Z"
    }
   },
   "outputs": [],
   "source": [
    "file_batches = [file for file in listdir(FOLDER_LOC) if file.endswith('.parquet')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_batches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comb = []\n",
    "for part in file_batches:\n",
    "    raw = pd.read_parquet(FOLDER_LOC+'/'+part)\n",
    "    comb.append(raw)\n",
    "    \n",
    "dgtl_trx_feats = pd.concat(comb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(79853, 3)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dgtl_trx_feats.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "dgtl_trx_feats.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "user_id                     0.000000\n",
       "trx_id                      0.000000\n",
       "nondgtl_nonpl_trx_co_180    0.225139\n",
       "dtype: float64"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dgtl_trx_feats.isna().mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dgtl_trx_feats = dgtl_trx_feats[['user_id', 'trx_id', 'nondgtl_nonpl_trx_co_180']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:02:44.798454Z",
     "start_time": "2019-11-07T04:02:44.619626Z"
    }
   },
   "outputs": [],
   "source": [
    "dgtl_trx_feats.to_parquet('data/pl_threshold_feats_18022020_part3.parquet', compression='gzip')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}