past-data-projects / personal_loan_credit_risk / 200-Feature_Creation / 201 - Additional New Features Digital Trx.ipynb
201 - Additional New Features Digital Trx.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pymysql\n",
    "import re\n",
    "from datetime import datetime\n",
    "from os import listdir\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import configparser\n",
    "config = configparser.ConfigParser()\n",
    "config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
    "\n",
    "host = config['MYSQL-ROOT']['HOST']\n",
    "user = config['MYSQL-ROOT']['USER']\n",
    "password = config['MYSQL-ROOT']['PASSWORD']\n",
    "\n",
    "def connect_sql():\n",
    "    cnx = pymysql.connect(host=host,\n",
    "                          user=user,\n",
    "                          password=password,\n",
    "                          cursorclass=pymysql.cursors.DictCursor)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "FOLDER_LOC = 'data/dgtl_trx_features'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_df = pd.read_parquet('data/interim/pl_jumbo_mini_junjulaugsep_27012020.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 18)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "135508"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df['user_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "150672"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df['trx_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('2019-09-28 23:49:58')"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df['transaction_date'].max()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Digital and Non-Digital Transactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_this = raw_df.sample(1000, random_state=99)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_trx_history(base_df):\n",
    "    uids = tuple(base_df['user_id'])\n",
    "    max_trx_date = base_df['transaction_date'].max()\n",
    "    \n",
    "    # Remove all promotional and service related fees\n",
    "    excluded_product = ('biaya layanan'\n",
    "                        ,'shipping fee'\n",
    "                        ,'admin fee' \n",
    "                        ,'biaya kirim yang dibebankan' \n",
    "                        ,'biaya tambahan' \n",
    "                        ,'discount'             \n",
    "                        ,'adminfee'       \n",
    "                        ,'shippingfee'\n",
    "                        ,'additionalfee'                    \n",
    "                        ,'diskon tambahan' \n",
    "                        ,'donasi'\n",
    "                        ,'donation'                    \n",
    "                        ,'charge kredivo 2.3%'  \n",
    "                        ,'lainnya'   \n",
    "                        ,'user credit'\n",
    "                        ,'deals')  \n",
    "    \n",
    "    query = f\"\"\"\n",
    "    SELECT\n",
    "        t.user_id,\n",
    "        t.transaction_id AS trx_id,\n",
    "        t.transaction_date,\n",
    "        LOWER(p.name) AS product_name,\n",
    "        LOWER(p.category) AS category,\n",
    "        l.loan_amount,\n",
    "        t.merchant_id\n",
    "    FROM\n",
    "            (\n",
    "            SELECT\n",
    "                user_id,\n",
    "                id AS transaction_id,\n",
    "                transaction_date,\n",
    "                merchant_id\n",
    "            FROM\n",
    "                l2alpha.`transaction`\n",
    "            WHERE\n",
    "                user_id IN {uids}\n",
    "                AND status = 4\n",
    "                AND transaction_type <> 2\n",
    "                AND transaction_date <= '{max_trx_date}') AS t\n",
    "        JOIN (\n",
    "            SELECT\n",
    "                name,\n",
    "                id AS product_id,\n",
    "                transaction_id,\n",
    "                category,\n",
    "                sku\n",
    "            FROM\n",
    "                l2alpha.product\n",
    "            WHERE\n",
    "                sku_type = 0\n",
    "                AND name NOT IN {excluded_product}\n",
    "                AND sku NOT IN {excluded_product}) AS p ON\n",
    "            t.transaction_id = p.transaction_id\n",
    "        JOIN (\n",
    "            SELECT\n",
    "                transaction_id,\n",
    "                loan_amount\n",
    "            FROM\n",
    "                l2alpha.loan\n",
    "            WHERE\n",
    "                is_active = 1\n",
    "                AND loan_amount > 0) AS l ON\n",
    "            p.transaction_id = l.transaction_id\n",
    "        JOIN (\n",
    "            SELECT\n",
    "                id,\n",
    "                name\n",
    "            FROM\n",
    "                l2alpha.merchant) AS m ON\n",
    "            t.merchant_id = m.id\n",
    "    \"\"\"\n",
    "    \n",
    "    df = pd.read_sql(query, connect_sql())\n",
    "    out_df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(df, how='left', on='user_id')\\\n",
    "                    .rename(columns={'transaction_date_x':'trx_dt_curr',\n",
    "                                     'transaction_date_y':'trx_dt_past',\n",
    "                                     'trx_id_x':'trx_id',\n",
    "                                     'trx_id_y':'trx_id_ref'})\n",
    "    out_df['time_diff'] = (out_df['trx_dt_curr'] - out_df['trx_dt_past']).dt.total_seconds()\n",
    "    return out_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# trx_hist = get_trx_history(test_this)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# trx_hist.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Digital Product Flag Feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_ticket(x):\n",
    "    if int(x['merchant_id']) in [154,428,48,87,432]:\n",
    "        if 'hotel' not in x['product_name'] and 'hotel' not in x['category']:\n",
    "            return True\n",
    "    elif x['merchant_id'] in [270]:\n",
    "        if 'flight' in x['product_name'] or 'flight' in x['category']:\n",
    "            return True\n",
    "\n",
    "    return False\n",
    "\n",
    "def get_utilities(x):\n",
    "    for w in ['pdam','pln','tv kabel', 'electricity','electricity_postpaid',\n",
    "              'listrik pln', 'pln','pdam','internet & tv kabel','air pdam', \n",
    "              '75061609','75062176', 'listrik prabayar', 'listrik lainnya','pulsa listrik']:\n",
    "        if w in x['product_name'] or w in x['category']:        \n",
    "            return True\n",
    "\n",
    "    if re.search(r'(tagihan).*(listrik|telkom)|(voucher|token).*(listrik|pln)|pln.*(prepaid|postpaid|token)|\\bpdam\\b', x['product_name']):\n",
    "        return True\n",
    "\n",
    "    return False\n",
    "\n",
    "def get_game(x):\n",
    "    if int(x['merchant_id']) in [426,197]:\n",
    "        return True\n",
    "    elif re.search(r'(voucher|data).*game|game.*(voucher|data)|\\bgamesmax\\b', x['product_name']):\n",
    "        return True\n",
    "    elif re.search('voucher', x['category']):\n",
    "        if re.search('game',x['product_name']):\n",
    "            return True\n",
    "    else:\n",
    "        return False\n",
    "\n",
    "def get_pulsa(x):\n",
    "    if int(x['merchant_id']) == 8:\n",
    "        for n in ['pln','listrik','pdam','game']:\n",
    "            if n in x['product_name']:\n",
    "                return False\n",
    "        if 'rp' in x['product_name']:\n",
    "            return True\n",
    "    elif x['merchant_id'] != 8:\n",
    "        for n in ['pln','listrik','pdam','game', 'flight','mobiles & tablets', 'komputer & aksesoris']:\n",
    "            if n in x['product_name'] or n in x['category']:\n",
    "                return False\n",
    "        for w in ['mobile', 'digital products','pulsa', \n",
    "              'virtual','75061635','paket data','digital utilities','pasca bayar']:\n",
    "            if w in x['category']:\n",
    "                return True\n",
    "\n",
    "        if re.search(r'\\bpulsa\\b', x['product_name']):\n",
    "            return True\n",
    "\n",
    "        for w in ['paket data', 'kuota', 'xl xtra', 'indosat' , 'telkomsel', 'voucher 3',\n",
    "                  'smartfren', 'voucher tri', 'xl xtra combo', 'axis', 'paket internet']:\n",
    "            if w in x['product_name']:\n",
    "                return True\n",
    "\n",
    "    else:\n",
    "        return False\n",
    "    \n",
    "def get_digital_product_status(df):\n",
    "    df['trx_cat_ticket'] = df.apply(lambda z: get_ticket(z), axis=1)\n",
    "    df['trx_cat_game'] = df.apply(lambda z: get_game(z), axis=1)\n",
    "    df['trx_cat_pulsa'] = df.apply(lambda z: get_pulsa(z), axis=1)\n",
    "    df['trx_cat_utilities'] = df.apply(lambda z: get_utilities(z), axis=1)\n",
    "    \n",
    "    df['trx_cat_digital'] = df[['trx_cat_game', 'trx_cat_pulsa', 'trx_cat_utilities']].any(1)\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# st_tm = datetime.now()\n",
    "# digital_product = get_digital_product_status(trx_hist)\n",
    "# print('Processing time:', datetime.now() - st_tm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# digital_product.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_prev_digital_trx_per_type(df, keyword, window='all'):\n",
    "    if keyword not in ['ticket', 'game', 'pulsa', 'utilities', 'digital']:\n",
    "        raise ValueError(f'{keyword} not available. Try ticket, game, pulsa, utilities, digital.')\n",
    "                         \n",
    "    if isinstance(window, int):\n",
    "        df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
    "        name = str(window)\n",
    "    elif window == 'all':\n",
    "        df = df[df['time_diff'] > 0]\n",
    "        name = 'ever'\n",
    "    elif isinstance(window, int) == False or window != 'all':\n",
    "        raise ValueError(f'The input \"{window}\" is wrong. Window value must be an integer or \"all\".')\n",
    "        \n",
    "    key = 'trx_cat_'+ keyword\n",
    "    digital = df.groupby(['user_id', 'trx_id', key]).agg({key:'count',\n",
    "                                                          'loan_amount':['sum', 'mean']})\\\n",
    "                .unstack().fillna(0)\n",
    "    \n",
    "    columns = []\n",
    "    for groups in digital.columns.values:\n",
    "        col_name = []\n",
    "        for item in groups:\n",
    "            col_name.append(str(item))\n",
    "        new_name = '-'.join(col_name)\n",
    "        columns.append(new_name)\n",
    "    digital.columns = columns\n",
    "    \n",
    "    if keyword == 'digital':\n",
    "        final = digital.reset_index()\n",
    "        final.columns = ['user_id', 'trx_id',\n",
    "                         f'nondgtl_nonpl_trx_co_{name}',\n",
    "                         f'dgtl_trx_co_all_{name}',\n",
    "                         f'nondgtl_nonpl_trx_sum_amt_{name}',\n",
    "                         f'dgtl_trx_sum_amt_all_{name}',\n",
    "                         f'nondgtl_nonpl_trx_avg_amt_{name}',\n",
    "                         f'dgtl_trx_avg_amt_all_{name}']\n",
    "        \n",
    "    else:\n",
    "        relevant_columns = [col for col in digital.columns if 'True' in col]\n",
    "        final = digital[relevant_columns].reset_index()\n",
    "        final.columns = ['user_id', 'trx_id', \n",
    "                         f'dgtl_trx_co_{keyword}_{name}', \n",
    "                         f'dgtl_trx_sum_amt_{keyword}_{name}',\n",
    "                         f'dgtl_trx_avg_amt_{keyword}_{name}']\n",
    "    \n",
    "    return final"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_digital_trx_feats(df_in, begin, finish):\n",
    "    # Load Raw Data\n",
    "    st_tm_1 = datetime.now()\n",
    "    print('Querying Transaction Data')\n",
    "    \n",
    "    df = df_in.iloc[begin:finish].reset_index(drop=True).copy()\n",
    "    display(df.shape)\n",
    "    trx_hist = get_trx_history(df)\n",
    "    trx_hist = trx_hist.dropna(how='any')\n",
    "    display(trx_hist.shape)\n",
    "    \n",
    "    print('query time:', datetime.now()-st_tm_1)\n",
    "    \n",
    "    #Create Digital Product Flags\n",
    "    st_tm_2 = datetime.now()\n",
    "    print('Create Digital Product Flags')\n",
    "    \n",
    "    trx_hist = get_digital_product_status(trx_hist)\n",
    "    display(trx_hist.shape)\n",
    "    \n",
    "    print('Create Digital Product Flags time:', datetime.now()-st_tm_2)\n",
    "    \n",
    "    #Create Features\n",
    "    st_tm_3 = datetime.now()\n",
    "    print('Create Features')\n",
    "    \n",
    "    ticket_all = create_prev_digital_trx_per_type(trx_hist, 'ticket', window='all')\n",
    "    ticket_180 = create_prev_digital_trx_per_type(trx_hist, 'ticket', window=180)\n",
    "    ticket_90 = create_prev_digital_trx_per_type(trx_hist, 'ticket', window=90)\n",
    "    \n",
    "    game_all = create_prev_digital_trx_per_type(trx_hist, 'game', window='all')\n",
    "    game_180 = create_prev_digital_trx_per_type(trx_hist, 'game', window=180)\n",
    "    game_90 = create_prev_digital_trx_per_type(trx_hist, 'game', window=90)\n",
    "    \n",
    "    pulsa_all = create_prev_digital_trx_per_type(trx_hist, 'pulsa', window='all')\n",
    "    pulsa_180 = create_prev_digital_trx_per_type(trx_hist, 'pulsa', window=180)\n",
    "    pulsa_90 = create_prev_digital_trx_per_type(trx_hist, 'pulsa', window=90)\n",
    "    \n",
    "    utilities_all = create_prev_digital_trx_per_type(trx_hist, 'utilities', window='all')\n",
    "    utilities_180 = create_prev_digital_trx_per_type(trx_hist, 'utilities', window=180)\n",
    "    utilities_90 = create_prev_digital_trx_per_type(trx_hist, 'utilities', window=90)\n",
    "    \n",
    "    digital_all = create_prev_digital_trx_per_type(trx_hist, 'digital', window='all')\n",
    "    digital_180 = create_prev_digital_trx_per_type(trx_hist, 'digital', window=180)\n",
    "    digital_90 = create_prev_digital_trx_per_type(trx_hist, 'digital', window=90)\n",
    "    \n",
    "    print('Create Features time:', datetime.now()-st_tm_3)\n",
    "    \n",
    "    #Combine DataFrames\n",
    "    st_tm_4 = datetime.now()\n",
    "    print('Combine DataFrames')\n",
    "    \n",
    "    df_out = df[['user_id', 'trx_id']].merge(ticket_all, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                       .merge(ticket_180, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                       .merge(ticket_90, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                       .merge(game_all, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                       .merge(game_180, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                       .merge(game_90, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                       .merge(pulsa_all, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                       .merge(pulsa_180, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                       .merge(pulsa_90, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                       .merge(utilities_all, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                       .merge(utilities_180, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                       .merge(utilities_90, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                       .merge(digital_all, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                       .merge(digital_180, how='left', on=['user_id', 'trx_id'])\\\n",
    "                                       .merge(digital_90, how='left', on=['user_id', 'trx_id'])\n",
    "    \n",
    "    print('combine dataframe time:', datetime.now()-st_tm_4)\n",
    "    print('total processing time:', datetime.now()-st_tm_1)\n",
    "    \n",
    "    return df_out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c25a9d52465d4aefa672df9a972ccc4f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "IntProgress(value=1, max=30, min=1)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "process data from 0 to 5000.\n",
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(5000, 18)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(455992, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query time: 0:04:55.938345\n",
      "Create Digital Product Flags\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(455992, 15)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create Digital Product Flags time: 0:05:21.252515\n",
      "Create Features\n",
      "Create Features time: 0:00:01.223702\n",
      "Combine DataFrames\n",
      "combine dataframe time: 0:00:00.072141\n",
      "total processing time: 0:10:18.487048\n",
      "batch 1 done!\n",
      "================================================================================================================\n",
      "process data from 5000 to 10000.\n",
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(5000, 18)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(459664, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query time: 0:02:12.641469\n",
      "Create Digital Product Flags\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(459664, 15)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create Digital Product Flags time: 0:05:24.357342\n",
      "Create Features\n",
      "Create Features time: 0:00:01.300519\n",
      "Combine DataFrames\n",
      "combine dataframe time: 0:00:00.073073\n",
      "total processing time: 0:07:38.373357\n",
      "batch 2 done!\n",
      "================================================================================================================\n",
      "process data from 10000 to 15000.\n",
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(5000, 18)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(471261, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query time: 0:02:08.431688\n",
      "Create Digital Product Flags\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(471261, 15)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create Digital Product Flags time: 0:05:35.378707\n",
      "Create Features\n",
      "Create Features time: 0:00:01.241208\n",
      "Combine DataFrames\n",
      "combine dataframe time: 0:00:00.070994\n",
      "total processing time: 0:07:45.123189\n",
      "batch 3 done!\n",
      "================================================================================================================\n",
      "process data from 15000 to 20000.\n",
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(5000, 18)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(444996, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query time: 0:02:24.834021\n",
      "Create Digital Product Flags\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(444996, 15)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create Digital Product Flags time: 0:05:18.165194\n",
      "Create Features\n",
      "Create Features time: 0:00:01.192006\n",
      "Combine DataFrames\n",
      "combine dataframe time: 0:00:00.070798\n",
      "total processing time: 0:07:44.262679\n",
      "batch 4 done!\n",
      "================================================================================================================\n",
      "process data from 20000 to 25000.\n",
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(5000, 18)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(432241, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query time: 0:02:08.947830\n",
      "Create Digital Product Flags\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(432241, 15)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create Digital Product Flags time: 0:05:04.784391\n",
      "Create Features\n",
      "Create Features time: 0:00:01.125733\n",
      "Combine DataFrames\n",
      "combine dataframe time: 0:00:00.076657\n",
      "total processing time: 0:07:14.934969\n",
      "batch 5 done!\n",
      "================================================================================================================\n",
      "process data from 25000 to 30000.\n",
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(5000, 18)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(458071, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query time: 0:01:55.028927\n",
      "Create Digital Product Flags\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(458071, 15)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create Digital Product Flags time: 0:05:22.558154\n",
      "Create Features\n",
      "Create Features time: 0:00:01.255805\n",
      "Combine DataFrames\n",
      "combine dataframe time: 0:00:00.074059\n",
      "total processing time: 0:07:18.917834\n",
      "batch 6 done!\n",
      "================================================================================================================\n",
      "process data from 30000 to 35000.\n",
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(5000, 18)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(433427, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query time: 0:04:52.383475\n",
      "Create Digital Product Flags\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(433427, 15)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create Digital Product Flags time: 0:05:06.224930\n",
      "Create Features\n",
      "Create Features time: 0:00:01.190107\n",
      "Combine DataFrames\n",
      "combine dataframe time: 0:00:00.071389\n",
      "total processing time: 0:09:59.870217\n",
      "batch 7 done!\n",
      "================================================================================================================\n",
      "process data from 35000 to 40000.\n",
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(5000, 18)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(461523, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query time: 0:05:01.467008\n",
      "Create Digital Product Flags\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(461523, 15)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create Digital Product Flags time: 0:05:36.692313\n",
      "Create Features\n",
      "Create Features time: 0:00:01.266789\n",
      "Combine DataFrames\n",
      "combine dataframe time: 0:00:00.072731\n",
      "total processing time: 0:10:39.499160\n",
      "batch 8 done!\n",
      "================================================================================================================\n",
      "process data from 40000 to 45000.\n",
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(5000, 18)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(412916, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query time: 0:05:05.307356\n",
      "Create Digital Product Flags\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(412916, 15)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create Digital Product Flags time: 0:04:57.675533\n",
      "Create Features\n",
      "Create Features time: 0:00:01.135864\n",
      "Combine DataFrames\n",
      "combine dataframe time: 0:00:00.070807\n",
      "total processing time: 0:10:04.190402\n",
      "batch 9 done!\n",
      "================================================================================================================\n",
      "process data from 45000 to 50000.\n",
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(5000, 18)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(440193, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query time: 0:05:17.737760\n",
      "Create Digital Product Flags\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(440193, 15)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create Digital Product Flags time: 0:05:08.174624\n",
      "Create Features\n",
      "Create Features time: 0:00:01.226927\n",
      "Combine DataFrames\n",
      "combine dataframe time: 0:00:00.074362\n",
      "total processing time: 0:10:27.214390\n",
      "batch 10 done!\n",
      "================================================================================================================\n",
      "process data from 50000 to 55000.\n",
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(5000, 18)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(449711, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query time: 0:06:31.009001\n",
      "Create Digital Product Flags\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(449711, 15)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create Digital Product Flags time: 0:05:17.469777\n",
      "Create Features\n",
      "Create Features time: 0:00:01.191957\n",
      "Combine DataFrames\n",
      "combine dataframe time: 0:00:00.071894\n",
      "total processing time: 0:11:49.742952\n",
      "batch 11 done!\n",
      "================================================================================================================\n",
      "process data from 55000 to 60000.\n",
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(5000, 18)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(455257, 10)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query time: 0:06:19.175089\n",
      "Create Digital Product Flags\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(455257, 15)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create Digital Product Flags time: 0:05:20.482020\n",
      "Create Features\n",
      "Create Features time: 0:00:01.244580\n",
      "Combine DataFrames\n",
      "combine dataframe time: 0:00:00.070326\n",
      "total processing time: 0:11:40.972786\n",
      "batch 12 done!\n",
      "================================================================================================================\n",
      "process data from 60000 to 65000.\n",
      "Querying Transaction Data\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(5000, 18)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from ipywidgets import IntProgress\n",
    "from IPython.display import display\n",
    "\n",
    "batch = 1\n",
    "batch_size = 5000\n",
    "\n",
    "max_count = round(raw_df.shape[0]/batch_size)\n",
    "f = IntProgress(min=batch, max=max_count) # instantiate the bar\n",
    "display(f) # display the bar\n",
    "\n",
    "st_tm = datetime.now()\n",
    "\n",
    "while (batch*batch_size) - raw_df.shape[0] < batch_size:\n",
    "    f.value += 1\n",
    "    start = (batch-1)*batch_size\n",
    "    end = batch*batch_size\n",
    "    print(f'process data from {start} to {end}.')\n",
    "    dgtl_trx_feats = create_digital_trx_feats(raw_df, start, end)\n",
    "    dgtl_trx_feats.to_parquet(FOLDER_LOC + '/dgtl_trx_feats_batch'+str(batch)+'.parquet', compression='gzip')\n",
    "    print(f'batch {batch} done!')\n",
    "    print('================================================================================================================')\n",
    "    batch += 1\n",
    "    \n",
    "print('Total Processing Time:', datetime.now() - st_tm)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combine Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_batches = [file for file in listdir(FOLDER_LOC) if file.endswith('.parquet')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "file_batches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comb = []\n",
    "for part in file_batches:\n",
    "    raw = pd.read_parquet(FOLDER_LOC+'/'+part)\n",
    "    comb.append(raw)\n",
    "    \n",
    "dgtl_trx_feats = pd.concat(comb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 56)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dgtl_trx_feats.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "dgtl_trx_feats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "user_id                            0.000000\n",
       "trx_id                             0.000000\n",
       "dgtl_trx_co_ticket_ever            0.178859\n",
       "dgtl_trx_sum_amt_ticket_ever       0.178859\n",
       "dgtl_trx_avg_amt_ticket_ever       0.178859\n",
       "dgtl_trx_co_ticket_180             0.207882\n",
       "dgtl_trx_sum_amt_ticket_180        0.207882\n",
       "dgtl_trx_avg_amt_ticket_180        0.207882\n",
       "dgtl_trx_co_ticket_90              0.251241\n",
       "dgtl_trx_sum_amt_ticket_90         0.251241\n",
       "dgtl_trx_avg_amt_ticket_90         0.251241\n",
       "dgtl_trx_co_game_ever              0.179164\n",
       "dgtl_trx_sum_amt_game_ever         0.179164\n",
       "dgtl_trx_avg_amt_game_ever         0.179164\n",
       "dgtl_trx_co_game_180               0.208207\n",
       "dgtl_trx_sum_amt_game_180          0.208207\n",
       "dgtl_trx_avg_amt_game_180          0.208207\n",
       "dgtl_trx_co_game_90                0.251666\n",
       "dgtl_trx_sum_amt_game_90           0.251666\n",
       "dgtl_trx_avg_amt_game_90           0.251666\n",
       "dgtl_trx_co_pulsa_ever             0.296824\n",
       "dgtl_trx_sum_amt_pulsa_ever        0.296824\n",
       "dgtl_trx_avg_amt_pulsa_ever        0.296824\n",
       "dgtl_trx_co_pulsa_180              0.329072\n",
       "dgtl_trx_sum_amt_pulsa_180         0.329072\n",
       "dgtl_trx_avg_amt_pulsa_180         0.329072\n",
       "dgtl_trx_co_pulsa_90               0.374555\n",
       "dgtl_trx_sum_amt_pulsa_90          0.374555\n",
       "dgtl_trx_avg_amt_pulsa_90          0.374555\n",
       "dgtl_trx_co_utilities_ever         0.178859\n",
       "dgtl_trx_sum_amt_utilities_ever    0.178859\n",
       "dgtl_trx_avg_amt_utilities_ever    0.178859\n",
       "dgtl_trx_co_utilities_180          0.207882\n",
       "dgtl_trx_sum_amt_utilities_180     0.207882\n",
       "dgtl_trx_avg_amt_utilities_180     0.207882\n",
       "dgtl_trx_co_utilities_90           0.251241\n",
       "dgtl_trx_sum_amt_utilities_90      0.251241\n",
       "dgtl_trx_avg_amt_utilities_90      0.251241\n",
       "nondgtl_nonpl_trx_co_ever          0.178859\n",
       "dgtl_trx_co_all_ever               0.178859\n",
       "nondgtl_nonpl_trx_sum_amt_ever     0.178859\n",
       "dgtl_trx_sum_amt_all_ever          0.178859\n",
       "nondgtl_nonpl_trx_avg_amt_ever     0.178859\n",
       "dgtl_trx_avg_amt_all_ever          0.178859\n",
       "nondgtl_nonpl_trx_co_180           0.207882\n",
       "dgtl_trx_co_all_180                0.207882\n",
       "nondgtl_nonpl_trx_sum_amt_180      0.207882\n",
       "dgtl_trx_sum_amt_all_180           0.207882\n",
       "nondgtl_nonpl_trx_avg_amt_180      0.207882\n",
       "dgtl_trx_avg_amt_all_180           0.207882\n",
       "nondgtl_nonpl_trx_co_90            0.251241\n",
       "dgtl_trx_co_all_90                 0.251241\n",
       "nondgtl_nonpl_trx_sum_amt_90       0.251241\n",
       "dgtl_trx_sum_amt_all_90            0.251241\n",
       "nondgtl_nonpl_trx_avg_amt_90       0.251241\n",
       "dgtl_trx_avg_amt_all_90            0.251241\n",
       "dtype: float64"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dgtl_trx_feats.isna().mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dgtl_trx_feats.to_parquet('data/interim/pl_model_v3_dgtl_trx_feats_18022020.parquet', compression='gzip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}