{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:36.988168Z",
"start_time": "2019-11-07T03:55:36.706164Z"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pymysql\n",
"from datetime import datetime\n",
"from os import listdir"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:36.994804Z",
"start_time": "2019-11-07T03:55:36.989531Z"
}
},
"outputs": [],
"source": [
"import configparser\n",
"config = configparser.ConfigParser()\n",
"config.read('/home/ec2-user/SageMaker/zhilal/config.ini')\n",
"\n",
"host = config['MYSQL-ROOT']['HOST']\n",
"user = config['MYSQL-ROOT']['USER']\n",
"password = config['MYSQL-ROOT']['PASSWORD']\n",
"\n",
"def connect_sql():\n",
" cnx = pymysql.connect(host=host,\n",
" user=user,\n",
" password=password,\n",
" cursorclass=pymysql.cursors.DictCursor)\n",
" return cnx"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"FOLDER_LOC = 'data/thresh_dgtl_trx_feats'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:37.026188Z",
"start_time": "2019-11-07T03:55:36.997299Z"
}
},
"outputs": [],
"source": [
"base_df = pd.read_parquet('data/pl_threshold_jan_11022020.parquet')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:37.034477Z",
"start_time": "2019-11-07T03:55:37.027603Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 10)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"base_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T03:55:37.043587Z",
"start_time": "2019-11-07T03:55:37.036542Z"
}
},
"outputs": [],
"source": [
"uids = tuple(base_df['user_id'].unique().tolist())\n",
"tids = tuple(base_df['trx_id'].unique().tolist())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"78465"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(uids)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"79853"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(tids)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Digital and Non-Digital Transactions"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def get_trx_history(base_df):\n",
" uids = tuple(base_df['user_id'])\n",
" max_trx_date = base_df['transaction_date'].max()\n",
" \n",
" # Remove all promotional and service related fees\n",
" excluded_product = ('biaya layanan'\n",
" ,'shipping fee'\n",
" ,'admin fee' \n",
" ,'biaya kirim yang dibebankan' \n",
" ,'biaya tambahan' \n",
" ,'discount' \n",
" ,'adminfee' \n",
" ,'shippingfee'\n",
" ,'additionalfee' \n",
" ,'diskon tambahan' \n",
" ,'donasi'\n",
" ,'donation' \n",
" ,'charge kredivo 2.3%' \n",
" ,'lainnya' \n",
" ,'user credit'\n",
" ,'deals') \n",
" \n",
" query = f\"\"\"\n",
" SELECT\n",
" t.user_id,\n",
" t.transaction_id AS trx_id,\n",
" t.transaction_date,\n",
" LOWER(p.name) AS product_name,\n",
" LOWER(p.category) AS category,\n",
" l.loan_amount,\n",
" t.merchant_id\n",
" FROM\n",
" (\n",
" SELECT\n",
" user_id,\n",
" id AS transaction_id,\n",
" transaction_date,\n",
" merchant_id\n",
" FROM\n",
" l2alpha.`transaction`\n",
" WHERE\n",
" user_id IN {uids}\n",
" AND status = 4\n",
" AND transaction_type <> 2\n",
" AND transaction_date <= '{max_trx_date}') AS t\n",
" JOIN (\n",
" SELECT\n",
" name,\n",
" id AS product_id,\n",
" transaction_id,\n",
" category,\n",
" sku\n",
" FROM\n",
" l2alpha.product\n",
" WHERE\n",
" sku_type = 0\n",
" AND name NOT IN {excluded_product}\n",
" AND sku NOT IN {excluded_product}) AS p ON\n",
" t.transaction_id = p.transaction_id\n",
" JOIN (\n",
" SELECT\n",
" transaction_id,\n",
" loan_amount\n",
" FROM\n",
" l2alpha.loan\n",
" WHERE\n",
" is_active = 1\n",
" AND loan_amount > 0) AS l ON\n",
" p.transaction_id = l.transaction_id\n",
" JOIN (\n",
" SELECT\n",
" id,\n",
" name\n",
" FROM\n",
" l2alpha.merchant) AS m ON\n",
" t.merchant_id = m.id\n",
" \"\"\"\n",
" \n",
" df = pd.read_sql(query, connect_sql())\n",
" out_df = base_df[['user_id', 'trx_id', 'transaction_date']].merge(df, how='left', on='user_id')\\\n",
" .rename(columns={'transaction_date_x':'trx_dt_curr',\n",
" 'transaction_date_y':'trx_dt_past',\n",
" 'trx_id_x':'trx_id',\n",
" 'trx_id_y':'trx_id_ref'})\n",
" out_df['time_diff'] = (out_df['trx_dt_curr'] - out_df['trx_dt_past']).dt.total_seconds()\n",
" return out_df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
" \n",
"def get_ticket(x):\n",
" if int(x['merchant_id']) in [154,428,48,87,432]:\n",
" if 'hotel' not in x['product_name'] and 'hotel' not in x['category']:\n",
" return True\n",
" elif x['merchant_id'] in [270]:\n",
" if 'flight' in x['product_name'] or 'flight' in x['category']:\n",
" return True\n",
"\n",
" return False\n",
"\n",
"def get_utilities(x):\n",
" for w in ['pdam','pln','tv kabel', 'electricity','electricity_postpaid',\n",
" 'listrik pln', 'pln','pdam','internet & tv kabel','air pdam', \n",
" '75061609','75062176', 'listrik prabayar', 'listrik lainnya','pulsa listrik']:\n",
" if w in x['product_name'] or w in x['category']: \n",
" return True\n",
"\n",
" if re.search(r'(tagihan).*(listrik|telkom)|(voucher|token).*(listrik|pln)|pln.*(prepaid|postpaid|token)|\\bpdam\\b', x['product_name']):\n",
" return True\n",
"\n",
" return False\n",
"\n",
"def get_game(x):\n",
" if int(x['merchant_id']) in [426,197]:\n",
" return True\n",
" elif re.search(r'(voucher|data).*game|game.*(voucher|data)|\\bgamesmax\\b', x['product_name']):\n",
" return True\n",
" elif re.search('voucher', x['category']):\n",
" if re.search('game',x['product_name']):\n",
" return True\n",
" else:\n",
" return False\n",
"\n",
"def get_pulsa(x):\n",
" if int(x['merchant_id']) == 8:\n",
" for n in ['pln','listrik','pdam','game']:\n",
" if n in x['product_name']:\n",
" return False\n",
" if 'rp' in x['product_name']:\n",
" return True\n",
" elif x['merchant_id'] != 8:\n",
" for n in ['pln','listrik','pdam','game', 'flight','mobiles & tablets', 'komputer & aksesoris']:\n",
" if n in x['product_name'] or n in x['category']:\n",
" return False\n",
" for w in ['mobile', 'digital products','pulsa', \n",
" 'virtual','75061635','paket data','digital utilities','pasca bayar']:\n",
" if w in x['category']:\n",
" return True\n",
"\n",
" if re.search(r'\\bpulsa\\b', x['product_name']):\n",
" return True\n",
"\n",
" for w in ['paket data', 'kuota', 'xl xtra', 'indosat' , 'telkomsel', 'voucher 3',\n",
" 'smartfren', 'voucher tri', 'xl xtra combo', 'axis', 'paket internet']:\n",
" if w in x['product_name']:\n",
" return True\n",
"\n",
" else:\n",
" return False\n",
" \n",
"def get_digital_product_status(df):\n",
" df['trx_cat_ticket'] = df.apply(lambda z: get_ticket(z), axis=1)\n",
" df['trx_cat_game'] = df.apply(lambda z: get_game(z), axis=1)\n",
" df['trx_cat_pulsa'] = df.apply(lambda z: get_pulsa(z), axis=1)\n",
" df['trx_cat_utilities'] = df.apply(lambda z: get_utilities(z), axis=1)\n",
" \n",
" df['trx_cat_digital'] = df[['trx_cat_game', 'trx_cat_pulsa', 'trx_cat_utilities']].any(1)\n",
" \n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def create_prev_digital_trx_per_type(df, keyword, window='all'):\n",
" if keyword not in ['ticket', 'game', 'pulsa', 'utilities', 'digital']:\n",
" raise ValueError(f'{keyword} not available. Try ticket, game, pulsa, utilities, digital.')\n",
" \n",
" if isinstance(window, int):\n",
" df = df[(df['time_diff'] <= window*(24*60*60)) & (df['time_diff'] > 0)]\n",
" name = str(window)\n",
" elif window == 'all':\n",
" df = df[df['time_diff'] > 0]\n",
" name = 'ever'\n",
" elif isinstance(window, int) == False or window != 'all':\n",
" raise ValueError(f'The input \"{window}\" is wrong. Window value must be an integer or \"all\".')\n",
" \n",
" key = 'trx_cat_'+ keyword\n",
" digital = df.groupby(['user_id', 'trx_id', key]).agg({key:'count',\n",
" 'loan_amount':['sum', 'mean']})\\\n",
" .unstack().fillna(0)\n",
" \n",
" columns = []\n",
" for groups in digital.columns.values:\n",
" col_name = []\n",
" for item in groups:\n",
" col_name.append(str(item))\n",
" new_name = '-'.join(col_name)\n",
" columns.append(new_name)\n",
" digital.columns = columns\n",
" \n",
" if keyword == 'digital':\n",
" final = digital.reset_index()\n",
" final.columns = ['user_id', 'trx_id',\n",
" f'nondgtl_nonpl_trx_co_{name}',\n",
" f'dgtl_trx_co_all_{name}',\n",
" f'nondgtl_nonpl_trx_sum_amt_{name}',\n",
" f'dgtl_trx_sum_amt_all_{name}',\n",
" f'nondgtl_nonpl_trx_avg_amt_{name}',\n",
" f'dgtl_trx_avg_amt_all_{name}']\n",
" \n",
" else:\n",
" relevant_columns = [col for col in digital.columns if 'True' in col]\n",
" final = digital[relevant_columns].reset_index()\n",
" final.columns = ['user_id', 'trx_id', \n",
" f'dgtl_trx_co_{keyword}_{name}', \n",
" f'dgtl_trx_sum_amt_{keyword}_{name}',\n",
" f'dgtl_trx_avg_amt_{keyword}_{name}']\n",
" \n",
" return final"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def create_digital_trx_feats(df_in, begin, finish):\n",
" # Load Raw Data\n",
" st_tm_1 = datetime.now()\n",
" print('Querying Transaction Data')\n",
" \n",
" df = df_in.iloc[begin:finish].reset_index(drop=True).copy()\n",
" display(df.shape)\n",
" trx_hist = get_trx_history(df)\n",
" trx_hist = trx_hist.dropna(how='any')\n",
" display(trx_hist.shape)\n",
" \n",
" print('query time:', datetime.now()-st_tm_1)\n",
" \n",
" #Create Digital Product Flags\n",
" st_tm_2 = datetime.now()\n",
" print('Create Digital Product Flags')\n",
" \n",
" trx_hist = get_digital_product_status(trx_hist)\n",
" display(trx_hist.shape)\n",
" \n",
" print('Create Digital Product Flags time:', datetime.now()-st_tm_2)\n",
" \n",
" #Create Features\n",
" st_tm_3 = datetime.now()\n",
" print('Create Features')\n",
" \n",
" digital_180 = create_prev_digital_trx_per_type(trx_hist, 'digital', window=180)\n",
" \n",
" print('Create Features time:', datetime.now()-st_tm_3)\n",
" \n",
" #Combine DataFrames\n",
" st_tm_4 = datetime.now()\n",
" print('Combine DataFrames')\n",
" \n",
" df_out = df[['user_id', 'trx_id']].merge(digital_180, how='left', on=['user_id', 'trx_id'])\n",
" \n",
" print('combine dataframe time:', datetime.now()-st_tm_4)\n",
" print('total processing time:', datetime.now()-st_tm_1)\n",
" \n",
" return df_out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "58af6962f8b047f29c2f5d975aba2983",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=1, max=16, min=1)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"process data from 0 to 5000.\n",
"Querying Transaction Data\n"
]
},
{
"data": {
"text/plain": [
"(5000, 10)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from ipywidgets import IntProgress\n",
"from IPython.display import display\n",
"\n",
"batch = 1\n",
"batch_size = 5000\n",
"\n",
"max_count = round(base_df.shape[0]/batch_size)\n",
"f = IntProgress(min=batch, max=max_count) # instantiate the bar\n",
"display(f) # display the bar\n",
"\n",
"st_tm = datetime.now()\n",
"\n",
"while (batch*batch_size) - base_df.shape[0] < batch_size:\n",
" f.value += 1\n",
" start = (batch-1)*batch_size\n",
" end = batch*batch_size\n",
" print(f'process data from {start} to {end}.')\n",
" dgtl_trx_feats = create_digital_trx_feats(base_df, start, end)\n",
" dgtl_trx_feats.to_parquet(FOLDER_LOC + '/dgtl_trx_feats_batch'+str(batch)+'.parquet', compression='gzip')\n",
" print(f'batch {batch} done!')\n",
" print('================================================================================================================')\n",
" batch += 1\n",
" \n",
"print('Total Processing Time:', datetime.now() - st_tm)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Combine Files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.586494Z",
"start_time": "2019-11-07T04:02:44.510100Z"
}
},
"outputs": [],
"source": [
"file_batches = [file for file in listdir(FOLDER_LOC) if file.endswith('.parquet')]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"file_batches"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"comb = []\n",
"for part in file_batches:\n",
" raw = pd.read_parquet(FOLDER_LOC+'/'+part)\n",
" comb.append(raw)\n",
" \n",
"dgtl_trx_feats = pd.concat(comb)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(79853, 3)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dgtl_trx_feats.shape"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"dgtl_trx_feats.head(20)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"user_id 0.000000\n",
"trx_id 0.000000\n",
"nondgtl_nonpl_trx_co_180 0.225139\n",
"dtype: float64"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dgtl_trx_feats.isna().mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dgtl_trx_feats = dgtl_trx_feats[['user_id', 'trx_id', 'nondgtl_nonpl_trx_co_180']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:02:44.798454Z",
"start_time": "2019-11-07T04:02:44.619626Z"
}
},
"outputs": [],
"source": [
"dgtl_trx_feats.to_parquet('data/pl_threshold_feats_18022020_part3.parquet', compression='gzip')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}