{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:30:47.225039Z",
"start_time": "2019-11-07T04:30:47.212091Z"
}
},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('/home/ec2-user/SageMaker/zhilal/adhoc/pl_score_features_and_proba/27-11-2019/pl_bscore')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:30:47.811761Z",
"start_time": "2019-11-07T04:30:47.413000Z"
}
},
"outputs": [],
"source": [
"from datetime import datetime\n",
"from os import listdir\n",
"import pandas as pd\n",
"import numpy as np\n",
"import pymysql\n",
"import boto3\n",
"import json\n",
"\n",
"import configparser"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"FOLDER_LOC = 'data/pefindo_features'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/home/ec2-user/SageMaker/zhilal/config.ini']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"config = configparser.ConfigParser()\n",
"config.read('/home/ec2-user/SageMaker/zhilal/config.ini')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:30:47.854397Z",
"start_time": "2019-11-07T04:30:47.851040Z"
}
},
"outputs": [],
"source": [
"host = config['MYSQL-ROOT']['HOST']\n",
"user = config['MYSQL-ROOT']['USER']\n",
"password = config['MYSQL-ROOT']['PASSWORD']\n",
"\n",
"def connect_sql():\n",
" cnx = pymysql.connect(host=host,\n",
" user=user,\n",
" password=password,\n",
" cursorclass=pymysql.cursors.DictCursor)\n",
" return cnx"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:30:48.243116Z",
"start_time": "2019-11-07T04:30:48.237668Z"
}
},
"outputs": [],
"source": [
"def connect_s3():\n",
" s3_client = boto3.client('s3')\n",
" return s3_client"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:30:49.557482Z",
"start_time": "2019-11-07T04:30:48.542457Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:708: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.\n",
" labels = getattr(columns, 'labels', None) or [\n",
"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:735: FutureWarning: the 'labels' keyword is deprecated, use 'codes' instead\n",
" return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)\n",
"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:752: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.\n",
" labels, = index.labels\n"
]
}
],
"source": [
"base_df = pd.read_parquet('data/interim/pl_jumbo_mini_junjulaugsep_27012020.parquet')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(150672, 17)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"base_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:30:52.327140Z",
"start_time": "2019-11-07T04:30:52.322654Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"uids = tuple(base_df['user_id'].unique().tolist())\n",
"tids = tuple(base_df['trx_id'].unique().tolist())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get Pefindo Raw Data"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:30:53.581233Z",
"start_time": "2019-11-07T04:30:53.577601Z"
}
},
"outputs": [],
"source": [
"query_pf = f'''\n",
"SELECT user_id, css_user_id\n",
"FROM l2alpha.css_users\n",
"WHERE user_id IN {uids}\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:30:54.907843Z",
"start_time": "2019-11-07T04:30:54.062681Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pymysql/cursors.py:329: Warning: (3170, \"Memory capacity of 8388608 bytes for 'range_optimizer_max_mem_size' exceeded. Range optimization was not done for this query.\")\n",
" self._do_get_result()\n"
]
}
],
"source": [
"pf_df = pd.read_sql(query_pf, connect_sql())"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:30:55.037967Z",
"start_time": "2019-11-07T04:30:55.023353Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"pf_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:30:55.549902Z",
"start_time": "2019-11-07T04:30:55.524225Z"
}
},
"outputs": [],
"source": [
"base_df = base_df.merge(pf_df, how='left', on='user_id')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:30:56.175306Z",
"start_time": "2019-11-07T04:30:56.140285Z"
}
},
"outputs": [],
"source": [
"base_df = base_df[['user_id', 'trx_id', 'transaction_date', 'css_user_id']]\n",
"# base_df = base_df.rename(columns={'transaction_date':'first_trx_date'})"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:30:56.697122Z",
"start_time": "2019-11-07T04:30:56.676671Z"
}
},
"outputs": [],
"source": [
"base_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T04:31:02.269101Z",
"start_time": "2019-11-07T04:31:02.242701Z"
}
},
"outputs": [],
"source": [
"AWS_S3_BUCKET_NAME = 'csk-production'\n",
"AWS_S3_FILE_PATH = 'userdata/pefindo/raw/'\n",
"\n",
"def get_read_json_from_s3(s3_client, bucket, key):\n",
" try:\n",
" obj = s3_client.get_object(\n",
" Bucket=bucket,\n",
" Key=key\n",
" )\n",
" js_raw = json.loads(obj['Body'].read().decode('utf-8'))\n",
" return js_raw\n",
" except Exception as e:\n",
" print(\"Can't get object from S3, error: {}\".format(e))\n",
" return None\n",
"\n",
"def get_raw_pefindo(css_user_id):\n",
" key = AWS_S3_FILE_PATH + css_user_id + '.json'\n",
" print(key)\n",
" raw_json = get_read_json_from_s3(connect_s3(), AWS_S3_BUCKET_NAME, key)\n",
" return raw_json"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"ExecuteTime": {
"end_time": "2019-11-07T05:51:44.979297Z",
"start_time": "2019-11-07T05:51:44.958247Z"
}
},
"outputs": [],
"source": [
"from pl_bscore.feature_creation import create_pefindo_features\n",
"\n",
"def get_pf_feats(df):\n",
" pf_feat_dict = {}\n",
" pefindo_feats = create_pefindo_features.PFFeatures(df['transaction_date'], df['pefindo_raw'])\n",
" if df['pefindo_raw'] != None:\n",
" pf_feat_dict['pf_util_creditcard_avg_12mo'] = pefindo_feats.pf_util_creditcard_avg_12mo()\n",
" pf_feat_dict['pf_delin_max_dpd_12mo'] = pefindo_feats.pf_delin_max_dpd_12mo()\n",
" pf_feat_dict['pf_delin_dist_contractcode_30_dpd_3mo'] = pefindo_feats.pf_delin_dist_contractcode_30_dpd_3mo()\n",
" pf_feat_dict['pf_con_open_count_12mo'] = pefindo_feats.pf_con_open_count_12mo()\n",
" pf_feat_dict['flag_good_pefindo'] = pefindo_feats.flag_good_pefindo()\n",
" pf_feat_dict['flag_bad_pefindo'] = pefindo_feats.flag_bad_pefindo()\n",
" else:\n",
" pf_feat_dict['pf_util_creditcard_avg_12mo'] = np.nan\n",
" pf_feat_dict['pf_delin_max_dpd_12mo'] = np.nan\n",
" pf_feat_dict['pf_delin_dist_contractcode_30_dpd_3mo'] = np.nan\n",
" pf_feat_dict['pf_con_open_count_12mo'] = np.nan\n",
" pf_feat_dict['flag_good_pefindo'] = np.nan\n",
" pf_feat_dict['flag_bad_pefindo'] = np.nan\n",
" return pf_feat_dict"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"def create_pef_features(df_in, begin, finish):\n",
" #Load Raw Data\n",
" st_tm_1 = datetime.now()\n",
" print('Loading Pefindo Data')\n",
" df = df_in.iloc[begin:finish].reset_index(drop=True).copy()\n",
" display(df.shape)\n",
" df['pefindo_raw'] = df.apply(lambda x: get_raw_pefindo(x['css_user_id']), axis=1)\n",
" display(df.shape)\n",
" print('load files from S3 time:', datetime.now()-st_tm_1)\n",
" \n",
" #Process Data into Features\n",
" st_tm_2 = datetime.now()\n",
" print('Processing Features')\n",
" df['transaction_date'] = df['transaction_date'].astype(str)\n",
" df['raw_feats'] = df.apply(get_pf_feats, axis=1)\n",
" print('feature processing time:', datetime.now()-st_tm_2)\n",
" \n",
" #Normalize DataFrame\n",
" st_tm_3 = datetime.now()\n",
" print('Normalizing DataFrame')\n",
" df_out = df.merge(pd.DataFrame(df['raw_feats'].tolist()), left_index=True, right_index=True)\n",
" df_out = df_out.drop(columns=['pefindo_raw', 'raw_feats'])\n",
" print('normalize dataframe:', datetime.now()-st_tm_3)\n",
" print('total processing time:', datetime.now()-st_tm_1)\n",
" \n",
" return df_out"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from ipywidgets import IntProgress\n",
"from IPython.display import display\n",
"\n",
"batch = 1\n",
"batch_size = 5000\n",
"\n",
"max_count = round(base_df.shape[0]/batch_size)\n",
"f = IntProgress(min=batch, max=max_count) # instantiate the bar\n",
"display(f) # display the bar\n",
"\n",
"st_tm = datetime.now()\n",
"\n",
"while (batch*batch_size) - base_df.shape[0] < batch_size:\n",
" f.value += 1\n",
" start = (batch-1)*batch_size\n",
" end = batch*batch_size\n",
" print(f'process data from {start} to {end}.')\n",
" pef_feats = create_pef_features(base_df, start, end)\n",
" pef_feats.to_parquet(FOLDER_LOC + '/pl_score_pef_feats_batch'+str(batch)+'.parquet', compression='gzip')\n",
" print(f'batch {batch} done!')\n",
" print('================================================================================================================')\n",
" batch += 1\n",
" \n",
"print('Total Processing Time:', datetime.now() - st_tm)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Combine Files"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"pef_batchlist = [file for file in listdir(FOLDER_LOC) if file.endswith('.parquet')]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['pl_score_pef_feats_batch13.parquet',\n",
" 'pl_score_pef_feats_batch7.parquet',\n",
" 'pl_score_pef_feats_batch22.parquet',\n",
" 'pl_score_pef_feats_batch30.parquet',\n",
" 'pl_score_pef_feats_batch9.parquet',\n",
" 'pl_score_pef_feats_batch24.parquet',\n",
" 'pl_score_pef_feats_batch29.parquet',\n",
" 'pl_score_pef_feats_batch17.parquet',\n",
" 'pl_score_pef_feats_batch27.parquet',\n",
" 'pl_score_pef_feats_batch6.parquet',\n",
" 'pl_score_pef_feats_batch10.parquet',\n",
" 'pl_score_pef_feats_batch8.parquet',\n",
" 'pl_score_pef_feats_batch23.parquet',\n",
" 'pl_score_pef_feats_batch19.parquet',\n",
" 'pl_score_pef_feats_batch28.parquet',\n",
" 'pl_score_pef_feats_batch3.parquet',\n",
" 'pl_score_pef_feats_batch18.parquet',\n",
" 'pl_score_pef_feats_batch14.parquet',\n",
" 'pl_score_pef_feats_batch4.parquet',\n",
" 'pl_score_pef_feats_batch21.parquet',\n",
" 'pl_score_pef_feats_batch5.parquet',\n",
" 'pl_score_pef_feats_batch12.parquet',\n",
" 'pl_score_pef_feats_batch2.parquet',\n",
" 'pl_score_pef_feats_batch15.parquet',\n",
" 'pl_score_pef_feats_batch11.parquet',\n",
" 'pl_score_pef_feats_batch25.parquet',\n",
" 'pl_score_pef_feats_batch16.parquet',\n",
" 'pl_score_pef_feats_batch26.parquet',\n",
" 'pl_score_pef_feats_batch1.parquet',\n",
" 'pl_score_pef_feats_batch20.parquet',\n",
" 'pl_score_pef_feats_batch31.parquet']"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pef_batchlist"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:708: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.\n",
" labels = getattr(columns, 'labels', None) or [\n",
"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:735: FutureWarning: the 'labels' keyword is deprecated, use 'codes' instead\n",
" return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)\n",
"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:752: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.\n",
" labels, = index.labels\n"
]
}
],
"source": [
"pef_comb = []\n",
"for part in pef_batchlist:\n",
" raw = pd.read_parquet(FOLDER_LOC+'/'+part)\n",
" pef_comb.append(raw)\n",
" \n",
"pef_feats = pd.concat(pef_comb)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(150672, 10)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pef_feats.shape"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:114: FutureWarning: A future version of pandas will default to `skipna=True`. To silence this warning, pass `skipna=True|False` explicitly.\n",
" result = infer_dtype(pandas_collection)\n"
]
}
],
"source": [
"pef_feats.to_parquet('data/interim/pl_model_v3_pefindo_feats_27012020.parquet')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}