past-data-projects / personal_loan_credit_risk / 000-Model_Definitions / 002 - Generate Pefindo Features.ipynb
002 - Generate Pefindo Features.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:30:47.225039Z",
     "start_time": "2019-11-07T04:30:47.212091Z"
    }
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/ec2-user/SageMaker/zhilal/adhoc/pl_score_features_and_proba/27-11-2019/pl_bscore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:30:47.811761Z",
     "start_time": "2019-11-07T04:30:47.413000Z"
    }
   },
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "from os import listdir\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pymysql\n",
    "import boto3\n",
    "import json\n",
    "\n",
    "import configparser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "FOLDER_LOC = 'data/pefindo_features'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['/home/ec2-user/SageMaker/zhilal/config.ini']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "config = configparser.ConfigParser()\n",
    "config.read('/home/ec2-user/SageMaker/zhilal/config.ini')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:30:47.854397Z",
     "start_time": "2019-11-07T04:30:47.851040Z"
    }
   },
   "outputs": [],
   "source": [
    "host = config['MYSQL-ROOT']['HOST']\n",
    "user = config['MYSQL-ROOT']['USER']\n",
    "password = config['MYSQL-ROOT']['PASSWORD']\n",
    "\n",
    "def connect_sql():\n",
    "    cnx = pymysql.connect(host=host,\n",
    "                          user=user,\n",
    "                          password=password,\n",
    "                          cursorclass=pymysql.cursors.DictCursor)\n",
    "    return cnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:30:48.243116Z",
     "start_time": "2019-11-07T04:30:48.237668Z"
    }
   },
   "outputs": [],
   "source": [
    "def connect_s3():\n",
    "    s3_client = boto3.client('s3')\n",
    "    return s3_client"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:30:49.557482Z",
     "start_time": "2019-11-07T04:30:48.542457Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:708: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.\n",
      "  labels = getattr(columns, 'labels', None) or [\n",
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:735: FutureWarning: the 'labels' keyword is deprecated, use 'codes' instead\n",
      "  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)\n",
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:752: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.\n",
      "  labels, = index.labels\n"
     ]
    }
   ],
   "source": [
    "base_df = pd.read_parquet('data/interim/pl_jumbo_mini_junjulaugsep_27012020.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 17)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:30:52.327140Z",
     "start_time": "2019-11-07T04:30:52.322654Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "uids = tuple(base_df['user_id'].unique().tolist())\n",
    "tids = tuple(base_df['trx_id'].unique().tolist())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get Pefindo Raw Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:30:53.581233Z",
     "start_time": "2019-11-07T04:30:53.577601Z"
    }
   },
   "outputs": [],
   "source": [
    "query_pf = f'''\n",
    "SELECT user_id, css_user_id\n",
    "FROM l2alpha.css_users\n",
    "WHERE user_id IN {uids}\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:30:54.907843Z",
     "start_time": "2019-11-07T04:30:54.062681Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pymysql/cursors.py:329: Warning: (3170, \"Memory capacity of 8388608 bytes for 'range_optimizer_max_mem_size' exceeded. Range optimization was not done for this query.\")\n",
      "  self._do_get_result()\n"
     ]
    }
   ],
   "source": [
    "pf_df = pd.read_sql(query_pf, connect_sql())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:30:55.037967Z",
     "start_time": "2019-11-07T04:30:55.023353Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "pf_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:30:55.549902Z",
     "start_time": "2019-11-07T04:30:55.524225Z"
    }
   },
   "outputs": [],
   "source": [
    "base_df = base_df.merge(pf_df, how='left', on='user_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:30:56.175306Z",
     "start_time": "2019-11-07T04:30:56.140285Z"
    }
   },
   "outputs": [],
   "source": [
    "base_df = base_df[['user_id', 'trx_id', 'transaction_date', 'css_user_id']]\n",
    "# base_df = base_df.rename(columns={'transaction_date':'first_trx_date'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:30:56.697122Z",
     "start_time": "2019-11-07T04:30:56.676671Z"
    }
   },
   "outputs": [],
   "source": [
    "base_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T04:31:02.269101Z",
     "start_time": "2019-11-07T04:31:02.242701Z"
    }
   },
   "outputs": [],
   "source": [
    "AWS_S3_BUCKET_NAME = 'csk-production'\n",
    "AWS_S3_FILE_PATH = 'userdata/pefindo/raw/'\n",
    "\n",
    "def get_read_json_from_s3(s3_client, bucket, key):\n",
    "    try:\n",
    "        obj = s3_client.get_object(\n",
    "            Bucket=bucket,\n",
    "            Key=key\n",
    "        )\n",
    "        js_raw = json.loads(obj['Body'].read().decode('utf-8'))\n",
    "        return js_raw\n",
    "    except Exception as e:\n",
    "        print(\"Can't get object from S3, error: {}\".format(e))\n",
    "        return None\n",
    "\n",
    "def get_raw_pefindo(css_user_id):\n",
    "    key = AWS_S3_FILE_PATH + css_user_id + '.json'\n",
    "    print(key)\n",
    "    raw_json = get_read_json_from_s3(connect_s3(), AWS_S3_BUCKET_NAME, key)\n",
    "    return raw_json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-11-07T05:51:44.979297Z",
     "start_time": "2019-11-07T05:51:44.958247Z"
    }
   },
   "outputs": [],
   "source": [
    "from pl_bscore.feature_creation import create_pefindo_features\n",
    "\n",
    "def get_pf_feats(df):\n",
    "    pf_feat_dict = {}\n",
    "    pefindo_feats = create_pefindo_features.PFFeatures(df['transaction_date'], df['pefindo_raw'])\n",
    "    if df['pefindo_raw'] != None:\n",
    "        pf_feat_dict['pf_util_creditcard_avg_12mo'] = pefindo_feats.pf_util_creditcard_avg_12mo()\n",
    "        pf_feat_dict['pf_delin_max_dpd_12mo'] = pefindo_feats.pf_delin_max_dpd_12mo()\n",
    "        pf_feat_dict['pf_delin_dist_contractcode_30_dpd_3mo'] = pefindo_feats.pf_delin_dist_contractcode_30_dpd_3mo()\n",
    "        pf_feat_dict['pf_con_open_count_12mo'] = pefindo_feats.pf_con_open_count_12mo()\n",
    "        pf_feat_dict['flag_good_pefindo'] = pefindo_feats.flag_good_pefindo()\n",
    "        pf_feat_dict['flag_bad_pefindo'] = pefindo_feats.flag_bad_pefindo()\n",
    "    else:\n",
    "        pf_feat_dict['pf_util_creditcard_avg_12mo'] = np.nan\n",
    "        pf_feat_dict['pf_delin_max_dpd_12mo'] = np.nan\n",
    "        pf_feat_dict['pf_delin_dist_contractcode_30_dpd_3mo'] = np.nan\n",
    "        pf_feat_dict['pf_con_open_count_12mo'] = np.nan\n",
    "        pf_feat_dict['flag_good_pefindo'] = np.nan\n",
    "        pf_feat_dict['flag_bad_pefindo'] = np.nan\n",
    "    return pf_feat_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_pef_features(df_in, begin, finish):\n",
    "    #Load Raw Data\n",
    "    st_tm_1 = datetime.now()\n",
    "    print('Loading Pefindo Data')\n",
    "    df = df_in.iloc[begin:finish].reset_index(drop=True).copy()\n",
    "    display(df.shape)\n",
    "    df['pefindo_raw'] = df.apply(lambda x: get_raw_pefindo(x['css_user_id']), axis=1)\n",
    "    display(df.shape)\n",
    "    print('load files from S3 time:', datetime.now()-st_tm_1)\n",
    "    \n",
    "    #Process Data into Features\n",
    "    st_tm_2 = datetime.now()\n",
    "    print('Processing Features')\n",
    "    df['transaction_date'] = df['transaction_date'].astype(str)\n",
    "    df['raw_feats'] = df.apply(get_pf_feats, axis=1)\n",
    "    print('feature processing time:', datetime.now()-st_tm_2)\n",
    "    \n",
    "    #Normalize DataFrame\n",
    "    st_tm_3 = datetime.now()\n",
    "    print('Normalizing DataFrame')\n",
    "    df_out = df.merge(pd.DataFrame(df['raw_feats'].tolist()), left_index=True, right_index=True)\n",
    "    df_out = df_out.drop(columns=['pefindo_raw', 'raw_feats'])\n",
    "    print('normalize dataframe:', datetime.now()-st_tm_3)\n",
    "    print('total processing time:', datetime.now()-st_tm_1)\n",
    "    \n",
    "    return df_out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ipywidgets import IntProgress\n",
    "from IPython.display import display\n",
    "\n",
    "batch = 1\n",
    "batch_size = 5000\n",
    "\n",
    "max_count = round(base_df.shape[0]/batch_size)\n",
    "f = IntProgress(min=batch, max=max_count) # instantiate the bar\n",
    "display(f) # display the bar\n",
    "\n",
    "st_tm = datetime.now()\n",
    "\n",
    "while (batch*batch_size) - base_df.shape[0] < batch_size:\n",
    "    f.value += 1\n",
    "    start = (batch-1)*batch_size\n",
    "    end = batch*batch_size\n",
    "    print(f'process data from {start} to {end}.')\n",
    "    pef_feats = create_pef_features(base_df, start, end)\n",
    "    pef_feats.to_parquet(FOLDER_LOC + '/pl_score_pef_feats_batch'+str(batch)+'.parquet', compression='gzip')\n",
    "    print(f'batch {batch} done!')\n",
    "    print('================================================================================================================')\n",
    "    batch += 1\n",
    "    \n",
    "print('Total Processing Time:', datetime.now() - st_tm)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Combine Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "pef_batchlist = [file for file in listdir(FOLDER_LOC) if file.endswith('.parquet')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['pl_score_pef_feats_batch13.parquet',\n",
       " 'pl_score_pef_feats_batch7.parquet',\n",
       " 'pl_score_pef_feats_batch22.parquet',\n",
       " 'pl_score_pef_feats_batch30.parquet',\n",
       " 'pl_score_pef_feats_batch9.parquet',\n",
       " 'pl_score_pef_feats_batch24.parquet',\n",
       " 'pl_score_pef_feats_batch29.parquet',\n",
       " 'pl_score_pef_feats_batch17.parquet',\n",
       " 'pl_score_pef_feats_batch27.parquet',\n",
       " 'pl_score_pef_feats_batch6.parquet',\n",
       " 'pl_score_pef_feats_batch10.parquet',\n",
       " 'pl_score_pef_feats_batch8.parquet',\n",
       " 'pl_score_pef_feats_batch23.parquet',\n",
       " 'pl_score_pef_feats_batch19.parquet',\n",
       " 'pl_score_pef_feats_batch28.parquet',\n",
       " 'pl_score_pef_feats_batch3.parquet',\n",
       " 'pl_score_pef_feats_batch18.parquet',\n",
       " 'pl_score_pef_feats_batch14.parquet',\n",
       " 'pl_score_pef_feats_batch4.parquet',\n",
       " 'pl_score_pef_feats_batch21.parquet',\n",
       " 'pl_score_pef_feats_batch5.parquet',\n",
       " 'pl_score_pef_feats_batch12.parquet',\n",
       " 'pl_score_pef_feats_batch2.parquet',\n",
       " 'pl_score_pef_feats_batch15.parquet',\n",
       " 'pl_score_pef_feats_batch11.parquet',\n",
       " 'pl_score_pef_feats_batch25.parquet',\n",
       " 'pl_score_pef_feats_batch16.parquet',\n",
       " 'pl_score_pef_feats_batch26.parquet',\n",
       " 'pl_score_pef_feats_batch1.parquet',\n",
       " 'pl_score_pef_feats_batch20.parquet',\n",
       " 'pl_score_pef_feats_batch31.parquet']"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pef_batchlist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:708: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.\n",
      "  labels = getattr(columns, 'labels', None) or [\n",
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:735: FutureWarning: the 'labels' keyword is deprecated, use 'codes' instead\n",
      "  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)\n",
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:752: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.\n",
      "  labels, = index.labels\n"
     ]
    }
   ],
   "source": [
    "pef_comb = []\n",
    "for part in pef_batchlist:\n",
    "    raw = pd.read_parquet(FOLDER_LOC+'/'+part)\n",
    "    pef_comb.append(raw)\n",
    "    \n",
    "pef_feats = pd.concat(pef_comb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150672, 10)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pef_feats.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyarrow/pandas_compat.py:114: FutureWarning: A future version of pandas will default to `skipna=True`. To silence this warning, pass `skipna=True|False` explicitly.\n",
      "  result = infer_dtype(pandas_collection)\n"
     ]
    }
   ],
   "source": [
    "pef_feats.to_parquet('data/interim/pl_model_v3_pefindo_feats_27012020.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}