import uuid
import streamlit as st
import pandas as pd
from tools import *
import os
import json
exp_id_ph, file_tree_ph = sidebar(globals())
st.title("Choose Your Dataset")
def on_file_upload():
st.session_state['selected_exp'] = None
def save_config(exp_id):
with open('config.json', 'w') as f:
json.dump({'exp_id': exp_id}, f)
def on_exp_select():
global files
files = []
# FIXME: fix selection bug when selected several times
if ((exp_id := st.session_state.get('selected_exp')) == st.session_state.get('exp_id')):
return
st.session_state['exp_id'] = exp_id
init_paths(globals(), exp_id)
update_exp_id(exp_id_ph, exp_id)
# save exp_id to config.json
save_config(exp_id)
with st.container(border=True):
# FIXME: clear file uploader when experiment is chosen
files = st.file_uploader("Upload Your Dataset", accept_multiple_files=True, type=['csv'],
on_change=lambda: on_file_upload())
# get experiments from data folder
# get only directories
experiments = [d for d in os.listdir(
CONST_PATH_TO_DATA) if os.path.isdir(os.path.join(CONST_PATH_TO_DATA, d))]
exp_id = st.session_state.get('exp_id')
st.selectbox("Or choose an experiment", experiments,
key="selected_exp", index=experiments.index(exp_id) if exp_id in experiments else None, on_change=lambda: on_exp_select())
df = pd.DataFrame()
if len(files) > 0:
for file in files:
dfn = pd.read_csv(file)
df = pd.concat([df, dfn], axis=0)
# reset index
# df = df.reset_index(drop=True)
# gen new exp id
uid = uuid.uuid4()
exp_id = str(uid).split('-')[0]
st.session_state['exp_id'] = exp_id
st.toast("New experiment has been generated")
init_paths(globals(), exp_id)
update_exp_id(exp_id_ph, exp_id)
# save exp_id to config.json
save_config(exp_id)
df.to_csv(PATH_TO_DATASET, index=None)
elif (exp_id := st.session_state.get('exp_id')) is not None:
df = pd.read_csv(PATH_TO_DATASET)
if df.empty:
st.stop()
df_ph = st.empty()
df = df_ph.data_editor(df, use_container_width=True)
chosen_target = st.selectbox(
'Choose the Target Column', df.columns, key="preserve_chosen_target", index=len(df.columns)-1)
if st.checkbox("Encode labels", key="preserve_encode_labels", disabled=chosen_target is None):
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# encode last column
df[chosen_target] = le.fit_transform(df[chosen_target])
df = df_ph.data_editor(df, use_container_width=True)
# drop columns
columns = st.multiselect(
"Choose columns to drop", df.columns, key="preserve_columns_to_drop")
# percentage slider
st.slider("Choose train/eval split ratio", 0,
100, 80, format="%f%%", key="preserve_split_ratio")
if st.button("Save"):
st.info("Split the data according to the ratio...")
if len(columns):
st.info("Dropping columns...")
df = df.drop(columns, axis=1)
st.success("Columns dropped successfully")
split_ratio = st.session_state.get('preserve_split_ratio') / 100
from sklearn.model_selection import train_test_split
train, test = train_test_split(
df, test_size=1-split_ratio, stratify=df[chosen_target] if identify_problem_type(df) == 'classification' else None)
st.success("Data split successfully")
st.info("Saving the data...")
df.to_csv(PATH_TO_DATASET, index=None)
train.to_csv(PATH_TO_TRAIN_DATASET, index=None)
test.to_csv(PATH_TO_EVAL_DATASET, index=None)
st.success("Data saved successfully")