auto-fl-fit / pages / 1_๐Ÿ“‚_Choose_Dataset.py
1_๐Ÿ“‚_Choose_Dataset.py
Raw
import uuid
import streamlit as st
import pandas as pd
from tools import *
import os
import json

exp_id_ph, file_tree_ph = sidebar(globals())
st.title("Choose Your Dataset")


def on_file_upload():
	st.session_state['selected_exp'] = None


def save_config(exp_id):
	with open('config.json', 'w') as f:
		json.dump({'exp_id': exp_id}, f)


def on_exp_select():
	global files
	files = []

	# FIXME: fix selection bug when selected several times
	if ((exp_id := st.session_state.get('selected_exp')) == st.session_state.get('exp_id')):
		return

	st.session_state['exp_id'] = exp_id
	init_paths(globals(), exp_id)
	update_exp_id(exp_id_ph, exp_id)

	# save exp_id to config.json
	save_config(exp_id)


with st.container(border=True):
	# FIXME: clear file uploader when experiment is chosen
	files = st.file_uploader("Upload Your Dataset", accept_multiple_files=True, type=['csv'],
	                         on_change=lambda: on_file_upload())

	# get experiments from data folder
	# get only directories
	experiments = [d for d in os.listdir(
		CONST_PATH_TO_DATA) if os.path.isdir(os.path.join(CONST_PATH_TO_DATA, d))]

	exp_id = st.session_state.get('exp_id')
	st.selectbox("Or choose an experiment", experiments,
              key="selected_exp", index=experiments.index(exp_id) if exp_id in experiments else None, on_change=lambda: on_exp_select())

df = pd.DataFrame()
if len(files) > 0:
	for file in files:
		dfn = pd.read_csv(file)
		df = pd.concat([df, dfn], axis=0)
		# reset index
		# df = df.reset_index(drop=True)

	# gen new exp id
	uid = uuid.uuid4()
	exp_id = str(uid).split('-')[0]
	st.session_state['exp_id'] = exp_id
	st.toast("New experiment has been generated")
	init_paths(globals(), exp_id)
	update_exp_id(exp_id_ph, exp_id)

	# save exp_id to config.json
	save_config(exp_id)

	df.to_csv(PATH_TO_DATASET, index=None)

elif (exp_id := st.session_state.get('exp_id')) is not None:
	df = pd.read_csv(PATH_TO_DATASET)

if df.empty:
	st.stop()

df_ph = st.empty()
df = df_ph.data_editor(df, use_container_width=True)

chosen_target = st.selectbox(
	'Choose the Target Column', df.columns, key="preserve_chosen_target", index=len(df.columns)-1)

if st.checkbox("Encode labels", key="preserve_encode_labels", disabled=chosen_target is None):
	from sklearn.preprocessing import LabelEncoder
	le = LabelEncoder()
	# encode last column
	df[chosen_target] = le.fit_transform(df[chosen_target])
	df = df_ph.data_editor(df, use_container_width=True)

# drop columns
columns = st.multiselect(
	"Choose columns to drop", df.columns, key="preserve_columns_to_drop")

# percentage slider
st.slider("Choose train/eval split ratio", 0,
          100, 80, format="%f%%", key="preserve_split_ratio")

if st.button("Save"):
	st.info("Split the data according to the ratio...")

	if len(columns):
		st.info("Dropping columns...")
		df = df.drop(columns, axis=1)
		st.success("Columns dropped successfully")

	split_ratio = st.session_state.get('preserve_split_ratio') / 100
	from sklearn.model_selection import train_test_split
	train, test = train_test_split(
		df, test_size=1-split_ratio, stratify=df[chosen_target] if identify_problem_type(df) == 'classification' else None)

	st.success("Data split successfully")
	st.info("Saving the data...")
	df.to_csv(PATH_TO_DATASET, index=None)
	train.to_csv(PATH_TO_TRAIN_DATASET, index=None)
	test.to_csv(PATH_TO_EVAL_DATASET, index=None)
	st.success("Data saved successfully")