import uuid import streamlit as st import pandas as pd from tools import * import os import json exp_id_ph, file_tree_ph = sidebar(globals()) st.title("Choose Your Dataset") def on_file_upload(): st.session_state['selected_exp'] = None def save_config(exp_id): with open('config.json', 'w') as f: json.dump({'exp_id': exp_id}, f) def on_exp_select(): global files files = [] # FIXME: fix selection bug when selected several times if ((exp_id := st.session_state.get('selected_exp')) == st.session_state.get('exp_id')): return st.session_state['exp_id'] = exp_id init_paths(globals(), exp_id) update_exp_id(exp_id_ph, exp_id) # save exp_id to config.json save_config(exp_id) with st.container(border=True): # FIXME: clear file uploader when experiment is chosen files = st.file_uploader("Upload Your Dataset", accept_multiple_files=True, type=['csv'], on_change=lambda: on_file_upload()) # get experiments from data folder # get only directories experiments = [d for d in os.listdir( CONST_PATH_TO_DATA) if os.path.isdir(os.path.join(CONST_PATH_TO_DATA, d))] exp_id = st.session_state.get('exp_id') st.selectbox("Or choose an experiment", experiments, key="selected_exp", index=experiments.index(exp_id) if exp_id in experiments else None, on_change=lambda: on_exp_select()) df = pd.DataFrame() if len(files) > 0: for file in files: dfn = pd.read_csv(file) df = pd.concat([df, dfn], axis=0) # reset index # df = df.reset_index(drop=True) # gen new exp id uid = uuid.uuid4() exp_id = str(uid).split('-')[0] st.session_state['exp_id'] = exp_id st.toast("New experiment has been generated") init_paths(globals(), exp_id) update_exp_id(exp_id_ph, exp_id) # save exp_id to config.json save_config(exp_id) df.to_csv(PATH_TO_DATASET, index=None) elif (exp_id := st.session_state.get('exp_id')) is not None: df = pd.read_csv(PATH_TO_DATASET) if df.empty: st.stop() df_ph = st.empty() df = df_ph.data_editor(df, use_container_width=True) chosen_target = st.selectbox( 'Choose the Target Column', df.columns, key="preserve_chosen_target", index=len(df.columns)-1) if st.checkbox("Encode labels", key="preserve_encode_labels", disabled=chosen_target is None): from sklearn.preprocessing import LabelEncoder le = LabelEncoder() # encode last column df[chosen_target] = le.fit_transform(df[chosen_target]) df = df_ph.data_editor(df, use_container_width=True) # drop columns columns = st.multiselect( "Choose columns to drop", df.columns, key="preserve_columns_to_drop") # percentage slider st.slider("Choose train/eval split ratio", 0, 100, 80, format="%f%%", key="preserve_split_ratio") if st.button("Save"): st.info("Split the data according to the ratio...") if len(columns): st.info("Dropping columns...") df = df.drop(columns, axis=1) st.success("Columns dropped successfully") split_ratio = st.session_state.get('preserve_split_ratio') / 100 from sklearn.model_selection import train_test_split train, test = train_test_split( df, test_size=1-split_ratio, stratify=df[chosen_target] if identify_problem_type(df) == 'classification' else None) st.success("Data split successfully") st.info("Saving the data...") df.to_csv(PATH_TO_DATASET, index=None) train.to_csv(PATH_TO_TRAIN_DATASET, index=None) test.to_csv(PATH_TO_EVAL_DATASET, index=None) st.success("Data saved successfully")