import streamlit as st import json import pandas as pd import os st.set_page_config(page_title="Dataset Builder", layout="wide") st.title("๐Ÿ“š JSONL Dataset Editor") TMP_DIR = "temp" TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl") os.makedirs(TMP_DIR, exist_ok=True) def get_all_fields(data): all_keys = set() for record in data: all_keys.update(record.keys()) return sorted(all_keys) # --- Clear session handler --- if st.button("๐Ÿงน Clear Session"): st.session_state.clear() if os.path.exists(TMP_FILE): os.remove(TMP_FILE) st.success("Session and temp file cleared!") st.rerun() # --- Load session data from temp file if exists --- if "data" not in st.session_state: if os.path.exists(TMP_FILE): with open(TMP_FILE, "r", encoding="utf-8") as f: st.session_state.data = [json.loads(line) for line in f] st.session_state.all_fields = get_all_fields(st.session_state.data) st.session_state.prev_data = st.session_state.data.copy() else: st.session_state.data = [] st.session_state.all_fields = [] st.session_state.prev_data = [] # --- Upload JSONL File --- uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"]) if uploaded_file: content = uploaded_file.read().decode("utf-8") st.session_state.data = [json.loads(line) for line in content.strip().splitlines()] st.session_state.all_fields = get_all_fields(st.session_state.data) st.session_state.prev_data = st.session_state.data.copy() with open(TMP_FILE, "w", encoding="utf-8") as f: for item in st.session_state.data: f.write(json.dumps(item, ensure_ascii=False) + "\n") st.success( f"Loaded {len(st.session_state.data)} records with fields: {st.session_state.all_fields}" ) # Fallback default fields if not st.session_state.data and not st.session_state.all_fields: st.session_state.all_fields = ["context", "question", "answer"] # --- Edit Existing Records --- st.markdown("### โœ๏ธ Edit Records") df = pd.DataFrame(st.session_state.data) df = df.reindex(columns=st.session_state.all_fields) for field in st.session_state.all_fields: if field.lower() in ["context", "answer", "question"]: df[field] = df[field].astype(str) column_configs = { field: ( st.column_config.TextColumn(label=field, width="large") if field.lower() in ["context", "answer", "question"] else None ) for field in st.session_state.all_fields } edited_df = st.data_editor( df, use_container_width=True, num_rows="dynamic", column_config=column_configs, key="editable_table" ) # --- Auto-save if any changes --- new_data = edited_df.fillna("").to_dict(orient="records") if new_data != st.session_state.prev_data: st.session_state.data = new_data st.session_state.prev_data = new_data.copy() with open(TMP_FILE, "w", encoding="utf-8") as f: for item in st.session_state.data: f.write(json.dumps(item, ensure_ascii=False) + "\n") st.toast("โœ… Changes auto-saved!", icon="๐Ÿ’พ") # --- Add New Entry --- st.markdown("### โž• Add New Entry") with st.form("new_entry_form"): new_record = {} for field in st.session_state.all_fields: new_record[field] = st.text_area(f"{field}", key=f"input_{field}") submitted = st.form_submit_button("Add Entry") if submitted: st.session_state.data.append(new_record) st.session_state.prev_data = st.session_state.data.copy() with open(TMP_FILE, "w", encoding="utf-8") as f: for item in st.session_state.data: f.write(json.dumps(item, ensure_ascii=False) + "\n") st.success("โœ… New entry added!") st.rerun() # --- Add New Field --- with st.expander("โž• Add New Field"): new_field = st.text_input("New field name", key="new_field_name") if st.button("Add Field"): if new_field and new_field not in st.session_state.all_fields: st.session_state.all_fields.append(new_field) st.success(f"โœ… Field '{new_field}' added!") st.rerun() # --- Export JSONL --- st.markdown("### ๐Ÿ“ค Export Dataset") export_path = st.text_input( "Custom save path (e.g., ./exports/my_dataset.jsonl)", value="./exports/exported_dataset.jsonl", ) col1, col2 = st.columns(2) with col1: if st.button("๐Ÿ“ Export JSONL"): os.makedirs(os.path.dirname(export_path), exist_ok=True) with open(export_path, "w", encoding="utf-8") as f_out: for row in st.session_state.data: f_out.write(json.dumps(row, ensure_ascii=False) + "\n") st.success(f"โœ… Dataset saved to {export_path}") with open(export_path, "r", encoding="utf-8") as f_download: exported_content = f_download.read() st.download_button( "โฌ‡๏ธ Download JSONL", exported_content, file_name=os.path.basename(export_path), mime="application/json", ) if os.path.exists(TMP_FILE): os.remove(TMP_FILE) st.session_state.clear() st.success("๐Ÿงน Temporary session cleared. You're starting fresh!") st.rerun() with col2: if os.path.exists(TMP_FILE): with open(TMP_FILE, "r", encoding="utf-8") as f_tmp: tmp_content = f_tmp.read() st.download_button( "โฌ‡๏ธ Download Temp File", tmp_content, file_name="session_dataset.jsonl", mime="application/json", ) else: st.warning("โš ๏ธ No temp file found to download.")