abubasith86's picture
Update app.py
28572de verified
raw
history blame
5.74 kB
import streamlit as st
import json
import pandas as pd
import os
st.set_page_config(page_title="Dataset Builder", layout="wide")
st.title("πŸ“š JSONL Dataset Editor")
TMP_DIR = "temp"
TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl")
# --- Helper: ensure tmp dir exists ---
os.makedirs(TMP_DIR, exist_ok=True)
# --- Helper: get all unique fields from records ---
def get_all_fields(data):
all_keys = set()
for record in data:
all_keys.update(record.keys())
return sorted(all_keys)
# --- Load session data from temp file if exists ---
if "data" not in st.session_state:
if os.path.exists(TMP_FILE):
with open(TMP_FILE, "r", encoding="utf-8") as f:
st.session_state.data = [json.loads(line) for line in f]
st.session_state.all_fields = get_all_fields(st.session_state.data)
else:
st.session_state.data = []
st.session_state.all_fields = []
# --- Upload JSONL File ---
uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"])
if uploaded_file:
content = uploaded_file.read().decode("utf-8")
st.session_state.data = [json.loads(line) for line in content.strip().splitlines()]
st.session_state.all_fields = get_all_fields(st.session_state.data)
# Save to temp
with open(TMP_FILE, "w", encoding="utf-8") as f:
for item in st.session_state.data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
st.success(
f"Loaded {len(st.session_state.data)} records with fields: {st.session_state.all_fields}"
)
# If still no data, use safe fallback fields
if not st.session_state.data and not st.session_state.all_fields:
st.session_state.all_fields = ["context", "question", "answer"]
# --- Edit Existing Records ---
st.markdown("### ✏️ Edit Records")
df = pd.DataFrame(st.session_state.data)
df = df.reindex(columns=st.session_state.all_fields)
# Fix: Convert likely text fields to string to avoid StreamlitAPIException
for field in st.session_state.all_fields:
if field.lower() in ["context", "answer", "question"]:
df[field] = df[field].astype(str)
# Auto-set long fields like "context", "answer" as textareas
column_configs = {
field: (
st.column_config.TextColumn(label=field, width="large")
if field.lower() in ["context", "answer", "question"]
else None
)
for field in st.session_state.all_fields
}
# --- Use st.data_editor for editable table ---
edited_df = st.data_editor(
df,
use_container_width=True,
num_rows="dynamic",
column_config=column_configs,
)
# --- Save updated data ---
if edited_df is not None:
st.session_state.data = edited_df.fillna("").to_dict(orient="records")
# Save to temp file
with open(TMP_FILE, "w", encoding="utf-8") as f:
for item in st.session_state.data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
st.toast("βœ… Changes auto-saved!", icon="πŸ’Ύ")
# --- Add New Entry ---
st.markdown("### βž• Add New Entry")
# Show form with current fields
with st.form("new_entry_form"):
new_record = {}
for field in st.session_state.all_fields:
new_record[field] = st.text_area(f"{field}", key=f"input_{field}")
submitted = st.form_submit_button("Add Entry")
if submitted:
st.session_state.data.append(new_record)
# Save to temp
with open(TMP_FILE, "w", encoding="utf-8") as f:
for item in st.session_state.data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
st.success("βœ… New entry added!")
st.rerun()
# Option to add a new field
with st.expander("βž• Add New Field"):
new_field = st.text_input("New field name", key="new_field_name")
if st.button("Add Field"):
if new_field and new_field not in st.session_state.all_fields:
st.session_state.all_fields.append(new_field)
st.success(f"βœ… Field '{new_field}' added!")
st.rerun()
# --- Export JSONL ---
st.markdown("### πŸ“€ Export Dataset")
# Let user define a custom export path
export_path = st.text_input(
"Custom save path (e.g., ./exports/my_dataset.jsonl)",
value="./exports/exported_dataset.jsonl",
)
col1, col2 = st.columns(2)
# --- Export Button ---
with col1:
if st.button("πŸ“ Export JSONL"):
if not os.path.exists(os.path.dirname(export_path)):
os.makedirs(os.path.dirname(export_path))
# Write to custom path
with open(export_path, "w", encoding="utf-8") as f_out:
for row in st.session_state.data:
f_out.write(json.dumps(row, ensure_ascii=False) + "\n")
st.success(f"βœ… Dataset saved to {export_path}")
# Load content for download
with open(export_path, "r", encoding="utf-8") as f_download:
exported_content = f_download.read()
st.download_button(
"⬇️ Download JSONL",
exported_content,
file_name=os.path.basename(export_path),
mime="application/json",
)
# Reset session and temp
if os.path.exists(TMP_FILE):
os.remove(TMP_FILE)
st.session_state.clear()
st.success("🧹 Temporary session cleared. You're starting fresh!")
st.rerun()
# --- Download Temp Only Button ---
with col2:
if os.path.exists(TMP_FILE):
with open(TMP_FILE, "r", encoding="utf-8") as f_tmp:
tmp_content = f_tmp.read()
st.download_button(
"⬇️ Download Temp File",
tmp_content,
file_name="session_dataset.jsonl",
mime="application/json",
)
else:
st.warning("⚠️ No temp file found to download.")