abubasith86's picture
Update app.py
fc8b17b verified
raw
history blame
5.25 kB
import streamlit as st
import json
import pandas as pd
import os
st.set_page_config(page_title="Dataset Builder", layout="wide")
st.title("πŸ“š JSONL Dataset Editor")
TMP_DIR = "temp"
TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl")
os.makedirs(TMP_DIR, exist_ok=True)
# --- Helpers ---
def get_all_fields(data):
all_keys = set()
for record in data:
all_keys.update(record.keys())
return sorted(all_keys)
# --- Session Initialization ---
if "data" not in st.session_state:
st.session_state.data = []
if "all_fields" not in st.session_state:
st.session_state.all_fields = []
if "prev_data" not in st.session_state:
st.session_state.prev_data = []
# --- Load from temp if needed ---
if not st.session_state.data and os.path.exists(TMP_FILE):
with open(TMP_FILE, "r", encoding="utf-8") as f:
st.session_state.data = [json.loads(line) for line in f]
st.session_state.all_fields = get_all_fields(st.session_state.data)
st.session_state.prev_data = st.session_state.data.copy()
# --- Upload JSONL ---
uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"])
if uploaded_file:
content = uploaded_file.read().decode("utf-8")
st.session_state.data = [json.loads(line) for line in content.strip().splitlines()]
st.session_state.all_fields = get_all_fields(st.session_state.data)
st.session_state.prev_data = st.session_state.data.copy()
with open(TMP_FILE, "w", encoding="utf-8") as f:
for item in st.session_state.data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
st.rerun()
# --- Fallback fields if none ---
if not st.session_state.all_fields:
st.session_state.all_fields = ["context", "question", "answer"]
# --- Edit Records ---
st.markdown("### ✏️ Edit Records")
df = pd.DataFrame(st.session_state.data)
df = df.reindex(columns=st.session_state.all_fields)
# Ensure fields are strings for editor
for field in st.session_state.all_fields:
if field.lower() in ["context", "question", "answer"]:
df[field] = df[field].astype(str)
# TextAreas for longer fields
column_configs = {
field: (
st.column_config.TextColumn(label=field, width="large")
if field.lower() in ["context", "question", "answer"]
else None
)
for field in st.session_state.all_fields
}
edited_df = st.data_editor(
df,
use_container_width=True,
num_rows="dynamic",
column_config=column_configs,
key="editable_table",
)
# Save if changed
new_data = edited_df.fillna("").to_dict(orient="records")
if new_data != st.session_state.prev_data:
st.session_state.data = new_data
st.session_state.prev_data = new_data.copy()
with open(TMP_FILE, "w", encoding="utf-8") as f:
for item in new_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
st.toast("βœ… Auto-saved!", icon="πŸ’Ύ")
# --- Add New Entry ---
st.markdown("### βž• Add New Entry")
with st.form("new_entry_form"):
new_record = {}
for field in st.session_state.all_fields:
new_record[field] = st.text_area(f"{field}", key=f"input_{field}")
submitted = st.form_submit_button("Add Entry")
if submitted:
st.session_state.data.append(new_record)
st.session_state.prev_data = st.session_state.data.copy()
with open(TMP_FILE, "w", encoding="utf-8") as f:
for item in st.session_state.data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
st.success("βœ… New entry added!")
st.rerun()
# --- Add New Field ---
with st.expander("βž• Add New Field"):
new_field = st.text_input("New field name", key="new_field_name")
if st.button("Add Field"):
if new_field and new_field not in st.session_state.all_fields:
st.session_state.all_fields.append(new_field)
st.rerun()
# --- Export Section ---
st.markdown("### πŸ“€ Export Dataset")
export_path = st.text_input("Save path", value="./exports/exported_dataset.jsonl")
col1, col2, col3 = st.columns(3)
# Export
with col1:
if st.button("πŸ“ Export JSONL"):
os.makedirs(os.path.dirname(export_path), exist_ok=True)
with open(export_path, "w", encoding="utf-8") as f:
for row in st.session_state.data:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
with open(export_path, "r", encoding="utf-8") as f:
content = f.read()
st.download_button("⬇️ Download JSONL", content, file_name=os.path.basename(export_path))
if os.path.exists(TMP_FILE):
os.remove(TMP_FILE)
st.session_state.clear()
st.rerun()
# Download temp
with col2:
if os.path.exists(TMP_FILE):
with open(TMP_FILE, "r", encoding="utf-8") as f:
tmp_content = f.read()
st.download_button(
"⬇️ Download Temp File",
tmp_content,
file_name="session_dataset.jsonl",
mime="application/json",
)
else:
st.warning("⚠️ No temp file found.")
# Clear session
with col3:
if st.button("πŸ—‘οΈ Clear Session"):
if os.path.exists(TMP_FILE):
os.remove(TMP_FILE)
st.session_state.clear()
st.rerun()