File size: 5,770 Bytes
28572de e40e1e1 28572de f3a8b9c e40e1e1 fc8b17b e40e1e1 f3a8b9c 77d363f e40e1e1 28572de e40e1e1 fc8b17b 28572de e40e1e1 f3a8b9c e40e1e1 f3a8b9c e40e1e1 f3a8b9c e40e1e1 28572de e40e1e1 28572de e40e1e1 28572de e40e1e1 28572de e40e1e1 28572de e40e1e1 28572de e40e1e1 fc8b17b f4fbdf0 e40e1e1 28572de e40e1e1 28572de e40e1e1 28572de e40e1e1 28572de e40e1e1 28572de e40e1e1 28572de e40e1e1 28572de e40e1e1 c973974 e40e1e1 c973974 e40e1e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import streamlit as st
import json
import pandas as pd
import os
st.set_page_config(page_title="Dataset Builder", layout="wide")
st.title("π JSONL Dataset Editor")
TMP_DIR = "temp"
TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl")
# --- Helper: ensure tmp dir exists ---
os.makedirs(TMP_DIR, exist_ok=True)
# --- Helper: get all unique fields from records ---
def get_all_fields(data):
all_keys = set()
for record in data:
all_keys.update(record.keys())
return sorted(all_keys)
# --- Load session data from temp file if exists ---
if "data" not in st.session_state:
if os.path.exists(TMP_FILE):
with open(TMP_FILE, "r", encoding="utf-8") as f:
st.session_state.data = [json.loads(line) for line in f]
st.session_state.all_fields = get_all_fields(st.session_state.data)
else:
st.session_state.data = []
st.session_state.all_fields = []
# --- Upload JSONL File ---
uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"])
if uploaded_file:
content = uploaded_file.read().decode("utf-8")
st.session_state.data = [json.loads(line) for line in content.strip().splitlines()]
st.session_state.all_fields = get_all_fields(st.session_state.data)
# Save to temp
with open(TMP_FILE, "w", encoding="utf-8") as f:
for item in st.session_state.data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
st.success(
f"Loaded {len(st.session_state.data)} records with fields: {st.session_state.all_fields}"
)
# If still no data, use safe fallback fields
if not st.session_state.data and not st.session_state.all_fields:
st.session_state.all_fields = ["context", "question", "answer"]
# --- Edit Existing Records ---
st.markdown("### βοΈ Edit Records")
df = pd.DataFrame(st.session_state.data)
df = df.reindex(columns=st.session_state.all_fields)
# Fix: Convert likely text fields to string to avoid StreamlitAPIException
for field in st.session_state.all_fields:
if field.lower() in ["context", "answer", "question"]:
df[field] = df[field].astype(str)
# Auto-set long fields like "context", "answer" as textareas
column_configs = {
field: (
st.column_config.TextColumn(label=field, width="large")
if field.lower() in ["context", "answer", "question"]
else None
)
for field in st.session_state.all_fields
}
# --- Use st.data_editor for editable table ---
edited_df = st.data_editor(
df,
use_container_width=True,
num_rows="dynamic",
column_config=column_configs,
)
# Auto-save logic: detect changes and persist
if not edited_df.equals(df):
st.session_state.data = edited_df.fillna("").to_dict(orient="records")
# Save to temp file
with open(TMP_FILE, "w", encoding="utf-8") as f:
for item in st.session_state.data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
st.toast("β
Auto-saved!", icon="πΎ")
st.rerun()
# --- Add New Entry ---
st.markdown("### β Add New Entry")
# Show form with current fields
with st.form("new_entry_form"):
new_record = {}
for field in st.session_state.all_fields:
new_record[field] = st.text_area(f"{field}", key=f"input_{field}")
submitted = st.form_submit_button("Add Entry")
if submitted:
st.session_state.data.append(new_record)
# Save to temp
with open(TMP_FILE, "w", encoding="utf-8") as f:
for item in st.session_state.data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
st.success("β
New entry added!")
st.rerun()
# Option to add a new field
with st.expander("β Add New Field"):
new_field = st.text_input("New field name", key="new_field_name")
if st.button("Add Field"):
if new_field and new_field not in st.session_state.all_fields:
st.session_state.all_fields.append(new_field)
st.success(f"β
Field '{new_field}' added!")
st.rerun()
# --- Export JSONL ---
st.markdown("### π€ Export Dataset")
# Let user define a custom export path
export_path = st.text_input(
"Custom save path (e.g., ./exports/my_dataset.jsonl)",
value="./exports/exported_dataset.jsonl",
)
col1, col2 = st.columns(2)
# --- Export Button ---
with col1:
if st.button("π Export JSONL"):
if not os.path.exists(os.path.dirname(export_path)):
os.makedirs(os.path.dirname(export_path))
# Write to custom path
with open(export_path, "w", encoding="utf-8") as f_out:
for row in st.session_state.data:
f_out.write(json.dumps(row, ensure_ascii=False) + "\n")
st.success(f"β
Dataset saved to {export_path}")
# Load content for download
with open(export_path, "r", encoding="utf-8") as f_download:
exported_content = f_download.read()
st.download_button(
"β¬οΈ Download JSONL",
exported_content,
file_name=os.path.basename(export_path),
mime="application/json",
)
# Reset session and temp
if os.path.exists(TMP_FILE):
os.remove(TMP_FILE)
st.session_state.clear()
st.success("π§Ή Temporary session cleared. You're starting fresh!")
st.rerun()
# --- Download Temp Only Button ---
with col2:
if os.path.exists(TMP_FILE):
with open(TMP_FILE, "r", encoding="utf-8") as f_tmp:
tmp_content = f_tmp.read()
st.download_button(
"β¬οΈ Download Temp File",
tmp_content,
file_name="session_dataset.jsonl",
mime="application/json",
)
else:
st.warning("β οΈ No temp file found to download.")
|