File size: 5,250 Bytes
28572de fc8b17b 28572de fc8b17b 28572de fc8b17b c973974 28572de fc8b17b 28572de fc8b17b e807a06 28572de fc8b17b 28572de fc8b17b 28572de fc8b17b 28572de fc8b17b 28572de c973974 28572de fc8b17b 28572de c973974 28572de fc8b17b f4fbdf0 fc8b17b 28572de e807a06 28572de e807a06 28572de c973974 28572de fc8b17b 28572de c973974 28572de fc8b17b 28572de e807a06 c973974 28572de c973974 fc8b17b 28572de fc8b17b 28572de c973974 28572de fc8b17b c973974 fc8b17b c973974 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import streamlit as st
import json
import pandas as pd
import os
st.set_page_config(page_title="Dataset Builder", layout="wide")
st.title("π JSONL Dataset Editor")
TMP_DIR = "temp"
TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl")
os.makedirs(TMP_DIR, exist_ok=True)
# --- Helpers ---
def get_all_fields(data):
all_keys = set()
for record in data:
all_keys.update(record.keys())
return sorted(all_keys)
# --- Session Initialization ---
if "data" not in st.session_state:
st.session_state.data = []
if "all_fields" not in st.session_state:
st.session_state.all_fields = []
if "prev_data" not in st.session_state:
st.session_state.prev_data = []
# --- Load from temp if needed ---
if not st.session_state.data and os.path.exists(TMP_FILE):
with open(TMP_FILE, "r", encoding="utf-8") as f:
st.session_state.data = [json.loads(line) for line in f]
st.session_state.all_fields = get_all_fields(st.session_state.data)
st.session_state.prev_data = st.session_state.data.copy()
# --- Upload JSONL ---
uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"])
if uploaded_file:
content = uploaded_file.read().decode("utf-8")
st.session_state.data = [json.loads(line) for line in content.strip().splitlines()]
st.session_state.all_fields = get_all_fields(st.session_state.data)
st.session_state.prev_data = st.session_state.data.copy()
with open(TMP_FILE, "w", encoding="utf-8") as f:
for item in st.session_state.data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
st.rerun()
# --- Fallback fields if none ---
if not st.session_state.all_fields:
st.session_state.all_fields = ["context", "question", "answer"]
# --- Edit Records ---
st.markdown("### βοΈ Edit Records")
df = pd.DataFrame(st.session_state.data)
df = df.reindex(columns=st.session_state.all_fields)
# Ensure fields are strings for editor
for field in st.session_state.all_fields:
if field.lower() in ["context", "question", "answer"]:
df[field] = df[field].astype(str)
# TextAreas for longer fields
column_configs = {
field: (
st.column_config.TextColumn(label=field, width="large")
if field.lower() in ["context", "question", "answer"]
else None
)
for field in st.session_state.all_fields
}
edited_df = st.data_editor(
df,
use_container_width=True,
num_rows="dynamic",
column_config=column_configs,
key="editable_table",
)
# Save if changed
new_data = edited_df.fillna("").to_dict(orient="records")
if new_data != st.session_state.prev_data:
st.session_state.data = new_data
st.session_state.prev_data = new_data.copy()
with open(TMP_FILE, "w", encoding="utf-8") as f:
for item in new_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
st.toast("β
Auto-saved!", icon="πΎ")
# --- Add New Entry ---
st.markdown("### β Add New Entry")
with st.form("new_entry_form"):
new_record = {}
for field in st.session_state.all_fields:
new_record[field] = st.text_area(f"{field}", key=f"input_{field}")
submitted = st.form_submit_button("Add Entry")
if submitted:
st.session_state.data.append(new_record)
st.session_state.prev_data = st.session_state.data.copy()
with open(TMP_FILE, "w", encoding="utf-8") as f:
for item in st.session_state.data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
st.success("β
New entry added!")
st.rerun()
# --- Add New Field ---
with st.expander("β Add New Field"):
new_field = st.text_input("New field name", key="new_field_name")
if st.button("Add Field"):
if new_field and new_field not in st.session_state.all_fields:
st.session_state.all_fields.append(new_field)
st.rerun()
# --- Export Section ---
st.markdown("### π€ Export Dataset")
export_path = st.text_input("Save path", value="./exports/exported_dataset.jsonl")
col1, col2, col3 = st.columns(3)
# Export
with col1:
if st.button("π Export JSONL"):
os.makedirs(os.path.dirname(export_path), exist_ok=True)
with open(export_path, "w", encoding="utf-8") as f:
for row in st.session_state.data:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
with open(export_path, "r", encoding="utf-8") as f:
content = f.read()
st.download_button("β¬οΈ Download JSONL", content, file_name=os.path.basename(export_path))
if os.path.exists(TMP_FILE):
os.remove(TMP_FILE)
st.session_state.clear()
st.rerun()
# Download temp
with col2:
if os.path.exists(TMP_FILE):
with open(TMP_FILE, "r", encoding="utf-8") as f:
tmp_content = f.read()
st.download_button(
"β¬οΈ Download Temp File",
tmp_content,
file_name="session_dataset.jsonl",
mime="application/json",
)
else:
st.warning("β οΈ No temp file found.")
# Clear session
with col3:
if st.button("ποΈ Clear Session"):
if os.path.exists(TMP_FILE):
os.remove(TMP_FILE)
st.session_state.clear()
st.rerun()
|