File size: 5,250 Bytes
28572de
 
 
 
 
 
 
 
 
 
 
 
fc8b17b
 
 
 
 
 
28572de
fc8b17b
28572de
fc8b17b
 
 
 
 
 
 
 
 
 
c973974
 
28572de
fc8b17b
28572de
 
 
 
fc8b17b
e807a06
28572de
 
 
fc8b17b
28572de
fc8b17b
 
28572de
 
fc8b17b
28572de
 
 
 
fc8b17b
28572de
c973974
28572de
 
fc8b17b
28572de
 
 
c973974
28572de
 
 
 
 
fc8b17b
 
 
 
 
 
 
f4fbdf0
fc8b17b
 
 
 
 
 
 
 
 
28572de
 
 
 
 
 
 
 
 
 
e807a06
28572de
 
 
 
 
 
e807a06
28572de
 
 
 
 
 
 
c973974
28572de
fc8b17b
28572de
c973974
28572de
fc8b17b
28572de
 
e807a06
c973974
28572de
c973974
 
 
fc8b17b
28572de
 
 
 
 
fc8b17b
28572de
 
c973974
 
28572de
 
 
 
 
 
 
fc8b17b
c973974
fc8b17b
c973974
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import streamlit as st
import json
import pandas as pd
import os

st.set_page_config(page_title="Dataset Builder", layout="wide")
st.title("πŸ“š JSONL Dataset Editor")

TMP_DIR = "temp"
TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl")
os.makedirs(TMP_DIR, exist_ok=True)

# --- Helpers ---
def get_all_fields(data):
    all_keys = set()
    for record in data:
        all_keys.update(record.keys())
    return sorted(all_keys)

# --- Session Initialization ---
if "data" not in st.session_state:
    st.session_state.data = []
if "all_fields" not in st.session_state:
    st.session_state.all_fields = []
if "prev_data" not in st.session_state:
    st.session_state.prev_data = []

# --- Load from temp if needed ---
if not st.session_state.data and os.path.exists(TMP_FILE):
    with open(TMP_FILE, "r", encoding="utf-8") as f:
        st.session_state.data = [json.loads(line) for line in f]
    st.session_state.all_fields = get_all_fields(st.session_state.data)
    st.session_state.prev_data = st.session_state.data.copy()

# --- Upload JSONL ---
uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"])
if uploaded_file:
    content = uploaded_file.read().decode("utf-8")
    st.session_state.data = [json.loads(line) for line in content.strip().splitlines()]
    st.session_state.all_fields = get_all_fields(st.session_state.data)
    st.session_state.prev_data = st.session_state.data.copy()
    with open(TMP_FILE, "w", encoding="utf-8") as f:
        for item in st.session_state.data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
    st.rerun()

# --- Fallback fields if none ---
if not st.session_state.all_fields:
    st.session_state.all_fields = ["context", "question", "answer"]

# --- Edit Records ---
st.markdown("### ✏️ Edit Records")
df = pd.DataFrame(st.session_state.data)
df = df.reindex(columns=st.session_state.all_fields)

# Ensure fields are strings for editor
for field in st.session_state.all_fields:
    if field.lower() in ["context", "question", "answer"]:
        df[field] = df[field].astype(str)

# TextAreas for longer fields
column_configs = {
    field: (
        st.column_config.TextColumn(label=field, width="large")
        if field.lower() in ["context", "question", "answer"]
        else None
    )
    for field in st.session_state.all_fields
}

edited_df = st.data_editor(
    df,
    use_container_width=True,
    num_rows="dynamic",
    column_config=column_configs,
    key="editable_table",
)

# Save if changed
new_data = edited_df.fillna("").to_dict(orient="records")
if new_data != st.session_state.prev_data:
    st.session_state.data = new_data
    st.session_state.prev_data = new_data.copy()
    with open(TMP_FILE, "w", encoding="utf-8") as f:
        for item in new_data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
    st.toast("βœ… Auto-saved!", icon="πŸ’Ύ")

# --- Add New Entry ---
st.markdown("### βž• Add New Entry")
with st.form("new_entry_form"):
    new_record = {}
    for field in st.session_state.all_fields:
        new_record[field] = st.text_area(f"{field}", key=f"input_{field}")
    submitted = st.form_submit_button("Add Entry")
    if submitted:
        st.session_state.data.append(new_record)
        st.session_state.prev_data = st.session_state.data.copy()
        with open(TMP_FILE, "w", encoding="utf-8") as f:
            for item in st.session_state.data:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")
        st.success("βœ… New entry added!")
        st.rerun()

# --- Add New Field ---
with st.expander("βž• Add New Field"):
    new_field = st.text_input("New field name", key="new_field_name")
    if st.button("Add Field"):
        if new_field and new_field not in st.session_state.all_fields:
            st.session_state.all_fields.append(new_field)
            st.rerun()

# --- Export Section ---
st.markdown("### πŸ“€ Export Dataset")
export_path = st.text_input("Save path", value="./exports/exported_dataset.jsonl")

col1, col2, col3 = st.columns(3)

# Export
with col1:
    if st.button("πŸ“ Export JSONL"):
        os.makedirs(os.path.dirname(export_path), exist_ok=True)
        with open(export_path, "w", encoding="utf-8") as f:
            for row in st.session_state.data:
                f.write(json.dumps(row, ensure_ascii=False) + "\n")
        with open(export_path, "r", encoding="utf-8") as f:
            content = f.read()
        st.download_button("⬇️ Download JSONL", content, file_name=os.path.basename(export_path))
        if os.path.exists(TMP_FILE):
            os.remove(TMP_FILE)
        st.session_state.clear()
        st.rerun()

# Download temp
with col2:
    if os.path.exists(TMP_FILE):
        with open(TMP_FILE, "r", encoding="utf-8") as f:
            tmp_content = f.read()
        st.download_button(
            "⬇️ Download Temp File",
            tmp_content,
            file_name="session_dataset.jsonl",
            mime="application/json",
        )
    else:
        st.warning("⚠️ No temp file found.")

# Clear session
with col3:
    if st.button("πŸ—‘οΈ Clear Session"):
        if os.path.exists(TMP_FILE):
            os.remove(TMP_FILE)
        st.session_state.clear()
        st.rerun()