File size: 5,666 Bytes
28572de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e807a06
 
 
 
 
 
 
 
28572de
 
 
 
 
 
e807a06
28572de
 
 
e807a06
28572de
 
 
 
 
 
 
 
e807a06
28572de
 
 
 
 
 
 
 
 
e807a06
28572de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e807a06
28572de
 
e807a06
f4fbdf0
e807a06
f4fbdf0
e807a06
b69fe64
f4fbdf0
e807a06
f4fbdf0
 
e807a06
28572de
 
 
 
 
 
 
 
 
 
 
e807a06
28572de
 
 
 
 
 
 
 
e807a06
28572de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e807a06
28572de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import streamlit as st
import json
import pandas as pd
import os

st.set_page_config(page_title="Dataset Builder", layout="wide")
st.title("πŸ“š JSONL Dataset Editor")

TMP_DIR = "temp"
TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl")

os.makedirs(TMP_DIR, exist_ok=True)


def get_all_fields(data):
    all_keys = set()
    for record in data:
        all_keys.update(record.keys())
    return sorted(all_keys)


# --- Clear session handler ---
if st.button("🧹 Clear Session"):
    st.session_state.clear()
    if os.path.exists(TMP_FILE):
        os.remove(TMP_FILE)
    st.success("Session and temp file cleared!")
    st.rerun()

# --- Load session data from temp file if exists ---
if "data" not in st.session_state:
    if os.path.exists(TMP_FILE):
        with open(TMP_FILE, "r", encoding="utf-8") as f:
            st.session_state.data = [json.loads(line) for line in f]
        st.session_state.all_fields = get_all_fields(st.session_state.data)
        st.session_state.prev_data = st.session_state.data.copy()
    else:
        st.session_state.data = []
        st.session_state.all_fields = []
        st.session_state.prev_data = []

# --- Upload JSONL File ---
uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"])

if uploaded_file:
    content = uploaded_file.read().decode("utf-8")
    st.session_state.data = [json.loads(line) for line in content.strip().splitlines()]
    st.session_state.all_fields = get_all_fields(st.session_state.data)
    st.session_state.prev_data = st.session_state.data.copy()

    with open(TMP_FILE, "w", encoding="utf-8") as f:
        for item in st.session_state.data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    st.success(
        f"Loaded {len(st.session_state.data)} records with fields: {st.session_state.all_fields}"
    )

# Fallback default fields
if not st.session_state.data and not st.session_state.all_fields:
    st.session_state.all_fields = ["context", "question", "answer"]

# --- Edit Existing Records ---
st.markdown("### ✏️ Edit Records")

df = pd.DataFrame(st.session_state.data)
df = df.reindex(columns=st.session_state.all_fields)

for field in st.session_state.all_fields:
    if field.lower() in ["context", "answer", "question"]:
        df[field] = df[field].astype(str)

column_configs = {
    field: (
        st.column_config.TextColumn(label=field, width="large")
        if field.lower() in ["context", "answer", "question"]
        else None
    )
    for field in st.session_state.all_fields
}

edited_df = st.data_editor(
    df,
    use_container_width=True,
    num_rows="dynamic",
    column_config=column_configs,
    key="editable_table"
)

# --- Auto-save if any changes ---
new_data = edited_df.fillna("").to_dict(orient="records")
if new_data != st.session_state.prev_data:
    st.session_state.data = new_data
    st.session_state.prev_data = new_data.copy()

    with open(TMP_FILE, "w", encoding="utf-8") as f:
        for item in st.session_state.data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    st.toast("βœ… Changes auto-saved!", icon="πŸ’Ύ")

# --- Add New Entry ---
st.markdown("### βž• Add New Entry")

with st.form("new_entry_form"):
    new_record = {}
    for field in st.session_state.all_fields:
        new_record[field] = st.text_area(f"{field}", key=f"input_{field}")
    submitted = st.form_submit_button("Add Entry")
    if submitted:
        st.session_state.data.append(new_record)
        st.session_state.prev_data = st.session_state.data.copy()

        with open(TMP_FILE, "w", encoding="utf-8") as f:
            for item in st.session_state.data:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")

        st.success("βœ… New entry added!")
        st.rerun()

# --- Add New Field ---
with st.expander("βž• Add New Field"):
    new_field = st.text_input("New field name", key="new_field_name")
    if st.button("Add Field"):
        if new_field and new_field not in st.session_state.all_fields:
            st.session_state.all_fields.append(new_field)
            st.success(f"βœ… Field '{new_field}' added!")
            st.rerun()

# --- Export JSONL ---
st.markdown("### πŸ“€ Export Dataset")

export_path = st.text_input(
    "Custom save path (e.g., ./exports/my_dataset.jsonl)",
    value="./exports/exported_dataset.jsonl",
)

col1, col2 = st.columns(2)

with col1:
    if st.button("πŸ“ Export JSONL"):
        os.makedirs(os.path.dirname(export_path), exist_ok=True)
        with open(export_path, "w", encoding="utf-8") as f_out:
            for row in st.session_state.data:
                f_out.write(json.dumps(row, ensure_ascii=False) + "\n")

        st.success(f"βœ… Dataset saved to {export_path}")

        with open(export_path, "r", encoding="utf-8") as f_download:
            exported_content = f_download.read()

        st.download_button(
            "⬇️ Download JSONL",
            exported_content,
            file_name=os.path.basename(export_path),
            mime="application/json",
        )

        if os.path.exists(TMP_FILE):
            os.remove(TMP_FILE)
        st.session_state.clear()
        st.success("🧹 Temporary session cleared. You're starting fresh!")
        st.rerun()

with col2:
    if os.path.exists(TMP_FILE):
        with open(TMP_FILE, "r", encoding="utf-8") as f_tmp:
            tmp_content = f_tmp.read()

        st.download_button(
            "⬇️ Download Temp File",
            tmp_content,
            file_name="session_dataset.jsonl",
            mime="application/json",
        )
    else:
        st.warning("⚠️ No temp file found to download.")