File size: 5,741 Bytes
28572de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import streamlit as st
import json
import pandas as pd
import os

st.set_page_config(page_title="Dataset Builder", layout="wide")
st.title("πŸ“š JSONL Dataset Editor")

TMP_DIR = "temp"
TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl")

# --- Helper: ensure tmp dir exists ---
os.makedirs(TMP_DIR, exist_ok=True)


# --- Helper: get all unique fields from records ---
def get_all_fields(data):
    all_keys = set()
    for record in data:
        all_keys.update(record.keys())
    return sorted(all_keys)


# --- Load session data from temp file if exists ---
if "data" not in st.session_state:
    if os.path.exists(TMP_FILE):
        with open(TMP_FILE, "r", encoding="utf-8") as f:
            st.session_state.data = [json.loads(line) for line in f]
        st.session_state.all_fields = get_all_fields(st.session_state.data)
    else:
        st.session_state.data = []
        st.session_state.all_fields = []

# --- Upload JSONL File ---
uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"])

if uploaded_file:
    content = uploaded_file.read().decode("utf-8")
    st.session_state.data = [json.loads(line) for line in content.strip().splitlines()]
    st.session_state.all_fields = get_all_fields(st.session_state.data)

    # Save to temp
    with open(TMP_FILE, "w", encoding="utf-8") as f:
        for item in st.session_state.data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    st.success(
        f"Loaded {len(st.session_state.data)} records with fields: {st.session_state.all_fields}"
    )

# If still no data, use safe fallback fields
if not st.session_state.data and not st.session_state.all_fields:
    st.session_state.all_fields = ["context", "question", "answer"]

# --- Edit Existing Records ---
st.markdown("### ✏️ Edit Records")

df = pd.DataFrame(st.session_state.data)
df = df.reindex(columns=st.session_state.all_fields)

# Fix: Convert likely text fields to string to avoid StreamlitAPIException
for field in st.session_state.all_fields:
    if field.lower() in ["context", "answer", "question"]:
        df[field] = df[field].astype(str)

# Auto-set long fields like "context", "answer" as textareas
column_configs = {
    field: (
        st.column_config.TextColumn(label=field, width="large")
        if field.lower() in ["context", "answer", "question"]
        else None
    )
    for field in st.session_state.all_fields
}

# --- Use st.data_editor for editable table ---
edited_df = st.data_editor(
    df,
    use_container_width=True,
    num_rows="dynamic",
    column_config=column_configs,
)

# --- Save updated data ---
if edited_df is not None:
    st.session_state.data = edited_df.fillna("").to_dict(orient="records")

    # Save to temp file
    with open(TMP_FILE, "w", encoding="utf-8") as f:
        for item in st.session_state.data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    st.toast("βœ… Changes auto-saved!", icon="πŸ’Ύ")

# --- Add New Entry ---
st.markdown("### βž• Add New Entry")

# Show form with current fields
with st.form("new_entry_form"):
    new_record = {}
    for field in st.session_state.all_fields:
        new_record[field] = st.text_area(f"{field}", key=f"input_{field}")

    submitted = st.form_submit_button("Add Entry")
    if submitted:
        st.session_state.data.append(new_record)

        # Save to temp
        with open(TMP_FILE, "w", encoding="utf-8") as f:
            for item in st.session_state.data:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")

        st.success("βœ… New entry added!")
        st.rerun()

# Option to add a new field
with st.expander("βž• Add New Field"):
    new_field = st.text_input("New field name", key="new_field_name")
    if st.button("Add Field"):
        if new_field and new_field not in st.session_state.all_fields:
            st.session_state.all_fields.append(new_field)
            st.success(f"βœ… Field '{new_field}' added!")
            st.rerun()


# --- Export JSONL ---
st.markdown("### πŸ“€ Export Dataset")

# Let user define a custom export path
export_path = st.text_input(
    "Custom save path (e.g., ./exports/my_dataset.jsonl)",
    value="./exports/exported_dataset.jsonl",
)

col1, col2 = st.columns(2)

# --- Export Button ---
with col1:
    if st.button("πŸ“ Export JSONL"):
        if not os.path.exists(os.path.dirname(export_path)):
            os.makedirs(os.path.dirname(export_path))

        # Write to custom path
        with open(export_path, "w", encoding="utf-8") as f_out:
            for row in st.session_state.data:
                f_out.write(json.dumps(row, ensure_ascii=False) + "\n")

        st.success(f"βœ… Dataset saved to {export_path}")

        # Load content for download
        with open(export_path, "r", encoding="utf-8") as f_download:
            exported_content = f_download.read()

        st.download_button(
            "⬇️ Download JSONL",
            exported_content,
            file_name=os.path.basename(export_path),
            mime="application/json",
        )

        # Reset session and temp
        if os.path.exists(TMP_FILE):
            os.remove(TMP_FILE)
        st.session_state.clear()
        st.success("🧹 Temporary session cleared. You're starting fresh!")
        st.rerun()

# --- Download Temp Only Button ---
with col2:
    if os.path.exists(TMP_FILE):
        with open(TMP_FILE, "r", encoding="utf-8") as f_tmp:
            tmp_content = f_tmp.read()

        st.download_button(
            "⬇️ Download Temp File",
            tmp_content,
            file_name="session_dataset.jsonl",
            mime="application/json",
        )
    else:
        st.warning("⚠️ No temp file found to download.")