File size: 5,770 Bytes
28572de
 
 
 
 
 
 
 
 
 
e40e1e1
 
28572de
 
f3a8b9c
e40e1e1
fc8b17b
e40e1e1
 
 
 
f3a8b9c
77d363f
e40e1e1
28572de
e40e1e1
 
 
 
 
 
 
 
 
 
 
 
 
 
fc8b17b
28572de
e40e1e1
 
 
 
f3a8b9c
e40e1e1
 
 
f3a8b9c
e40e1e1
 
 
f3a8b9c
e40e1e1
28572de
e40e1e1
28572de
 
 
e40e1e1
28572de
e40e1e1
 
28572de
e40e1e1
28572de
e40e1e1
 
 
 
 
28572de
 
 
e40e1e1
fc8b17b
 
 
 
 
 
f4fbdf0
e40e1e1
 
 
 
 
 
 
 
 
 
 
 
28572de
e40e1e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28572de
 
 
 
 
e40e1e1
 
28572de
 
e40e1e1
 
28572de
e40e1e1
 
 
 
 
 
 
 
 
28572de
 
e40e1e1
 
 
 
 
28572de
e40e1e1
28572de
e40e1e1
c973974
e40e1e1
 
 
 
 
 
 
 
 
 
 
 
c973974
 
 
e40e1e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import streamlit as st
import json
import pandas as pd
import os

st.set_page_config(page_title="Dataset Builder", layout="wide")
st.title("πŸ“š JSONL Dataset Editor")

TMP_DIR = "temp"
TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl")

# --- Helper: ensure tmp dir exists ---
os.makedirs(TMP_DIR, exist_ok=True)


# --- Helper: get all unique fields from records ---
def get_all_fields(data):
    all_keys = set()
    for record in data:
        all_keys.update(record.keys())
    return sorted(all_keys)


# --- Load session data from temp file if exists ---
if "data" not in st.session_state:
    if os.path.exists(TMP_FILE):
        with open(TMP_FILE, "r", encoding="utf-8") as f:
            st.session_state.data = [json.loads(line) for line in f]
        st.session_state.all_fields = get_all_fields(st.session_state.data)
    else:
        st.session_state.data = []
        st.session_state.all_fields = []

# --- Upload JSONL File ---
uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"])

if uploaded_file:
    content = uploaded_file.read().decode("utf-8")
    st.session_state.data = [json.loads(line) for line in content.strip().splitlines()]
    st.session_state.all_fields = get_all_fields(st.session_state.data)

    # Save to temp
    with open(TMP_FILE, "w", encoding="utf-8") as f:
        for item in st.session_state.data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    st.success(
        f"Loaded {len(st.session_state.data)} records with fields: {st.session_state.all_fields}"
    )

# If still no data, use safe fallback fields
if not st.session_state.data and not st.session_state.all_fields:
    st.session_state.all_fields = ["context", "question", "answer"]

# --- Edit Existing Records ---
st.markdown("### ✏️ Edit Records")

df = pd.DataFrame(st.session_state.data)
df = df.reindex(columns=st.session_state.all_fields)

# Fix: Convert likely text fields to string to avoid StreamlitAPIException
for field in st.session_state.all_fields:
    if field.lower() in ["context", "answer", "question"]:
        df[field] = df[field].astype(str)

# Auto-set long fields like "context", "answer" as textareas
column_configs = {
    field: (
        st.column_config.TextColumn(label=field, width="large")
        if field.lower() in ["context", "answer", "question"]
        else None
    )
    for field in st.session_state.all_fields
}

# --- Use st.data_editor for editable table ---
edited_df = st.data_editor(
    df,
    use_container_width=True,
    num_rows="dynamic",
    column_config=column_configs,
)

# Auto-save logic: detect changes and persist
if not edited_df.equals(df):
    st.session_state.data = edited_df.fillna("").to_dict(orient="records")

    # Save to temp file
    with open(TMP_FILE, "w", encoding="utf-8") as f:
        for item in st.session_state.data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    st.toast("βœ… Auto-saved!", icon="πŸ’Ύ")
    st.rerun()


# --- Add New Entry ---
st.markdown("### βž• Add New Entry")

# Show form with current fields
with st.form("new_entry_form"):
    new_record = {}
    for field in st.session_state.all_fields:
        new_record[field] = st.text_area(f"{field}", key=f"input_{field}")

    submitted = st.form_submit_button("Add Entry")
    if submitted:
        st.session_state.data.append(new_record)

        # Save to temp
        with open(TMP_FILE, "w", encoding="utf-8") as f:
            for item in st.session_state.data:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")

        st.success("βœ… New entry added!")
        st.rerun()

# Option to add a new field
with st.expander("βž• Add New Field"):
    new_field = st.text_input("New field name", key="new_field_name")
    if st.button("Add Field"):
        if new_field and new_field not in st.session_state.all_fields:
            st.session_state.all_fields.append(new_field)
            st.success(f"βœ… Field '{new_field}' added!")
            st.rerun()


# --- Export JSONL ---
st.markdown("### πŸ“€ Export Dataset")

# Let user define a custom export path
export_path = st.text_input(
    "Custom save path (e.g., ./exports/my_dataset.jsonl)",
    value="./exports/exported_dataset.jsonl",
)

col1, col2 = st.columns(2)

# --- Export Button ---
with col1:
    if st.button("πŸ“ Export JSONL"):
        if not os.path.exists(os.path.dirname(export_path)):
            os.makedirs(os.path.dirname(export_path))

        # Write to custom path
        with open(export_path, "w", encoding="utf-8") as f_out:
            for row in st.session_state.data:
                f_out.write(json.dumps(row, ensure_ascii=False) + "\n")

        st.success(f"βœ… Dataset saved to {export_path}")

        # Load content for download
        with open(export_path, "r", encoding="utf-8") as f_download:
            exported_content = f_download.read()

        st.download_button(
            "⬇️ Download JSONL",
            exported_content,
            file_name=os.path.basename(export_path),
            mime="application/json",
        )

        # Reset session and temp
        if os.path.exists(TMP_FILE):
            os.remove(TMP_FILE)
        st.session_state.clear()
        st.success("🧹 Temporary session cleared. You're starting fresh!")
        st.rerun()

# --- Download Temp Only Button ---
with col2:
    if os.path.exists(TMP_FILE):
        with open(TMP_FILE, "r", encoding="utf-8") as f_tmp:
            tmp_content = f_tmp.read()

        st.download_button(
            "⬇️ Download Temp File",
            tmp_content,
            file_name="session_dataset.jsonl",
            mime="application/json",
        )
    else:
        st.warning("⚠️ No temp file found to download.")