Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- app.py +336 -0
- test_pairs2.json +0 -0
app.py
ADDED
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import json
|
3 |
+
from datetime import datetime
|
4 |
+
import os
|
5 |
+
import logging
|
6 |
+
|
7 |
+
def _setup_logger():
|
8 |
+
log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s")
|
9 |
+
logger = logging.getLogger()
|
10 |
+
logger.setLevel(logging.INFO)
|
11 |
+
|
12 |
+
console_handler = logging.StreamHandler()
|
13 |
+
console_handler.setFormatter(log_format)
|
14 |
+
logger.handlers = [console_handler]
|
15 |
+
|
16 |
+
return logger
|
17 |
+
|
18 |
+
logger = _setup_logger()
|
19 |
+
|
20 |
+
DATA_DIR = "annotations_data2"
|
21 |
+
os.makedirs(DATA_DIR, exist_ok=True)
|
22 |
+
|
23 |
+
with open("test_pairs2.json", "r") as f:
|
24 |
+
response_pairs = json.load(f)
|
25 |
+
|
26 |
+
custom_css = """
|
27 |
+
@import url('https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap');
|
28 |
+
|
29 |
+
body {
|
30 |
+
font-family: 'Roboto', sans-serif !important;
|
31 |
+
line-height: 1.6;
|
32 |
+
}
|
33 |
+
|
34 |
+
.panel {
|
35 |
+
border: 1px solid #e5e7eb !important;
|
36 |
+
border-radius: 12px !important;
|
37 |
+
padding: 20px !important;
|
38 |
+
}
|
39 |
+
|
40 |
+
button {
|
41 |
+
font-weight: 500 !important;
|
42 |
+
transition: all 0.2s ease !important;
|
43 |
+
font-family: 'Roboto', sans-serif !important;
|
44 |
+
}
|
45 |
+
|
46 |
+
button:hover {
|
47 |
+
transform: translateY(-1px);
|
48 |
+
}
|
49 |
+
|
50 |
+
.progress {
|
51 |
+
color: #4f46e5;
|
52 |
+
font-weight: 500;
|
53 |
+
}
|
54 |
+
|
55 |
+
textarea {
|
56 |
+
border-radius: 8px !important;
|
57 |
+
padding: 12px !important;
|
58 |
+
font-family: 'Roboto', sans-serif !important;
|
59 |
+
}
|
60 |
+
|
61 |
+
.selected-response {
|
62 |
+
border: 2px solid #4f46e5 !important;
|
63 |
+
background-color: #f5f3ff;
|
64 |
+
}
|
65 |
+
|
66 |
+
.instruction-panel {
|
67 |
+
background: #f8f9fa !important;
|
68 |
+
border: 1px solid #e0e0e0 !important;
|
69 |
+
border-radius: 12px !important;
|
70 |
+
padding: 25px !important;
|
71 |
+
margin-bottom: 25px !important;
|
72 |
+
}
|
73 |
+
|
74 |
+
.criteria-list {
|
75 |
+
margin-left: 20px !important;
|
76 |
+
list-style-type: none !important;
|
77 |
+
}
|
78 |
+
|
79 |
+
.criteria-item {
|
80 |
+
padding: 8px 0 !important;
|
81 |
+
}
|
82 |
+
|
83 |
+
.highlight {
|
84 |
+
color: #4f46e5;
|
85 |
+
font-weight: 500;
|
86 |
+
}
|
87 |
+
"""
|
88 |
+
|
89 |
+
class State:
|
90 |
+
def __init__(self):
|
91 |
+
self.current_idx = 0
|
92 |
+
self.prolific_id = ""
|
93 |
+
self.annotations = []
|
94 |
+
self.start_time = datetime.now()
|
95 |
+
|
96 |
+
state = State()
|
97 |
+
|
98 |
+
def save_annotations():
|
99 |
+
if not state.prolific_id:
|
100 |
+
return
|
101 |
+
filename = f"{state.prolific_id}_latest.json"
|
102 |
+
filepath = os.path.join(DATA_DIR, filename)
|
103 |
+
data = {
|
104 |
+
"prolific_id": state.prolific_id,
|
105 |
+
"duration": (datetime.now() - state.start_time).total_seconds(),
|
106 |
+
"current_idx": state.current_idx,
|
107 |
+
"annotations": state.annotations
|
108 |
+
}
|
109 |
+
with open(filepath, "w") as f:
|
110 |
+
json.dump(data, f, indent=2)
|
111 |
+
logger.info(f"Saved annotations to {filepath}")
|
112 |
+
return filepath
|
113 |
+
|
114 |
+
def load_latest_data(prolific_id):
|
115 |
+
filename = f"{prolific_id}_latest.json"
|
116 |
+
filepath = os.path.join(DATA_DIR, filename)
|
117 |
+
if os.path.exists(filepath):
|
118 |
+
try:
|
119 |
+
data = json.load(open(filepath))
|
120 |
+
data["current_idx"] = min(max(data["current_idx"], 0), len(response_pairs)-1)
|
121 |
+
return data
|
122 |
+
except Exception as e:
|
123 |
+
logger.error(f"Error loading {filepath}: {e}")
|
124 |
+
return None
|
125 |
+
|
126 |
+
INSTRUCTION = """
|
127 |
+
### Welcome! 🎉
|
128 |
+
|
129 |
+
In this task, you'll act as a judge comparing two AI chatbot responses. Your goal is to determine which response is better based on specific criteria.
|
130 |
+
|
131 |
+
### 📋 Task Overview:
|
132 |
+
- You'll evaluate multiple questions (prompts), each with two responses (Response A and B)
|
133 |
+
- Select the better response for each question based on the criteria below
|
134 |
+
- Your progress will be tracked
|
135 |
+
|
136 |
+
### 🏅 Evaluation Criteria:
|
137 |
+
1. **Perceived Usefulness**
|
138 |
+
→ Does the answer address the question effectively and provide relevant information?
|
139 |
+
2. **Social Presence**
|
140 |
+
→ Does the answer creates "the feeling of being there with a 'real' person"?
|
141 |
+
|
142 |
+
|
143 |
+
### 🚀 Getting Started:
|
144 |
+
1. Input your Prolific ID to begin
|
145 |
+
2. Read the question carefully
|
146 |
+
3. Compare both responses side-by-side
|
147 |
+
4. Select the better response using the radio buttons
|
148 |
+
5. Provide optional feedback and confidence rating
|
149 |
+
6. Click "Next" to continue or "Previous" to review
|
150 |
+
|
151 |
+
**Note:** You must select a response and confidence level before proceeding to the next question.
|
152 |
+
|
153 |
+
*Thank you for contributing to our research! Your input is valuable.*
|
154 |
+
"""
|
155 |
+
|
156 |
+
MINI_INSTRUCTION = """You’ll compare two AI chatbot answers for different questions and pick the better one. Read the question, then look at Response A and Response B. Choose the one that’s better based on: Helpfulness (answers well, gives useful info), Clarity (clear, logical, on topic), and Emotion (understands feelings, fits the situation).
|
157 |
+
|
158 |
+
*Select your choice and rate your confidence. Click "Next" to move on or "Previous" to go back. You must pick a response and confidence level to continue. Thanks for helping with our research!*
|
159 |
+
"""
|
160 |
+
|
161 |
+
def create_interface():
|
162 |
+
with gr.Blocks(gr.themes.Ocean(), title="AI Response Evaluation", css=custom_css) as demo:
|
163 |
+
# User ID Section
|
164 |
+
with gr.Column(visible=True, elem_id="id_section") as id_section:
|
165 |
+
with gr.Column(elem_classes="instruction-panel"):
|
166 |
+
gr.Markdown(INSTRUCTION)
|
167 |
+
gr.Markdown("---")
|
168 |
+
gr.Markdown("## Prolific ID Verification")
|
169 |
+
prolific_id = gr.Textbox(label="Enter your Prolific ID")
|
170 |
+
id_submit_btn = gr.Button("Submit", variant="primary")
|
171 |
+
|
172 |
+
# Main Interface
|
173 |
+
with gr.Column(visible=False, elem_id="main_interface") as main_interface:
|
174 |
+
progress_md = gr.Markdown("**Progress:** 0% (0/0)", elem_classes="progress")
|
175 |
+
gr.HTML('<style>.prompt-highlight { background-color: #e6f7ff; padding: 10px; border: 1px solid #91d5ff; border-radius: 5px; }</style>')
|
176 |
+
gr.Markdown(MINI_INSTRUCTION)
|
177 |
+
gr.Markdown("---")
|
178 |
+
gr.Markdown("### Current Question")
|
179 |
+
prompt_box = gr.Markdown(elem_classes="prompt-highlight")
|
180 |
+
with gr.Row():
|
181 |
+
with gr.Column(variant="panel"):
|
182 |
+
gr.Markdown("### Response A")
|
183 |
+
response_a = gr.Markdown(height='200px')
|
184 |
+
with gr.Column(variant="panel"):
|
185 |
+
gr.Markdown("### Response B")
|
186 |
+
response_b = gr.Markdown(height='200px')
|
187 |
+
selection_radio = gr.Radio(
|
188 |
+
choices=[("Response A", "A"), ("Response B", "B")],
|
189 |
+
label="Select the better response",
|
190 |
+
)
|
191 |
+
feedback = gr.Textbox(label="Additional Feedback (optional)", lines=3)
|
192 |
+
confidence = gr.Radio(
|
193 |
+
choices=[("1 - Not confident", 1), ("2", 2), ("3", 3), ("4", 4), ("5 - Very confident", 5)],
|
194 |
+
label="Confidence Rating",
|
195 |
+
)
|
196 |
+
with gr.Row():
|
197 |
+
prev_btn = gr.Button("Previous", variant="secondary")
|
198 |
+
next_btn = gr.Button("Next", variant="primary")
|
199 |
+
|
200 |
+
# Completion Section
|
201 |
+
with gr.Column(visible=False, elem_id="completion") as completion_section:
|
202 |
+
gr.Markdown("# Thank You!")
|
203 |
+
gr.Markdown("### Completion code: `CA7IOI65`")
|
204 |
+
completion_md = gr.Markdown("Your annotations have been saved.")
|
205 |
+
gr.HTML("""
|
206 |
+
<p>Click <a href="https://app.prolific.com/researcher/submissions/complete?cc=CA7IOI65" target="_blank">here</a> to complete the task.</p>
|
207 |
+
""")
|
208 |
+
|
209 |
+
def handle_id_submit(prolific_id_val):
|
210 |
+
if not prolific_id_val.strip():
|
211 |
+
raise gr.Error("Please enter a valid Prolific ID")
|
212 |
+
state.prolific_id = prolific_id_val.strip()
|
213 |
+
data = load_latest_data(state.prolific_id)
|
214 |
+
|
215 |
+
if data:
|
216 |
+
state.annotations = data.get("annotations", [])
|
217 |
+
state.current_idx = data.get("current_idx", 0)
|
218 |
+
if state.current_idx >= len(response_pairs):
|
219 |
+
save_annotations()
|
220 |
+
return {
|
221 |
+
id_section: gr.update(visible=False),
|
222 |
+
main_interface: gr.update(visible=False),
|
223 |
+
completion_section: gr.update(visible=True)
|
224 |
+
}
|
225 |
+
else:
|
226 |
+
state.annotations = []
|
227 |
+
state.current_idx = 0
|
228 |
+
|
229 |
+
return {
|
230 |
+
id_section: gr.update(visible=False),
|
231 |
+
main_interface: gr.update(visible=True),
|
232 |
+
completion_section: gr.update(visible=False),
|
233 |
+
**update_interface(state.current_idx)
|
234 |
+
}
|
235 |
+
|
236 |
+
def update_interface(idx):
|
237 |
+
if idx >= len(response_pairs):
|
238 |
+
idx = len(response_pairs) - 1
|
239 |
+
current_data = response_pairs[idx] if idx < len(response_pairs) else {}
|
240 |
+
progress = f"**Progress:** {idx/len(response_pairs):.0%} ({idx}/{len(response_pairs)})"
|
241 |
+
annotation = state.annotations[idx] if idx < len(state.annotations) else None
|
242 |
+
return {
|
243 |
+
prompt_box: current_data.get("prompt", ""),
|
244 |
+
response_a: current_data.get("responseA", ""),
|
245 |
+
response_b: current_data.get("responseB", ""),
|
246 |
+
progress_md: progress,
|
247 |
+
feedback: annotation["feedback"] if annotation else "",
|
248 |
+
confidence: annotation["confidence"] if annotation else None,
|
249 |
+
selection_radio: annotation["selected"] if annotation else None
|
250 |
+
}
|
251 |
+
|
252 |
+
def handle_navigation(direction, selection, confidence_val, feedback_val):
|
253 |
+
error_msg = None
|
254 |
+
if direction == "next":
|
255 |
+
if not selection:
|
256 |
+
error_msg = "Please select a response before proceeding."
|
257 |
+
if not confidence_val:
|
258 |
+
error_msg = "Please select a confidence level before proceeding."
|
259 |
+
|
260 |
+
if error_msg:
|
261 |
+
gr.Warning(error_msg)
|
262 |
+
return {
|
263 |
+
main_interface: gr.update(visible=True),
|
264 |
+
completion_section: gr.update(visible=False),
|
265 |
+
**update_interface(state.current_idx)
|
266 |
+
}
|
267 |
+
|
268 |
+
# Save current annotation
|
269 |
+
if selection and confidence_val:
|
270 |
+
annotation = {
|
271 |
+
"id": response_pairs[state.current_idx]["id"], # Save unique ID
|
272 |
+
"prompt": response_pairs[state.current_idx]["prompt"],
|
273 |
+
"selected": selection,
|
274 |
+
"confidence": confidence_val,
|
275 |
+
"feedback": feedback_val,
|
276 |
+
"timestamp": datetime.now().isoformat()
|
277 |
+
}
|
278 |
+
if state.current_idx < len(state.annotations):
|
279 |
+
state.annotations[state.current_idx] = annotation
|
280 |
+
else:
|
281 |
+
state.annotations.append(annotation)
|
282 |
+
|
283 |
+
# Navigation logic
|
284 |
+
try:
|
285 |
+
new_idx = state.current_idx + 1 if direction == "next" else max(0, state.current_idx - 1)
|
286 |
+
state.current_idx = new_idx
|
287 |
+
save_annotations()
|
288 |
+
|
289 |
+
if new_idx >= len(response_pairs):
|
290 |
+
return {
|
291 |
+
main_interface: gr.update(visible=False),
|
292 |
+
completion_section: gr.update(visible=True),
|
293 |
+
**update_interface(new_idx)
|
294 |
+
}
|
295 |
+
|
296 |
+
return {
|
297 |
+
main_interface: gr.update(visible=True),
|
298 |
+
completion_section: gr.update(visible=False),
|
299 |
+
**update_interface(new_idx)
|
300 |
+
}
|
301 |
+
|
302 |
+
except Exception as e:
|
303 |
+
logger.error(f"Navigation error: {e}")
|
304 |
+
return {
|
305 |
+
main_interface: gr.update(visible=True),
|
306 |
+
completion_section: gr.update(visible=False),
|
307 |
+
**update_interface(state.current_idx)
|
308 |
+
}
|
309 |
+
|
310 |
+
# Event bindings
|
311 |
+
id_submit_btn.click(
|
312 |
+
handle_id_submit,
|
313 |
+
inputs=prolific_id,
|
314 |
+
outputs=[id_section, main_interface, completion_section, prompt_box,
|
315 |
+
response_a, response_b, progress_md, feedback, confidence, selection_radio]
|
316 |
+
)
|
317 |
+
|
318 |
+
prev_btn.click(
|
319 |
+
handle_navigation,
|
320 |
+
inputs=[gr.State("prev"), selection_radio, confidence, feedback],
|
321 |
+
outputs=[main_interface, completion_section, prompt_box, response_a,
|
322 |
+
response_b, progress_md, feedback, confidence, selection_radio]
|
323 |
+
)
|
324 |
+
|
325 |
+
next_btn.click(
|
326 |
+
handle_navigation,
|
327 |
+
inputs=[gr.State("next"), selection_radio, confidence, feedback],
|
328 |
+
outputs=[main_interface, completion_section, prompt_box, response_a,
|
329 |
+
response_b, progress_md, feedback, confidence, selection_radio]
|
330 |
+
)
|
331 |
+
|
332 |
+
return demo
|
333 |
+
|
334 |
+
if __name__ == "__main__":
|
335 |
+
app = create_interface()
|
336 |
+
app.launch(server_name="0.0.0.0", server_port=7861, share=True)
|
test_pairs2.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|