yixuantt commited on
Commit
a044e1e
·
verified ·
1 Parent(s): 4088fe7

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +336 -0
  2. test_pairs2.json +0 -0
app.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ from datetime import datetime
4
+ import os
5
+ import logging
6
+
7
+ def _setup_logger():
8
+ log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s")
9
+ logger = logging.getLogger()
10
+ logger.setLevel(logging.INFO)
11
+
12
+ console_handler = logging.StreamHandler()
13
+ console_handler.setFormatter(log_format)
14
+ logger.handlers = [console_handler]
15
+
16
+ return logger
17
+
18
+ logger = _setup_logger()
19
+
20
+ DATA_DIR = "annotations_data2"
21
+ os.makedirs(DATA_DIR, exist_ok=True)
22
+
23
+ with open("test_pairs2.json", "r") as f:
24
+ response_pairs = json.load(f)
25
+
26
+ custom_css = """
27
+ @import url('https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap');
28
+
29
+ body {
30
+ font-family: 'Roboto', sans-serif !important;
31
+ line-height: 1.6;
32
+ }
33
+
34
+ .panel {
35
+ border: 1px solid #e5e7eb !important;
36
+ border-radius: 12px !important;
37
+ padding: 20px !important;
38
+ }
39
+
40
+ button {
41
+ font-weight: 500 !important;
42
+ transition: all 0.2s ease !important;
43
+ font-family: 'Roboto', sans-serif !important;
44
+ }
45
+
46
+ button:hover {
47
+ transform: translateY(-1px);
48
+ }
49
+
50
+ .progress {
51
+ color: #4f46e5;
52
+ font-weight: 500;
53
+ }
54
+
55
+ textarea {
56
+ border-radius: 8px !important;
57
+ padding: 12px !important;
58
+ font-family: 'Roboto', sans-serif !important;
59
+ }
60
+
61
+ .selected-response {
62
+ border: 2px solid #4f46e5 !important;
63
+ background-color: #f5f3ff;
64
+ }
65
+
66
+ .instruction-panel {
67
+ background: #f8f9fa !important;
68
+ border: 1px solid #e0e0e0 !important;
69
+ border-radius: 12px !important;
70
+ padding: 25px !important;
71
+ margin-bottom: 25px !important;
72
+ }
73
+
74
+ .criteria-list {
75
+ margin-left: 20px !important;
76
+ list-style-type: none !important;
77
+ }
78
+
79
+ .criteria-item {
80
+ padding: 8px 0 !important;
81
+ }
82
+
83
+ .highlight {
84
+ color: #4f46e5;
85
+ font-weight: 500;
86
+ }
87
+ """
88
+
89
+ class State:
90
+ def __init__(self):
91
+ self.current_idx = 0
92
+ self.prolific_id = ""
93
+ self.annotations = []
94
+ self.start_time = datetime.now()
95
+
96
+ state = State()
97
+
98
+ def save_annotations():
99
+ if not state.prolific_id:
100
+ return
101
+ filename = f"{state.prolific_id}_latest.json"
102
+ filepath = os.path.join(DATA_DIR, filename)
103
+ data = {
104
+ "prolific_id": state.prolific_id,
105
+ "duration": (datetime.now() - state.start_time).total_seconds(),
106
+ "current_idx": state.current_idx,
107
+ "annotations": state.annotations
108
+ }
109
+ with open(filepath, "w") as f:
110
+ json.dump(data, f, indent=2)
111
+ logger.info(f"Saved annotations to {filepath}")
112
+ return filepath
113
+
114
+ def load_latest_data(prolific_id):
115
+ filename = f"{prolific_id}_latest.json"
116
+ filepath = os.path.join(DATA_DIR, filename)
117
+ if os.path.exists(filepath):
118
+ try:
119
+ data = json.load(open(filepath))
120
+ data["current_idx"] = min(max(data["current_idx"], 0), len(response_pairs)-1)
121
+ return data
122
+ except Exception as e:
123
+ logger.error(f"Error loading {filepath}: {e}")
124
+ return None
125
+
126
+ INSTRUCTION = """
127
+ ### Welcome! 🎉
128
+
129
+ In this task, you'll act as a judge comparing two AI chatbot responses. Your goal is to determine which response is better based on specific criteria.
130
+
131
+ ### 📋 Task Overview:
132
+ - You'll evaluate multiple questions (prompts), each with two responses (Response A and B)
133
+ - Select the better response for each question based on the criteria below
134
+ - Your progress will be tracked
135
+
136
+ ### 🏅 Evaluation Criteria:
137
+ 1. **Perceived Usefulness**
138
+ → Does the answer address the question effectively and provide relevant information?
139
+ 2. **Social Presence**
140
+ → Does the answer creates "the feeling of being there with a 'real' person"?
141
+
142
+
143
+ ### 🚀 Getting Started:
144
+ 1. Input your Prolific ID to begin
145
+ 2. Read the question carefully
146
+ 3. Compare both responses side-by-side
147
+ 4. Select the better response using the radio buttons
148
+ 5. Provide optional feedback and confidence rating
149
+ 6. Click "Next" to continue or "Previous" to review
150
+
151
+ **Note:** You must select a response and confidence level before proceeding to the next question.
152
+
153
+ *Thank you for contributing to our research! Your input is valuable.*
154
+ """
155
+
156
+ MINI_INSTRUCTION = """You’ll compare two AI chatbot answers for different questions and pick the better one. Read the question, then look at Response A and Response B. Choose the one that’s better based on: Helpfulness (answers well, gives useful info), Clarity (clear, logical, on topic), and Emotion (understands feelings, fits the situation).
157
+
158
+ *Select your choice and rate your confidence. Click "Next" to move on or "Previous" to go back. You must pick a response and confidence level to continue. Thanks for helping with our research!*
159
+ """
160
+
161
+ def create_interface():
162
+ with gr.Blocks(gr.themes.Ocean(), title="AI Response Evaluation", css=custom_css) as demo:
163
+ # User ID Section
164
+ with gr.Column(visible=True, elem_id="id_section") as id_section:
165
+ with gr.Column(elem_classes="instruction-panel"):
166
+ gr.Markdown(INSTRUCTION)
167
+ gr.Markdown("---")
168
+ gr.Markdown("## Prolific ID Verification")
169
+ prolific_id = gr.Textbox(label="Enter your Prolific ID")
170
+ id_submit_btn = gr.Button("Submit", variant="primary")
171
+
172
+ # Main Interface
173
+ with gr.Column(visible=False, elem_id="main_interface") as main_interface:
174
+ progress_md = gr.Markdown("**Progress:** 0% (0/0)", elem_classes="progress")
175
+ gr.HTML('<style>.prompt-highlight { background-color: #e6f7ff; padding: 10px; border: 1px solid #91d5ff; border-radius: 5px; }</style>')
176
+ gr.Markdown(MINI_INSTRUCTION)
177
+ gr.Markdown("---")
178
+ gr.Markdown("### Current Question")
179
+ prompt_box = gr.Markdown(elem_classes="prompt-highlight")
180
+ with gr.Row():
181
+ with gr.Column(variant="panel"):
182
+ gr.Markdown("### Response A")
183
+ response_a = gr.Markdown(height='200px')
184
+ with gr.Column(variant="panel"):
185
+ gr.Markdown("### Response B")
186
+ response_b = gr.Markdown(height='200px')
187
+ selection_radio = gr.Radio(
188
+ choices=[("Response A", "A"), ("Response B", "B")],
189
+ label="Select the better response",
190
+ )
191
+ feedback = gr.Textbox(label="Additional Feedback (optional)", lines=3)
192
+ confidence = gr.Radio(
193
+ choices=[("1 - Not confident", 1), ("2", 2), ("3", 3), ("4", 4), ("5 - Very confident", 5)],
194
+ label="Confidence Rating",
195
+ )
196
+ with gr.Row():
197
+ prev_btn = gr.Button("Previous", variant="secondary")
198
+ next_btn = gr.Button("Next", variant="primary")
199
+
200
+ # Completion Section
201
+ with gr.Column(visible=False, elem_id="completion") as completion_section:
202
+ gr.Markdown("# Thank You!")
203
+ gr.Markdown("### Completion code: `CA7IOI65`")
204
+ completion_md = gr.Markdown("Your annotations have been saved.")
205
+ gr.HTML("""
206
+ <p>Click <a href="https://app.prolific.com/researcher/submissions/complete?cc=CA7IOI65" target="_blank">here</a> to complete the task.</p>
207
+ """)
208
+
209
+ def handle_id_submit(prolific_id_val):
210
+ if not prolific_id_val.strip():
211
+ raise gr.Error("Please enter a valid Prolific ID")
212
+ state.prolific_id = prolific_id_val.strip()
213
+ data = load_latest_data(state.prolific_id)
214
+
215
+ if data:
216
+ state.annotations = data.get("annotations", [])
217
+ state.current_idx = data.get("current_idx", 0)
218
+ if state.current_idx >= len(response_pairs):
219
+ save_annotations()
220
+ return {
221
+ id_section: gr.update(visible=False),
222
+ main_interface: gr.update(visible=False),
223
+ completion_section: gr.update(visible=True)
224
+ }
225
+ else:
226
+ state.annotations = []
227
+ state.current_idx = 0
228
+
229
+ return {
230
+ id_section: gr.update(visible=False),
231
+ main_interface: gr.update(visible=True),
232
+ completion_section: gr.update(visible=False),
233
+ **update_interface(state.current_idx)
234
+ }
235
+
236
+ def update_interface(idx):
237
+ if idx >= len(response_pairs):
238
+ idx = len(response_pairs) - 1
239
+ current_data = response_pairs[idx] if idx < len(response_pairs) else {}
240
+ progress = f"**Progress:** {idx/len(response_pairs):.0%} ({idx}/{len(response_pairs)})"
241
+ annotation = state.annotations[idx] if idx < len(state.annotations) else None
242
+ return {
243
+ prompt_box: current_data.get("prompt", ""),
244
+ response_a: current_data.get("responseA", ""),
245
+ response_b: current_data.get("responseB", ""),
246
+ progress_md: progress,
247
+ feedback: annotation["feedback"] if annotation else "",
248
+ confidence: annotation["confidence"] if annotation else None,
249
+ selection_radio: annotation["selected"] if annotation else None
250
+ }
251
+
252
+ def handle_navigation(direction, selection, confidence_val, feedback_val):
253
+ error_msg = None
254
+ if direction == "next":
255
+ if not selection:
256
+ error_msg = "Please select a response before proceeding."
257
+ if not confidence_val:
258
+ error_msg = "Please select a confidence level before proceeding."
259
+
260
+ if error_msg:
261
+ gr.Warning(error_msg)
262
+ return {
263
+ main_interface: gr.update(visible=True),
264
+ completion_section: gr.update(visible=False),
265
+ **update_interface(state.current_idx)
266
+ }
267
+
268
+ # Save current annotation
269
+ if selection and confidence_val:
270
+ annotation = {
271
+ "id": response_pairs[state.current_idx]["id"], # Save unique ID
272
+ "prompt": response_pairs[state.current_idx]["prompt"],
273
+ "selected": selection,
274
+ "confidence": confidence_val,
275
+ "feedback": feedback_val,
276
+ "timestamp": datetime.now().isoformat()
277
+ }
278
+ if state.current_idx < len(state.annotations):
279
+ state.annotations[state.current_idx] = annotation
280
+ else:
281
+ state.annotations.append(annotation)
282
+
283
+ # Navigation logic
284
+ try:
285
+ new_idx = state.current_idx + 1 if direction == "next" else max(0, state.current_idx - 1)
286
+ state.current_idx = new_idx
287
+ save_annotations()
288
+
289
+ if new_idx >= len(response_pairs):
290
+ return {
291
+ main_interface: gr.update(visible=False),
292
+ completion_section: gr.update(visible=True),
293
+ **update_interface(new_idx)
294
+ }
295
+
296
+ return {
297
+ main_interface: gr.update(visible=True),
298
+ completion_section: gr.update(visible=False),
299
+ **update_interface(new_idx)
300
+ }
301
+
302
+ except Exception as e:
303
+ logger.error(f"Navigation error: {e}")
304
+ return {
305
+ main_interface: gr.update(visible=True),
306
+ completion_section: gr.update(visible=False),
307
+ **update_interface(state.current_idx)
308
+ }
309
+
310
+ # Event bindings
311
+ id_submit_btn.click(
312
+ handle_id_submit,
313
+ inputs=prolific_id,
314
+ outputs=[id_section, main_interface, completion_section, prompt_box,
315
+ response_a, response_b, progress_md, feedback, confidence, selection_radio]
316
+ )
317
+
318
+ prev_btn.click(
319
+ handle_navigation,
320
+ inputs=[gr.State("prev"), selection_radio, confidence, feedback],
321
+ outputs=[main_interface, completion_section, prompt_box, response_a,
322
+ response_b, progress_md, feedback, confidence, selection_radio]
323
+ )
324
+
325
+ next_btn.click(
326
+ handle_navigation,
327
+ inputs=[gr.State("next"), selection_radio, confidence, feedback],
328
+ outputs=[main_interface, completion_section, prompt_box, response_a,
329
+ response_b, progress_md, feedback, confidence, selection_radio]
330
+ )
331
+
332
+ return demo
333
+
334
+ if __name__ == "__main__":
335
+ app = create_interface()
336
+ app.launch(server_name="0.0.0.0", server_port=7861, share=True)
test_pairs2.json ADDED
The diff for this file is too large to render. See raw diff