File size: 12,727 Bytes
a044e1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
import gradio as gr
import json
from datetime import datetime
import os
import logging

def _setup_logger():
    log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s")
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    console_handler = logging.StreamHandler()
    console_handler.setFormatter(log_format)
    logger.handlers = [console_handler]

    return logger

logger = _setup_logger()

DATA_DIR = "annotations_data2"
os.makedirs(DATA_DIR, exist_ok=True)

with open("test_pairs2.json", "r") as f:
    response_pairs = json.load(f)

custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap');

body {
    font-family: 'Roboto', sans-serif !important;
    line-height: 1.6;
}

.panel {
    border: 1px solid #e5e7eb !important;
    border-radius: 12px !important;
    padding: 20px !important;
}

button {
    font-weight: 500 !important;
    transition: all 0.2s ease !important;
    font-family: 'Roboto', sans-serif !important;
}

button:hover {
    transform: translateY(-1px);
}

.progress {
    color: #4f46e5;
    font-weight: 500;
}

textarea {
    border-radius: 8px !important;
    padding: 12px !important;
    font-family: 'Roboto', sans-serif !important;
}

.selected-response {
    border: 2px solid #4f46e5 !important;
    background-color: #f5f3ff;
}

.instruction-panel {
    background: #f8f9fa !important;
    border: 1px solid #e0e0e0 !important;
    border-radius: 12px !important;
    padding: 25px !important;
    margin-bottom: 25px !important;
}

.criteria-list {
    margin-left: 20px !important;
    list-style-type: none !important;
}

.criteria-item {
    padding: 8px 0 !important;
}

.highlight {
    color: #4f46e5;
    font-weight: 500;
}
"""

class State:
    def __init__(self):
        self.current_idx = 0
        self.prolific_id = ""
        self.annotations = []
        self.start_time = datetime.now()

state = State()

def save_annotations():
    if not state.prolific_id:
        return
    filename = f"{state.prolific_id}_latest.json"
    filepath = os.path.join(DATA_DIR, filename)
    data = {
        "prolific_id": state.prolific_id,
        "duration": (datetime.now() - state.start_time).total_seconds(),
        "current_idx": state.current_idx,
        "annotations": state.annotations
    }
    with open(filepath, "w") as f:
        json.dump(data, f, indent=2)
    logger.info(f"Saved annotations to {filepath}")
    return filepath

def load_latest_data(prolific_id):
    filename = f"{prolific_id}_latest.json"
    filepath = os.path.join(DATA_DIR, filename)
    if os.path.exists(filepath):
        try:
            data = json.load(open(filepath))
            data["current_idx"] = min(max(data["current_idx"], 0), len(response_pairs)-1)
            return data
        except Exception as e:
            logger.error(f"Error loading {filepath}: {e}")
    return None

INSTRUCTION = """
### Welcome! πŸŽ‰

In this task, you'll act as a judge comparing two AI chatbot responses. Your goal is to determine which response is better based on specific criteria.

### πŸ“‹ Task Overview:
- You'll evaluate multiple questions (prompts), each with two responses (Response A and B)
- Select the better response for each question based on the criteria below
- Your progress will be tracked

### πŸ… Evaluation Criteria:
1. **Perceived Usefulness**  
   β†’ Does the answer address the question effectively and provide relevant information?
2. **Social Presence**  
   β†’ Does the answer creates "the feeling of being there with a 'real' person"?


### πŸš€ Getting Started:
1. Input your Prolific ID to begin
2. Read the question carefully
3. Compare both responses side-by-side
4. Select the better response using the radio buttons
5. Provide optional feedback and confidence rating
6. Click "Next" to continue or "Previous" to review

**Note:** You must select a response and confidence level before proceeding to the next question.

*Thank you for contributing to our research! Your input is valuable.*  
"""

MINI_INSTRUCTION = """You’ll compare two AI chatbot answers for different questions and pick the better one. Read the question, then look at Response A and Response B. Choose the one that’s better based on: Helpfulness (answers well, gives useful info), Clarity (clear, logical, on topic), and Emotion (understands feelings, fits the situation).

*Select your choice and rate your confidence. Click "Next" to move on or "Previous" to go back. You must pick a response and confidence level to continue. Thanks for helping with our research!*
"""

def create_interface():
    with gr.Blocks(gr.themes.Ocean(), title="AI Response Evaluation", css=custom_css) as demo:
        # User ID Section
        with gr.Column(visible=True, elem_id="id_section") as id_section:
            with gr.Column(elem_classes="instruction-panel"):
                gr.Markdown(INSTRUCTION)
            gr.Markdown("---")
            gr.Markdown("## Prolific ID Verification")
            prolific_id = gr.Textbox(label="Enter your Prolific ID")
            id_submit_btn = gr.Button("Submit", variant="primary")

        # Main Interface
        with gr.Column(visible=False, elem_id="main_interface") as main_interface:
            progress_md = gr.Markdown("**Progress:** 0% (0/0)", elem_classes="progress")
            gr.HTML('<style>.prompt-highlight { background-color: #e6f7ff; padding: 10px; border: 1px solid #91d5ff; border-radius: 5px; }</style>')
            gr.Markdown(MINI_INSTRUCTION)
            gr.Markdown("---")
            gr.Markdown("### Current Question")
            prompt_box = gr.Markdown(elem_classes="prompt-highlight")
            with gr.Row():
                with gr.Column(variant="panel"):
                    gr.Markdown("### Response A")
                    response_a = gr.Markdown(height='200px')
                with gr.Column(variant="panel"):
                    gr.Markdown("### Response B")
                    response_b = gr.Markdown(height='200px')
            selection_radio = gr.Radio(
                choices=[("Response A", "A"), ("Response B", "B")],
                label="Select the better response",
            )
            feedback = gr.Textbox(label="Additional Feedback (optional)", lines=3)
            confidence = gr.Radio(
                choices=[("1 - Not confident", 1), ("2", 2), ("3", 3), ("4", 4), ("5 - Very confident", 5)],
                label="Confidence Rating",
            )
            with gr.Row():
                prev_btn = gr.Button("Previous", variant="secondary")
                next_btn = gr.Button("Next", variant="primary")

        # Completion Section
        with gr.Column(visible=False, elem_id="completion") as completion_section:
            gr.Markdown("# Thank You!")
            gr.Markdown("### Completion code: `CA7IOI65`")
            completion_md = gr.Markdown("Your annotations have been saved.")
            gr.HTML("""
                <p>Click <a href="https://app.prolific.com/researcher/submissions/complete?cc=CA7IOI65" target="_blank">here</a> to complete the task.</p>
            """)

        def handle_id_submit(prolific_id_val):
            if not prolific_id_val.strip():
                raise gr.Error("Please enter a valid Prolific ID")
            state.prolific_id = prolific_id_val.strip()
            data = load_latest_data(state.prolific_id)
            
            if data:
                state.annotations = data.get("annotations", [])
                state.current_idx = data.get("current_idx", 0)
                if state.current_idx >= len(response_pairs):
                    save_annotations()
                    return {
                        id_section: gr.update(visible=False),
                        main_interface: gr.update(visible=False),
                        completion_section: gr.update(visible=True)
                    }
            else:
                state.annotations = []
                state.current_idx = 0

            return {
                id_section: gr.update(visible=False),
                main_interface: gr.update(visible=True),
                completion_section: gr.update(visible=False),
                **update_interface(state.current_idx)
            }

        def update_interface(idx):
            if idx >= len(response_pairs):
                idx = len(response_pairs) - 1
            current_data = response_pairs[idx] if idx < len(response_pairs) else {}
            progress = f"**Progress:** {idx/len(response_pairs):.0%} ({idx}/{len(response_pairs)})"
            annotation = state.annotations[idx] if idx < len(state.annotations) else None
            return {
                prompt_box: current_data.get("prompt", ""),
                response_a: current_data.get("responseA", ""),
                response_b: current_data.get("responseB", ""),
                progress_md: progress,
                feedback: annotation["feedback"] if annotation else "",
                confidence: annotation["confidence"] if annotation else None,
                selection_radio: annotation["selected"] if annotation else None
            }

        def handle_navigation(direction, selection, confidence_val, feedback_val):
            error_msg = None
            if direction == "next":
                if not selection:
                    error_msg = "Please select a response before proceeding."
                if not confidence_val:
                    error_msg = "Please select a confidence level before proceeding."
            
            if error_msg:
                gr.Warning(error_msg)
                return {
                    main_interface: gr.update(visible=True),
                    completion_section: gr.update(visible=False),
                    **update_interface(state.current_idx)
                }
    
            # Save current annotation
            if selection and confidence_val:
                annotation = {
                    "id": response_pairs[state.current_idx]["id"],  # Save unique ID
                    "prompt": response_pairs[state.current_idx]["prompt"],
                    "selected": selection,
                    "confidence": confidence_val,
                    "feedback": feedback_val,
                    "timestamp": datetime.now().isoformat()
                }
                if state.current_idx < len(state.annotations):
                    state.annotations[state.current_idx] = annotation
                else:
                    state.annotations.append(annotation)

            # Navigation logic
            try:
                new_idx = state.current_idx + 1 if direction == "next" else max(0, state.current_idx - 1)
                state.current_idx = new_idx
                save_annotations()

                if new_idx >= len(response_pairs):
                    return {
                        main_interface: gr.update(visible=False),
                        completion_section: gr.update(visible=True),
                        **update_interface(new_idx)
                    }
                
                return {
                    main_interface: gr.update(visible=True),
                    completion_section: gr.update(visible=False),
                    **update_interface(new_idx)
                }
            
            except Exception as e:
                logger.error(f"Navigation error: {e}")
                return {
                    main_interface: gr.update(visible=True),
                    completion_section: gr.update(visible=False),
                    **update_interface(state.current_idx)
                }

        # Event bindings
        id_submit_btn.click(
            handle_id_submit,
            inputs=prolific_id,
            outputs=[id_section, main_interface, completion_section, prompt_box, 
                    response_a, response_b, progress_md, feedback, confidence, selection_radio]
        )

        prev_btn.click(
            handle_navigation,
            inputs=[gr.State("prev"), selection_radio, confidence, feedback],
            outputs=[main_interface, completion_section, prompt_box, response_a, 
                    response_b, progress_md, feedback, confidence, selection_radio]
        )

        next_btn.click(
            handle_navigation,
            inputs=[gr.State("next"), selection_radio, confidence, feedback],
            outputs=[main_interface, completion_section, prompt_box, response_a,
                    response_b, progress_md, feedback, confidence, selection_radio]
        )

    return demo

if __name__ == "__main__":
    app = create_interface()
    app.launch(server_name="0.0.0.0", server_port=7861, share=True)