Spaces:
Sleeping
Sleeping
import gradio as gr | |
import json | |
from datetime import datetime | |
import os | |
import logging | |
def _setup_logger(): | |
log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s") | |
logger = logging.getLogger() | |
logger.setLevel(logging.INFO) | |
console_handler = logging.StreamHandler() | |
console_handler.setFormatter(log_format) | |
logger.handlers = [console_handler] | |
return logger | |
logger = _setup_logger() | |
DATA_DIR = "annotations_data2" | |
os.makedirs(DATA_DIR, exist_ok=True) | |
with open("test_pairs2.json", "r") as f: | |
response_pairs = json.load(f) | |
custom_css = """ | |
@import url('https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap'); | |
body { | |
font-family: 'Roboto', sans-serif !important; | |
line-height: 1.6; | |
} | |
.panel { | |
border: 1px solid #e5e7eb !important; | |
border-radius: 12px !important; | |
padding: 20px !important; | |
} | |
button { | |
font-weight: 500 !important; | |
transition: all 0.2s ease !important; | |
font-family: 'Roboto', sans-serif !important; | |
} | |
button:hover { | |
transform: translateY(-1px); | |
} | |
.progress { | |
color: #4f46e5; | |
font-weight: 500; | |
} | |
textarea { | |
border-radius: 8px !important; | |
padding: 12px !important; | |
font-family: 'Roboto', sans-serif !important; | |
} | |
.selected-response { | |
border: 2px solid #4f46e5 !important; | |
background-color: #f5f3ff; | |
} | |
.instruction-panel { | |
background: #f8f9fa !important; | |
border: 1px solid #e0e0e0 !important; | |
border-radius: 12px !important; | |
padding: 25px !important; | |
margin-bottom: 25px !important; | |
} | |
.criteria-list { | |
margin-left: 20px !important; | |
list-style-type: none !important; | |
} | |
.criteria-item { | |
padding: 8px 0 !important; | |
} | |
.highlight { | |
color: #4f46e5; | |
font-weight: 500; | |
} | |
""" | |
class State: | |
def __init__(self): | |
self.current_idx = 0 | |
self.prolific_id = "" | |
self.annotations = [] | |
self.start_time = datetime.now() | |
state = State() | |
def save_annotations(): | |
if not state.prolific_id: | |
return | |
filename = f"{state.prolific_id}_latest.json" | |
filepath = os.path.join(DATA_DIR, filename) | |
data = { | |
"prolific_id": state.prolific_id, | |
"duration": (datetime.now() - state.start_time).total_seconds(), | |
"current_idx": state.current_idx, | |
"annotations": state.annotations | |
} | |
with open(filepath, "w") as f: | |
json.dump(data, f, indent=2) | |
logger.info(f"Saved annotations to {filepath}") | |
return filepath | |
def load_latest_data(prolific_id): | |
filename = f"{prolific_id}_latest.json" | |
filepath = os.path.join(DATA_DIR, filename) | |
if os.path.exists(filepath): | |
try: | |
data = json.load(open(filepath)) | |
data["current_idx"] = min(max(data["current_idx"], 0), len(response_pairs)-1) | |
return data | |
except Exception as e: | |
logger.error(f"Error loading {filepath}: {e}") | |
return None | |
INSTRUCTION = """ | |
### Welcome! 🎉 | |
In this task, you'll act as a judge comparing two AI chatbot responses. Your goal is to determine which response is better based on specific criteria. | |
### 📋 Task Overview: | |
- You'll evaluate multiple questions (prompts), each with two responses (Response A and B) | |
- Select the better response for each question based on the criteria below | |
- Your progress will be tracked | |
### 🏅 Evaluation Criteria: | |
1. **Perceived Usefulness** | |
→ Does the answer address the question effectively and provide relevant information? | |
2. **Social Presence** | |
→ Does the answer creates "the feeling of being there with a 'real' person"? | |
### 🚀 Getting Started: | |
1. Input your Prolific ID to begin | |
2. Read the question carefully | |
3. Compare both responses side-by-side | |
4. Select the better response using the radio buttons | |
5. Provide optional feedback and confidence rating | |
6. Click "Next" to continue or "Previous" to review | |
**Note:** You must select a response and confidence level before proceeding to the next question. | |
*We do not expect any risks beyond what you’d experience in daily life from joining this study. You’ll just read questions and answers, pick your favorite, and rate your confidence—nothing stressful or harmful. It’s as safe as reading a webpage or filling out a short survey.* | |
*Thank you for contributing to our research! Your input is valuable.* | |
""" | |
MINI_INSTRUCTION = """You’ll compare two AI chatbot answers for different questions and pick the better one. Read the question, then look at Response A and Response B. Choose the one that’s better based on: Helpfulness (answers well, gives useful info), Clarity (clear, logical, on topic), and Emotion (understands feelings, fits the situation). | |
*Select your choice and rate your confidence. Click "Next" to move on or "Previous" to go back. You must pick a response and confidence level to continue. Thanks for helping with our research!* | |
""" | |
def create_interface(): | |
with gr.Blocks(gr.themes.Ocean(), title="AI Response Evaluation", css=custom_css) as demo: | |
# User ID Section | |
with gr.Column(visible=True, elem_id="id_section") as id_section: | |
with gr.Column(elem_classes="instruction-panel"): | |
gr.Markdown(INSTRUCTION) | |
gr.Markdown("---") | |
gr.Markdown("## Prolific ID Verification") | |
prolific_id = gr.Textbox(label="Enter your Prolific ID") | |
id_submit_btn = gr.Button("Submit", variant="primary") | |
# Main Interface | |
with gr.Column(visible=False, elem_id="main_interface") as main_interface: | |
progress_md = gr.Markdown("**Progress:** 0% (0/0)", elem_classes="progress") | |
gr.HTML('<style>.prompt-highlight { background-color: #e6f7ff; padding: 10px; border: 1px solid #91d5ff; border-radius: 5px; }</style>') | |
gr.Markdown(MINI_INSTRUCTION) | |
gr.Markdown("---") | |
gr.Markdown("### Current Question") | |
prompt_box = gr.Markdown(elem_classes="prompt-highlight") | |
with gr.Row(): | |
with gr.Column(variant="panel"): | |
gr.Markdown("### Response A") | |
response_a = gr.Markdown(height='200px') | |
with gr.Column(variant="panel"): | |
gr.Markdown("### Response B") | |
response_b = gr.Markdown(height='200px') | |
selection_radio = gr.Radio( | |
choices=[("Response A", "A"), ("Response B", "B")], | |
label="Select the better response", | |
) | |
feedback = gr.Textbox(label="Additional Feedback (optional)", lines=3) | |
confidence = gr.Radio( | |
choices=[("1 - Not confident", 1), ("2", 2), ("3", 3), ("4", 4), ("5 - Very confident", 5)], | |
label="Confidence Rating", | |
) | |
with gr.Row(): | |
prev_btn = gr.Button("Previous", variant="secondary") | |
next_btn = gr.Button("Next", variant="primary") | |
# Completion Section | |
with gr.Column(visible=False, elem_id="completion") as completion_section: | |
gr.Markdown("# Thank You!") | |
gr.Markdown("### Completion code: `CA7IOI65`") | |
completion_md = gr.Markdown("Your annotations have been saved.") | |
gr.HTML(""" | |
<p>Click <a href="https://app.prolific.com/researcher/submissions/complete?cc=CA7IOI65" target="_blank">here</a> to complete the task.</p> | |
""") | |
def handle_id_submit(prolific_id_val): | |
if not prolific_id_val.strip(): | |
raise gr.Error("Please enter a valid Prolific ID") | |
state.prolific_id = prolific_id_val.strip() | |
data = load_latest_data(state.prolific_id) | |
if data: | |
state.annotations = data.get("annotations", []) | |
state.current_idx = data.get("current_idx", 0) | |
if state.current_idx >= len(response_pairs): | |
save_annotations() | |
return { | |
id_section: gr.update(visible=False), | |
main_interface: gr.update(visible=False), | |
completion_section: gr.update(visible=True) | |
} | |
else: | |
state.annotations = [] | |
state.current_idx = 0 | |
return { | |
id_section: gr.update(visible=False), | |
main_interface: gr.update(visible=True), | |
completion_section: gr.update(visible=False), | |
**update_interface(state.current_idx) | |
} | |
def update_interface(idx): | |
if idx >= len(response_pairs): | |
idx = len(response_pairs) - 1 | |
current_data = response_pairs[idx] if idx < len(response_pairs) else {} | |
progress = f"**Progress:** {idx/len(response_pairs):.0%} ({idx}/{len(response_pairs)})" | |
annotation = state.annotations[idx] if idx < len(state.annotations) else None | |
return { | |
prompt_box: current_data.get("prompt", ""), | |
response_a: current_data.get("responseA", ""), | |
response_b: current_data.get("responseB", ""), | |
progress_md: progress, | |
feedback: annotation["feedback"] if annotation else "", | |
confidence: annotation["confidence"] if annotation else None, | |
selection_radio: annotation["selected"] if annotation else None | |
} | |
def handle_navigation(direction, selection, confidence_val, feedback_val): | |
error_msg = None | |
if direction == "next": | |
if not selection: | |
error_msg = "Please select a response before proceeding." | |
if not confidence_val: | |
error_msg = "Please select a confidence level before proceeding." | |
if error_msg: | |
gr.Warning(error_msg) | |
return { | |
main_interface: gr.update(visible=True), | |
completion_section: gr.update(visible=False), | |
**update_interface(state.current_idx) | |
} | |
# Save current annotation | |
if selection and confidence_val: | |
annotation = { | |
"id": response_pairs[state.current_idx]["id"], # Save unique ID | |
"prompt": response_pairs[state.current_idx]["prompt"], | |
"selected": selection, | |
"confidence": confidence_val, | |
"feedback": feedback_val, | |
"timestamp": datetime.now().isoformat() | |
} | |
if state.current_idx < len(state.annotations): | |
state.annotations[state.current_idx] = annotation | |
else: | |
state.annotations.append(annotation) | |
# Navigation logic | |
try: | |
new_idx = state.current_idx + 1 if direction == "next" else max(0, state.current_idx - 1) | |
state.current_idx = new_idx | |
save_annotations() | |
if new_idx >= len(response_pairs): | |
return { | |
main_interface: gr.update(visible=False), | |
completion_section: gr.update(visible=True), | |
**update_interface(new_idx) | |
} | |
return { | |
main_interface: gr.update(visible=True), | |
completion_section: gr.update(visible=False), | |
**update_interface(new_idx) | |
} | |
except Exception as e: | |
logger.error(f"Navigation error: {e}") | |
return { | |
main_interface: gr.update(visible=True), | |
completion_section: gr.update(visible=False), | |
**update_interface(state.current_idx) | |
} | |
# Event bindings | |
id_submit_btn.click( | |
handle_id_submit, | |
inputs=prolific_id, | |
outputs=[id_section, main_interface, completion_section, prompt_box, | |
response_a, response_b, progress_md, feedback, confidence, selection_radio] | |
) | |
prev_btn.click( | |
handle_navigation, | |
inputs=[gr.State("prev"), selection_radio, confidence, feedback], | |
outputs=[main_interface, completion_section, prompt_box, response_a, | |
response_b, progress_md, feedback, confidence, selection_radio] | |
) | |
next_btn.click( | |
handle_navigation, | |
inputs=[gr.State("next"), selection_radio, confidence, feedback], | |
outputs=[main_interface, completion_section, prompt_box, response_a, | |
response_b, progress_md, feedback, confidence, selection_radio] | |
) | |
return demo | |
if __name__ == "__main__": | |
app = create_interface() | |
app.launch() | |