Spaces:
Sleeping
Sleeping
import gradio as gr | |
import json | |
from datetime import datetime | |
import os | |
import logging | |
def _setup_logger(): | |
log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s") | |
logger = logging.getLogger() | |
logger.setLevel(logging.INFO) | |
console_handler = logging.StreamHandler() | |
console_handler.setFormatter(log_format) | |
logger.handlers = [console_handler] | |
return logger | |
logger = _setup_logger() | |
DATA_DIR = "annotations_data2" | |
os.makedirs(DATA_DIR, exist_ok=True) | |
with open("test_pairs2.json", "r") as f: | |
response_pairs = json.load(f) | |
custom_css = """ | |
@import url('https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap'); | |
body { | |
font-family: 'Roboto', sans-serif !important; | |
line-height: 1.6; | |
} | |
.panel { | |
border: 1px solid #e5e7eb !important; | |
border-radius: 12px !important; | |
padding: 20px !important; | |
} | |
button { | |
font-weight: 500 !important; | |
transition: all 0.2s ease !important; | |
font-family: 'Roboto', sans-serif !important; | |
} | |
button:hover { | |
transform: translateY(-1px); | |
} | |
.progress { | |
color: #4f46e5; | |
font-weight: 500; | |
} | |
textarea { | |
border-radius: 8px !important; | |
padding: 12px !important; | |
font-family: 'Roboto', sans-serif !important; | |
} | |
.selected-response { | |
border: 2px solid #4f46e5 !important; | |
background-color: #f5f3ff; | |
} | |
.instruction-panel { | |
background: #f8f9fa !important; | |
border: 1px solid #e0e0e0 !important; | |
border-radius: 12px !important; | |
padding: 25px !important; | |
margin-bottom: 25px !important; | |
} | |
.criteria-list { | |
margin-left: 20px !important; | |
list-style-type: none !important; | |
} | |
.criteria-item { | |
padding: 8px 0 !important; | |
} | |
.highlight { | |
color: #4f46e5; | |
font-weight: 500; | |
} | |
""" | |
class State: | |
def __init__(self): | |
self.current_idx = 0 | |
self.prolific_id = "" | |
self.annotations = [] | |
self.start_time = datetime.now() | |
state = State() | |
def save_annotations(): | |
if not state.prolific_id: | |
return | |
filename = f"{state.prolific_id}_latest.json" | |
filepath = os.path.join(DATA_DIR, filename) | |
data = { | |
"prolific_id": state.prolific_id, | |
"duration": (datetime.now() - state.start_time).total_seconds(), | |
"current_idx": state.current_idx, | |
"annotations": state.annotations | |
} | |
with open(filepath, "w") as f: | |
json.dump(data, f, indent=2) | |
logger.info(f"Saved annotations to {filepath}") | |
return filepath | |
def load_latest_data(prolific_id): | |
filename = f"{prolific_id}_latest.json" | |
filepath = os.path.join(DATA_DIR, filename) | |
if os.path.exists(filepath): | |
try: | |
data = json.load(open(filepath)) | |
data["current_idx"] = min(max(data["current_idx"], 0), len(response_pairs)-1) | |
return data | |
except Exception as e: | |
logger.error(f"Error loading {filepath}: {e}") | |
return None | |
INSTRUCTION = """ | |
### Welcome! π | |
In this task, you'll act as a judge comparing two AI chatbot responses. Your goal is to determine which response is better based on specific criteria. | |
### π Task Overview: | |
- You'll evaluate multiple questions (prompts), each with two responses (Response A and B) | |
- Select the better response for each question based on the criteria below | |
- Your progress will be tracked | |
### π Evaluation Criteria: | |
1. **Perceived Usefulness** | |
β Does the answer address the question effectively and provide relevant information? | |
2. **Social Presence** | |
β Does the answer creates "the feeling of being there with a 'real' person"? | |
### π Getting Started: | |
1. Input your Prolific ID to begin | |
2. Read the question carefully | |
3. Compare both responses side-by-side | |
4. Select the better response using the radio buttons | |
5. Provide optional feedback and confidence rating | |
6. Click "Next" to continue or "Previous" to review | |
**Note:** You must select a response and confidence level before proceeding to the next question. | |
*Thank you for contributing to our research! Your input is valuable.* | |
""" | |
MINI_INSTRUCTION = """Youβll compare two AI chatbot answers for different questions and pick the better one. Read the question, then look at Response A and Response B. Choose the one thatβs better based on: Helpfulness (answers well, gives useful info), Clarity (clear, logical, on topic), and Emotion (understands feelings, fits the situation). | |
*Select your choice and rate your confidence. Click "Next" to move on or "Previous" to go back. You must pick a response and confidence level to continue. Thanks for helping with our research!* | |
""" | |
def create_interface(): | |
with gr.Blocks(gr.themes.Ocean(), title="AI Response Evaluation", css=custom_css) as demo: | |
# User ID Section | |
with gr.Column(visible=True, elem_id="id_section") as id_section: | |
with gr.Column(elem_classes="instruction-panel"): | |
gr.Markdown(INSTRUCTION) | |
gr.Markdown("---") | |
gr.Markdown("## Prolific ID Verification") | |
prolific_id = gr.Textbox(label="Enter your Prolific ID") | |
id_submit_btn = gr.Button("Submit", variant="primary") | |
# Main Interface | |
with gr.Column(visible=False, elem_id="main_interface") as main_interface: | |
progress_md = gr.Markdown("**Progress:** 0% (0/0)", elem_classes="progress") | |
gr.HTML('<style>.prompt-highlight { background-color: #e6f7ff; padding: 10px; border: 1px solid #91d5ff; border-radius: 5px; }</style>') | |
gr.Markdown(MINI_INSTRUCTION) | |
gr.Markdown("---") | |
gr.Markdown("### Current Question") | |
prompt_box = gr.Markdown(elem_classes="prompt-highlight") | |
with gr.Row(): | |
with gr.Column(variant="panel"): | |
gr.Markdown("### Response A") | |
response_a = gr.Markdown(height='200px') | |
with gr.Column(variant="panel"): | |
gr.Markdown("### Response B") | |
response_b = gr.Markdown(height='200px') | |
selection_radio = gr.Radio( | |
choices=[("Response A", "A"), ("Response B", "B")], | |
label="Select the better response", | |
) | |
feedback = gr.Textbox(label="Additional Feedback (optional)", lines=3) | |
confidence = gr.Radio( | |
choices=[("1 - Not confident", 1), ("2", 2), ("3", 3), ("4", 4), ("5 - Very confident", 5)], | |
label="Confidence Rating", | |
) | |
with gr.Row(): | |
prev_btn = gr.Button("Previous", variant="secondary") | |
next_btn = gr.Button("Next", variant="primary") | |
# Completion Section | |
with gr.Column(visible=False, elem_id="completion") as completion_section: | |
gr.Markdown("# Thank You!") | |
gr.Markdown("### Completion code: `CA7IOI65`") | |
completion_md = gr.Markdown("Your annotations have been saved.") | |
gr.HTML(""" | |
<p>Click <a href="https://app.prolific.com/researcher/submissions/complete?cc=CA7IOI65" target="_blank">here</a> to complete the task.</p> | |
""") | |
def handle_id_submit(prolific_id_val): | |
if not prolific_id_val.strip(): | |
raise gr.Error("Please enter a valid Prolific ID") | |
state.prolific_id = prolific_id_val.strip() | |
data = load_latest_data(state.prolific_id) | |
if data: | |
state.annotations = data.get("annotations", []) | |
state.current_idx = data.get("current_idx", 0) | |
if state.current_idx >= len(response_pairs): | |
save_annotations() | |
return { | |
id_section: gr.update(visible=False), | |
main_interface: gr.update(visible=False), | |
completion_section: gr.update(visible=True) | |
} | |
else: | |
state.annotations = [] | |
state.current_idx = 0 | |
return { | |
id_section: gr.update(visible=False), | |
main_interface: gr.update(visible=True), | |
completion_section: gr.update(visible=False), | |
**update_interface(state.current_idx) | |
} | |
def update_interface(idx): | |
if idx >= len(response_pairs): | |
idx = len(response_pairs) - 1 | |
current_data = response_pairs[idx] if idx < len(response_pairs) else {} | |
progress = f"**Progress:** {idx/len(response_pairs):.0%} ({idx}/{len(response_pairs)})" | |
annotation = state.annotations[idx] if idx < len(state.annotations) else None | |
return { | |
prompt_box: current_data.get("prompt", ""), | |
response_a: current_data.get("responseA", ""), | |
response_b: current_data.get("responseB", ""), | |
progress_md: progress, | |
feedback: annotation["feedback"] if annotation else "", | |
confidence: annotation["confidence"] if annotation else None, | |
selection_radio: annotation["selected"] if annotation else None | |
} | |
def handle_navigation(direction, selection, confidence_val, feedback_val): | |
error_msg = None | |
if direction == "next": | |
if not selection: | |
error_msg = "Please select a response before proceeding." | |
if not confidence_val: | |
error_msg = "Please select a confidence level before proceeding." | |
if error_msg: | |
gr.Warning(error_msg) | |
return { | |
main_interface: gr.update(visible=True), | |
completion_section: gr.update(visible=False), | |
**update_interface(state.current_idx) | |
} | |
# Save current annotation | |
if selection and confidence_val: | |
annotation = { | |
"id": response_pairs[state.current_idx]["id"], # Save unique ID | |
"prompt": response_pairs[state.current_idx]["prompt"], | |
"selected": selection, | |
"confidence": confidence_val, | |
"feedback": feedback_val, | |
"timestamp": datetime.now().isoformat() | |
} | |
if state.current_idx < len(state.annotations): | |
state.annotations[state.current_idx] = annotation | |
else: | |
state.annotations.append(annotation) | |
# Navigation logic | |
try: | |
new_idx = state.current_idx + 1 if direction == "next" else max(0, state.current_idx - 1) | |
state.current_idx = new_idx | |
save_annotations() | |
if new_idx >= len(response_pairs): | |
return { | |
main_interface: gr.update(visible=False), | |
completion_section: gr.update(visible=True), | |
**update_interface(new_idx) | |
} | |
return { | |
main_interface: gr.update(visible=True), | |
completion_section: gr.update(visible=False), | |
**update_interface(new_idx) | |
} | |
except Exception as e: | |
logger.error(f"Navigation error: {e}") | |
return { | |
main_interface: gr.update(visible=True), | |
completion_section: gr.update(visible=False), | |
**update_interface(state.current_idx) | |
} | |
# Event bindings | |
id_submit_btn.click( | |
handle_id_submit, | |
inputs=prolific_id, | |
outputs=[id_section, main_interface, completion_section, prompt_box, | |
response_a, response_b, progress_md, feedback, confidence, selection_radio] | |
) | |
prev_btn.click( | |
handle_navigation, | |
inputs=[gr.State("prev"), selection_radio, confidence, feedback], | |
outputs=[main_interface, completion_section, prompt_box, response_a, | |
response_b, progress_md, feedback, confidence, selection_radio] | |
) | |
next_btn.click( | |
handle_navigation, | |
inputs=[gr.State("next"), selection_radio, confidence, feedback], | |
outputs=[main_interface, completion_section, prompt_box, response_a, | |
response_b, progress_md, feedback, confidence, selection_radio] | |
) | |
return demo | |
if __name__ == "__main__": | |
app = create_interface() | |
app.launch(server_name="0.0.0.0", server_port=7861, share=True) | |