User-Study / app.py
yixuantt's picture
Upload 2 files
a044e1e verified
raw
history blame
12.7 kB
import gradio as gr
import json
from datetime import datetime
import os
import logging
def _setup_logger():
log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s")
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setFormatter(log_format)
logger.handlers = [console_handler]
return logger
logger = _setup_logger()
DATA_DIR = "annotations_data2"
os.makedirs(DATA_DIR, exist_ok=True)
with open("test_pairs2.json", "r") as f:
response_pairs = json.load(f)
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap');
body {
font-family: 'Roboto', sans-serif !important;
line-height: 1.6;
}
.panel {
border: 1px solid #e5e7eb !important;
border-radius: 12px !important;
padding: 20px !important;
}
button {
font-weight: 500 !important;
transition: all 0.2s ease !important;
font-family: 'Roboto', sans-serif !important;
}
button:hover {
transform: translateY(-1px);
}
.progress {
color: #4f46e5;
font-weight: 500;
}
textarea {
border-radius: 8px !important;
padding: 12px !important;
font-family: 'Roboto', sans-serif !important;
}
.selected-response {
border: 2px solid #4f46e5 !important;
background-color: #f5f3ff;
}
.instruction-panel {
background: #f8f9fa !important;
border: 1px solid #e0e0e0 !important;
border-radius: 12px !important;
padding: 25px !important;
margin-bottom: 25px !important;
}
.criteria-list {
margin-left: 20px !important;
list-style-type: none !important;
}
.criteria-item {
padding: 8px 0 !important;
}
.highlight {
color: #4f46e5;
font-weight: 500;
}
"""
class State:
def __init__(self):
self.current_idx = 0
self.prolific_id = ""
self.annotations = []
self.start_time = datetime.now()
state = State()
def save_annotations():
if not state.prolific_id:
return
filename = f"{state.prolific_id}_latest.json"
filepath = os.path.join(DATA_DIR, filename)
data = {
"prolific_id": state.prolific_id,
"duration": (datetime.now() - state.start_time).total_seconds(),
"current_idx": state.current_idx,
"annotations": state.annotations
}
with open(filepath, "w") as f:
json.dump(data, f, indent=2)
logger.info(f"Saved annotations to {filepath}")
return filepath
def load_latest_data(prolific_id):
filename = f"{prolific_id}_latest.json"
filepath = os.path.join(DATA_DIR, filename)
if os.path.exists(filepath):
try:
data = json.load(open(filepath))
data["current_idx"] = min(max(data["current_idx"], 0), len(response_pairs)-1)
return data
except Exception as e:
logger.error(f"Error loading {filepath}: {e}")
return None
INSTRUCTION = """
### Welcome! πŸŽ‰
In this task, you'll act as a judge comparing two AI chatbot responses. Your goal is to determine which response is better based on specific criteria.
### πŸ“‹ Task Overview:
- You'll evaluate multiple questions (prompts), each with two responses (Response A and B)
- Select the better response for each question based on the criteria below
- Your progress will be tracked
### πŸ… Evaluation Criteria:
1. **Perceived Usefulness**
β†’ Does the answer address the question effectively and provide relevant information?
2. **Social Presence**
β†’ Does the answer creates "the feeling of being there with a 'real' person"?
### πŸš€ Getting Started:
1. Input your Prolific ID to begin
2. Read the question carefully
3. Compare both responses side-by-side
4. Select the better response using the radio buttons
5. Provide optional feedback and confidence rating
6. Click "Next" to continue or "Previous" to review
**Note:** You must select a response and confidence level before proceeding to the next question.
*Thank you for contributing to our research! Your input is valuable.*
"""
MINI_INSTRUCTION = """You’ll compare two AI chatbot answers for different questions and pick the better one. Read the question, then look at Response A and Response B. Choose the one that’s better based on: Helpfulness (answers well, gives useful info), Clarity (clear, logical, on topic), and Emotion (understands feelings, fits the situation).
*Select your choice and rate your confidence. Click "Next" to move on or "Previous" to go back. You must pick a response and confidence level to continue. Thanks for helping with our research!*
"""
def create_interface():
with gr.Blocks(gr.themes.Ocean(), title="AI Response Evaluation", css=custom_css) as demo:
# User ID Section
with gr.Column(visible=True, elem_id="id_section") as id_section:
with gr.Column(elem_classes="instruction-panel"):
gr.Markdown(INSTRUCTION)
gr.Markdown("---")
gr.Markdown("## Prolific ID Verification")
prolific_id = gr.Textbox(label="Enter your Prolific ID")
id_submit_btn = gr.Button("Submit", variant="primary")
# Main Interface
with gr.Column(visible=False, elem_id="main_interface") as main_interface:
progress_md = gr.Markdown("**Progress:** 0% (0/0)", elem_classes="progress")
gr.HTML('<style>.prompt-highlight { background-color: #e6f7ff; padding: 10px; border: 1px solid #91d5ff; border-radius: 5px; }</style>')
gr.Markdown(MINI_INSTRUCTION)
gr.Markdown("---")
gr.Markdown("### Current Question")
prompt_box = gr.Markdown(elem_classes="prompt-highlight")
with gr.Row():
with gr.Column(variant="panel"):
gr.Markdown("### Response A")
response_a = gr.Markdown(height='200px')
with gr.Column(variant="panel"):
gr.Markdown("### Response B")
response_b = gr.Markdown(height='200px')
selection_radio = gr.Radio(
choices=[("Response A", "A"), ("Response B", "B")],
label="Select the better response",
)
feedback = gr.Textbox(label="Additional Feedback (optional)", lines=3)
confidence = gr.Radio(
choices=[("1 - Not confident", 1), ("2", 2), ("3", 3), ("4", 4), ("5 - Very confident", 5)],
label="Confidence Rating",
)
with gr.Row():
prev_btn = gr.Button("Previous", variant="secondary")
next_btn = gr.Button("Next", variant="primary")
# Completion Section
with gr.Column(visible=False, elem_id="completion") as completion_section:
gr.Markdown("# Thank You!")
gr.Markdown("### Completion code: `CA7IOI65`")
completion_md = gr.Markdown("Your annotations have been saved.")
gr.HTML("""
<p>Click <a href="https://app.prolific.com/researcher/submissions/complete?cc=CA7IOI65" target="_blank">here</a> to complete the task.</p>
""")
def handle_id_submit(prolific_id_val):
if not prolific_id_val.strip():
raise gr.Error("Please enter a valid Prolific ID")
state.prolific_id = prolific_id_val.strip()
data = load_latest_data(state.prolific_id)
if data:
state.annotations = data.get("annotations", [])
state.current_idx = data.get("current_idx", 0)
if state.current_idx >= len(response_pairs):
save_annotations()
return {
id_section: gr.update(visible=False),
main_interface: gr.update(visible=False),
completion_section: gr.update(visible=True)
}
else:
state.annotations = []
state.current_idx = 0
return {
id_section: gr.update(visible=False),
main_interface: gr.update(visible=True),
completion_section: gr.update(visible=False),
**update_interface(state.current_idx)
}
def update_interface(idx):
if idx >= len(response_pairs):
idx = len(response_pairs) - 1
current_data = response_pairs[idx] if idx < len(response_pairs) else {}
progress = f"**Progress:** {idx/len(response_pairs):.0%} ({idx}/{len(response_pairs)})"
annotation = state.annotations[idx] if idx < len(state.annotations) else None
return {
prompt_box: current_data.get("prompt", ""),
response_a: current_data.get("responseA", ""),
response_b: current_data.get("responseB", ""),
progress_md: progress,
feedback: annotation["feedback"] if annotation else "",
confidence: annotation["confidence"] if annotation else None,
selection_radio: annotation["selected"] if annotation else None
}
def handle_navigation(direction, selection, confidence_val, feedback_val):
error_msg = None
if direction == "next":
if not selection:
error_msg = "Please select a response before proceeding."
if not confidence_val:
error_msg = "Please select a confidence level before proceeding."
if error_msg:
gr.Warning(error_msg)
return {
main_interface: gr.update(visible=True),
completion_section: gr.update(visible=False),
**update_interface(state.current_idx)
}
# Save current annotation
if selection and confidence_val:
annotation = {
"id": response_pairs[state.current_idx]["id"], # Save unique ID
"prompt": response_pairs[state.current_idx]["prompt"],
"selected": selection,
"confidence": confidence_val,
"feedback": feedback_val,
"timestamp": datetime.now().isoformat()
}
if state.current_idx < len(state.annotations):
state.annotations[state.current_idx] = annotation
else:
state.annotations.append(annotation)
# Navigation logic
try:
new_idx = state.current_idx + 1 if direction == "next" else max(0, state.current_idx - 1)
state.current_idx = new_idx
save_annotations()
if new_idx >= len(response_pairs):
return {
main_interface: gr.update(visible=False),
completion_section: gr.update(visible=True),
**update_interface(new_idx)
}
return {
main_interface: gr.update(visible=True),
completion_section: gr.update(visible=False),
**update_interface(new_idx)
}
except Exception as e:
logger.error(f"Navigation error: {e}")
return {
main_interface: gr.update(visible=True),
completion_section: gr.update(visible=False),
**update_interface(state.current_idx)
}
# Event bindings
id_submit_btn.click(
handle_id_submit,
inputs=prolific_id,
outputs=[id_section, main_interface, completion_section, prompt_box,
response_a, response_b, progress_md, feedback, confidence, selection_radio]
)
prev_btn.click(
handle_navigation,
inputs=[gr.State("prev"), selection_radio, confidence, feedback],
outputs=[main_interface, completion_section, prompt_box, response_a,
response_b, progress_md, feedback, confidence, selection_radio]
)
next_btn.click(
handle_navigation,
inputs=[gr.State("next"), selection_radio, confidence, feedback],
outputs=[main_interface, completion_section, prompt_box, response_a,
response_b, progress_md, feedback, confidence, selection_radio]
)
return demo
if __name__ == "__main__":
app = create_interface()
app.launch(server_name="0.0.0.0", server_port=7861, share=True)