evaluation

Sleeping

App Files Files Community

iyosha commited on Apr 9

Commit

38c5e59

verified ·

1 Parent(s): 50df258

Upload 11 files

Browse files

Files changed (11) hide show

app.py +314 -0
backend/__init__.py +1 -0
backend/backend.py +66 -0
backend/helpers.py +20 -0
clients.py +8 -0
configs.py +19 -0
logger/__init__.py +1 -0
logger/json_formatter.py +53 -0
logger/logger.py +66 -0
main.py +9 -0
requirements.txt +9 -0

app.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import gradio as gr
+from uuid import uuid4
+from datasets import load_dataset
+from collections import Counter
+from .configs import configs
+from .clients import backend, logger
+from .backend.helpers import get_random_session_samples
+dataset = load_dataset("iyosha-huji/stressBench", token=configs.HF_API_TOKEN)["test"]
+def human_eval_tab():
+    with gr.Tab(label="Evaluation"):
+        # ==== State ====
+        i = gr.State(-1)
+        selected_answer = gr.State(None)
+        answers_dict = gr.State({})
+        logged_in = gr.State(False)
+        session_id = gr.State(None)
+        session_sample_indices = gr.State([])
+        # === Login UI ===
+        with gr.Group(visible=True) as login_group:
+            gr.Markdown("### 🔐 Login to Continue")
+            with gr.Row():
+                username = gr.Text(label="Username", placeholder="Enter username")
+                password = gr.Text(
+                    label="Password", type="password", placeholder="Enter password"
+                )
+            login_error = gr.Markdown(
+                "\u274c Incorrect login, try again.", visible=False
+            )
+            login_btn = gr.Button("Login")
+        def login(u, p):
+            if u == configs.USER_NAME and p == configs.USER_PASSWORD:
+                new_session_id = str(uuid4())
+                current_rows = backend.get_all_rows()
+                sample_indices = get_random_session_samples(
+                    current_rows, dataset, num_samples=30
+                )
+                logger.info(f"Session ID: {new_session_id}")
+                return (
+                    True,
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    new_session_id,
+                    sample_indices,
+                )
+            else:
+                return False, gr.update(visible=True), gr.update(visible=True), None, []
+        login_btn.click(
+            fn=login,
+            inputs=[username, password],
+            outputs=[
+                logged_in,
+                login_group,
+                login_error,
+                session_id,
+                session_sample_indices,
+            ],
+        )
+        # === UI Elements ===
+        next_btn = gr.Button("Start", visible=False)
+        prev_btn = gr.Button("Previous Sample", visible=False)
+        warning_msg = gr.Markdown(
+            "<span style='color:red;'>\u26a0\ufe0f Please select an answer before continuing.</span>",
+            visible=False,
+        )
+        with gr.Group(visible=False) as app_group:
+            with gr.Group():
+                gr.Markdown("<div align='center'><big><b>Instructions</b></big></div>")
+                gr.Markdown(
+                    "<div align='center'>You are given an audio sample and a question with 2 answer options.\n\nListen to the audio and select the correct answer from the options below.</div>"
+                )
+            with gr.Group(visible=False) as question_group:
+                with gr.Row(show_progress=True):
+                    with gr.Column(variant="compact"):
+                        sample_info = gr.Markdown()
+                        gr.Markdown("**Question:**")
+                        question_md = gr.Markdown()
+                        radio = gr.Radio(label="Answer:", interactive=True)
+                    with gr.Column(variant="compact"):
+                        audio_output = gr.Audio()
+        with gr.Group(
+            visible=False, elem_id="final_page"
+        ) as final_group:  # Final page, not visible until the end
+            gr.Markdown(
+                """
+            # 🎉 Thanks for your help!
+            You helped moving science forward 🤓
+            Your responses have been recorded.
+            You may now close this tab.
+            """
+            )
+        # === Logic ===
+        def update_ui(i, answers, session_sample_indices):
+            if i == -1:  # We haven't started yet
+                return (
+                    gr.update(visible=False),
+                    "",
+                    "",
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    None,
+                )
+            # show the question
+            true_index = session_sample_indices[i]
+            sample = dataset[true_index]
+            audio_data = (sample["audio"]["sampling_rate"], sample["audio"]["array"])
+            previous_answer = answers.get(i, None)
+            return (
+                gr.update(visible=True),
+                f"<div align='center'>Sample <b>{i+1}</b> out of <b>{len(session_sample_indices)}</b></div>",
+                "Out of the following answers, according to the speaker's stressed words, what is most likely the underlying intention of the speaker?",
+                gr.Audio(value=audio_data, label="Audio:"),
+                gr.Radio(
+                    choices=sample["possible_answers"],
+                    value=previous_answer,
+                    label="Answer:",
+                ),
+                previous_answer,
+            )
+        def update_next_index(i, answer, answers, session_id, session_sample_indices):
+            if answer is None and i != -1:  # if no answer is selected
+                # show warning message
+                return (
+                    gr.update(),
+                    gr.update(visible=True),
+                    gr.update(),
+                    answers,
+                    gr.update(visible=False),
+                    gr.update(visible=True),
+                )
+            if answer:  # if an answer is selected
+                # save the answer to the backend
+                answers[i] = answer
+                true_index = session_sample_indices[i]
+                sample = dataset[true_index]
+                interp_id = sample["interpretation_id"]
+                trans_id = sample["transcription_id"]
+                user_id = session_id
+                logger.info(
+                    "saving answer to backend",
+                    context={
+                        "i": true_index,
+                        "interp_id": interp_id,
+                        "answer": answer,
+                        "user_id": user_id,
+                    },
+                )
+                if not backend.update_row(true_index, interp_id, user_id, answer):
+                    backend.add_row(true_index, interp_id, trans_id, user_id, answer)
+            if i + 1 == len(session_sample_indices):  # Last question just answered
+                return (
+                    -1,  # reset i to stop showing question
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    answers,
+                    gr.update(visible=True),  # show final page
+                    gr.update(visible=False),  # hide previous button
+                )
+            # go to the next question
+            new_i = i + 1 if i + 1 < len(session_sample_indices) else 0
+            return (
+                new_i,
+                gr.update(visible=False),
+                gr.update(value="Submit answer and go to Next"),
+                answers,
+                gr.update(visible=False),
+                gr.update(visible=True),
+            )
+        def update_prev_index(i):
+            # prevent goint back in the first question and first page
+            if i <= 0:
+                return i, gr.update(visible=False)
+            # go back to the previous question
+            else:
+                return i - 1, gr.update(visible=False)
+        def answer_change_callback(answer, i, answers):
+            answers[i] = answer
+            return answer, answers
+        def login_callback(logged_in):
+            return (
+                (
+                    gr.update(visible=True),
+                    gr.update(visible=True),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                )
+                if logged_in
+                else (
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                )
+            )
+        # === Events ===
+        next_btn.click(
+            update_next_index,
+            [i, selected_answer, answers_dict, session_id, session_sample_indices],
+            [i, warning_msg, next_btn, answers_dict, final_group, prev_btn],
+        )
+        prev_btn.click(update_prev_index, i, [i, warning_msg])
+        i.change(
+            update_ui,
+            [i, answers_dict, session_sample_indices],
+            [
+                question_group,
+                sample_info,
+                question_md,
+                audio_output,
+                radio,
+                selected_answer,
+            ],
+        )
+        radio.change(
+            answer_change_callback,
+            [radio, i, answers_dict],
+            [selected_answer, answers_dict],
+        )
+        logged_in.change(
+            login_callback, logged_in, [app_group, next_btn, prev_btn, warning_msg]
+        )
+# Dummy password for admin tab
+ADMIN_PASSWORD = configs.ADMIN_PASSWORD
+def get_admin_tab():
+    with gr.Tab("Admin Console"):
+        admin_password = gr.Text(label="Enter Admin Password", type="password")
+        check_btn = gr.Button("Enter")
+        error_box = gr.Markdown("", visible=False)
+        output_box = gr.Markdown("", visible=False)
+        def calculate_majority_vote_accuracy(pw):
+            if pw != ADMIN_PASSWORD:
+                return gr.update(
+                    visible=True, value="\u274c Incorrect password."
+                ), gr.update(visible=False)
+            df = backend.get_all_rows()
+            if df.empty:
+                return gr.update(visible=True, value="No data available."), gr.update(
+                    visible=False
+                )
+            majority_answers = {}
+            for interp_id, group in df.groupby("interpretation_id"):
+                answer_counts = Counter(group["answer"])
+                if answer_counts:
+                    majority_answers[interp_id] = answer_counts.most_common(1)[0][0]
+            total = 0
+            correct = 0
+            for sample in dataset:
+                interp_id = sample["interpretation_id"]
+                if interp_id not in majority_answers:
+                    continue
+                predicted_answer = majority_answers[interp_id]
+                correct_label_idx = sample["label"]
+                correct_answer_text = sample["possible_answers"][correct_label_idx]
+                total += 1
+                if predicted_answer == correct_answer_text:
+                    correct += 1
+            acc = correct / total if total > 0 else 0
+            # calculate total answers submited
+            total_answers = len(df)
+            answers_to_go = (3 * len(dataset)) - total_answers
+            users_count = df["user_id"].nunique()
+            # update the admin console
+            return gr.update(visible=False), gr.update(
+                visible=True,
+                value=f"""**Accuracy over answered samples:** {acc:.2%} ({correct}/{total})
+                **Total answers submitted:** {total_answers}
+                **Answers to go:** {answers_to_go}
+                **Users count:** {users_count}""",
+            )
+        check_btn.click(
+            fn=calculate_majority_vote_accuracy,
+            inputs=admin_password,
+            outputs=[error_box, output_box],
+        )
+# App UI
+with gr.Blocks() as demo:
+    human_eval_tab()
+    get_admin_tab()

backend/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .backend import Backend

backend/backend.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import json
+import gspread
+import pandas as pd
+from datetime import datetime
+from oauth2client.service_account import ServiceAccountCredentials
+class Backend:
+    def __init__(self, sheet_name: str, credentials: str):
+        creds_dict = json.load(open(credentials))
+        scope = [
+            "https://spreadsheets.google.com/feeds",
+            "https://www.googleapis.com/auth/drive",
+        ]
+        credentials = ServiceAccountCredentials.from_json_keyfile_dict(
+            creds_dict, scope
+        )
+        client = gspread.authorize(credentials)
+        self.sheet = client.open(sheet_name).sheet1
+        self.header = self.sheet.row_values(1)
+    def get_all_rows(self) -> pd.DataFrame:
+        records = self.sheet.get_all_records()
+        return pd.DataFrame.from_records(records)
+    def add_row(
+        self, index_in_dataset, interpretation_id, transcription_id, user_id, answer
+    ):
+        timestamp = datetime.utcnow().isoformat()
+        self.sheet.append_row(
+            [
+                index_in_dataset,
+                interpretation_id,
+                transcription_id,
+                user_id,
+                answer,
+                timestamp,
+            ]
+        )
+    def update_row(self, index_in_dataset, interpretation_id, user_id, new_answer):
+        records = self.get_all_rows().to_dict("records")
+        for idx, row in enumerate(records):
+            if (
+                row["interpretation_id"] == interpretation_id
+                and row["index_in_dataset"] == index_in_dataset
+                and row["user_id"] == user_id
+            ):
+                sheet_row = (
+                    idx + 2
+                )  # +2 because sheet rows are 1-indexed and header is row 1
+                if row["answer"] != new_answer:
+                    self.sheet.update_cell(
+                        sheet_row, self.header.index("answer") + 1, new_answer
+                    )
+                    self.sheet.update_cell(
+                        sheet_row,
+                        self.header.index("timestamp") + 1,
+                        datetime.utcnow().isoformat(),
+                    )
+                return True
+        return False
+    def get_answer_count(self, interpretation_id):
+        df = self.get_all_rows()
+        return df[df["interpretation_id"] == interpretation_id]["user_id"].nunique()

backend/helpers.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import random
+import pandas as pd
+def get_random_session_samples(df: pd.DataFrame, dataset, num_samples=30):
+    if df.empty:
+        # Return any random sample from the dataset if no answers exist yet
+        return random.sample(range(len(dataset)), min(num_samples, len(dataset)))
+    # Otherwise compute counts normally
+    counts = df.groupby("interpretation_id")["user_id"].nunique().to_dict()
+    # Select samples with < 3 answers
+    eligible_indices = [
+        i
+        for i, sample in enumerate(dataset)
+        if counts.get(sample["interpretation_id"], 0) < 3
+    ]
+    return random.sample(eligible_indices, min(num_samples, len(eligible_indices)))

clients.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .configs import configs
+from .logger import Logger
+from .backend import Backend
+logger = Logger(context={"service": "Human Evaluation"}, use_context_var=False)
+backend = Backend(
+    sheet_name=configs.GOOGLE_SHEET_NAME, credentials=configs.GOOGLE_SHEETS_CREDENTIALS
+)

configs.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from pydantic import Field
+from pydantic_settings import BaseSettings
+from pathlib import Path
+class Settings(BaseSettings):
+    HF_API_TOKEN: str = Field(default="your_hf_api_token")
+    GOOGLE_SHEET_NAME: str = Field(
+        default="sheet name"
+    )  # Replace with your actual Google Sheet name
+    GOOGLE_SHEETS_CREDENTIALS: str = Field(
+        default="path_to_creds"
+    )  # Replace with your actual Google Sheets credentials
+    ADMIN_PASSWORD: str = Field(default="admin_password")
+    USER_PASSWORD: str = Field(default="user_password")
+    USER_NAME: str = Field(default="user_name")
+configs = Settings()

logger/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .logger import Logger

logger/json_formatter.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import json
+import logging
+from time import strftime, gmtime
+class JsonFormatter(logging.Formatter):
+    grey = "\x1b[38;20m"
+    green = "\x1b[33;32m"
+    yellow = "\x1b[33;20m"
+    red = "\x1b[31;20m"
+    bold_red = "\x1b[31;1m"
+    reset = "\x1b[0m"
+    FORMATS = {
+        logging.DEBUG: grey,
+        logging.INFO: green,
+        logging.WARNING: yellow,
+        logging.ERROR: red,
+        logging.CRITICAL: bold_red,
+    }
+    def __init__(self):
+        super().__init__()
+    @staticmethod
+    def serialize_to_json(data):
+        try:
+            return json.dumps(data, indent=2)
+        except Exception as e:
+            return f"Failed to serialize data to JSON: {str(data)}\nError: {str(e)}"
+    def format(self, record):
+        error_json = (
+            {"error": self.formatException(record.exc_info)}
+            if record.levelno == logging.ERROR and record.exc_info
+            else {}
+        )
+        context = record.__dict__["context"]
+        json_record = {
+            "message": record.getMessage(),
+            "level": record.levelname,
+            "logged_at": strftime("%Y-%m-%d %H:%M:%S", gmtime(record.created)),
+            **context,
+            **error_json,
+        }
+        try:
+            json_log = f"{self.FORMATS.get(record.levelno)}{json.dumps(json_record, indent=2)}{self.reset}"
+            colorful_json = json_log.encode("utf-8").decode("unicode_escape")
+            return colorful_json
+        except Exception as e:
+            return (
+                f"Failed to serialize data to JSON: {str(json_record)}\nError: {str(e)}"
+            )

logger/logger.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import sys
+import logging
+import contextvars
+from typing import Dict, Any
+from .json_formatter import JsonFormatter
+# Create a context variable to store request-specific information
+context_var: contextvars.ContextVar[Dict[str, Any]] = contextvars.ContextVar(
+    "context_dict"
+)
+class Logger:
+    def __init__(self, context=None, use_context_var=False):
+        self.logger = logging.getLogger("json_logger")
+        self.context = context or {}
+        self.base_context = context or {}
+        self.use_context_var = use_context_var
+        self._setup()
+    def _setup(self):
+        self.logger.setLevel(logging.DEBUG)
+        self.context = (
+            context_var.set(self.base_context) if self.use_context_var else self.context
+        )
+        if not self.logger.handlers:
+            console_handler = logging.StreamHandler(sys.stdout)
+            console_handler.setFormatter(JsonFormatter())
+            self.logger.addHandler(console_handler)
+    def debug(self, data, context={}):
+        self.log(logging.DEBUG, data, context)
+    def info(self, data, context={}):
+        self.log(logging.INFO, data, context)
+    def warning(self, data, context={}):
+        self.log(logging.WARNING, data, context)
+    def error(self, data, error=None, context={}):
+        self.log(logging.ERROR, data, context, error)
+    def log(self, level, data, context={}, error=None):
+        self.update_context(context=context)
+        self.logger.log(
+            level,
+            msg=data,
+            extra={"context": self._get_context()},
+            exc_info=error,
+        )
+    def _get_context(self):
+        return context_var.get() if self.use_context_var else self.context
+    def reset_context(self):
+        if self.use_context_var:
+            context_var.set(self.base_context)
+        else:
+            self.context = self.base_context or {}
+    def update_context(self, context):
+        if self.use_context_var:
+            context_var.set({**context_var.get(), **context})
+        else:
+            self.context.update(context)

main.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .app import demo
+def launch():
+    demo.launch(server_name="0.0.0.0", server_port=7860)
+if __name__ == "__main__":
+    launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio==5.16.2
+pydantic==2.8.2
+pydantic-settings==2.0.3
+librosa==0.10.2.post1
+soundfile==0.12.1
+datasets==2.21.0
+gspread==6.2.0
+oauth2client==4.1.3
+pandas==2.2.3