Spaces:

alisartazkhan
/

tempo_control2

Sleeping

App Files Files Community

alisartazkhan commited on Apr 19

Commit

f35f09f

verified ·

1 Parent(s): cc8489d

Update talk_arena/audio_collection.py

Browse files

Files changed (1) hide show

talk_arena/audio_collection.py +179 -469

talk_arena/audio_collection.py CHANGED Viewed

@@ -1,22 +1,13 @@
-import argparse
-import asyncio
 import os
-import random
-import textwrap
-import time
 import uuid
-import gradio as gr
 import numpy as np
 import soundfile as sf
 import xxhash
-from datasets import Audio
-from dotenv import load_dotenv
-from openai import OpenAI
 from huggingface_hub import upload_file, HfApi
-import talk_arena.streaming_helpers as sh
-from talk_arena.db_utils import TinyThreadSafeDB
 # Load environment variables
 load_dotenv()
@@ -27,44 +18,71 @@ os.makedirs("outputs", exist_ok=True)
 # Initialize Hugging Face API client
 hf_api = HfApi(token=os.getenv("HF_TOKEN"))
 DATASET_REPO = "alisartazkhan/audioLLM_judge"
-CATEGORY = "pilot_tempo_control_2"
-COUNTER = 3
-CODE = "C1BDJUET"
-CAT_DESC = "An interactive study that tests how well audio models follow voice prompts with changing tempo. Create your own prompts and compare model responses!"
 resampler = Audio(sampling_rate=16_000)
-def parse_args():
-    parser = argparse.ArgumentParser(description="Talk Arena Demo")
-    parser.add_argument("--free_only", action="store_true", help="Only use free models")
-    return parser.parse_args()
-args = parse_args()
-if gr.NO_RELOAD:  # Prevents Re-init during hot reloading
-    # Transcription Disabled for Public Interface
-    # asr_pipe = pipeline(
-    #    task="automatic-speech-recognition",
-    #    model="openai/whisper-large-v3-turbo",
-    #    chunk_length_s=30,
-    #    device="cuda:1",
-    # )
-    anonymous = True
-    gpt4o_audio, gpt4o_model = sh.gpt4o_streaming("models/gpt4o")
-    gemini2_audio, gemini2_model = sh.gemini_streaming("models/gemini-2.0-flash-exp")
-    competitor_info = [
-        (sh.gradio_gen_factory(gpt4o_audio, "GPT4o", anonymous), "gpt4o", "GPT-4o"),
-        (sh.gradio_gen_factory(gemini2_audio, "Gemini 2 Flash", anonymous), "gemini_2f", "Gemini 2 Flash"),
-    ]
-    resp_generators = [generator for generator, _, _ in competitor_info]
-    model_shorthand = [shorthand for _, shorthand, _ in competitor_info]
-    model_name = [full_name for _, _, full_name in competitor_info]
-    all_models = list(range(len(model_shorthand)))
-# Function to upload file to HF dataset repository
 def upload_to_hf(local_path, repo_path):
     try:
         upload_file(
             path_or_fileobj=local_path,
@@ -73,456 +91,148 @@ def upload_to_hf(local_path, repo_path):
             repo_type="dataset",
             token=os.getenv("HF_TOKEN")
         )
-        print(f"Uploaded file: {local_path} to Hugging Face repository at {repo_path}")
         return True
     except Exception as e:
         print(f"Error uploading file to HF: {e}")
         return False
-async def pairwise_response_async(audio_input, state, model_order):
-    if audio_input == None:
-        raise StopAsyncIteration(
-            "",
-            "",
-            gr.Button(visible=False),
-            gr.Button(visible=False),
-            gr.Button(visible=False),
-            state,
-            audio_input,
-            None,
-            None,
-            None,
         )
-    spinner_id = 0
-    spinners = ["◐ ", "◓ ", "◑", "◒"]
-    spinner = spinners[0]
-    gen_pair = [resp_generators[model_order[0]], resp_generators[model_order[1]]]
-    latencies = [{}, {}]  # Store timing info for each model
-    resps = [gr.Textbox(value="", info="", visible=False), gr.Textbox(value="", info="", visible=False)]
-    tts_resps = [gr.Audio(), gr.Audio()]
-    error_in_model = False
-    # Get a unique hash for this audio input
     sr, y = audio_input
-    x = xxhash.xxh32(bytes(y)).hexdigest()
-    for order, generator in enumerate(gen_pair):
-        start_time = time.time()
-        first_token = True
-        total_length = 0
-        try:
-            async for local_resp in generator(audio_input, order):
-                total_length += 1
-                if first_token:
-                    latencies[order]["time_to_first_token"] = time.time() - start_time
-                    first_token = False
-                resps[order] = local_resp
-                spinner = spinners[spinner_id]
-                spinner_id = (spinner_id + 1) % 4
-                yield (
-                    gr.Button(
-                        value=spinner + " Generating Responses " + spinner,
-                        interactive=False,
-                        variant="primary",
-                    ),
-                    resps[0],
-                    resps[1],
-                    tts_resps[0],
-                    tts_resps[1],
-                    gr.Button(visible=False),
-                    gr.Button(visible=False),
-                    gr.Button(visible=False),
-                    state,
-                    audio_input,
-                    None,
-                    None,
-                    latencies,
-                )
-            latencies[order]["total_time"] = time.time() - start_time
-            latencies[order]["response_length"] = total_length
-        except Exception as e:
-            print(f"Error in model {order+1}: {e}")
-            error_in_model = True
-            resps[order] = gr.Textbox(
-                info=f"<strong>Error thrown by Model {order+1} API</strong>",
-                value="" if first_token else resps[order]._constructor_args[0]["value"],
-                visible=True,
-                label=f"Model {order+1}",
-            )
-            yield (
-                gr.Button(
-                    value=spinner + " Generating Responses " + spinner,
-                    interactive=False,
-                    variant="primary",
-                ),
-                resps[0],
-                resps[1],
-                tts_resps[0],
-                tts_resps[1],
-                gr.Button(visible=False),
-                gr.Button(visible=False),
-                gr.Button(visible=False),
-                state,
-                audio_input,
-                None,
-                None,
-                latencies,
-            )
-        # Process and save audio
-        y = y.astype(np.float32)
-        y /= np.max(np.abs(y))
-        a = resampler.decode_example(resampler.encode_example({"array": y, "sampling_rate": sr}))
-        # Create a unique identifier
-        unique_id = str(uuid.uuid4())[:8]
-        local_filename = f"outputs/{x}_resp{order}_{unique_id}.wav"
-        # Save locally first
-        sf.write(local_filename, a["array"], a["sampling_rate"], format="wav")
-        # Upload to HF dataset
-        upload_to_hf(
-            local_filename,
-            f"{CATEGORY}/{x}_resp{order}_{unique_id}.wav"
-        )
-        # Generate TTS response
-        try:
-            tts_options = {
-                "model": "gpt-4o-mini-tts",
-                "voice": "alloy",
-                "input": resps[order].__dict__["_constructor_args"][0]["value"],
-                "response_format": "wav",
-            }
-            abytes = OpenAI(api_key=os.environ["OPENAI_API_KEY"]).audio.speech.create(**tts_options).content
-            tts_resps[order] = gr.Audio(
-                value=abytes,
-                visible=True,
-            )
-        except Exception as e:
-            print(f"Error generating TTS: {e}")
-            tts_resps[order] = gr.Audio(visible=False)
-        latencies[order]["total_time"] = time.time() - start_time
-        latencies[order]["response_length"] = total_length
-    print("Latency data:", latencies)
-    yield (
-        gr.Button(value="Vote for which model is better!", interactive=False, variant="primary", visible=False),
-        resps[0],
-        resps[1],
-        tts_resps[0],
-        tts_resps[1],
-        gr.Button(visible=not error_in_model),
-        gr.Button(visible=not error_in_model),
-        gr.Button(visible=not error_in_model),
-        responses_complete(state),
-        audio_input,
-        gr.Textbox(visible=False),
-        gr.Audio(visible=False),
-        latencies,
-    )
-def on_page_load(state, model_order):
-    if state == 0:
-        # gr.Info(
-        #    "Record something you'd say to an AI Assistant! Think about what you usually use Siri, Google Assistant,"
-        #    " or ChatGPT for."
-        # )
-        state = 1
-        model_order = random.sample(all_models, 2) if anonymous else model_order
-    return state, model_order
-def recording_complete(state):
-    if state == 1:
-        # gr.Info(
-        #    "Once you submit your recording, you'll receive responses from different models. This might take a second."
-        # )
-        state = 2
     return (
-        gr.Button(value="Starting Generation", interactive=False, variant="primary"),
-        state,
     )
-def responses_complete(state):
-    if state == 2:
-        gr.Info(
-            "Give us your feedback! Mark which model gave you the best response so we can understand the quality of"
-            " these different voice assistant models."
-        )
-        state = 3
-    return state
-class UploadableDB(TinyThreadSafeDB):
-    def __init__(self, filename):
-        super().__init__(filename)
-        self.filename = filename
-    async def upload_db(self):
-        try:
-            # Upload the JSON database file to HF
-            upload_to_hf(
-                self.filename,
-                f"{CATEGORY}/{self.filename}"
-            )
-            print(f"Successfully uploaded DB file {self.filename} to HF dataset")
-            return True
-        except Exception as e:
-            print(f"Error uploading DB file to HF: {e}")
-            return False
-def clear_factory(button_id):
-    async def clear(audio_input, model_order, pref_counter, reasoning, latency):
-        textbox1 = gr.Textbox(visible=False)
-        textbox2 = gr.Textbox(visible=False)
-        if button_id != None:
-            sr, y = audio_input
-            x = xxhash.xxh32(bytes(y)).hexdigest()
-            await db.insert(
-                {
-                    "audio_hash": x,
-                    "outcome": button_id,
-                    "model_a": model_shorthand[model_order[0]],
-                    "model_b": model_shorthand[model_order[1]],
-                    "why": reasoning,
-                    "model_a_latency": latency[0],
-                    "model_b_latency": latency[1],
-                }
-            )
-            # Upload the updated database to HF after each insertion
-            await db.upload_db()
-            pref_counter += 1
-            model_a = model_name[model_order[0]]
-            model_b = model_name[model_order[1]]
-        counter_text = f"# {pref_counter}/{COUNTER} Preferences Submitted"
-        if pref_counter >= COUNTER:
-            counter_text = f"# Completed! Completion Code: {CODE}"
-        if anonymous:
-            model_order = random.sample(all_models, 2)
         return (
-            model_order,
-            gr.Button(
-                value="Record Audio to Submit Again!",
-                interactive=False,
-                visible=True,
-            ),
-            gr.Button(visible=False),
             gr.Button(visible=False),
             gr.Button(visible=False),
-            None,
-            textbox1,
-            textbox2,
-            gr.Audio(visible=False),
-            gr.Audio(visible=False),
-            pref_counter,
-            counter_text,
-            gr.Textbox(visible=False),
-            gr.Audio(visible=False),
         )
-    return clear
-def transcribe(transc, voice_reason):
-    if transc is None:
-        transc = ""
-    transc += " " + asr_pipe(voice_reason, generate_kwargs={"task": "transcribe"}, return_timestamps=False)["text"]
-    return transc, gr.Audio(value=None)
 theme = gr.themes.Soft(
-    primary_hue=gr.themes.Color(
-        c100="#82000019",
-        c200="#82000033",
-        c300="#8200004c",
-        c400="#82000066",
-        c50="#8200007f",
-        c500="#8200007f",
-        c600="#82000099",
-        c700="#820000b2",
-        c800="#820000cc",
-        c900="#820000e5",
-        c950="#820000f2",
-    ),
-    secondary_hue="rose",
-    neutral_hue="stone",
 )
-import os
-css_path = os.path.join(os.path.dirname(__file__), "styles.css")
-with open(css_path, "r") as css_file:
-    custom_css = css_file.read()
-# Initialize our custom database class instead of the original one
-db = UploadableDB("audio_out_votes.json")
-with gr.Blocks(theme=theme, fill_height=True, css=custom_css) as demo:
-    submitted_preferences = gr.State(0)
-    state = gr.State(0)
-    model_order = gr.State([])
-    latency = gr.State([])
-    with gr.Row():
-        counter_text = gr.Markdown(
-            f"# 0/{COUNTER} Preferences Submitted.\n Follow the pop-up tips to submit your first preference."
-        )
-        category_description_text = gr.Markdown(CAT_DESC)
-    with gr.Row():
-        audio_input = gr.Audio(sources=["microphone"], streaming=False, label="Audio Input")
-    with gr.Row(equal_height=True):
-        with gr.Column(scale=1):
-            out1 = gr.Textbox(visible=False, lines=5, autoscroll=True)
-            audio_out1 = gr.Audio(visible=False)
-        with gr.Column(scale=1):
-            out2 = gr.Textbox(visible=False, lines=5, autoscroll=True)
-            audio_out2 = gr.Audio(visible=False)
-    with gr.Row():
-        btn = gr.Button(value="Record Audio to Submit!", interactive=False)
-    with gr.Row(equal_height=True):
-        reason = gr.Textbox(label="[Optional] Explain Your Preferences", visible=False, scale=4)
-        reason_record = gr.Audio(
-            sources=["microphone"],
-            interactive=True,
-            streaming=False,
-            label="Speak to transcribe!",
-            visible=False,
-            type="filepath",
-            # waveform_options={"show_recording_waveform": False},
-            scale=1,
-        )
-    with gr.Row():
-        best1 = gr.Button(value="Model 1 is better", visible=False)
-        tie = gr.Button(value="Tie", visible=False)
-        best2 = gr.Button(value="Model 2 is better", visible=False)
-    with gr.Row():
-        contact = gr.Markdown("")
-    # reason_record.stop_recording(transcribe, inputs=[reason, reason_record], outputs=[reason, reason_record])
-    audio_input.stop_recording(
-        recording_complete,
-        [state],
-        [btn, state],
-    ).then(
-        fn=pairwise_response_async,
-        inputs=[audio_input, state, model_order],
-        outputs=[
-            btn,
-            out1,
-            out2,
-            audio_out1,
-            audio_out2,
-            best1,
-            best2,
-            tie,
-            state,
-            audio_input,
-            reason,
-            reason_record,
-            latency,
-        ],
-    )
-    audio_input.start_recording(
-        lambda: gr.Button(value="Uploading Audio to Cloud", interactive=False, variant="primary"),
-        None,
-        btn,
-    )
-    best1.click(
-        fn=clear_factory(0),
-        inputs=[audio_input, model_order, submitted_preferences, reason, latency],
-        outputs=[
-            model_order,
-            btn,
-            best1,
-            best2,
-            tie,
-            audio_input,
-            out1,
-            out2,
-            audio_out1,
-            audio_out2,
-            submitted_preferences,
-            counter_text,
-            reason,
-            reason_record,
-        ],
     )
-    tie.click(
-        fn=clear_factory(0.5),
-        inputs=[audio_input, model_order, submitted_preferences, reason, latency],
-        outputs=[
-            model_order,
-            btn,
-            best1,
-            best2,
-            tie,
-            audio_input,
-            out1,
-            out2,
-            audio_out1,
-            audio_out2,
-            submitted_preferences,
-            counter_text,
-            reason,
-            reason_record,
-        ],
     )
-    best2.click(
-        fn=clear_factory(1),
-        inputs=[audio_input, model_order, submitted_preferences, reason, latency],
-        outputs=[
-            model_order,
-            btn,
-            best1,
-            best2,
-            tie,
-            audio_input,
-            out1,
-            out2,
-            audio_out1,
-            audio_out2,
-            submitted_preferences,
-            counter_text,
-            reason,
-            reason_record,
-        ],
     )
-    audio_input.clear(
-        clear_factory(None),
-        [audio_input, model_order, submitted_preferences, reason, latency],
-        [
-            model_order,
-            btn,
-            best1,
-            best2,
-            tie,
-            audio_input,
-            out1,
-            out2,
-            audio_out1,
-            audio_out2,
-            submitted_preferences,
-            counter_text,
-            reason,
-            reason_record,
-        ],
     )
-    demo.load(fn=on_page_load, inputs=[state, model_order], outputs=[state, model_order])
 if __name__ == "__main__":
-    demo.queue(default_concurrency_limit=40, api_open=False).launch(share=True, ssr_mode=False)

 import os
 import uuid
+import json
 import numpy as np
+import gradio as gr
 import soundfile as sf
 import xxhash
 from huggingface_hub import upload_file, HfApi
+from dotenv import load_dotenv
+from datasets import Audio
 # Load environment variables
 load_dotenv()
 # Initialize Hugging Face API client
 hf_api = HfApi(token=os.getenv("HF_TOKEN"))
 DATASET_REPO = "alisartazkhan/audioLLM_judge"
+CATEGORY = "pilot_tempo_control_3"
+MAX_RECORDINGS = 10  # Number of prompts to record
 resampler = Audio(sampling_rate=16_000)
+# Load the prompts from a JSON file
+prompt_path = os.path.join(os.path.dirname(__file__), "prompts.json")
+with open(prompt_path, "r") as f:
+    prompts_data = json.load(f)
+    PROMPTS = prompts_data["prompts"]
+# Create a JSON database to track uploads
+class UploadTracker:
+    def __init__(self, filename="recording_tracker.json"):
+        self.filename = filename
+        self.data = []
+        # Create file if it doesn't exist
+        if not os.path.exists(filename):
+            with open(filename, "w") as f:
+                json.dump([], f)
+        else:
+            # Load existing data
+            with open(filename, "r") as f:
+                self.data = json.load(f)
+    def add_recording(self, prompt_index, audio_hash, filename):
+        """Add a record of an uploaded recording"""
+        record = {
+            "prompt_index": prompt_index,
+            "audio_hash": audio_hash,
+            "filename": filename,
+            "timestamp": str(uuid.uuid4())
+        }
+        self.data.append(record)
+        # Save to file
+        with open(self.filename, "w") as f:
+            json.dump(self.data, f, indent=2)
+        # Upload tracker file to HF
+        self.upload_tracker()
+        return record
+    def upload_tracker(self):
+        """Upload the tracker JSON to Hugging Face"""
+        try:
+            upload_file(
+                path_or_fileobj=self.filename,
+                path_in_repo=f"{CATEGORY}/{self.filename}",
+                repo_id=DATASET_REPO,
+                repo_type="dataset",
+                token=os.getenv("HF_TOKEN")
+            )
+            print(f"Uploaded tracker file to Hugging Face")
+            return True
+        except Exception as e:
+            print(f"Error uploading tracker file: {e}")
+            return False
+# Initialize the tracker
+tracker = UploadTracker()
 def upload_to_hf(local_path, repo_path):
+    """Upload a file to the Hugging Face dataset repository"""
     try:
         upload_file(
             path_or_fileobj=local_path,
             repo_type="dataset",
             token=os.getenv("HF_TOKEN")
         )
+        print(f"Uploaded file: {local_path} to Hugging Face at {repo_path}")
         return True
     except Exception as e:
         print(f"Error uploading file to HF: {e}")
         return False
+def on_submit(audio_input, prompt_index):
+    """Handle the submission of a recorded audio prompt"""
+    if audio_input is None:
+        return (
+            gr.Markdown(f"# Recording {prompt_index + 1}/{MAX_RECORDINGS}"),
+            gr.Markdown(f"## Please record the following prompt:"),
+            gr.Markdown(f"### {PROMPTS[prompt_index]}"),
+            gr.Audio(value=None, label="Record your response"),
+            gr.Button("Submit Recording", interactive=False),
+            gr.Button("Next Prompt", visible=False),
+            prompt_index
         )
+    # Process the audio
     sr, y = audio_input
+    # Generate a hash for this audio
+    audio_hash = xxhash.xxh32(bytes(y)).hexdigest()
+    # Normalize audio
+    y = y.astype(np.float32)
+    y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1.0
+    # Resample to 16kHz
+    a = resampler.decode_example(resampler.encode_example({"array": y, "sampling_rate": sr}))
+    # Create unique filename
+    unique_id = str(uuid.uuid4())[:8]
+    local_filename = f"outputs/prompt{prompt_index}_{audio_hash}_{unique_id}.wav"
+    # Save locally
+    sf.write(local_filename, a["array"], a["sampling_rate"], format="wav")
+    # Upload to HF dataset
+    hf_path = f"{CATEGORY}/prompt{prompt_index}_{audio_hash}_{unique_id}.wav"
+    upload_to_hf(local_filename, hf_path)
+    # Add to tracker
+    tracker.add_recording(prompt_index, audio_hash, hf_path)
+    # Show success message
     return (
+        gr.Markdown(f"# Recording {prompt_index + 1}/{MAX_RECORDINGS}"),
+        gr.Markdown(f"## Recording successfully uploaded!"),
+        gr.Markdown(f"### {PROMPTS[prompt_index]}"),
+        gr.Audio(value=None, label="Record your response"),
+        gr.Button("Submit Recording", interactive=False),
+        gr.Button("Next Prompt", visible=True),
+        prompt_index
     )
+def next_prompt(prompt_index):
+    """Move to the next prompt"""
+    prompt_index += 1
+    # Check if we've gone through all prompts
+    if prompt_index >= min(len(PROMPTS), MAX_RECORDINGS):
         return (
+            gr.Markdown("# All recordings complete!"),
+            gr.Markdown("## Thank you for your participation."),
+            gr.Markdown("### You have completed all prompts."),
+            gr.Audio(visible=False),
             gr.Button(visible=False),
             gr.Button(visible=False),
+            prompt_index
         )
+    # Display the next prompt
+    return (
+        gr.Markdown(f"# Recording {prompt_index + 1}/{MAX_RECORDINGS}"),
+        gr.Markdown(f"## Please record the following prompt:"),
+        gr.Markdown(f"### {PROMPTS[prompt_index]}"),
+        gr.Audio(value=None, label="Record your response", sources=["microphone"]),
+        gr.Button("Submit Recording", interactive=False),
+        gr.Button("Next Prompt", visible=False),
+        prompt_index
+    )
+def enable_submit_button(audio_input):
+    """Enable the submit button when audio is recorded"""
+    if audio_input is not None:
+        return gr.Button("Submit Recording", interactive=True)
+    return gr.Button("Submit Recording", interactive=False)
+# Create a theme
 theme = gr.themes.Soft(
+    primary_hue="blue",
+    secondary_hue="indigo",
+    neutral_hue="slate",
 )
+# Create Gradio interface
+with gr.Blocks(theme=theme, css="footer {visibility: hidden}") as demo:
+    prompt_index = gr.State(0)
+    title = gr.Markdown(f"# Recording 1/{MAX_RECORDINGS}")
+    instructions = gr.Markdown("## Please record the following prompt:")
+    prompt_text = gr.Markdown(f"### {PROMPTS[0]}")
+    audio_input = gr.Audio(
+        label="Record your response",
+        sources=["microphone"],
+        streaming=False
     )
+    with gr.Row():
+        submit_btn = gr.Button("Submit Recording", interactive=False)
+        next_btn = gr.Button("Next Prompt", visible=False)
+    # Enable submit button when audio is recorded
+    audio_input.change(
+        fn=enable_submit_button,
+        inputs=[audio_input],
+        outputs=[submit_btn]
     )
+    # Handle submission
+    submit_btn.click(
+        fn=on_submit,
+        inputs=[audio_input, prompt_index],
+        outputs=[title, instructions, prompt_text, audio_input, submit_btn, next_btn, prompt_index]
     )
+    # Handle next button
+    next_btn.click(
+        fn=next_prompt,
+        inputs=[prompt_index],
+        outputs=[title, instructions, prompt_text, audio_input, submit_btn, next_btn, prompt_index]
     )
+# Launch the app
 if __name__ == "__main__":
+    # First, create the prompts.json file
+    with open("talkarena/prompts.json", "w") as f:
+        json.dump({
+            "prompts": PROMPTS
+        }, f, indent=2)
+    demo.launch(share=True)