Spaces:

benwiley
/

pyannote-speech-separation-ami-1.0

Runtime error

App Files Files Community

Ben Wiley commited on Jun 28, 2024

Commit

4739174

1 Parent(s): a7ab6a9

Adding HF Auth attempt

Browse files

Files changed (1) hide show

app.py +61 -33

app.py CHANGED Viewed

@@ -4,14 +4,26 @@ from pyannote.audio import Pipeline
 from pyannote.audio.pipelines.utils.hook import ProgressHook
 import scipy.io.wavfile
 import os
 def perform_separation(audio_file_path: str):
     # Instantiate the pipeline
-    pipeline = Pipeline.from_pretrained(
-        "pyannote/speech-separation-ami-1.0",
-        use_auth_token=HUGGINGFACE_ACCESS_TOKEN,
-    )
     waveform, sample_rate = torchaudio.load(audio_file_path)
@@ -40,33 +52,49 @@ def perform_separation(audio_file_path: str):
     return output_file_paths, rttm_content
-def gradio_wrapper(audio_file_path: str):
     output_file_paths, rttm_content = perform_separation(audio_file_path)
-    return output_file_paths + [rttm_content]
-inputs = gr.inputs.Audio(label="Input Audio", type="filepath")
-# Dynamic output for audio files
-outputs = []
-max_speakers = 10  # Set a reasonable maximum number of speakers
-for i in range(max_speakers):
-    outputs.append(gr.outputs.Audio(label=f"Speaker {i+1}", type="filepath"))
-# Add RTTM output
-outputs.append(gr.outputs.Textbox(label="RTTM Output"))
-title = "Speech Separation and Diarization"
-description = "Gradio demo for Speech Separation and Diarization using Pyannote's pyannote/speech-separation-ami-1.0. To use it, simply upload your audio, or click one of the examples to load them. The app will output separated audio for each speaker and the RTTM file content."
-article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2403.02288' target='_blank'>PixIT: Joint Training of Speaker Diarization and Speech Separation from Real-world Multi-speaker Recordings</a> | <a href='https://huggingface.co/pyannote/speech-separation-ami-1.0' '_blank'>HuggingFace Pipeline</a></p>"
-examples = [["samples_audio_samples_test_mixture.wav"]]
-gr.Interface(
-    gradio_wrapper,
-    inputs,
-    outputs,
-    title=title,
-    description=description,
-    article=article,
-    examples=examples,
-).launch()

 from pyannote.audio.pipelines.utils.hook import ProgressHook
 import scipy.io.wavfile
 import os
+from huggingface_hub import HfApi
+# Global variable to store the user's token
+HUGGINGFACE_ACCESS_TOKEN = None
 def perform_separation(audio_file_path: str):
+    global HUGGINGFACE_ACCESS_TOKEN
+    if not HUGGINGFACE_ACCESS_TOKEN:
+        return [], "Please log in with your HuggingFace account first."
     # Instantiate the pipeline
+    try:
+        pipeline = Pipeline.from_pretrained(
+            "pyannote/speech-separation-ami-1.0",
+            use_auth_token=HUGGINGFACE_ACCESS_TOKEN,
+        )
+    except Exception as e:
+        return [], f"Error loading pipeline: {str(e)}"
     waveform, sample_rate = torchaudio.load(audio_file_path)
     return output_file_paths, rttm_content
+def gradio_wrapper(audio_file_path: str, request: gr.Request):
+    global HUGGINGFACE_ACCESS_TOKEN
+    if not HUGGINGFACE_ACCESS_TOKEN:
+        return [""] * 10 + ["Please log in with your HuggingFace account first."]
     output_file_paths, rttm_content = perform_separation(audio_file_path)
+    return output_file_paths + [""] * (10 - len(output_file_paths)) + [rttm_content]
+def login(request: gr.Request):
+    global HUGGINGFACE_ACCESS_TOKEN
+    if request.username:
+        # User is authenticated
+        HUGGINGFACE_ACCESS_TOKEN = request.auth
+        return f"Welcome, {request.username}! You are now logged in."
+    else:
+        return "Please log in with your HuggingFace account to use this app."
+with gr.Blocks() as demo:
+    gr.Markdown("## Speech Separation and Diarization")
+    gr.Markdown("Please log in with your HuggingFace account to use this app.")
+    login_status = gr.Markdown()
+    with gr.Row():
+        input_audio = gr.Audio(label="Input Audio", type="filepath")
+    with gr.Row():
+        submit_button = gr.Button("Process Audio")
+    outputs = []
+    max_speakers = 10
+    for i in range(max_speakers):
+        outputs.append(gr.Audio(label=f"Speaker {i+1}", type="filepath"))
+    rttm_output = gr.Textbox(label="RTTM Output")
+    demo.load(login, inputs=None, outputs=login_status)
+    submit_button.click(
+        gradio_wrapper, inputs=[input_audio], outputs=outputs + [rttm_output]
+    )
+demo.launch(auth={"hf_oauth": True})