Spaces:

mangoesai
/

Pyannote_diarization

Running on T4

App Files Files Community

Y-Mangoes commited on 9 days ago

Commit

00e1f93

verified ·

1 Parent(s): 4ddfa68

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -36

app.py CHANGED Viewed

@@ -5,58 +5,127 @@ from pyannote.core import Segment, Annotation
 import os
 from huggingface_hub import login
 import tempfile
 # Authenticate with Hugging Face
-HF_TOKEN = os.getenv("HF_TOKEN")
-if HF_TOKEN:
-    login(token=HF_TOKEN)
-else:
-    raise ValueError("HF_TOKEN environment variable not set. Please set it in Hugging Face Space settings.")
-# Initialize the pyannote pipeline with GPU support
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 pipeline = Pipeline.from_pretrained(
     "pyannote/speaker-diarization-3.1",
-    use_auth_token=HF_TOKEN
-).to(device)
-def diarize_audio(audio_file):
-    try:
-        # Verify audio file format
-        if not audio_file.endswith('.wav'):
-            return "Error: Please upload a WAV file."
-        # Process the audio file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
-            temp_file.write(open(audio_file, 'rb').read())
             temp_file_path = temp_file.name
-        # Perform diarization
-        diarization = pipeline(temp_file_path)
-        # Format the output
-        output = []
         for turn, _, speaker in diarization.itertracks(yield_label=True):
             start = turn.start
             end = turn.end
-            output.append(f"Speaker {speaker}: {start:.1f}s - {end:.1f}s")
-        # Clean up temporary file
-        os.unlink(temp_file_path)
-        # Return formatted results
-        return "\n".join(output) if output else "No speakers detected."
     except Exception as e:
-        return f"Error processing audio: {str(e)}"
 # Create Gradio interface
 iface = gr.Interface(
-    fn=diarize_audio,
-    inputs=gr.Audio(type="filepath", label="Upload WAV Audio File"),
-    outputs=gr.Textbox(label="Diarization Results"),
-    title="Speaker Diarization with pyannote.audio 3.1",
-    description="Upload a WAV audio file to perform speaker diarization. Results show speaker segments with timestamps."
 )
 # Launch the interface

 import os
 from huggingface_hub import login
 import tempfile
+import librosa
+import soundfile as sf
+import numpy as np
+import warnings
+# Suppress torchaudio backend warning
+warnings.filterwarnings("ignore", category=UserWarning, module="pyannote.audio.core.io")
 # Authenticate with Hugging Face
+os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")  # Set in Hugging Face Space secrets
+login(token=os.environ["HF_TOKEN"])
+# Initialize the pyannote pipeline with pre-trained model
 pipeline = Pipeline.from_pretrained(
     "pyannote/speaker-diarization-3.1",
+    use_auth_token=True
+)
+# Optimize for GPU if available
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+pipeline.to(device)
+def process_audio(audio_file):
+    """
+    Process the input audio file and return diarization results.
+    Args:
+        audio_file: Path to the input audio file
+    Returns:
+        Tuple containing:
+        - Diarization text output
+        - Path to visualization plot
+        - Number of speakers detected
+    """
+    try:
+        # Load and preprocess audio
+        audio, sr = librosa.load(audio_file, sr=16000, mono=True)
+        # Save temporary audio file in WAV format (pyannote requirement)
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+            sf.write(temp_file.name, audio, sr)
             temp_file_path = temp_file.name
+        # Perform speaker diarization
+        diarization = pipeline({"uri": "audio", "audio": temp_file_path})
+        # Clean up temporary file
+        os.unlink(temp_file_path)
+        # Process diarization results
+        output_text = []
+        speakers = set()
         for turn, _, speaker in diarization.itertracks(yield_label=True):
             start = turn.start
             end = turn.end
+            output_text.append(
+                f"Speaker {speaker}: {start:.2f}s - {end:.2f}s"
+            )
+            speakers.add(speaker)
+        # Generate visualization
+        plot_path = visualize_diarization(diarization, audio, sr)
+        return (
+            "\n".join(output_text),
+            plot_path,
+            len(speakers)
+        )
     except Exception as e:
+        return f"Error processing audio: {str(e)}", None, 0
+def visualize_diarization(diarization, audio, sr):
+    """
+    Create a visualization of the diarization results.
+    Args:
+        diarization: Pyannote diarization object
+        audio: Audio waveform
+        sr: Sample rate
+    Returns:
+        Path to saved visualization plot
+    """
+    import matplotlib.pyplot as plt
+    plt.figure(figsize=(12, 4))
+    # Plot waveform
+    time = np.linspace(0, len(audio)/sr, num=len(audio))
+    plt.plot(time, audio, alpha=0.3, color='gray')
+    # Plot diarization segments
+    for turn, _, speaker in diarization.itertracks(yield_label=True):
+        plt.axvspan(turn.start, turn.end, alpha=0.2, label=f'Speaker {speaker}')
+    plt.xlabel('Time (s)')
+    plt.ylabel('Amplitude')
+    plt.title('Speaker Diarization')
+    plt.legend()
+    # Save plot
+    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_plot:
+        plt.savefig(temp_plot.name)
+        plot_path = temp_plot.name
+    plt.close()
+    return plot_path
 # Create Gradio interface
 iface = gr.Interface(
+    fn=process_audio,
+    inputs=gr.Audio(type="filepath", label="Upload Audio File"),
+    outputs=[
+        gr.Textbox(label="Diarization Results"),
+        gr.Image(label="Visualization"),
+        gr.Number(label="Number of Speakers")
+    ],
+    title="Speaker Diarization with Pyannote 3.1",
+    description="Upload an audio file to perform speaker diarization. Results show speaker segments and a visualization."
 )
 # Launch the interface