Spaces:

mangoesai
/

Pyannote_diarization

Running on T4

App Files Files Community

Y-Mangoes commited on 10 days ago

Commit

84035c8

verified ·

1 Parent(s): 4873c49

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -115

app.py CHANGED Viewed

@@ -1,133 +1,101 @@
 import gradio as gr
 import torch
 from pyannote.audio import Pipeline
-from pyannote.core import Segment, Annotation
-import os
 from huggingface_hub import login
-import tempfile
-import librosa
-import soundfile as sf
 import numpy as np
-import warnings
-# Suppress torchaudio backend warning
-warnings.filterwarnings("ignore", category=UserWarning, module="pyannote.audio.core.io")
-# Authenticate with Hugging Face
-os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")  # Set in Hugging Face Space secrets
-login(token=os.environ["HF_TOKEN"])
-# Initialize the pyannote pipeline with pre-trained model
-pipeline = Pipeline.from_pretrained(
-    "pyannote/speaker-diarization-3.1",
-    use_auth_token=True
-)
-# Optimize for GPU if available
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-pipeline.to(device)
-def process_audio(audio_file):
-    """
-    Process the input audio file and return diarization results.
-    Args:
-        audio_file: Path to the input audio file
-    Returns:
-        Tuple containing:
-        - Diarization text output
-        - Path to visualization plot
-        - Number of speakers detected
-    """
     try:
-        # Load and preprocess audio
-        audio, sr = librosa.load(audio_file, sr=16000, mono=True)
-        # Save temporary audio file in WAV format (pyannote requirement)
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
-            sf.write(temp_file.name, audio, sr)
-            temp_file_path = temp_file.name
-        # Perform speaker diarization
-        diarization = pipeline({"uri": "audio", "audio": temp_file_path})
-        # Clean up temporary file
-        os.unlink(temp_file_path)
-        # Process diarization results
-        output_text = []
-        speakers = set()
         for turn, _, speaker in diarization.itertracks(yield_label=True):
-            start = turn.start
-            end = turn.end
-            output_text.append(
-                f"Speaker {speaker}: {start:.2f}s - {end:.2f}s"
-            )
-            speakers.add(speaker)
-        # Generate visualization
-        plot_path = visualize_diarization(diarization, audio, sr)
-        return (
-            "\n".join(output_text),
-            plot_path,
-            len(speakers)
-        )
     except Exception as e:
-        return f"Error processing audio: {str(e)}", None, 0
-def visualize_diarization(diarization, audio, sr):
-    """
-    Create a visualization of the diarization results.
-    Args:
-        diarization: Pyannote diarization object
-        audio: Audio waveform
-        sr: Sample rate
-    Returns:
-        Path to saved visualization plot
-    """
-    import matplotlib.pyplot as plt
-    plt.figure(figsize=(12, 4))
-    # Plot waveform
-    time = np.linspace(0, len(audio)/sr, num=len(audio))
-    plt.plot(time, audio, alpha=0.3, color='gray')
-    # Plot diarization segments
-    for turn, _, speaker in diarization.itertracks(yield_label=True):
-        plt.axvspan(turn.start, turn.end, alpha=0.2, label=f'Speaker {speaker}')
-    plt.xlabel('Time (s)')
-    plt.ylabel('Amplitude')
-    plt.title('Speaker Diarization')
-    plt.legend()
-    # Save plot
-    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_plot:
-        plt.savefig(temp_plot.name)
-        plot_path = temp_plot.name
-    plt.close()
-    return plot_path
-# Create Gradio interface
-iface = gr.Interface(
-    fn=process_audio,
-    inputs=gr.Audio(type="filepath", label="Upload Audio File"),
-    outputs=[
-        gr.Textbox(label="Diarization Results"),
-        gr.Image(label="Visualization"),
-        gr.Number(label="Number of Speakers")
-    ],
-    title="Speaker Diarization with Pyannote 3.1",
-    description="Upload an audio file to perform speaker diarization. Results show speaker segments and a visualization."
-)
-# Launch the interface
-if __name__ == "__main__":
-    iface.launch()

+import os
 import gradio as gr
 import torch
+import torchaudio
+from pydub import AudioSegment
 from pyannote.audio import Pipeline
 from huggingface_hub import login
 import numpy as np
+import json
+# Authenticate with Huggingface
+HF_TOKEN = os.getenv("HF_TOKEN")
+if HF_TOKEN:
+    login(HF_TOKEN)
+else:
+    raise ValueError("Huggingface token not found. Set HF_TOKEN environment variable.")
+# Load the diarization pipeline
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0").to(device)
+def preprocess_audio(audio_path):
+    """Convert audio to mono, 16kHz WAV format suitable for pyannote."""
     try:
+        # Load audio with pydub
+        audio = AudioSegment.from_file(audio_path)
+        # Convert to mono and set sample rate to 16kHz
+        audio = audio.set_channels(1).set_frame_rate(16000)
+        # Export to temporary WAV file
+        temp_wav = "temp_audio.wav"
+        audio.export(temp_wav, format="wav")
+        return temp_wav
+    except Exception as e:
+        raise ValueError(f"Error preprocessing audio: {str(e)}")
+def diarize_audio(audio_path, num_speakers):
+    """Perform speaker diarization and return formatted results."""
+    try:
+        # Validate inputs
+        if not os.path.exists(audio_path):
+            raise ValueError("Audio file not found.")
+        if not isinstance(num_speakers, int) or num_speakers < 1:
+            raise ValueError("Number of speakers must be a positive integer.")
+        # Preprocess audio
+        wav_path = preprocess_audio(audio_path)
+        # Load audio for pyannote
+        waveform, sample_rate = torchaudio.load(wav_path)
+        audio_dict = {"waveform": waveform.to(device), "sample_rate": sample_rate}
+        # Configure pipeline with number of speakers
+        pipeline_params = {"num_speakers": num_speakers}
+        diarization = pipeline(audio_dict, **pipeline_params)
+        # Format results
+        results = []
+        text_output = ""
         for turn, _, speaker in diarization.itertracks(yield_label=True):
+            result = {
+                "start": round(turn.start, 3),
+                "end": round(turn.end, 3),
+                "speaker_id": speaker
+            }
+            results.append(result)
+            text_output += f"Speaker {speaker}: {result['start']}s - {result['end']}s\n"
+        # Clean up temporary file
+        if os.path.exists(wav_path):
+            os.remove(wav_path)
+        # Return text and JSON results
+        json_output = json.dumps(results, indent=2)
+        return text_output, json_output
     except Exception as e:
+        return f"Error: {str(e)}", ""
+# Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Speaker Diarization with Pyannote 3.0")
+    gr.Markdown("Upload an audio file and specify the number of speakers to diarize the audio.")
+    with gr.Row():
+        audio_input = gr.Audio(label="Upload Audio File", type="filepath")
+        num_speakers = gr.Slider(minimum=1, maximum=10, step=1, label="Number of Speakers", value=2)
+    submit_btn = gr.Button("Diarize")
+    with gr.Row():
+        text_output = gr.Textbox(label="Diarization Results (Text)")
+        json_output = gr.Textbox(label="Diarization Results (JSON)")
+    submit_btn.click(
+        fn=diarize_audio,
+        inputs=[audio_input, num_speakers],
+        outputs=[text_output, json_output]
+    )
+# Launch the Gradio app
+demo.launch()