Y-Mangoes commited on
Commit
00e1f93
·
verified ·
1 Parent(s): 4ddfa68

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -36
app.py CHANGED
@@ -5,58 +5,127 @@ from pyannote.core import Segment, Annotation
5
  import os
6
  from huggingface_hub import login
7
  import tempfile
 
 
 
 
 
 
 
8
 
9
  # Authenticate with Hugging Face
10
- HF_TOKEN = os.getenv("HF_TOKEN")
11
- if HF_TOKEN:
12
- login(token=HF_TOKEN)
13
- else:
14
- raise ValueError("HF_TOKEN environment variable not set. Please set it in Hugging Face Space settings.")
15
 
16
- # Initialize the pyannote pipeline with GPU support
17
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
  pipeline = Pipeline.from_pretrained(
19
  "pyannote/speaker-diarization-3.1",
20
- use_auth_token=HF_TOKEN
21
- ).to(device)
22
 
23
- def diarize_audio(audio_file):
24
- try:
25
- # Verify audio file format
26
- if not audio_file.endswith('.wav'):
27
- return "Error: Please upload a WAV file."
28
 
29
- # Process the audio file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
31
- temp_file.write(open(audio_file, 'rb').read())
32
  temp_file_path = temp_file.name
33
-
34
- # Perform diarization
35
- diarization = pipeline(temp_file_path)
36
-
37
- # Format the output
38
- output = []
 
 
 
 
39
  for turn, _, speaker in diarization.itertracks(yield_label=True):
40
  start = turn.start
41
  end = turn.end
42
- output.append(f"Speaker {speaker}: {start:.1f}s - {end:.1f}s")
43
-
44
- # Clean up temporary file
45
- os.unlink(temp_file_path)
46
-
47
- # Return formatted results
48
- return "\n".join(output) if output else "No speakers detected."
49
-
 
 
 
 
 
 
50
  except Exception as e:
51
- return f"Error processing audio: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  # Create Gradio interface
54
  iface = gr.Interface(
55
- fn=diarize_audio,
56
- inputs=gr.Audio(type="filepath", label="Upload WAV Audio File"),
57
- outputs=gr.Textbox(label="Diarization Results"),
58
- title="Speaker Diarization with pyannote.audio 3.1",
59
- description="Upload a WAV audio file to perform speaker diarization. Results show speaker segments with timestamps."
 
 
 
 
60
  )
61
 
62
  # Launch the interface
 
5
  import os
6
  from huggingface_hub import login
7
  import tempfile
8
+ import librosa
9
+ import soundfile as sf
10
+ import numpy as np
11
+ import warnings
12
+
13
+ # Suppress torchaudio backend warning
14
+ warnings.filterwarnings("ignore", category=UserWarning, module="pyannote.audio.core.io")
15
 
16
  # Authenticate with Hugging Face
17
+ os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN") # Set in Hugging Face Space secrets
18
+ login(token=os.environ["HF_TOKEN"])
 
 
 
19
 
20
+ # Initialize the pyannote pipeline with pre-trained model
 
21
  pipeline = Pipeline.from_pretrained(
22
  "pyannote/speaker-diarization-3.1",
23
+ use_auth_token=True
24
+ )
25
 
26
+ # Optimize for GPU if available
27
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28
+ pipeline.to(device)
 
 
29
 
30
+ def process_audio(audio_file):
31
+ """
32
+ Process the input audio file and return diarization results.
33
+
34
+ Args:
35
+ audio_file: Path to the input audio file
36
+
37
+ Returns:
38
+ Tuple containing:
39
+ - Diarization text output
40
+ - Path to visualization plot
41
+ - Number of speakers detected
42
+ """
43
+ try:
44
+ # Load and preprocess audio
45
+ audio, sr = librosa.load(audio_file, sr=16000, mono=True)
46
+
47
+ # Save temporary audio file in WAV format (pyannote requirement)
48
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
49
+ sf.write(temp_file.name, audio, sr)
50
  temp_file_path = temp_file.name
51
+
52
+ # Perform speaker diarization
53
+ diarization = pipeline({"uri": "audio", "audio": temp_file_path})
54
+
55
+ # Clean up temporary file
56
+ os.unlink(temp_file_path)
57
+
58
+ # Process diarization results
59
+ output_text = []
60
+ speakers = set()
61
  for turn, _, speaker in diarization.itertracks(yield_label=True):
62
  start = turn.start
63
  end = turn.end
64
+ output_text.append(
65
+ f"Speaker {speaker}: {start:.2f}s - {end:.2f}s"
66
+ )
67
+ speakers.add(speaker)
68
+
69
+ # Generate visualization
70
+ plot_path = visualize_diarization(diarization, audio, sr)
71
+
72
+ return (
73
+ "\n".join(output_text),
74
+ plot_path,
75
+ len(speakers)
76
+ )
77
+
78
  except Exception as e:
79
+ return f"Error processing audio: {str(e)}", None, 0
80
+
81
+ def visualize_diarization(diarization, audio, sr):
82
+ """
83
+ Create a visualization of the diarization results.
84
+
85
+ Args:
86
+ diarization: Pyannote diarization object
87
+ audio: Audio waveform
88
+ sr: Sample rate
89
+
90
+ Returns:
91
+ Path to saved visualization plot
92
+ """
93
+ import matplotlib.pyplot as plt
94
+
95
+ plt.figure(figsize=(12, 4))
96
+
97
+ # Plot waveform
98
+ time = np.linspace(0, len(audio)/sr, num=len(audio))
99
+ plt.plot(time, audio, alpha=0.3, color='gray')
100
+
101
+ # Plot diarization segments
102
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
103
+ plt.axvspan(turn.start, turn.end, alpha=0.2, label=f'Speaker {speaker}')
104
+
105
+ plt.xlabel('Time (s)')
106
+ plt.ylabel('Amplitude')
107
+ plt.title('Speaker Diarization')
108
+ plt.legend()
109
+
110
+ # Save plot
111
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_plot:
112
+ plt.savefig(temp_plot.name)
113
+ plot_path = temp_plot.name
114
+
115
+ plt.close()
116
+ return plot_path
117
 
118
  # Create Gradio interface
119
  iface = gr.Interface(
120
+ fn=process_audio,
121
+ inputs=gr.Audio(type="filepath", label="Upload Audio File"),
122
+ outputs=[
123
+ gr.Textbox(label="Diarization Results"),
124
+ gr.Image(label="Visualization"),
125
+ gr.Number(label="Number of Speakers")
126
+ ],
127
+ title="Speaker Diarization with Pyannote 3.1",
128
+ description="Upload an audio file to perform speaker diarization. Results show speaker segments and a visualization."
129
  )
130
 
131
  # Launch the interface