Ben Wiley commited on
Commit
a7ab6a9
·
1 Parent(s): 73bfca0

adding init

Browse files
Files changed (2) hide show
  1. app.py +72 -0
  2. requirement.txt +3 -0
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torchaudio
2
+ import gradio as gr
3
+ from pyannote.audio import Pipeline
4
+ from pyannote.audio.pipelines.utils.hook import ProgressHook
5
+ import scipy.io.wavfile
6
+ import os
7
+
8
+
9
+ def perform_separation(audio_file_path: str):
10
+ # Instantiate the pipeline
11
+ pipeline = Pipeline.from_pretrained(
12
+ "pyannote/speech-separation-ami-1.0",
13
+ use_auth_token=HUGGINGFACE_ACCESS_TOKEN,
14
+ )
15
+
16
+ waveform, sample_rate = torchaudio.load(audio_file_path)
17
+
18
+ # Run the pipeline
19
+ with ProgressHook() as hook:
20
+ diarization, sources = pipeline(
21
+ {"waveform": waveform, "sample_rate": sample_rate}, hook=hook
22
+ )
23
+
24
+ # Save separated sources to disk as SPEAKER_XX.wav files
25
+ output_file_paths = []
26
+ for s, speaker in enumerate(diarization.labels()):
27
+ number_of_separated_sources = sources.data.shape[1]
28
+ if s >= number_of_separated_sources:
29
+ break
30
+
31
+ output_file_path = f"{speaker}.wav"
32
+ scipy.io.wavfile.write(
33
+ output_file_path, sample_rate, sources.data[:, s].numpy()
34
+ )
35
+ output_file_paths.append(output_file_path)
36
+
37
+ # Generate RTTM content
38
+ rttm_content = diarization.to_rttm()
39
+
40
+ return output_file_paths, rttm_content
41
+
42
+
43
+ def gradio_wrapper(audio_file_path: str):
44
+ output_file_paths, rttm_content = perform_separation(audio_file_path)
45
+ return output_file_paths + [rttm_content]
46
+
47
+
48
+ inputs = gr.inputs.Audio(label="Input Audio", type="filepath")
49
+
50
+ # Dynamic output for audio files
51
+ outputs = []
52
+ max_speakers = 10 # Set a reasonable maximum number of speakers
53
+ for i in range(max_speakers):
54
+ outputs.append(gr.outputs.Audio(label=f"Speaker {i+1}", type="filepath"))
55
+
56
+ # Add RTTM output
57
+ outputs.append(gr.outputs.Textbox(label="RTTM Output"))
58
+
59
+ title = "Speech Separation and Diarization"
60
+ description = "Gradio demo for Speech Separation and Diarization using Pyannote's pyannote/speech-separation-ami-1.0. To use it, simply upload your audio, or click one of the examples to load them. The app will output separated audio for each speaker and the RTTM file content."
61
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2403.02288' target='_blank'>PixIT: Joint Training of Speaker Diarization and Speech Separation from Real-world Multi-speaker Recordings</a> | <a href='https://huggingface.co/pyannote/speech-separation-ami-1.0' '_blank'>HuggingFace Pipeline</a></p>"
62
+ examples = [["samples_audio_samples_test_mixture.wav"]]
63
+
64
+ gr.Interface(
65
+ gradio_wrapper,
66
+ inputs,
67
+ outputs,
68
+ title=title,
69
+ description=description,
70
+ article=article,
71
+ examples=examples,
72
+ ).launch()
requirement.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pyannote.audio[separation]==3.3.0
2
+ torchaudio
3
+ scipy