morbiwalaq commited on
Commit
03ec144
·
verified ·
1 Parent(s): 0de4764

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torchaudio
4
+ import gradio as gr
5
+ import numpy as np
6
+ from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
7
+ import torchaudio.transforms as transforms
8
+
9
+
10
+ MODEL_NAME = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech"
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+
13
+ feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
14
+ model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(device)
15
+
16
+ label2id = {"female": 0, "male": 1}
17
+ id2label = {0: "Female", 1: "Male"}
18
+
19
+
20
+ def preprocess_audio(audio):
21
+ """Convert stereo to mono, normalize, resample, and pad audio if needed."""
22
+ # Check if audio is not blank
23
+ if audio is None:
24
+ return None
25
+ sr, audio_data = audio
26
+ if audio_data is None:
27
+ return None
28
+
29
+ if audio_data.ndim > 1:
30
+ audio_data = np.mean(audio_data, axis=0)
31
+
32
+ audio_tensor = torch.tensor(audio_data, dtype=torch.float32)
33
+ resampler = torchaudio.transforms.Resample(sr, 16000)
34
+ audio_data_resampled = resampler(audio_tensor).numpy()
35
+
36
+
37
+ min_length = 16000
38
+ if audio_data_resampled.shape[0] < min_length:
39
+ padding = np.zeros(min_length - audio_data_resampled.shape[0], dtype=audio_data_resampled.dtype)
40
+ audio_data_resampled = np.concatenate([audio_data_resampled, padding])
41
+
42
+ return audio_data_resampled
43
+
44
+ def predict_gender(audio):
45
+
46
+ if audio is None:
47
+ return {"Error": "No audio provided."}
48
+ audio_data = preprocess_audio(audio)
49
+ if audio_data is None:
50
+ return {"Error": "Invalid audio input."}
51
+
52
+ inputs = feature_extractor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True)
53
+ # Move each tensor in the inputs dictionary to the desired device.
54
+ inputs = {k: v.to(device) for k, v in inputs.items()}
55
+
56
+ with torch.no_grad():
57
+ logits = model(**inputs).logits
58
+ scores = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
59
+
60
+ return { id2label[0]: scores[0], id2label[1]: scores[1] }
61
+
62
+
63
+ demo = gr.Interface(
64
+ fn=predict_gender,
65
+ inputs=gr.Audio(type="numpy"),
66
+ outputs=gr.Label(num_top_classes=2),
67
+ title="Voice Gender Detection",
68
+ description="Please use the microphone option and speak into the microphone to predict real time gender from voice."
69
+ )
70
+
71
+ demo.launch(debug=False, share=True)