VoiceGenX / app.py
morbiwalaq's picture
Create app.py
03ec144 verified
raw
history blame contribute delete
2.39 kB
import os
import torch
import torchaudio
import gradio as gr
import numpy as np
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
import torchaudio.transforms as transforms
MODEL_NAME = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(device)
label2id = {"female": 0, "male": 1}
id2label = {0: "Female", 1: "Male"}
def preprocess_audio(audio):
"""Convert stereo to mono, normalize, resample, and pad audio if needed."""
# Check if audio is not blank
if audio is None:
return None
sr, audio_data = audio
if audio_data is None:
return None
if audio_data.ndim > 1:
audio_data = np.mean(audio_data, axis=0)
audio_tensor = torch.tensor(audio_data, dtype=torch.float32)
resampler = torchaudio.transforms.Resample(sr, 16000)
audio_data_resampled = resampler(audio_tensor).numpy()
min_length = 16000
if audio_data_resampled.shape[0] < min_length:
padding = np.zeros(min_length - audio_data_resampled.shape[0], dtype=audio_data_resampled.dtype)
audio_data_resampled = np.concatenate([audio_data_resampled, padding])
return audio_data_resampled
def predict_gender(audio):
if audio is None:
return {"Error": "No audio provided."}
audio_data = preprocess_audio(audio)
if audio_data is None:
return {"Error": "Invalid audio input."}
inputs = feature_extractor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True)
# Move each tensor in the inputs dictionary to the desired device.
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
logits = model(**inputs).logits
scores = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
return { id2label[0]: scores[0], id2label[1]: scores[1] }
demo = gr.Interface(
fn=predict_gender,
inputs=gr.Audio(type="numpy"),
outputs=gr.Label(num_top_classes=2),
title="Voice Gender Detection",
description="Please use the microphone option and speak into the microphone to predict real time gender from voice."
)
demo.launch(debug=False, share=True)