|
import gradio as gr |
|
from transformers import pipeline |
|
from pydub import AudioSegment |
|
import os |
|
|
|
|
|
asr=pipeline("audio-classification", model="firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3") |
|
|
|
def convert_audio_to_wav(audio_path): |
|
"""Convert audio to WAV format""" |
|
audio = AudioSegment.from_file(audio_path) |
|
wav_path = audio_path + ".wav" |
|
audio.export(wav_path, format="wav") |
|
return wav_path |
|
|
|
def transcribe(audio_path): |
|
wav_path = convert_audio_to_wav(audio_path) |
|
result = asr(wav_path) |
|
os.remove(wav_path) |
|
return result[0] |
|
|
|
|
|
demo = gr.Interface( |
|
fn=transcribe, |
|
inputs=gr.Audio(type="filepath", label="Upload Audio (.m4a, .mp3, .wav...)"), |
|
outputs=gr.Textbox(label="Transcription"), |
|
title="Whisper Speech emotion Recognition", |
|
description="Transcribes most audio formats using Whisper." |
|
) |
|
|
|
|
|
demo.launch() |
|
|
|
|
|
|
|
|