audio-to-text / app.py
pratikshahp's picture
Update app.py
bc310a6 verified
raw
history blame contribute delete
1.8 kB
import torch
import torchaudio
from torchaudio.transforms import Resample
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
import streamlit as st
from audio_recorder_streamlit import audio_recorder
def preprocess_audio(audio_bytes, sample_rate=16000):
# Load audio and convert to mono if necessary
waveform, _ = torchaudio.load(audio_bytes, normalize=True)
if waveform.size(0) > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Resample if needed
if waveform.shape[1] != sample_rate:
resampler = Resample(orig_freq=waveform.shape[1], new_freq=sample_rate)
waveform = resampler(waveform)
return waveform
def transcribe_audio(audio_bytes):
model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
# Preprocess audio
input_features = preprocess_audio(audio_bytes)
# Tokenize audio
inputs = processor(input_features.squeeze(0), return_tensors="pt", padding=True)
# Generate transcription
generated_ids = model.generate(inputs.input_features)
translation = processor.batch_decode(generated_ids, skip_special_tokens=True)
return translation
st.title("Audio to Text Transcription with Recording")
# Use the st_audio_recorder widget to record audio
audio_bytes = audio_recorder()
# Display the recorded audio
if audio_bytes:
st.audio(audio_bytes, format="audio/wav")
transcription = transcribe_audio(audio_bytes)
if transcription:
st.write("Transcription:")
st.write(transcription[0])
else:
st.write("Error: Failed to transcribe audio.")
else:
st.write("Please record an audio.")