KASOTI_GAME / app.py
iisadia's picture
Update app.py
54d37c3 verified
raw
history blame
2.27 kB
import streamlit as st
from transformers import pipeline
import numpy as np
import torchaudio
from audio_recorder_streamlit import audio_recorder
import torch
from io import BytesIO
# Load Whisper model (cached)
@st.cache_resource
def load_model():
return pipeline("automatic-speech-recognition", model="openai/whisper-base")
# Audio processing function
def process_audio(audio_bytes):
waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
if waveform.shape[0] > 1: # Convert stereo to mono
waveform = torch.mean(waveform, dim=0, keepdim=True)
if sample_rate != 16000: # Resample to 16kHz if needed
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform)
return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}
# Streamlit App
st.title("Real-Time Voice Typing")
st.write("Type or speak - text will appear instantly!")
# Initialize text in session state
if 'text_input' not in st.session_state:
st.session_state.text_input = ""
# Main text area (auto-updates from session state)
text_input = st.text_area(
"Your text will appear here:",
value=st.session_state.text_input,
height=300,
key="text_area"
)
# Audio recorder component
audio_bytes = audio_recorder(
pause_threshold=2.0, # Stop after 2 seconds of silence
text="Speak to type",
recording_color="#e8b62c",
neutral_color="#6aa36f",
)
# Process audio in real-time
if audio_bytes:
try:
audio_input = process_audio(audio_bytes)
whisper = load_model()
transcribed_text = whisper(audio_input)["text"]
# Append new transcription to existing text
st.session_state.text_input = st.session_state.text_input + " " + transcribed_text
st.experimental_rerun() # Refresh to update text area
except Exception as e:
st.error(f"Error: {str(e)}")
# Control buttons
col1, col2 = st.columns(2)
with col1:
if st.button("Clear Text"):
st.session_state.text_input = ""
st.experimental_rerun()
with col2:
st.download_button(
"Download Text",
data=st.session_state.text_input,
file_name="voice_typed.txt",
mime="text/plain"
)