Spaces:
Running
Running
import streamlit as st | |
from transformers import pipeline | |
import numpy as np | |
import torchaudio | |
from audio_recorder_streamlit import audio_recorder | |
import torch | |
from io import BytesIO | |
import hashlib | |
# Load Whisper model (cached) | |
def load_model(): | |
return pipeline("automatic-speech-recognition", model="openai/whisper-base") | |
# Audio processing function | |
def process_audio(audio_bytes): | |
waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes)) | |
if waveform.shape[0] > 1: # Convert stereo to mono | |
waveform = torch.mean(waveform, dim=0, keepdim=True) | |
if sample_rate != 16000: # Resample to 16kHz if needed | |
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
waveform = resampler(waveform) | |
return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000} | |
# Streamlit App | |
st.title("Real-Time Voice Typing") | |
st.write("Type or speak - text will appear instantly!") | |
# Initialize session state | |
if 'text_input' not in st.session_state: | |
st.session_state.text_input = "" | |
if 'last_audio_hash' not in st.session_state: | |
st.session_state.last_audio_hash = "" | |
# Main text area | |
text_input = st.text_area( | |
"Your text will appear here:", | |
value=st.session_state.text_input, | |
height=300, | |
key="text_area" | |
) | |
# Audio recorder component | |
audio_bytes = audio_recorder( | |
pause_threshold=0.8, # Shorter pause threshold | |
text="Speak to type", | |
recording_color="#e8b62c", | |
neutral_color="#6aa36f", | |
) | |
# Process new audio only if it's different from last time | |
if audio_bytes: | |
st.info("π€ Audio received!") | |
current_hash = hashlib.md5(audio_bytes).hexdigest() | |
if current_hash != st.session_state.last_audio_hash: | |
st.session_state.last_audio_hash = current_hash | |
try: | |
audio_input = process_audio(audio_bytes) | |
whisper = load_model() | |
transcribed_text = whisper(audio_input)["text"] | |
st.info(f"π Transcribed: {transcribed_text}") | |
# Append new transcription only if different | |
if (not st.session_state.text_input.endswith(transcribed_text.strip()) and | |
len(transcribed_text.strip()) > 0): | |
st.session_state.text_input += " " + transcribed_text | |
st.success("β Text added to box!") | |
st.rerun() | |
except Exception as e: | |
st.error(f"Error: {str(e)}") | |
# Control buttons | |
col1, col2 = st.columns(2) | |
with col1: | |
if st.button("Clear Text"): | |
st.session_state.text_input = "" | |
st.session_state.last_audio_hash = "" | |
st.rerun() | |
with col2: | |
st.download_button( | |
"Download Text", | |
data=st.session_state.text_input, | |
file_name="voice_typed.txt", | |
mime="text/plain" | |
) |