import streamlit as st from transformers import pipeline import numpy as np import torchaudio from audio_recorder_streamlit import audio_recorder import torch from io import BytesIO import hashlib # Load Whisper model (cached) @st.cache_resource def load_model(): return pipeline("automatic-speech-recognition", model="openai/whisper-base") # Audio processing function def process_audio(audio_bytes): waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes)) if waveform.shape[0] > 1: # Convert stereo to mono waveform = torch.mean(waveform, dim=0, keepdim=True) if sample_rate != 16000: # Resample to 16kHz if needed resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) waveform = resampler(waveform) return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000} # Streamlit App st.title("Real-Time Voice Typing") st.write("Type or speak - text will appear instantly!") # Initialize session state if 'text_input' not in st.session_state: st.session_state.text_input = "" if 'last_audio_hash' not in st.session_state: st.session_state.last_audio_hash = "" # Main text area text_input = st.text_area( "Your text will appear here:", value=st.session_state.text_input, height=300, key="text_area" ) # Audio recorder component audio_bytes = audio_recorder( pause_threshold=0.8, # Shorter pause threshold text="Speak to type", recording_color="#e8b62c", neutral_color="#6aa36f", ) # Process new audio only if it's different from last time if audio_bytes: st.info("🎤 Audio received!") current_hash = hashlib.md5(audio_bytes).hexdigest() if current_hash != st.session_state.last_audio_hash: st.session_state.last_audio_hash = current_hash try: audio_input = process_audio(audio_bytes) whisper = load_model() transcribed_text = whisper(audio_input)["text"] st.info(f"📝 Transcribed: {transcribed_text}") # Append new transcription only if different if (not st.session_state.text_input.endswith(transcribed_text.strip()) and len(transcribed_text.strip()) > 0): st.session_state.text_input += " " + transcribed_text st.success("✅ Text added to box!") st.rerun() except Exception as e: st.error(f"Error: {str(e)}") # Control buttons col1, col2 = st.columns(2) with col1: if st.button("Clear Text"): st.session_state.text_input = "" st.session_state.last_audio_hash = "" st.rerun() with col2: st.download_button( "Download Text", data=st.session_state.text_input, file_name="voice_typed.txt", mime="text/plain" )