Spaces:
Running
Running
File size: 2,824 Bytes
c4035be 8b91795 656e90b 6ddfbf8 656e90b 6ddfbf8 54d37c3 dacb1ad 8b91795 656e90b 8b91795 6dc3295 8b91795 656e90b 54d37c3 656e90b 54d37c3 656e90b 54d37c3 8b91795 656e90b 54d37c3 656e90b 8b91795 dacb1ad 54d37c3 656e90b 8b91795 656e90b 54d37c3 656e90b 54d37c3 656e90b 54d37c3 8b91795 656e90b da50417 656e90b 8b91795 656e90b 4ac719f 656e90b 4ac719f dacb1ad 656e90b 4ac719f 656e90b f909060 54d37c3 656e90b dacb1ad 54d37c3 656e90b 54d37c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import streamlit as st
from transformers import pipeline
import numpy as np
import torchaudio
from audio_recorder_streamlit import audio_recorder
import torch
from io import BytesIO
import hashlib
# Load Whisper model (cached)
@st.cache_resource
def load_model():
return pipeline("automatic-speech-recognition", model="openai/whisper-base")
# Audio processing function
def process_audio(audio_bytes):
waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
if waveform.shape[0] > 1: # Convert stereo to mono
waveform = torch.mean(waveform, dim=0, keepdim=True)
if sample_rate != 16000: # Resample to 16kHz if needed
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform)
return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}
# Streamlit App
st.title("Real-Time Voice Typing")
st.write("Type or speak - text will appear instantly!")
# Initialize session state
if 'text_input' not in st.session_state:
st.session_state.text_input = ""
if 'last_audio_hash' not in st.session_state:
st.session_state.last_audio_hash = ""
# Main text area
text_input = st.text_area(
"Your text will appear here:",
value=st.session_state.text_input,
height=300,
key="text_area"
)
# Audio recorder component
audio_bytes = audio_recorder(
pause_threshold=0.8, # Shorter pause threshold
text="Speak to type",
recording_color="#e8b62c",
neutral_color="#6aa36f",
)
# Process new audio only if it's different from last time
if audio_bytes:
st.info("π€ Audio received!")
current_hash = hashlib.md5(audio_bytes).hexdigest()
if current_hash != st.session_state.last_audio_hash:
st.session_state.last_audio_hash = current_hash
try:
audio_input = process_audio(audio_bytes)
whisper = load_model()
transcribed_text = whisper(audio_input)["text"]
st.info(f"π Transcribed: {transcribed_text}")
# Append new transcription only if different
if (not st.session_state.text_input.endswith(transcribed_text.strip()) and
len(transcribed_text.strip()) > 0):
st.session_state.text_input += " " + transcribed_text
st.success("β
Text added to box!")
st.rerun()
except Exception as e:
st.error(f"Error: {str(e)}")
# Control buttons
col1, col2 = st.columns(2)
with col1:
if st.button("Clear Text"):
st.session_state.text_input = ""
st.session_state.last_audio_hash = ""
st.rerun()
with col2:
st.download_button(
"Download Text",
data=st.session_state.text_input,
file_name="voice_typed.txt",
mime="text/plain"
) |