File size: 2,266 Bytes
c4035be
8b91795
 
6ddfbf8
6dc3295
6ddfbf8
54d37c3
8b91795
54d37c3
8b91795
 
6dc3295
8b91795
54d37c3
 
 
 
 
 
 
 
 
8b91795
54d37c3
 
 
8b91795
54d37c3
 
 
8b91795
54d37c3
 
 
 
 
 
 
8b91795
54d37c3
 
 
 
 
 
 
8b91795
54d37c3
 
c4035be
54d37c3
 
 
6ddfbf8
54d37c3
 
 
8b91795
 
54d37c3
8b91795
54d37c3
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import streamlit as st
from transformers import pipeline
import numpy as np
import torchaudio
from audio_recorder_streamlit import audio_recorder
import torch
from io import BytesIO

# Load Whisper model (cached)
@st.cache_resource
def load_model():
    return pipeline("automatic-speech-recognition", model="openai/whisper-base")

# Audio processing function
def process_audio(audio_bytes):
    waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
    if waveform.shape[0] > 1:  # Convert stereo to mono
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    if sample_rate != 16000:  # Resample to 16kHz if needed
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
    return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}

# Streamlit App
st.title("Real-Time Voice Typing")
st.write("Type or speak - text will appear instantly!")

# Initialize text in session state
if 'text_input' not in st.session_state:
    st.session_state.text_input = ""

# Main text area (auto-updates from session state)
text_input = st.text_area(
    "Your text will appear here:", 
    value=st.session_state.text_input,
    height=300,
    key="text_area"
)

# Audio recorder component
audio_bytes = audio_recorder(
    pause_threshold=2.0,  # Stop after 2 seconds of silence
    text="Speak to type",
    recording_color="#e8b62c",
    neutral_color="#6aa36f",
)

# Process audio in real-time
if audio_bytes:
    try:
        audio_input = process_audio(audio_bytes)
        whisper = load_model()
        transcribed_text = whisper(audio_input)["text"]
        
        # Append new transcription to existing text
        st.session_state.text_input = st.session_state.text_input + " " + transcribed_text
        st.experimental_rerun()  # Refresh to update text area
        
    except Exception as e:
        st.error(f"Error: {str(e)}")

# Control buttons
col1, col2 = st.columns(2)
with col1:
    if st.button("Clear Text"):
        st.session_state.text_input = ""
        st.experimental_rerun()
with col2:
    st.download_button(
        "Download Text",
        data=st.session_state.text_input,
        file_name="voice_typed.txt",
        mime="text/plain"
    )