File size: 2,824 Bytes
c4035be
8b91795
656e90b
6ddfbf8
656e90b
6ddfbf8
54d37c3
dacb1ad
8b91795
656e90b
8b91795
 
6dc3295
8b91795
656e90b
54d37c3
 
656e90b
54d37c3
656e90b
54d37c3
 
 
8b91795
656e90b
54d37c3
656e90b
8b91795
dacb1ad
54d37c3
 
656e90b
 
8b91795
656e90b
54d37c3
656e90b
54d37c3
656e90b
 
54d37c3
8b91795
656e90b
 
da50417
656e90b
 
 
 
8b91795
656e90b
 
4ac719f
656e90b
 
 
 
 
 
 
4ac719f
dacb1ad
656e90b
 
 
 
4ac719f
656e90b
 
 
 
f909060
54d37c3
 
 
 
 
656e90b
dacb1ad
54d37c3
 
 
 
656e90b
54d37c3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import streamlit as st
from transformers import pipeline
import numpy as np
import torchaudio
from audio_recorder_streamlit import audio_recorder
import torch
from io import BytesIO
import hashlib

# Load Whisper model (cached)
@st.cache_resource
def load_model():
    return pipeline("automatic-speech-recognition", model="openai/whisper-base")

# Audio processing function
def process_audio(audio_bytes):
    waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
    if waveform.shape[0] > 1:  # Convert stereo to mono
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    if sample_rate != 16000:  # Resample to 16kHz if needed
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
    return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}

# Streamlit App
st.title("Real-Time Voice Typing")
st.write("Type or speak - text will appear instantly!")

# Initialize session state
if 'text_input' not in st.session_state:
    st.session_state.text_input = ""
if 'last_audio_hash' not in st.session_state:
    st.session_state.last_audio_hash = ""

# Main text area
text_input = st.text_area(
    "Your text will appear here:", 
    value=st.session_state.text_input,
    height=300,
    key="text_area"
)

# Audio recorder component
audio_bytes = audio_recorder(
    pause_threshold=0.8,  # Shorter pause threshold
    text="Speak to type",
    recording_color="#e8b62c",
    neutral_color="#6aa36f",
)

# Process new audio only if it's different from last time
if audio_bytes:
    st.info("🎀 Audio received!")
    current_hash = hashlib.md5(audio_bytes).hexdigest()
    if current_hash != st.session_state.last_audio_hash:
        st.session_state.last_audio_hash = current_hash
        try:
            audio_input = process_audio(audio_bytes)
            whisper = load_model()
            transcribed_text = whisper(audio_input)["text"]
            st.info(f"πŸ“ Transcribed: {transcribed_text}")
            
            # Append new transcription only if different
            if (not st.session_state.text_input.endswith(transcribed_text.strip()) and 
                len(transcribed_text.strip()) > 0):
                st.session_state.text_input += " " + transcribed_text
                st.success("βœ… Text added to box!")
                st.rerun()
                
        except Exception as e:
            st.error(f"Error: {str(e)}")

# Control buttons
col1, col2 = st.columns(2)
with col1:
    if st.button("Clear Text"):
        st.session_state.text_input = ""
        st.session_state.last_audio_hash = ""
        st.rerun()
with col2:
    st.download_button(
        "Download Text",
        data=st.session_state.text_input,
        file_name="voice_typed.txt",
        mime="text/plain"
    )