iisadia commited on
Commit
54d37c3
·
verified ·
1 Parent(s): 3964945

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -69
app.py CHANGED
@@ -2,89 +2,73 @@ import streamlit as st
2
  from transformers import pipeline
3
  import numpy as np
4
  import torchaudio
5
- from io import BytesIO
6
  from audio_recorder_streamlit import audio_recorder
7
  import torch
 
8
 
9
- # Load Whisper model
10
  @st.cache_resource
11
  def load_model():
12
  return pipeline("automatic-speech-recognition", model="openai/whisper-base")
13
 
14
- st.title("Text Entry with Voice Input")
15
- st.write("Enter text manually or use voice input:")
 
 
 
 
 
 
 
16
 
17
- # Initialize session state
18
- if 'combined_text' not in st.session_state:
19
- st.session_state.combined_text = ""
20
 
21
- # Create columns layout
22
- col1, col2 = st.columns(2)
 
23
 
24
- with col1:
25
- # Text input
26
- text_input = st.text_area("Type your text here:", height=200)
 
 
 
 
27
 
28
- with col2:
29
- # Audio input
30
- st.write("Record your voice:")
31
- audio_bytes = audio_recorder()
32
- if audio_bytes:
33
- st.audio(audio_bytes, format="audio/wav")
 
34
 
35
- def process_audio(audio_bytes):
 
36
  try:
37
- # Convert bytes to numpy array
38
- waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
 
39
 
40
- # Convert stereo to mono if needed
41
- if waveform.shape[0] > 1:
42
- waveform = torch.mean(waveform, dim=0, keepdim=True)
43
-
44
- # Resample to 16kHz if needed (Whisper's expected sample rate)
45
- if sample_rate != 16000:
46
- resampler = torchaudio.transforms.Resample(
47
- orig_freq=sample_rate,
48
- new_freq=16000
49
- )
50
- waveform = resampler(waveform)
51
- sample_rate = 16000
52
-
53
- # Convert to numpy array
54
- audio_np = waveform.numpy().squeeze()
55
 
56
- return {"raw": audio_np, "sampling_rate": sample_rate}
57
  except Exception as e:
58
- st.error(f"Audio processing error: {str(e)}")
59
- return None
60
-
61
- # Process audio when recording is available
62
- if audio_bytes:
63
- audio_input = process_audio(audio_bytes)
64
- if audio_input:
65
- try:
66
- # Transcribe audio
67
- whisper = load_model()
68
- transcribed_text = whisper(audio_input)["text"]
69
-
70
- # Update session state
71
- st.session_state.combined_text = f"{text_input}\n{transcribed_text}".strip()
72
- except Exception as e:
73
- st.error(f"Transcription error: {str(e)}")
74
-
75
- # Combine inputs when button is clicked
76
- if st.button("Submit"):
77
- if not text_input and not audio_bytes:
78
- st.warning("Please enter text or record audio")
79
- else:
80
- # Display combined text
81
- st.subheader("Combined Input:")
82
- st.write(st.session_state.combined_text)
83
 
84
- # Add download button
85
- st.download_button(
86
- label="Download Text",
87
- data=st.session_state.combined_text,
88
- file_name="combined_input.txt",
89
- mime="text/plain"
90
- )
 
 
 
 
 
 
 
2
  from transformers import pipeline
3
  import numpy as np
4
  import torchaudio
 
5
  from audio_recorder_streamlit import audio_recorder
6
  import torch
7
+ from io import BytesIO
8
 
9
+ # Load Whisper model (cached)
10
  @st.cache_resource
11
  def load_model():
12
  return pipeline("automatic-speech-recognition", model="openai/whisper-base")
13
 
14
+ # Audio processing function
15
+ def process_audio(audio_bytes):
16
+ waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
17
+ if waveform.shape[0] > 1: # Convert stereo to mono
18
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
19
+ if sample_rate != 16000: # Resample to 16kHz if needed
20
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
21
+ waveform = resampler(waveform)
22
+ return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}
23
 
24
+ # Streamlit App
25
+ st.title("Real-Time Voice Typing")
26
+ st.write("Type or speak - text will appear instantly!")
27
 
28
+ # Initialize text in session state
29
+ if 'text_input' not in st.session_state:
30
+ st.session_state.text_input = ""
31
 
32
+ # Main text area (auto-updates from session state)
33
+ text_input = st.text_area(
34
+ "Your text will appear here:",
35
+ value=st.session_state.text_input,
36
+ height=300,
37
+ key="text_area"
38
+ )
39
 
40
+ # Audio recorder component
41
+ audio_bytes = audio_recorder(
42
+ pause_threshold=2.0, # Stop after 2 seconds of silence
43
+ text="Speak to type",
44
+ recording_color="#e8b62c",
45
+ neutral_color="#6aa36f",
46
+ )
47
 
48
+ # Process audio in real-time
49
+ if audio_bytes:
50
  try:
51
+ audio_input = process_audio(audio_bytes)
52
+ whisper = load_model()
53
+ transcribed_text = whisper(audio_input)["text"]
54
 
55
+ # Append new transcription to existing text
56
+ st.session_state.text_input = st.session_state.text_input + " " + transcribed_text
57
+ st.experimental_rerun() # Refresh to update text area
 
 
 
 
 
 
 
 
 
 
 
 
58
 
 
59
  except Exception as e:
60
+ st.error(f"Error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ # Control buttons
63
+ col1, col2 = st.columns(2)
64
+ with col1:
65
+ if st.button("Clear Text"):
66
+ st.session_state.text_input = ""
67
+ st.experimental_rerun()
68
+ with col2:
69
+ st.download_button(
70
+ "Download Text",
71
+ data=st.session_state.text_input,
72
+ file_name="voice_typed.txt",
73
+ mime="text/plain"
74
+ )