iisadia commited on
Commit
656e90b
·
verified ·
1 Parent(s): c5e11de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -38
app.py CHANGED
@@ -1,81 +1,83 @@
1
  import streamlit as st
2
  from transformers import pipeline
 
3
  import torchaudio
 
4
  import torch
5
  from io import BytesIO
6
  import hashlib
7
 
8
- # Load Whisper model
9
  @st.cache_resource
10
  def load_model():
11
  return pipeline("automatic-speech-recognition", model="openai/whisper-base")
12
 
 
13
  def process_audio(audio_bytes):
14
  waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
15
- if waveform.shape[0] > 1: # Stereo to mono
16
  waveform = torch.mean(waveform, dim=0, keepdim=True)
17
- if sample_rate != 16000: # Resample if needed
18
  resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
19
  waveform = resampler(waveform)
20
  return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}
21
 
22
- # Custom audio recorder component
23
- def audio_recorder_component():
24
- return st.audio(
25
- "microphone",
26
- format="audio/wav",
27
- start_recording=True,
28
- pause_threshold=1.0,
29
- sample_rate=16000,
30
- key="audio_recorder"
31
- )
32
-
33
- # App Interface
34
  st.title("Real-Time Voice Typing")
35
- st.write("Speak and your words will appear immediately")
36
 
37
  # Initialize session state
38
  if 'text_input' not in st.session_state:
39
  st.session_state.text_input = ""
40
- if 'last_audio' not in st.session_state:
41
- st.session_state.last_audio = None
42
 
43
- # Text display
44
  text_input = st.text_area(
45
- "Dictation Result:",
46
  value=st.session_state.text_input,
47
- height=300
 
48
  )
49
 
50
- # Audio recording
51
- audio_bytes = audio_recorder_component()
 
 
 
 
 
52
 
53
- # Process audio when available
54
- if audio_bytes and audio_bytes != st.session_state.last_audio:
55
- st.session_state.last_audio = audio_bytes
56
- try:
57
- audio_input = process_audio(audio_bytes)
58
- whisper = load_model()
59
- transcribed_text = whisper(audio_input)["text"].strip()
60
-
61
- if transcribed_text:
62
- st.session_state.text_input += " " + transcribed_text
63
- st.rerun()
64
 
65
- except Exception as e:
66
- st.error(f"Error: {str(e)}")
 
 
 
 
 
 
67
 
68
  # Control buttons
69
  col1, col2 = st.columns(2)
70
  with col1:
71
  if st.button("Clear Text"):
72
  st.session_state.text_input = ""
73
- st.session_state.last_audio = None
74
  st.rerun()
75
  with col2:
76
  st.download_button(
77
  "Download Text",
78
  data=st.session_state.text_input,
79
- file_name="dictation.txt",
80
  mime="text/plain"
81
  )
 
1
  import streamlit as st
2
  from transformers import pipeline
3
+ import numpy as np
4
  import torchaudio
5
+ from audio_recorder_streamlit import audio_recorder
6
  import torch
7
  from io import BytesIO
8
  import hashlib
9
 
10
+ # Load Whisper model (cached)
11
  @st.cache_resource
12
  def load_model():
13
  return pipeline("automatic-speech-recognition", model="openai/whisper-base")
14
 
15
+ # Audio processing function
16
  def process_audio(audio_bytes):
17
  waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
18
+ if waveform.shape[0] > 1: # Convert stereo to mono
19
  waveform = torch.mean(waveform, dim=0, keepdim=True)
20
+ if sample_rate != 16000: # Resample to 16kHz if needed
21
  resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
22
  waveform = resampler(waveform)
23
  return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}
24
 
25
+ # Streamlit App
 
 
 
 
 
 
 
 
 
 
 
26
  st.title("Real-Time Voice Typing")
27
+ st.write("Type or speak - text will appear instantly!")
28
 
29
  # Initialize session state
30
  if 'text_input' not in st.session_state:
31
  st.session_state.text_input = ""
32
+ if 'last_audio_hash' not in st.session_state:
33
+ st.session_state.last_audio_hash = ""
34
 
35
+ # Main text area
36
  text_input = st.text_area(
37
+ "Your text will appear here:",
38
  value=st.session_state.text_input,
39
+ height=300,
40
+ key="text_area"
41
  )
42
 
43
+ # Audio recorder component
44
+ audio_bytes = audio_recorder(
45
+ pause_threshold=1.5, # Shorter pause threshold
46
+ text="Speak to type",
47
+ recording_color="#e8b62c",
48
+ neutral_color="#6aa36f",
49
+ )
50
 
51
+ # Process new audio only if it's different from last time
52
+ if audio_bytes:
53
+ current_hash = hashlib.md5(audio_bytes).hexdigest()
54
+ if current_hash != st.session_state.last_audio_hash:
55
+ st.session_state.last_audio_hash = current_hash
56
+ try:
57
+ audio_input = process_audio(audio_bytes)
58
+ whisper = load_model()
59
+ transcribed_text = whisper(audio_input)["text"]
 
 
60
 
61
+ # Append new transcription only if different
62
+ if (not st.session_state.text_input.endswith(transcribed_text.strip()) and
63
+ len(transcribed_text.strip()) > 0):
64
+ st.session_state.text_input += " " + transcribed_text
65
+ st.rerun()
66
+
67
+ except Exception as e:
68
+ st.error(f"Error: {str(e)}")
69
 
70
  # Control buttons
71
  col1, col2 = st.columns(2)
72
  with col1:
73
  if st.button("Clear Text"):
74
  st.session_state.text_input = ""
75
+ st.session_state.last_audio_hash = ""
76
  st.rerun()
77
  with col2:
78
  st.download_button(
79
  "Download Text",
80
  data=st.session_state.text_input,
81
+ file_name="voice_typed.txt",
82
  mime="text/plain"
83
  )