iisadia commited on
Commit
f909060
·
verified ·
1 Parent(s): dacb1ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -32
app.py CHANGED
@@ -1,83 +1,90 @@
1
  import streamlit as st
2
  from transformers import pipeline
3
- import numpy as np
4
  import torchaudio
5
  from audio_recorder_streamlit import audio_recorder
6
  import torch
7
  from io import BytesIO
8
  import hashlib
9
 
10
- # Load Whisper model (cached)
11
  @st.cache_resource
12
  def load_model():
13
  return pipeline("automatic-speech-recognition", model="openai/whisper-base")
14
 
15
- # Audio processing function
16
  def process_audio(audio_bytes):
17
  waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
18
- if waveform.shape[0] > 1: # Convert stereo to mono
19
  waveform = torch.mean(waveform, dim=0, keepdim=True)
20
- if sample_rate != 16000: # Resample to 16kHz if needed
21
  resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
22
  waveform = resampler(waveform)
23
  return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}
24
 
25
- # Streamlit App
26
  st.title("Real-Time Voice Typing")
27
- st.write("Type or speak - text will appear instantly!")
28
 
29
  # Initialize session state
30
  if 'text_input' not in st.session_state:
31
  st.session_state.text_input = ""
32
- if 'last_audio_hash' not in st.session_state:
33
- st.session_state.last_audio_hash = ""
 
 
34
 
35
- # Main text area
36
  text_input = st.text_area(
37
- "Your text will appear here:",
38
  value=st.session_state.text_input,
39
- height=300,
40
- key="text_area"
41
  )
42
 
43
- # Audio recorder component
44
- audio_bytes = audio_recorder(
45
- pause_threshold=1.5, # Shorter pause threshold
46
- text="Speak to type",
47
- recording_color="#e8b62c",
48
- neutral_color="#6aa36f",
49
- )
50
 
51
- # Process new audio only if it's different from last time
52
- if audio_bytes:
53
- current_hash = hashlib.md5(audio_bytes).hexdigest()
54
- if current_hash != st.session_state.last_audio_hash:
55
- st.session_state.last_audio_hash = current_hash
56
  try:
57
- audio_input = process_audio(audio_bytes)
58
  whisper = load_model()
59
- transcribed_text = whisper(audio_input)["text"]
60
 
61
- # Append new transcription only if different
62
- if (not st.session_state.text_input.endswith(transcribed_text.strip()) and
63
- len(transcribed_text.strip()) > 0):
64
  st.session_state.text_input += " " + transcribed_text
 
65
  st.rerun()
66
 
67
  except Exception as e:
68
  st.error(f"Error: {str(e)}")
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # Control buttons
71
  col1, col2 = st.columns(2)
72
  with col1:
73
  if st.button("Clear Text"):
74
  st.session_state.text_input = ""
75
- st.session_state.last_audio_hash = ""
76
  st.rerun()
77
  with col2:
78
  st.download_button(
79
  "Download Text",
80
  data=st.session_state.text_input,
81
- file_name="voice_typed.txt",
82
  mime="text/plain"
83
  )
 
1
  import streamlit as st
2
  from transformers import pipeline
 
3
  import torchaudio
4
  from audio_recorder_streamlit import audio_recorder
5
  import torch
6
  from io import BytesIO
7
  import hashlib
8
 
9
+ # Load Whisper model
10
  @st.cache_resource
11
  def load_model():
12
  return pipeline("automatic-speech-recognition", model="openai/whisper-base")
13
 
 
14
  def process_audio(audio_bytes):
15
  waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
16
+ if waveform.shape[0] > 1: # Stereo to mono
17
  waveform = torch.mean(waveform, dim=0, keepdim=True)
18
+ if sample_rate != 16000: # Resample if needed
19
  resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
20
  waveform = resampler(waveform)
21
  return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}
22
 
23
+ # App Interface
24
  st.title("Real-Time Voice Typing")
25
+ st.write("Speak and your words will appear immediately")
26
 
27
  # Initialize session state
28
  if 'text_input' not in st.session_state:
29
  st.session_state.text_input = ""
30
+ if 'current_audio' not in st.session_state:
31
+ st.session_state.current_audio = None
32
+ if 'is_recording' not in st.session_state:
33
+ st.session_state.is_recording = False
34
 
35
+ # Text display
36
  text_input = st.text_area(
37
+ "Dictation Result:",
38
  value=st.session_state.text_input,
39
+ height=300
 
40
  )
41
 
42
+ # Audio recorder with callback
43
+ def handle_recording(audio_bytes):
44
+ if audio_bytes:
45
+ st.session_state.current_audio = audio_bytes
46
+ process_current_audio()
 
 
47
 
48
+ def process_current_audio():
49
+ if st.session_state.current_audio:
 
 
 
50
  try:
51
+ audio_input = process_audio(st.session_state.current_audio)
52
  whisper = load_model()
53
+ transcribed_text = whisper(audio_input)["text"].strip()
54
 
55
+ if transcribed_text:
 
 
56
  st.session_state.text_input += " " + transcribed_text
57
+ st.session_state.current_audio = None
58
  st.rerun()
59
 
60
  except Exception as e:
61
  st.error(f"Error: {str(e)}")
62
 
63
+ # Audio recorder component
64
+ audio_bytes = audio_recorder(
65
+ pause_threshold=1.0, # Faster response
66
+ text="Click to speak",
67
+ recording_color="#e8b62c",
68
+ neutral_color="#6aa36f",
69
+ callback=handle_recording,
70
+ key="audio_recorder"
71
+ )
72
+
73
+ # Process any pending audio
74
+ if st.session_state.current_audio:
75
+ process_current_audio()
76
+
77
  # Control buttons
78
  col1, col2 = st.columns(2)
79
  with col1:
80
  if st.button("Clear Text"):
81
  st.session_state.text_input = ""
82
+ st.session_state.current_audio = None
83
  st.rerun()
84
  with col2:
85
  st.download_button(
86
  "Download Text",
87
  data=st.session_state.text_input,
88
+ file_name="dictation.txt",
89
  mime="text/plain"
90
  )