Spaces:

iisadia
/

KASOTI_GAME

Sleeping

App Files Files Community

iisadia commited on Apr 12

Commit

f909060

verified ·

1 Parent(s): dacb1ad

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -32

app.py CHANGED Viewed

@@ -1,83 +1,90 @@
 import streamlit as st
 from transformers import pipeline
-import numpy as np
 import torchaudio
 from audio_recorder_streamlit import audio_recorder
 import torch
 from io import BytesIO
 import hashlib
-# Load Whisper model (cached)
 @st.cache_resource
 def load_model():
     return pipeline("automatic-speech-recognition", model="openai/whisper-base")
-# Audio processing function
 def process_audio(audio_bytes):
     waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
-    if waveform.shape[0] > 1:  # Convert stereo to mono
         waveform = torch.mean(waveform, dim=0, keepdim=True)
-    if sample_rate != 16000:  # Resample to 16kHz if needed
         resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
         waveform = resampler(waveform)
     return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}
-# Streamlit App
 st.title("Real-Time Voice Typing")
-st.write("Type or speak - text will appear instantly!")
 # Initialize session state
 if 'text_input' not in st.session_state:
     st.session_state.text_input = ""
-if 'last_audio_hash' not in st.session_state:
-    st.session_state.last_audio_hash = ""
-# Main text area
 text_input = st.text_area(
-    "Your text will appear here:",
     value=st.session_state.text_input,
-    height=300,
-    key="text_area"
 )
-# Audio recorder component
-audio_bytes = audio_recorder(
-    pause_threshold=1.5,  # Shorter pause threshold
-    text="Speak to type",
-    recording_color="#e8b62c",
-    neutral_color="#6aa36f",
-)
-# Process new audio only if it's different from last time
-if audio_bytes:
-    current_hash = hashlib.md5(audio_bytes).hexdigest()
-    if current_hash != st.session_state.last_audio_hash:
-        st.session_state.last_audio_hash = current_hash
         try:
-            audio_input = process_audio(audio_bytes)
             whisper = load_model()
-            transcribed_text = whisper(audio_input)["text"]
-            # Append new transcription only if different
-            if (not st.session_state.text_input.endswith(transcribed_text.strip()) and
-                len(transcribed_text.strip()) > 0):
                 st.session_state.text_input += " " + transcribed_text
                 st.rerun()
         except Exception as e:
             st.error(f"Error: {str(e)}")
 # Control buttons
 col1, col2 = st.columns(2)
 with col1:
     if st.button("Clear Text"):
         st.session_state.text_input = ""
-        st.session_state.last_audio_hash = ""
         st.rerun()
 with col2:
     st.download_button(
         "Download Text",
         data=st.session_state.text_input,
-        file_name="voice_typed.txt",
         mime="text/plain"
     )

 import streamlit as st
 from transformers import pipeline
 import torchaudio
 from audio_recorder_streamlit import audio_recorder
 import torch
 from io import BytesIO
 import hashlib
+# Load Whisper model
 @st.cache_resource
 def load_model():
     return pipeline("automatic-speech-recognition", model="openai/whisper-base")
 def process_audio(audio_bytes):
     waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
+    if waveform.shape[0] > 1:  # Stereo to mono
         waveform = torch.mean(waveform, dim=0, keepdim=True)
+    if sample_rate != 16000:  # Resample if needed
         resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
         waveform = resampler(waveform)
     return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}
+# App Interface
 st.title("Real-Time Voice Typing")
+st.write("Speak and your words will appear immediately")
 # Initialize session state
 if 'text_input' not in st.session_state:
     st.session_state.text_input = ""
+if 'current_audio' not in st.session_state:
+    st.session_state.current_audio = None
+if 'is_recording' not in st.session_state:
+    st.session_state.is_recording = False
+# Text display
 text_input = st.text_area(
+    "Dictation Result:",
     value=st.session_state.text_input,
+    height=300
 )
+# Audio recorder with callback
+def handle_recording(audio_bytes):
+    if audio_bytes:
+        st.session_state.current_audio = audio_bytes
+        process_current_audio()
+def process_current_audio():
+    if st.session_state.current_audio:
         try:
+            audio_input = process_audio(st.session_state.current_audio)
             whisper = load_model()
+            transcribed_text = whisper(audio_input)["text"].strip()
+            if transcribed_text:
                 st.session_state.text_input += " " + transcribed_text
+                st.session_state.current_audio = None
                 st.rerun()
         except Exception as e:
             st.error(f"Error: {str(e)}")
+# Audio recorder component
+audio_bytes = audio_recorder(
+    pause_threshold=1.0,  # Faster response
+    text="Click to speak",
+    recording_color="#e8b62c",
+    neutral_color="#6aa36f",
+    callback=handle_recording,
+    key="audio_recorder"
+)
+# Process any pending audio
+if st.session_state.current_audio:
+    process_current_audio()
 # Control buttons
 col1, col2 = st.columns(2)
 with col1:
     if st.button("Clear Text"):
         st.session_state.text_input = ""
+        st.session_state.current_audio = None
         st.rerun()
 with col2:
     st.download_button(
         "Download Text",
         data=st.session_state.text_input,
+        file_name="dictation.txt",
         mime="text/plain"
     )