Spaces:

iisadia
/

KASOTI_GAME

Running

App Files Files Community

iisadia commited on Apr 12

Commit

656e90b

verified ·

1 Parent(s): c5e11de

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -38

app.py CHANGED Viewed

@@ -1,81 +1,83 @@
 import streamlit as st
 from transformers import pipeline
 import torchaudio
 import torch
 from io import BytesIO
 import hashlib
-# Load Whisper model
 @st.cache_resource
 def load_model():
     return pipeline("automatic-speech-recognition", model="openai/whisper-base")
 def process_audio(audio_bytes):
     waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
-    if waveform.shape[0] > 1:  # Stereo to mono
         waveform = torch.mean(waveform, dim=0, keepdim=True)
-    if sample_rate != 16000:  # Resample if needed
         resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
         waveform = resampler(waveform)
     return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}
-# Custom audio recorder component
-def audio_recorder_component():
-    return st.audio(
-        "microphone",
-        format="audio/wav",
-        start_recording=True,
-        pause_threshold=1.0,
-        sample_rate=16000,
-        key="audio_recorder"
-    )
-# App Interface
 st.title("Real-Time Voice Typing")
-st.write("Speak and your words will appear immediately")
 # Initialize session state
 if 'text_input' not in st.session_state:
     st.session_state.text_input = ""
-if 'last_audio' not in st.session_state:
-    st.session_state.last_audio = None
-# Text display
 text_input = st.text_area(
-    "Dictation Result:",
     value=st.session_state.text_input,
-    height=300
 )
-# Audio recording
-audio_bytes = audio_recorder_component()
-# Process audio when available
-if audio_bytes and audio_bytes != st.session_state.last_audio:
-    st.session_state.last_audio = audio_bytes
-    try:
-        audio_input = process_audio(audio_bytes)
-        whisper = load_model()
-        transcribed_text = whisper(audio_input)["text"].strip()
-        if transcribed_text:
-            st.session_state.text_input += " " + transcribed_text
-            st.rerun()
-    except Exception as e:
-        st.error(f"Error: {str(e)}")
 # Control buttons
 col1, col2 = st.columns(2)
 with col1:
     if st.button("Clear Text"):
         st.session_state.text_input = ""
-        st.session_state.last_audio = None
         st.rerun()
 with col2:
     st.download_button(
         "Download Text",
         data=st.session_state.text_input,
-        file_name="dictation.txt",
         mime="text/plain"
     )

 import streamlit as st
 from transformers import pipeline
+import numpy as np
 import torchaudio
+from audio_recorder_streamlit import audio_recorder
 import torch
 from io import BytesIO
 import hashlib
+# Load Whisper model (cached)
 @st.cache_resource
 def load_model():
     return pipeline("automatic-speech-recognition", model="openai/whisper-base")
+# Audio processing function
 def process_audio(audio_bytes):
     waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
+    if waveform.shape[0] > 1:  # Convert stereo to mono
         waveform = torch.mean(waveform, dim=0, keepdim=True)
+    if sample_rate != 16000:  # Resample to 16kHz if needed
         resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
         waveform = resampler(waveform)
     return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}
+# Streamlit App
 st.title("Real-Time Voice Typing")
+st.write("Type or speak - text will appear instantly!")
 # Initialize session state
 if 'text_input' not in st.session_state:
     st.session_state.text_input = ""
+if 'last_audio_hash' not in st.session_state:
+    st.session_state.last_audio_hash = ""
+# Main text area
 text_input = st.text_area(
+    "Your text will appear here:",
     value=st.session_state.text_input,
+    height=300,
+    key="text_area"
 )
+# Audio recorder component
+audio_bytes = audio_recorder(
+    pause_threshold=1.5,  # Shorter pause threshold
+    text="Speak to type",
+    recording_color="#e8b62c",
+    neutral_color="#6aa36f",
+)
+# Process new audio only if it's different from last time
+if audio_bytes:
+    current_hash = hashlib.md5(audio_bytes).hexdigest()
+    if current_hash != st.session_state.last_audio_hash:
+        st.session_state.last_audio_hash = current_hash
+        try:
+            audio_input = process_audio(audio_bytes)
+            whisper = load_model()
+            transcribed_text = whisper(audio_input)["text"]
+            # Append new transcription only if different
+            if (not st.session_state.text_input.endswith(transcribed_text.strip()) and
+                len(transcribed_text.strip()) > 0):
+                st.session_state.text_input += " " + transcribed_text
+                st.rerun()
+        except Exception as e:
+            st.error(f"Error: {str(e)}")
 # Control buttons
 col1, col2 = st.columns(2)
 with col1:
     if st.button("Clear Text"):
         st.session_state.text_input = ""
+        st.session_state.last_audio_hash = ""
         st.rerun()
 with col2:
     st.download_button(
         "Download Text",
         data=st.session_state.text_input,
+        file_name="voice_typed.txt",
         mime="text/plain"
     )