import streamlit as st
from transformers import pipeline
import numpy as np
import torchaudio
from audio_recorder_streamlit import audio_recorder
import torch
from io import BytesIO
import hashlib

# Load Whisper model (cached)
@st.cache_resource
def load_model():
    return pipeline("automatic-speech-recognition", model="openai/whisper-base")

# Audio processing function
def process_audio(audio_bytes):
    waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
    if waveform.shape[0] > 1:  # Convert stereo to mono
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    if sample_rate != 16000:  # Resample to 16kHz if needed
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
    return {"raw": waveform.numpy().squeeze(), "sampling_rate": 16000}

# Streamlit App
st.title("Real-Time Voice Typing")
st.write("Type or speak - text will appear instantly!")

# Initialize session state
if 'text_input' not in st.session_state:
    st.session_state.text_input = ""
if 'last_audio_hash' not in st.session_state:
    st.session_state.last_audio_hash = ""

# Main text area
text_input = st.text_area(
    "Your text will appear here:", 
    value=st.session_state.text_input,
    height=300,
    key="text_area"
)

# Audio recorder component
audio_bytes = audio_recorder(
    pause_threshold=0.8,  # Shorter pause threshold
    text="Speak to type",
    recording_color="#e8b62c",
    neutral_color="#6aa36f",
)

# Process new audio only if it's different from last time
if audio_bytes:
    st.info("🎤 Audio received!")
    current_hash = hashlib.md5(audio_bytes).hexdigest()
    if current_hash != st.session_state.last_audio_hash:
        st.session_state.last_audio_hash = current_hash
        try:
            audio_input = process_audio(audio_bytes)
            whisper = load_model()
            transcribed_text = whisper(audio_input)["text"]
            st.info(f"📝 Transcribed: {transcribed_text}")
            
            # Append new transcription only if different
            if (not st.session_state.text_input.endswith(transcribed_text.strip()) and 
                len(transcribed_text.strip()) > 0):
                st.session_state.text_input += " " + transcribed_text
                st.success("✅ Text added to box!")
                st.rerun()
                
        except Exception as e:
            st.error(f"Error: {str(e)}")

# Control buttons
col1, col2 = st.columns(2)
with col1:
    if st.button("Clear Text"):
        st.session_state.text_input = ""
        st.session_state.last_audio_hash = ""
        st.rerun()
with col2:
    st.download_button(
        "Download Text",
        data=st.session_state.text_input,
        file_name="voice_typed.txt",
        mime="text/plain"
    )