Spaces:

tahirsher
/

ASR_Model_for_Transcription_into_Text

Sleeping

File size: 4,547 Bytes

cd7aa15
 
 
2791b7d
cd7aa15
723513d
2791b7d
f6dc6c7
f0a5b40
cd7aa15
76c5c38
723513d
76c5c38
 
 
 
 
 
f6dc6c7
76c5c38
723513d
 
2791b7d
cd7aa15
76c5c38
 
2791b7d
76c5c38
2791b7d
76c5c38
f0a5b40
76c5c38
098a61e
cd7aa15
2791b7d
3a9d859
e3021fc
f6dc6c7
e3021fc
 
 
941924a
e3021fc
3a9d859
e3021fc
 
f0a5b40
cd7aa15
2791b7d
cd7aa15
76c5c38
 
e3021fc
76c5c38
e3021fc
 
76c5c38
e3021fc
 
cd7aa15
2791b7d
cd7aa15
2791b7d
f0a5b40
 
 
 
cd7aa15
 
f0a5b40
 
2791b7d
 
 
e3021fc
76c5c38
e3021fc
2791b7d
 
e3021fc
76c5c38
 
e3021fc
 
76c5c38
2791b7d
76c5c38
a4a32f2
2791b7d
 
a4a32f2
 
 
 
2791b7d
 
76c5c38
2791b7d
 
76c5c38
e3021fc
76c5c38
e3021fc
 
76c5c38
e3021fc
 
2791b7d
76c5c38
e3021fc
 
76c5c38
e3021fc
 
 
76c5c38
e3021fc
 
2791b7d

import os
import torch
import torchaudio
import librosa
import streamlit as st
from huggingface_hub import login
from transformers import AutoProcessor, AutoModelForCTC
from cryptography.fernet import Fernet

# ================================
# 1️⃣ Authenticate with Hugging Face Hub (Cache to prevent re-authentication)
# ================================
@st.cache_resource
def authenticate_hf():
    HF_TOKEN = os.getenv("hf_token")
    if HF_TOKEN is None:
        raise ValueError("❌ Hugging Face API token not found. Please set it in Secrets.")
    login(token=HF_TOKEN)

authenticate_hf()

# ================================
# 2️⃣ Load Conformer Model & Processor (Cached)
# ================================
@st.cache_resource
def load_model():
    MODEL_NAME = "deepl-project/conformer-finetunning"
    processor = AutoProcessor.from_pretrained(MODEL_NAME)
    model = AutoModelForCTC.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")
    return processor, model

processor, model = load_model()

# ================================
# 3️⃣ Streamlit Sidebar for Fine-Tuning & Security
# ================================
st.sidebar.title("🔧 Fine-Tuning & Security Settings")

num_epochs = st.sidebar.slider("Epochs", min_value=1, max_value=10, value=3)
learning_rate = st.sidebar.select_slider("Learning Rate", options=[5e-4, 1e-4, 5e-5, 1e-5], value=5e-5)
batch_size = st.sidebar.select_slider("Batch Size", options=[2, 4, 8, 16], value=8)

attack_strength = st.sidebar.slider("Adversarial Attack Strength", 0.1, 0.9, 0.3)

enable_encryption = st.sidebar.checkbox("🔒 Encrypt Transcription", value=True)
show_transcription = st.sidebar.checkbox("📖 Show Transcription", value=False)

# ================================
# 4️⃣ Encryption Handling (Precomputed Key)
# ================================
encryption_key = Fernet.generate_key()
fernet = Fernet(encryption_key)

def encrypt_text(text):
    return fernet.encrypt(text.encode())

def decrypt_text(encrypted_text):
    return fernet.decrypt(encrypted_text).decode()

# ================================
# 5️⃣ Optimized ASR Web App
# ================================
st.title("🎙️ Speech-to-Text ASR Model using Conformer with Security Features")

audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])

if audio_file:
    audio_path = "temp_audio.wav"
    with open(audio_path, "wb") as f:
        f.write(audio_file.read())

    # Load and preprocess the audio file using librosa
    speech, sr = librosa.load(audio_path, sr=16000)

    # ================================
    # ✅ Optimized Adversarial Attack Handling
    # ================================
    noise = attack_strength * torch.randn_like(torch.tensor(speech))
    adversarial_waveform = torch.tensor(speech) + noise
    adversarial_waveform = torch.clamp(adversarial_waveform, -1.0, 1.0)

    # Remove background noise for speed & accuracy
    denoised_waveform = torchaudio.functional.vad(adversarial_waveform, sample_rate=16000)

    # ================================
    # ✅ Fast Transcription Processing with Conformer
    # ================================
    # Convert waveform into the required format
    inputs = processor(denoised_waveform.numpy(), sampling_rate=sr, return_tensors="pt", padding=True).to("cuda" if torch.cuda.is_available() else "cpu")

    # Make sure the input has batch dimension (even if it's one example)
    if len(inputs.input_values.shape) == 1:
        inputs.input_values = inputs.input_values.unsqueeze(0)

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)

    if attack_strength > 0.3:
        st.warning("⚠️ Adversarial attack detected! Denoising applied.")

    # ================================
    # ✅ Optimized Encryption Handling
    # ================================
    if enable_encryption:
        encrypted_transcription = encrypt_text(transcription[0])
        st.info("🔒 Transcription is encrypted. Enable 'Show Transcription' to view.")
        
        if show_transcription:
            decrypted_text = decrypt_text(encrypted_transcription)
            st.success("📄 Secure Transcription:")
            st.write(decrypted_text)
        else:
            st.write("🔒 [Encrypted] Transcription hidden. Enable 'Show Transcription' to view.")
    else:
        st.success("📄 Transcription:")
        st.write(transcription[0])