File size: 2,691 Bytes
cd7aa15
 
 
6027573
cd7aa15
723513d
6027573
32d1b7b
cd7aa15
15b7647
723513d
15b7647
f6dc6c7
15b7647
 
 
 
723513d
 
15b7647
cd7aa15
6027573
15b7647
6027573
f0a5b40
15b7647
 
 
098a61e
cd7aa15
15b7647
3a9d859
15b7647
e3021fc
 
 
15b7647
f0a5b40
cd7aa15
15b7647
cd7aa15
e1b64e4
f0a5b40
 
 
 
cd7aa15
 
f0a5b40
 
6027573
8d19597
15b7647
6027573
 
15b7647
6027573
 
15b7647
6027573
 
 
 
 
15b7647
e1b64e4
15b7647
 
 
6027573
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import torch
import torchaudio
import librosa
import streamlit as st
from huggingface_hub import login
from transformers import AutoProcessor, AutoModelForCTC
import numpy as np
# ================================
# 1️⃣ Authenticate with Hugging Face Hub (Securely)
# ================================
HF_TOKEN = os.getenv("hf_token")  

if HF_TOKEN is None:
    raise ValueError("❌ Hugging Face API token not found. Please set it in Secrets.")

login(token=HF_TOKEN)

# ================================
# 2️⃣ Load Conformer Model & Processor
# ================================
MODEL_NAME = "deepl-project/conformer-finetunning"
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModelForCTC.from_pretrained(MODEL_NAME)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"✅ Conformer Model loaded on {device}")

# ================================
# 3️⃣ Streamlit UI: Fine-Tuning Hyperparameter Selection
# ================================
st.sidebar.title("🔧 Fine-Tuning Hyperparameters")
num_epochs = st.sidebar.slider("Epochs", min_value=1, max_value=10, value=3)
learning_rate = st.sidebar.select_slider("Learning Rate", options=[5e-4, 1e-4, 5e-5, 1e-5], value=5e-5)
batch_size = st.sidebar.select_slider("Batch Size", options=[2, 4, 8, 16], value=8)
attack_strength = st.sidebar.slider("Attack Strength", 0.0, 0.9, 0.1)

# ================================
# 4️⃣ Streamlit ASR Web App (Fast Decoding & Security Features)
# ================================
st.title("🎙️ Speech-to-Text ASR Conformer Model Finetunned on Libri Speech with Security Features 🎶")

audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])

if audio_file:
    audio_path = "temp_audio.wav"
    with open(audio_path, "wb") as f:
        f.write(audio_file.read())

    speech, sr = librosa.load(audio_path, sr=16000)
    
    # Simulate an adversarial attack by injecting random noise
    adversarial_speech = speech + (attack_strength * np.random.randn(*speech.shape))
    adversarial_speech = np.clip(adversarial_speech, -1.0, 1.0)
    
    inputs = processor(adversarial_speech, sampling_rate=sr, return_tensors="pt", padding=True)
    input_values = inputs.input_values.to(device)
    
    with torch.no_grad():
        logits = model(input_values).logits
    
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    
    if attack_strength > 0.2:
        st.warning("⚠️ Adversarial attack detected! Transcription may be affected.")
    
    st.success("📄 Secure Transcription:")
    st.write(transcription[0])