File size: 5,047 Bytes
cd7aa15 723513d 2791b7d f6dc6c7 f0a5b40 cd7aa15 76c5c38 723513d 76c5c38 f6dc6c7 76c5c38 723513d 2791b7d cd7aa15 76c5c38 2791b7d 76c5c38 2791b7d 76c5c38 f0a5b40 76c5c38 098a61e cd7aa15 2791b7d 3a9d859 e3021fc f6dc6c7 e3021fc 941924a e3021fc 3a9d859 e3021fc f0a5b40 cd7aa15 2791b7d cd7aa15 76c5c38 e3021fc 76c5c38 e3021fc 76c5c38 e3021fc cd7aa15 2791b7d cd7aa15 2791b7d f0a5b40 cd7aa15 f0a5b40 8d19597 e3021fc 76c5c38 e3021fc 8d19597 e3021fc 76c5c38 add50b3 76c5c38 eda3536 add50b3 eda3536 a4a32f2 add50b3 2791b7d 8d19597 76c5c38 2791b7d 76c5c38 e3021fc 76c5c38 e3021fc 76c5c38 e3021fc 2791b7d 76c5c38 e3021fc 76c5c38 e3021fc 76c5c38 e3021fc 2791b7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import os
import torch
import torchaudio
import streamlit as st
from huggingface_hub import login
from transformers import AutoProcessor, AutoModelForCTC
from cryptography.fernet import Fernet
# ================================
# 1οΈβ£ Authenticate with Hugging Face Hub (Cache to prevent re-authentication)
# ================================
@st.cache_resource
def authenticate_hf():
HF_TOKEN = os.getenv("hf_token")
if HF_TOKEN is None:
raise ValueError("β Hugging Face API token not found. Please set it in Secrets.")
login(token=HF_TOKEN)
authenticate_hf()
# ================================
# 2οΈβ£ Load Conformer Model & Processor (Cached)
# ================================
@st.cache_resource
def load_model():
MODEL_NAME = "deepl-project/conformer-finetunning"
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModelForCTC.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")
return processor, model
processor, model = load_model()
# ================================
# 3οΈβ£ Streamlit Sidebar for Fine-Tuning & Security
# ================================
st.sidebar.title("π§ Fine-Tuning & Security Settings")
num_epochs = st.sidebar.slider("Epochs", min_value=1, max_value=10, value=3)
learning_rate = st.sidebar.select_slider("Learning Rate", options=[5e-4, 1e-4, 5e-5, 1e-5], value=5e-5)
batch_size = st.sidebar.select_slider("Batch Size", options=[2, 4, 8, 16], value=8)
attack_strength = st.sidebar.slider("Adversarial Attack Strength", 0.1, 0.9, 0.3)
enable_encryption = st.sidebar.checkbox("π Encrypt Transcription", value=True)
show_transcription = st.sidebar.checkbox("π Show Transcription", value=False)
# ================================
# 4οΈβ£ Encryption Handling (Precomputed Key)
# ================================
encryption_key = Fernet.generate_key()
fernet = Fernet(encryption_key)
def encrypt_text(text):
return fernet.encrypt(text.encode())
def decrypt_text(encrypted_text):
return fernet.decrypt(encrypted_text).decode()
# ================================
# 5οΈβ£ Optimized ASR Web App
# ================================
st.title("ποΈ Speech-to-Text ASR Model using Conformer with Security Features")
audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])
if audio_file:
audio_path = "temp_audio.wav"
with open(audio_path, "wb") as f:
f.write(audio_file.read())
# Load and preprocess the audio file using torchaudio
waveform, sample_rate = torchaudio.load(audio_path)
waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
waveform = waveform.to(dtype=torch.float32)
# ================================
# β
Optimized Adversarial Attack Handling
# ================================
noise = attack_strength * torch.randn_like(waveform)
adversarial_waveform = waveform + noise
adversarial_waveform = torch.clamp(adversarial_waveform, -1.0, 1.0)
# ================================
# β
Preprocess Audio with Processor (Corrected)
# ================================
# Ensure the input has batch dimension (even if it's one example)
inputs = processor(adversarial_waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
# Check the structure of the returned `inputs` to understand what it contains
st.write("Processor Output:", inputs)
# Extract the correct key (input_features or input_values depending on the model)
if "input_features" in inputs:
input_features = inputs["input_features"]
elif "input_values" in inputs:
input_features = inputs["input_values"]
else:
raise ValueError("β The processor output does not contain 'input_features' or 'input_values'.")
input_features = input_features.to("cuda" if torch.cuda.is_available() else "cpu")
# ================================
# β
Fast Transcription Processing with Conformer
# ================================
with torch.no_grad():
logits = model(input_features).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
if attack_strength > 0.3:
st.warning("β οΈ Adversarial attack detected! Denoising applied.")
# ================================
# β
Optimized Encryption Handling
# ================================
if enable_encryption:
encrypted_transcription = encrypt_text(transcription[0])
st.info("π Transcription is encrypted. Enable 'Show Transcription' to view.")
if show_transcription:
decrypted_text = decrypt_text(encrypted_transcription)
st.success("π Secure Transcription:")
st.write(decrypted_text)
else:
st.write("π [Encrypted] Transcription hidden. Enable 'Show Transcription' to view.")
else:
st.success("π Transcription:")
st.write(transcription[0])
|