File size: 4,547 Bytes
cd7aa15 2791b7d cd7aa15 723513d 2791b7d f6dc6c7 f0a5b40 cd7aa15 76c5c38 723513d 76c5c38 f6dc6c7 76c5c38 723513d 2791b7d cd7aa15 76c5c38 2791b7d 76c5c38 2791b7d 76c5c38 f0a5b40 76c5c38 098a61e cd7aa15 2791b7d 3a9d859 e3021fc f6dc6c7 e3021fc 941924a e3021fc 3a9d859 e3021fc f0a5b40 cd7aa15 2791b7d cd7aa15 76c5c38 e3021fc 76c5c38 e3021fc 76c5c38 e3021fc cd7aa15 2791b7d cd7aa15 2791b7d f0a5b40 cd7aa15 f0a5b40 2791b7d e3021fc 76c5c38 e3021fc 2791b7d e3021fc 76c5c38 e3021fc 76c5c38 2791b7d 76c5c38 a4a32f2 2791b7d a4a32f2 2791b7d 76c5c38 2791b7d 76c5c38 e3021fc 76c5c38 e3021fc 76c5c38 e3021fc 2791b7d 76c5c38 e3021fc 76c5c38 e3021fc 76c5c38 e3021fc 2791b7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import os
import torch
import torchaudio
import librosa
import streamlit as st
from huggingface_hub import login
from transformers import AutoProcessor, AutoModelForCTC
from cryptography.fernet import Fernet
# ================================
# 1οΈβ£ Authenticate with Hugging Face Hub (Cache to prevent re-authentication)
# ================================
@st.cache_resource
def authenticate_hf():
HF_TOKEN = os.getenv("hf_token")
if HF_TOKEN is None:
raise ValueError("β Hugging Face API token not found. Please set it in Secrets.")
login(token=HF_TOKEN)
authenticate_hf()
# ================================
# 2οΈβ£ Load Conformer Model & Processor (Cached)
# ================================
@st.cache_resource
def load_model():
MODEL_NAME = "deepl-project/conformer-finetunning"
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModelForCTC.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")
return processor, model
processor, model = load_model()
# ================================
# 3οΈβ£ Streamlit Sidebar for Fine-Tuning & Security
# ================================
st.sidebar.title("π§ Fine-Tuning & Security Settings")
num_epochs = st.sidebar.slider("Epochs", min_value=1, max_value=10, value=3)
learning_rate = st.sidebar.select_slider("Learning Rate", options=[5e-4, 1e-4, 5e-5, 1e-5], value=5e-5)
batch_size = st.sidebar.select_slider("Batch Size", options=[2, 4, 8, 16], value=8)
attack_strength = st.sidebar.slider("Adversarial Attack Strength", 0.1, 0.9, 0.3)
enable_encryption = st.sidebar.checkbox("π Encrypt Transcription", value=True)
show_transcription = st.sidebar.checkbox("π Show Transcription", value=False)
# ================================
# 4οΈβ£ Encryption Handling (Precomputed Key)
# ================================
encryption_key = Fernet.generate_key()
fernet = Fernet(encryption_key)
def encrypt_text(text):
return fernet.encrypt(text.encode())
def decrypt_text(encrypted_text):
return fernet.decrypt(encrypted_text).decode()
# ================================
# 5οΈβ£ Optimized ASR Web App
# ================================
st.title("ποΈ Speech-to-Text ASR Model using Conformer with Security Features")
audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])
if audio_file:
audio_path = "temp_audio.wav"
with open(audio_path, "wb") as f:
f.write(audio_file.read())
# Load and preprocess the audio file using librosa
speech, sr = librosa.load(audio_path, sr=16000)
# ================================
# β
Optimized Adversarial Attack Handling
# ================================
noise = attack_strength * torch.randn_like(torch.tensor(speech))
adversarial_waveform = torch.tensor(speech) + noise
adversarial_waveform = torch.clamp(adversarial_waveform, -1.0, 1.0)
# Remove background noise for speed & accuracy
denoised_waveform = torchaudio.functional.vad(adversarial_waveform, sample_rate=16000)
# ================================
# β
Fast Transcription Processing with Conformer
# ================================
# Convert waveform into the required format
inputs = processor(denoised_waveform.numpy(), sampling_rate=sr, return_tensors="pt", padding=True).to("cuda" if torch.cuda.is_available() else "cpu")
# Make sure the input has batch dimension (even if it's one example)
if len(inputs.input_values.shape) == 1:
inputs.input_values = inputs.input_values.unsqueeze(0)
with torch.no_grad():
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
if attack_strength > 0.3:
st.warning("β οΈ Adversarial attack detected! Denoising applied.")
# ================================
# β
Optimized Encryption Handling
# ================================
if enable_encryption:
encrypted_transcription = encrypt_text(transcription[0])
st.info("π Transcription is encrypted. Enable 'Show Transcription' to view.")
if show_transcription:
decrypted_text = decrypt_text(encrypted_transcription)
st.success("π Secure Transcription:")
st.write(decrypted_text)
else:
st.write("π [Encrypted] Transcription hidden. Enable 'Show Transcription' to view.")
else:
st.success("π Transcription:")
st.write(transcription[0])
|