|
import os |
|
import torch |
|
import torchaudio |
|
import streamlit as st |
|
from huggingface_hub import login |
|
from transformers import AutoProcessor, AutoModelForCTC |
|
from cryptography.fernet import Fernet |
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
def authenticate_hf(): |
|
HF_TOKEN = os.getenv("hf_token") |
|
if HF_TOKEN is None: |
|
raise ValueError("β Hugging Face API token not found. Please set it in Secrets.") |
|
login(token=HF_TOKEN) |
|
|
|
authenticate_hf() |
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
def load_model(): |
|
MODEL_NAME = "deepl-project/conformer-finetunning" |
|
processor = AutoProcessor.from_pretrained(MODEL_NAME) |
|
model = AutoModelForCTC.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu") |
|
return processor, model |
|
|
|
processor, model = load_model() |
|
|
|
|
|
|
|
|
|
st.sidebar.title("π§ Fine-Tuning & Security Settings") |
|
|
|
num_epochs = st.sidebar.slider("Epochs", min_value=1, max_value=10, value=3) |
|
learning_rate = st.sidebar.select_slider("Learning Rate", options=[5e-4, 1e-4, 5e-5, 1e-5], value=5e-5) |
|
batch_size = st.sidebar.select_slider("Batch Size", options=[2, 4, 8, 16], value=8) |
|
|
|
attack_strength = st.sidebar.slider("Adversarial Attack Strength", 0.1, 0.9, 0.3) |
|
|
|
enable_encryption = st.sidebar.checkbox("π Encrypt Transcription", value=True) |
|
show_transcription = st.sidebar.checkbox("π Show Transcription", value=False) |
|
|
|
|
|
|
|
|
|
encryption_key = Fernet.generate_key() |
|
fernet = Fernet(encryption_key) |
|
|
|
def encrypt_text(text): |
|
return fernet.encrypt(text.encode()) |
|
|
|
def decrypt_text(encrypted_text): |
|
return fernet.decrypt(encrypted_text).decode() |
|
|
|
|
|
|
|
|
|
st.title("ποΈ Speech-to-Text ASR Model using Conformer with Security Features") |
|
|
|
audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"]) |
|
|
|
if audio_file: |
|
audio_path = "temp_audio.wav" |
|
with open(audio_path, "wb") as f: |
|
f.write(audio_file.read()) |
|
|
|
|
|
waveform, sample_rate = torchaudio.load(audio_path) |
|
waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform) |
|
waveform = waveform.to(dtype=torch.float32) |
|
|
|
|
|
|
|
|
|
noise = attack_strength * torch.randn_like(waveform) |
|
adversarial_waveform = waveform + noise |
|
adversarial_waveform = torch.clamp(adversarial_waveform, -1.0, 1.0) |
|
|
|
|
|
|
|
|
|
inputs = processor(adversarial_waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True) |
|
|
|
|
|
input_features = inputs["input_features"].to("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
logits = model(input_features).logits |
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
transcription = processor.batch_decode(predicted_ids) |
|
|
|
if attack_strength > 0.3: |
|
st.warning("β οΈ Adversarial attack detected! Denoising applied.") |
|
|
|
|
|
|
|
|
|
if enable_encryption: |
|
encrypted_transcription = encrypt_text(transcription[0]) |
|
st.info("π Transcription is encrypted. Enable 'Show Transcription' to view.") |
|
|
|
if show_transcription: |
|
decrypted_text = decrypt_text(encrypted_transcription) |
|
st.success("π Secure Transcription:") |
|
st.write(decrypted_text) |
|
else: |
|
st.write("π [Encrypted] Transcription hidden. Enable 'Show Transcription' to view.") |
|
else: |
|
st.success("π Transcription:") |
|
st.write(transcription[0]) |
|
|