Spaces:

tahirsher
/

ASR_Model_for_Transcription_into_Text

Sleeping

App Files Files Community

ASR_Model_for_Transcription_into_Text / app.py

tahirsher

Update app.py

a4a32f2 verified 2 months ago

raw

history blame

4.55 kB

	import os
	import torch
	import torchaudio
	import librosa
	import streamlit as st
	from huggingface_hub import login
	from transformers import AutoProcessor, AutoModelForCTC
	from cryptography.fernet import Fernet

	# ================================
	# 1️⃣ Authenticate with Hugging Face Hub (Cache to prevent re-authentication)
	# ================================
	@st.cache_resource
	def authenticate_hf():
	HF_TOKEN = os.getenv("hf_token")
	if HF_TOKEN is None:
	raise ValueError("❌ Hugging Face API token not found. Please set it in Secrets.")
	login(token=HF_TOKEN)

	authenticate_hf()

	# ================================
	# 2️⃣ Load Conformer Model & Processor (Cached)
	# ================================
	@st.cache_resource
	def load_model():
	MODEL_NAME = "deepl-project/conformer-finetunning"
	processor = AutoProcessor.from_pretrained(MODEL_NAME)
	model = AutoModelForCTC.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")
	return processor, model

	processor, model = load_model()

	# ================================
	# 3️⃣ Streamlit Sidebar for Fine-Tuning & Security
	# ================================
	st.sidebar.title("🔧 Fine-Tuning & Security Settings")

	num_epochs = st.sidebar.slider("Epochs", min_value=1, max_value=10, value=3)
	learning_rate = st.sidebar.select_slider("Learning Rate", options=[5e-4, 1e-4, 5e-5, 1e-5], value=5e-5)
	batch_size = st.sidebar.select_slider("Batch Size", options=[2, 4, 8, 16], value=8)

	attack_strength = st.sidebar.slider("Adversarial Attack Strength", 0.1, 0.9, 0.3)

	enable_encryption = st.sidebar.checkbox("🔒 Encrypt Transcription", value=True)
	show_transcription = st.sidebar.checkbox("📖 Show Transcription", value=False)

	# ================================
	# 4️⃣ Encryption Handling (Precomputed Key)
	# ================================
	encryption_key = Fernet.generate_key()
	fernet = Fernet(encryption_key)

	def encrypt_text(text):
	return fernet.encrypt(text.encode())

	def decrypt_text(encrypted_text):
	return fernet.decrypt(encrypted_text).decode()

	# ================================
	# 5️⃣ Optimized ASR Web App
	# ================================
	st.title("🎙️ Speech-to-Text ASR Model using Conformer with Security Features")

	audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])

	if audio_file:
	audio_path = "temp_audio.wav"
	with open(audio_path, "wb") as f:
	f.write(audio_file.read())

	# Load and preprocess the audio file using librosa
	speech, sr = librosa.load(audio_path, sr=16000)

	# ================================
	# ✅ Optimized Adversarial Attack Handling
	# ================================
	noise = attack_strength * torch.randn_like(torch.tensor(speech))
	adversarial_waveform = torch.tensor(speech) + noise
	adversarial_waveform = torch.clamp(adversarial_waveform, -1.0, 1.0)

	# Remove background noise for speed & accuracy
	denoised_waveform = torchaudio.functional.vad(adversarial_waveform, sample_rate=16000)

	# ================================
	# ✅ Fast Transcription Processing with Conformer
	# ================================
	# Convert waveform into the required format
	inputs = processor(denoised_waveform.numpy(), sampling_rate=sr, return_tensors="pt", padding=True).to("cuda" if torch.cuda.is_available() else "cpu")

	# Make sure the input has batch dimension (even if it's one example)
	if len(inputs.input_values.shape) == 1:
	inputs.input_values = inputs.input_values.unsqueeze(0)

	with torch.no_grad():
	logits = model(**inputs).logits

	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor.batch_decode(predicted_ids)

	if attack_strength > 0.3:
	st.warning("⚠️ Adversarial attack detected! Denoising applied.")

	# ================================
	# ✅ Optimized Encryption Handling
	# ================================
	if enable_encryption:
	encrypted_transcription = encrypt_text(transcription[0])
	st.info("🔒 Transcription is encrypted. Enable 'Show Transcription' to view.")

	if show_transcription:
	decrypted_text = decrypt_text(encrypted_transcription)
	st.success("📄 Secure Transcription:")
	st.write(decrypted_text)
	else:
	st.write("🔒 [Encrypted] Transcription hidden. Enable 'Show Transcription' to view.")
	else:
	st.success("📄 Transcription:")
	st.write(transcription[0])