Spaces:

tahirsher
/

ASR_Model_for_Transcription_into_Text

Sleeping

App Files Files Community

tahirsher commited on Mar 10

Commit

15b7647

verified ·

1 Parent(s): eda3536

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -94

app.py CHANGED Viewed

@@ -3,63 +3,42 @@ import torch
 import torchaudio
 import streamlit as st
 from huggingface_hub import login
-from transformers import AutoProcessor, AutoModelForCTC
-from cryptography.fernet import Fernet
 # ================================
-# 1️⃣ Authenticate with Hugging Face Hub (Cache to prevent re-authentication)
 # ================================
-@st.cache_resource
-def authenticate_hf():
-    HF_TOKEN = os.getenv("hf_token")
-    if HF_TOKEN is None:
-        raise ValueError("❌ Hugging Face API token not found. Please set it in Secrets.")
-    login(token=HF_TOKEN)
-authenticate_hf()
 # ================================
-# 2️⃣ Load Conformer Model & Processor (Cached)
 # ================================
-@st.cache_resource
-def load_model():
-    MODEL_NAME = "deepl-project/conformer-finetunning"
-    processor = AutoProcessor.from_pretrained(MODEL_NAME)
-    model = AutoModelForCTC.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")
-    return processor, model
-processor, model = load_model()
 # ================================
-# 3️⃣ Streamlit Sidebar for Fine-Tuning & Security
 # ================================
-st.sidebar.title("🔧 Fine-Tuning & Security Settings")
 num_epochs = st.sidebar.slider("Epochs", min_value=1, max_value=10, value=3)
 learning_rate = st.sidebar.select_slider("Learning Rate", options=[5e-4, 1e-4, 5e-5, 1e-5], value=5e-5)
 batch_size = st.sidebar.select_slider("Batch Size", options=[2, 4, 8, 16], value=8)
-attack_strength = st.sidebar.slider("Adversarial Attack Strength", 0.1, 0.9, 0.3)
-enable_encryption = st.sidebar.checkbox("🔒 Encrypt Transcription", value=True)
-show_transcription = st.sidebar.checkbox("📖 Show Transcription", value=False)
 # ================================
-# 4️⃣ Encryption Handling (Precomputed Key)
 # ================================
-encryption_key = Fernet.generate_key()
-fernet = Fernet(encryption_key)
-def encrypt_text(text):
-    return fernet.encrypt(text.encode())
-def decrypt_text(encrypted_text):
-    return fernet.decrypt(encrypted_text).decode()
-# ================================
-# 5️⃣ Optimized ASR Web App
-# ================================
-st.title("🎙️ Speech-to-Text ASR Model using Conformer with Security Features")
 audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])
@@ -68,62 +47,25 @@ if audio_file:
     with open(audio_path, "wb") as f:
         f.write(audio_file.read())
-    # Load and preprocess the audio file using torchaudio
     waveform, sample_rate = torchaudio.load(audio_path)
     waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
     waveform = waveform.to(dtype=torch.float32)
-    # ================================
-    # ✅ Optimized Adversarial Attack Handling
-    # ================================
-    noise = attack_strength * torch.randn_like(waveform)
-    adversarial_waveform = waveform + noise
     adversarial_waveform = torch.clamp(adversarial_waveform, -1.0, 1.0)
-    # ================================
-    # ✅ Preprocess Audio with Processor (Corrected)
-    # ================================
-    # Ensure the input has batch dimension (even if it's one example)
-    inputs = processor(adversarial_waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
-    # Check the structure of the returned `inputs` to understand what it contains
-    st.write("Processor Output:", inputs)
-    # Extract the correct key (input_features or input_values depending on the model)
-    if "input_features" in inputs:
-        input_features = inputs["input_features"]
-    elif "input_values" in inputs:
-        input_features = inputs["input_values"]
-    else:
-        raise ValueError("❌ The processor output does not contain 'input_features' or 'input_values'.")
-    input_features = input_features.to("cuda" if torch.cuda.is_available() else "cpu")
-    # ================================
-    # ✅ Fast Transcription Processing with Conformer
-    # ================================
-    with torch.no_grad():
-        logits = model(input_features).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = processor.batch_decode(predicted_ids)
-    if attack_strength > 0.3:
-        st.warning("⚠️ Adversarial attack detected! Denoising applied.")
-    # ================================
-    # ✅ Optimized Encryption Handling
-    # ================================
-    if enable_encryption:
-        encrypted_transcription = encrypt_text(transcription[0])
-        st.info("🔒 Transcription is encrypted. Enable 'Show Transcription' to view.")
-        if show_transcription:
-            decrypted_text = decrypt_text(encrypted_transcription)
-            st.success("📄 Secure Transcription:")
-            st.write(decrypted_text)
-        else:
-            st.write("🔒 [Encrypted] Transcription hidden. Enable 'Show Transcription' to view.")
-    else:
-        st.success("📄 Transcription:")
-        st.write(transcription[0])

 import torchaudio
 import streamlit as st
 from huggingface_hub import login
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 # ================================
+# 1️⃣ Authenticate with Hugging Face Hub (Securely)
 # ================================
+HF_TOKEN = os.getenv("hf_token")
+if HF_TOKEN is None:
+    raise ValueError("❌ Hugging Face API token not found. Please set it in Secrets.")
+login(token=HF_TOKEN)
 # ================================
+# 2️⃣ Load Conformer Model & Processor
 # ================================
+MODEL_NAME = "facebook/wav2vec2-conformer-rel-pos-large"
+processor = AutoProcessor.from_pretrained(MODEL_NAME)
+model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+print(f"✅ Conformer Model loaded on {device}")
 # ================================
+# 3️⃣ Streamlit UI: Fine-Tuning Hyperparameter Selection
 # ================================
+st.sidebar.title("🔧 Fine-Tuning Hyperparameters")
 num_epochs = st.sidebar.slider("Epochs", min_value=1, max_value=10, value=3)
 learning_rate = st.sidebar.select_slider("Learning Rate", options=[5e-4, 1e-4, 5e-5, 1e-5], value=5e-5)
 batch_size = st.sidebar.select_slider("Batch Size", options=[2, 4, 8, 16], value=8)
+attack_strength = st.sidebar.slider("Attack Strength", 0.0, 0.9, 0.1)
 # ================================
+# 4️⃣ Streamlit ASR Web App (Fast Decoding & Security Features)
 # ================================
+st.title("🎙️ Speech-to-Text ASR Model with Security Features 🎶")
 audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])
     with open(audio_path, "wb") as f:
         f.write(audio_file.read())
     waveform, sample_rate = torchaudio.load(audio_path)
     waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
     waveform = waveform.to(dtype=torch.float32)
+    # Simulate an adversarial attack by injecting random noise
+    adversarial_waveform = waveform + (attack_strength * torch.randn_like(waveform))
     adversarial_waveform = torch.clamp(adversarial_waveform, -1.0, 1.0)
+    inputs = processor(adversarial_waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
+    input_features = inputs.input_values.to(device)
+    attention_mask = inputs.attention_mask.to(device) if "attention_mask" in inputs else None
+    with torch.inference_mode():
+        generated_ids = model.generate(input_features, max_length=200, num_beams=2, do_sample=False, use_cache=True,
+                                       attention_mask=attention_mask)
+        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    if attack_strength > 0.1:
+        st.warning("⚠️ Adversarial attack detected! Transcription may be affected.")
+    st.success("📄 Secure Transcription:")
+    st.write(transcription)