Spaces:

tahirsher
/

ASR_Model_for_Transcription_into_Text

Sleeping

App Files Files Community

tahirsher commited on Mar 10

Commit

2791b7d

verified ·

1 Parent(s): 76c5c38

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -58

app.py CHANGED Viewed

@@ -1,14 +1,10 @@
 import os
-import tarfile
 import torch
 import torchaudio
-import numpy as np
 import streamlit as st
 from huggingface_hub import login
-from transformers import (
-    AutoProcessor,
-    AutoModelForSpeechSeq2Seq,
-)
 from cryptography.fernet import Fernet
 # ================================
@@ -24,52 +20,19 @@ def authenticate_hf():
 authenticate_hf()
 # ================================
-# 2️⃣ Load Model & Processor (Cached)
 # ================================
 @st.cache_resource
 def load_model():
-    MODEL_NAME = "AqeelShafy7/AudioSangraha-Audio_to_Text"
     processor = AutoProcessor.from_pretrained(MODEL_NAME)
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")
     return processor, model
 processor, model = load_model()
 # ================================
-# 3️⃣ Dataset Extraction (Cached)
-# ================================
-@st.cache_resource
-def extract_dataset():
-    DATASET_TAR_PATH = "dev-clean.tar.gz"
-    EXTRACT_PATH = "./librispeech_dev_clean"
-    if not os.path.exists(EXTRACT_PATH):
-        with tarfile.open(DATASET_TAR_PATH, "r:gz") as tar:
-            tar.extractall(EXTRACT_PATH)
-    return os.path.join(EXTRACT_PATH, "LibriSpeech", "dev-clean")
-AUDIO_FOLDER = extract_dataset()
-# ================================
-# 4️⃣ Load Transcripts (Cached)
-# ================================
-@st.cache_resource
-def load_transcripts():
-    transcripts = {}
-    for root, _, files in os.walk(AUDIO_FOLDER):
-        for file in files:
-            if file.endswith(".txt"):
-                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
-                    for line in f:
-                        parts = line.strip().split(" ", 1)
-                        if len(parts) == 2:
-                            transcripts[parts[0]] = parts[1]
-    return transcripts
-transcripts = load_transcripts()
-# ================================
-# 5️⃣ Streamlit Sidebar for Fine-Tuning & Security
 # ================================
 st.sidebar.title("🔧 Fine-Tuning & Security Settings")
@@ -83,7 +46,7 @@ enable_encryption = st.sidebar.checkbox("🔒 Encrypt Transcription", value=True
 show_transcription = st.sidebar.checkbox("📖 Show Transcription", value=False)
 # ================================
-# 6️⃣ Encryption Handling (Precomputed Key)
 # ================================
 encryption_key = Fernet.generate_key()
 fernet = Fernet(encryption_key)
@@ -95,9 +58,9 @@ def decrypt_text(encrypted_text):
     return fernet.decrypt(encrypted_text).decode()
 # ================================
-# 7️⃣ Optimized ASR Web App
 # ================================
-st.title("🎙️ Speech-to-Text ASR Model Finetuned on Librispeech Corpus with Security Features")
 audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])
@@ -106,27 +69,29 @@ if audio_file:
     with open(audio_path, "wb") as f:
         f.write(audio_file.read())
-    waveform, sample_rate = torchaudio.load(audio_path)
-    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
     # ================================
     # ✅ Optimized Adversarial Attack Handling
     # ================================
-    noise = attack_strength * torch.randn_like(waveform)
-    adversarial_waveform = waveform + noise
     adversarial_waveform = torch.clamp(adversarial_waveform, -1.0, 1.0)
     # Remove background noise for speed & accuracy
     denoised_waveform = torchaudio.functional.vad(adversarial_waveform, sample_rate=16000)
     # ================================
-    # ✅ Fast Transcription Processing
     # ================================
-    input_features = processor(denoised_waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features.to("cuda" if torch.cuda.is_available() else "cpu")
-    with torch.inference_mode():
-        generated_ids = model.generate(input_features, max_length=200, num_beams=2, do_sample=False)
-        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     if attack_strength > 0.3:
         st.warning("⚠️ Adversarial attack detected! Denoising applied.")
@@ -135,7 +100,7 @@ if audio_file:
     # ✅ Optimized Encryption Handling
     # ================================
     if enable_encryption:
-        encrypted_transcription = encrypt_text(transcription)
         st.info("🔒 Transcription is encrypted. Enable 'Show Transcription' to view.")
         if show_transcription:
@@ -146,4 +111,4 @@ if audio_file:
             st.write("🔒 [Encrypted] Transcription hidden. Enable 'Show Transcription' to view.")
     else:
         st.success("📄 Transcription:")
-        st.write(transcription)

 import os
 import torch
 import torchaudio
+import librosa
 import streamlit as st
 from huggingface_hub import login
+from transformers import AutoProcessor, AutoModelForCTC
 from cryptography.fernet import Fernet
 # ================================
 authenticate_hf()
 # ================================
+# 2️⃣ Load Conformer Model & Processor (Cached)
 # ================================
 @st.cache_resource
 def load_model():
+    MODEL_NAME = "deepl-project/conformer-finetunning"
     processor = AutoProcessor.from_pretrained(MODEL_NAME)
+    model = AutoModelForCTC.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")
     return processor, model
 processor, model = load_model()
 # ================================
+# 3️⃣ Streamlit Sidebar for Fine-Tuning & Security
 # ================================
 st.sidebar.title("🔧 Fine-Tuning & Security Settings")
 show_transcription = st.sidebar.checkbox("📖 Show Transcription", value=False)
 # ================================
+# 4️⃣ Encryption Handling (Precomputed Key)
 # ================================
 encryption_key = Fernet.generate_key()
 fernet = Fernet(encryption_key)
     return fernet.decrypt(encrypted_text).decode()
 # ================================
+# 5️⃣ Optimized ASR Web App
 # ================================
+st.title("🎙️ Speech-to-Text ASR Model using Conformer with Security Features")
 audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])
     with open(audio_path, "wb") as f:
         f.write(audio_file.read())
+    # Load and preprocess the audio file using librosa
+    speech, sr = librosa.load(audio_path, sr=16000)
     # ================================
     # ✅ Optimized Adversarial Attack Handling
     # ================================
+    noise = attack_strength * torch.randn_like(torch.tensor(speech))
+    adversarial_waveform = torch.tensor(speech) + noise
     adversarial_waveform = torch.clamp(adversarial_waveform, -1.0, 1.0)
     # Remove background noise for speed & accuracy
     denoised_waveform = torchaudio.functional.vad(adversarial_waveform, sample_rate=16000)
     # ================================
+    # ✅ Fast Transcription Processing with Conformer
     # ================================
+    inputs = processor(denoised_waveform.numpy(), sampling_rate=sr, return_tensors="pt", padding=True).to("cuda" if torch.cuda.is_available() else "cpu")
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.batch_decode(predicted_ids)
     if attack_strength > 0.3:
         st.warning("⚠️ Adversarial attack detected! Denoising applied.")
     # ✅ Optimized Encryption Handling
     # ================================
     if enable_encryption:
+        encrypted_transcription = encrypt_text(transcription[0])
         st.info("🔒 Transcription is encrypted. Enable 'Show Transcription' to view.")
         if show_transcription:
             st.write("🔒 [Encrypted] Transcription hidden. Enable 'Show Transcription' to view.")
     else:
         st.success("📄 Transcription:")
+        st.write(transcription[0])