Spaces:

tahirsher
/

ASR_Model_for_Transcription_into_Text

Sleeping

App Files Files Community

tahirsher commited on Mar 10

Commit

941924a

verified ·

1 Parent(s): a312467

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -83

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import torchaudio
 import numpy as np
 import streamlit as st
 import matplotlib.pyplot as plt
 from huggingface_hub import login
 from transformers import (
     AutoProcessor,
@@ -18,10 +19,8 @@ from transformers import (
 # 1️⃣ Authenticate with Hugging Face Hub (Securely)
 # ================================
 HF_TOKEN = os.getenv("hf_token")
 if HF_TOKEN is None:
     raise ValueError("❌ Hugging Face API token not found. Please set it in Secrets.")
 login(token=HF_TOKEN)
 # ================================
@@ -33,7 +32,6 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
-print(f"✅ Model loaded on {device}")
 # ================================
 # 3️⃣ Load Dataset (From Extracted Folder)
@@ -42,89 +40,53 @@ DATASET_TAR_PATH = "dev-clean.tar.gz"
 EXTRACT_PATH = "./librispeech_dev_clean"
 if not os.path.exists(EXTRACT_PATH):
-    print("🔄 Extracting dataset...")
     with tarfile.open(DATASET_TAR_PATH, "r:gz") as tar:
         tar.extractall(EXTRACT_PATH)
-    print("✅ Extraction complete.")
-else:
-    print("✅ Dataset already extracted.")
 AUDIO_FOLDER = os.path.join(EXTRACT_PATH, "LibriSpeech", "dev-clean")
 def find_audio_files(base_folder):
-    """Recursively search for all .flac files in subdirectories."""
-    audio_files = []
-    for root, _, files in os.walk(base_folder):
-        for file in files:
-            if file.endswith(".flac"):
-                audio_files.append(os.path.join(root, file))
-    return audio_files
 audio_files = find_audio_files(AUDIO_FOLDER)
-if not audio_files:
-    raise FileNotFoundError(f"❌ No .flac files found in {AUDIO_FOLDER}. Check dataset structure!")
-print(f"✅ Found {len(audio_files)} audio files in dataset!")
 # ================================
 # 4️⃣ Load Transcripts
 # ================================
 def load_transcripts():
-    """Loads transcript text files and maps them to audio files."""
     transcript_dict = {}
     for root, _, files in os.walk(AUDIO_FOLDER):
         for file in files:
-            if file.endswith(".txt"):  # Transcript files
                 with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                     for line in f:
                         parts = line.strip().split(" ", 1)
                         if len(parts) == 2:
-                            file_id, text = parts
-                            transcript_dict[file_id] = text
     return transcript_dict
 transcripts = load_transcripts()
-if not transcripts:
-    raise FileNotFoundError("❌ No transcripts found! Check dataset structure.")
-print(f"✅ Loaded {len(transcripts)} transcripts.")
 # ================================
-# 5️⃣ Preprocess Dataset (Fixing `input_ids` issue)
 # ================================
-def load_and_process_audio(audio_path):
-    """Loads and processes a single audio file into model format."""
-    waveform, sample_rate = torchaudio.load(audio_path)
-    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
-    input_features = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features[0]
-    return input_features
-dataset = []
-for audio_file in audio_files[:100]:  # Limit to 100 for faster processing
-    file_id = os.path.basename(audio_file).replace(".flac", "")
-    if file_id in transcripts:
-        input_features = load_and_process_audio(audio_file)
-        labels = processor.tokenizer(transcripts[file_id], padding="max_length", truncation=True, return_tensors="pt").input_ids[0]
-        dataset.append({"input_features": input_features, "labels": labels})
-train_size = int(0.8 * len(dataset))
-train_dataset = dataset[:train_size]
-eval_dataset = dataset[train_size:]
-print(f"✅ Dataset Prepared! Training: {len(train_dataset)}, Evaluation: {len(eval_dataset)}")
 # ================================
-# 6️⃣ Streamlit UI: Fine-Tuning Hyperparameter Selection
 # ================================
-st.sidebar.title("🔧 Fine-Tuning Hyperparameters")
-num_epochs = st.sidebar.slider("Epochs", min_value=1, max_value=10, value=3)
-learning_rate = st.sidebar.select_slider("Learning Rate", options=[5e-4, 1e-4, 5e-5, 1e-5], value=5e-5)
-batch_size = st.sidebar.select_slider("Batch Size", options=[2, 4, 8, 16], value=8)
 # ================================
 # 7️⃣ Training Arguments & Trainer
@@ -133,10 +95,10 @@ training_args = TrainingArguments(
     output_dir="./asr_model_finetuned",
     eval_strategy="epoch",
     save_strategy="epoch",
-    learning_rate=learning_rate,
-    per_device_train_batch_size=batch_size,
-    per_device_eval_batch_size=batch_size,
-    num_train_epochs=num_epochs,
     weight_decay=0.01,
     logging_dir="./logs",
     logging_steps=500,
@@ -146,20 +108,14 @@ training_args = TrainingArguments(
     hub_token=HF_TOKEN,
 )
-data_collator = DataCollatorForSeq2Seq(tokenizer=processor.tokenizer, model=model, return_tensors="pt")
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
-    data_collator=data_collator,
-)
 # ================================
-# 8️⃣ Streamlit ASR Web App (Fast Decoding)
 # ================================
-st.title("🎙️ Speech-to-Text ASR Model with Fine-Tuning 🎶")
 audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])
@@ -176,16 +132,19 @@ if audio_file:
     ).input_features.to(device)
     with torch.inference_mode():
-        generated_ids = model.generate(
-            input_features,
-            max_length=200,
-            num_beams=2,
-            do_sample=False,
-            use_cache=True,
-            language="en",
-            attention_mask=torch.ones(input_features.shape, dtype=torch.long).to(device),
-        )
         transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    st.success("📄 Transcription:")
-    st.write(transcription)

 import numpy as np
 import streamlit as st
 import matplotlib.pyplot as plt
+from cryptography.fernet import Fernet  # Encryption
 from huggingface_hub import login
 from transformers import (
     AutoProcessor,
 # 1️⃣ Authenticate with Hugging Face Hub (Securely)
 # ================================
 HF_TOKEN = os.getenv("hf_token")
 if HF_TOKEN is None:
     raise ValueError("❌ Hugging Face API token not found. Please set it in Secrets.")
 login(token=HF_TOKEN)
 # ================================
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
 # ================================
 # 3️⃣ Load Dataset (From Extracted Folder)
 EXTRACT_PATH = "./librispeech_dev_clean"
 if not os.path.exists(EXTRACT_PATH):
     with tarfile.open(DATASET_TAR_PATH, "r:gz") as tar:
         tar.extractall(EXTRACT_PATH)
 AUDIO_FOLDER = os.path.join(EXTRACT_PATH, "LibriSpeech", "dev-clean")
 def find_audio_files(base_folder):
+    return [os.path.join(root, file) for root, _, files in os.walk(base_folder) for file in files if file.endswith(".flac")]
 audio_files = find_audio_files(AUDIO_FOLDER)
 # ================================
 # 4️⃣ Load Transcripts
 # ================================
 def load_transcripts():
     transcript_dict = {}
     for root, _, files in os.walk(AUDIO_FOLDER):
         for file in files:
+            if file.endswith(".txt"):
                 with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                     for line in f:
                         parts = line.strip().split(" ", 1)
                         if len(parts) == 2:
+                            transcript_dict[parts[0]] = parts[1]
     return transcript_dict
 transcripts = load_transcripts()
 # ================================
+# 5️⃣ Adversarial Attack Simulation (Modifying Transcripts)
 # ================================
+def generate_adversarial_text(text):
+    words = text.split()
+    if len(words) > 3:
+        words[2] = "[REPLACED]"
+    return " ".join(words)
 # ================================
+# 6️⃣ Encrypt & Decrypt Transcriptions
 # ================================
+key = Fernet.generate_key()
+cipher = Fernet(key)
+def encrypt_transcription(text):
+    return cipher.encrypt(text.encode()).decode()
+def decrypt_transcription(encrypted_text):
+    return cipher.decrypt(encrypted_text.encode()).decode()
 # ================================
 # 7️⃣ Training Arguments & Trainer
     output_dir="./asr_model_finetuned",
     eval_strategy="epoch",
     save_strategy="epoch",
+    learning_rate=5e-5,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+    num_train_epochs=3,
     weight_decay=0.01,
     logging_dir="./logs",
     logging_steps=500,
     hub_token=HF_TOKEN,
 )
 # ================================
+# 8️⃣ Streamlit ASR Web App (Enhanced UI)
 # ================================
+st.title("🎙️ Speech-to-Text ASR Model with Security & Attack Detection")
+st.sidebar.title("⚙️ Settings")
+attack_mode = st.sidebar.checkbox("Enable Adversarial Attack Simulation")
+encryption_mode = st.sidebar.checkbox("Enable Encryption")
 audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])
     ).input_features.to(device)
     with torch.inference_mode():
+        generated_ids = model.generate(input_features, max_length=200, num_beams=2)
         transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    if attack_mode:
+        transcription = generate_adversarial_text(transcription)
+        st.warning("⚠️ Adversarial attack detected: Modified transcription!")
+    if encryption_mode:
+        encrypted_text = encrypt_transcription(transcription)
+        st.success("🔐 Encrypted Transcription:")
+        st.write(encrypted_text)
+        st.text("🔓 Decrypted Transcription:")
+        st.write(decrypt_transcription(encrypted_text))
+    else:
+        st.success("📄 Transcription:")
+        st.write(transcription)