Spaces:

tahirsher
/

ASR_Model_for_Transcription_into_Text

Sleeping

App Files Files Community

tahirsher commited on Mar 9

Commit

723513d

verified ·

1 Parent(s): 3a79217

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -11

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import torch
 import torchaudio
 import numpy as np
 import streamlit as st
 from transformers import (
     AutoProcessor,
     AutoModelForSpeechSeq2Seq,
@@ -13,7 +14,13 @@ from transformers import (
 )
 # ================================
-# 1️⃣ Load Model & Processor
 # ================================
 MODEL_NAME = "AqeelShafy7/AudioSangraha-Audio_to_Text"
@@ -27,7 +34,7 @@ model.to(device)
 print(f"✅ Model loaded on {device}")
 # ================================
-# 2️⃣ Load Dataset (Recursively from Extracted Path)
 # ================================
 DATASET_TAR_PATH = "dev-clean.tar.gz"
 EXTRACT_PATH = "./librispeech_dev_clean"
@@ -63,7 +70,7 @@ if not audio_files:
 print(f"✅ Found {len(audio_files)} audio files in dataset!")
 # ================================
-# 3️⃣ Preprocess Dataset (Fixed input_features)
 # ================================
 def load_and_process_audio(audio_path):
     """Loads and processes a single audio file into model format."""
@@ -80,7 +87,7 @@ def load_and_process_audio(audio_path):
 # Manually create dataset structure
 dataset = [{"input_features": load_and_process_audio(f), "labels": []} for f in audio_files[:100]]
-# Split dataset into train and eval (Recommended Fix)
 train_size = int(0.9 * len(dataset))
 train_dataset = dataset[:train_size]
 eval_dataset = dataset[train_size:]
@@ -88,11 +95,11 @@ eval_dataset = dataset[train_size:]
 print(f"✅ Dataset Loaded! Training: {len(train_dataset)}, Evaluation: {len(eval_dataset)}")
 # ================================
-# 4️⃣ Training Arguments & Trainer
 # ================================
 training_args = TrainingArguments(
     output_dir="./asr_model_finetuned",
-    eval_strategy="epoch",  # Fix: Proper evaluation
     save_strategy="epoch",
     learning_rate=5e-5,
     per_device_train_batch_size=8,
@@ -102,7 +109,9 @@ training_args = TrainingArguments(
     logging_dir="./logs",
     logging_steps=500,
     save_total_limit=2,
-    push_to_hub=True,
 )
 # Data collator (for dynamic padding)
@@ -113,13 +122,13 @@ trainer = Trainer(
     model=model,
     args=training_args,
     train_dataset=train_dataset,
-    eval_dataset=eval_dataset,  # Fix: Providing eval_dataset
     processing_class=processor,  # Fix: Replacing deprecated `tokenizer`
     data_collator=data_collator,
 )
 # ================================
-# 5️⃣ Fine-Tuning Execution
 # ================================
 if st.button("Start Fine-Tuning"):
     with st.spinner("Fine-tuning in progress... Please wait!"):
@@ -127,7 +136,7 @@ if st.button("Start Fine-Tuning"):
     st.success("✅ Fine-Tuning Completed! Model updated.")
 # ================================
-# 6️⃣ Streamlit ASR Web App
 # ================================
 st.title("🎙️ Speech-to-Text ASR with Fine-Tuning 🎶")
@@ -159,7 +168,7 @@ if audio_file:
     st.write(transcription)
     # ================================
-    # 7️⃣ Fine-Tune Model with User Correction
     # ================================
     user_correction = st.text_area("🔧 Correct the transcription (if needed):", transcription)

 import torchaudio
 import numpy as np
 import streamlit as st
+from huggingface_hub import login
 from transformers import (
     AutoProcessor,
     AutoModelForSpeechSeq2Seq,
 )
 # ================================
+# 1️⃣ Authenticate with Hugging Face Hub
+# ================================
+HF_TOKEN = "hf_xxxxxxxxxxxxxxxxxxxxxxx"  # Replace with your Hugging Face token
+login(token=HF_TOKEN)  # Ensure authentication
+# ================================
+# 2️⃣ Load Model & Processor
 # ================================
 MODEL_NAME = "AqeelShafy7/AudioSangraha-Audio_to_Text"
 print(f"✅ Model loaded on {device}")
 # ================================
+# 3️⃣ Load Dataset (Recursively from Extracted Path)
 # ================================
 DATASET_TAR_PATH = "dev-clean.tar.gz"
 EXTRACT_PATH = "./librispeech_dev_clean"
 print(f"✅ Found {len(audio_files)} audio files in dataset!")
 # ================================
+# 4️⃣ Preprocess Dataset (Fixed input_features)
 # ================================
 def load_and_process_audio(audio_path):
     """Loads and processes a single audio file into model format."""
 # Manually create dataset structure
 dataset = [{"input_features": load_and_process_audio(f), "labels": []} for f in audio_files[:100]]
+# Split dataset into train and eval
 train_size = int(0.9 * len(dataset))
 train_dataset = dataset[:train_size]
 eval_dataset = dataset[train_size:]
 print(f"✅ Dataset Loaded! Training: {len(train_dataset)}, Evaluation: {len(eval_dataset)}")
 # ================================
+# 5️⃣ Training Arguments & Trainer
 # ================================
 training_args = TrainingArguments(
     output_dir="./asr_model_finetuned",
+    eval_strategy="epoch",  # Fixed deprecated evaluation_strategy
     save_strategy="epoch",
     learning_rate=5e-5,
     per_device_train_batch_size=8,
     logging_dir="./logs",
     logging_steps=500,
     save_total_limit=2,
+    push_to_hub=True,  # Fix: Properly authenticate Hugging Face Hub
+    hub_model_id="tahirsher/ASR_Model",  # Replace with your Hugging Face repo
+    hub_token=HF_TOKEN,
 )
 # Data collator (for dynamic padding)
     model=model,
     args=training_args,
     train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
     processing_class=processor,  # Fix: Replacing deprecated `tokenizer`
     data_collator=data_collator,
 )
 # ================================
+# 6️⃣ Fine-Tuning Execution
 # ================================
 if st.button("Start Fine-Tuning"):
     with st.spinner("Fine-tuning in progress... Please wait!"):
     st.success("✅ Fine-Tuning Completed! Model updated.")
 # ================================
+# 7️⃣ Streamlit ASR Web App
 # ================================
 st.title("🎙️ Speech-to-Text ASR with Fine-Tuning 🎶")
     st.write(transcription)
     # ================================
+    # 8️⃣ Fine-Tune Model with User Correction
     # ================================
     user_correction = st.text_area("🔧 Correct the transcription (if needed):", transcription)