Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import torch
|
|
4 |
import torchaudio
|
5 |
import numpy as np
|
6 |
import streamlit as st
|
|
|
7 |
from transformers import (
|
8 |
AutoProcessor,
|
9 |
AutoModelForSpeechSeq2Seq,
|
@@ -13,7 +14,13 @@ from transformers import (
|
|
13 |
)
|
14 |
|
15 |
# ================================
|
16 |
-
# 1️⃣
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
# ================================
|
18 |
MODEL_NAME = "AqeelShafy7/AudioSangraha-Audio_to_Text"
|
19 |
|
@@ -27,7 +34,7 @@ model.to(device)
|
|
27 |
print(f"✅ Model loaded on {device}")
|
28 |
|
29 |
# ================================
|
30 |
-
#
|
31 |
# ================================
|
32 |
DATASET_TAR_PATH = "dev-clean.tar.gz"
|
33 |
EXTRACT_PATH = "./librispeech_dev_clean"
|
@@ -63,7 +70,7 @@ if not audio_files:
|
|
63 |
print(f"✅ Found {len(audio_files)} audio files in dataset!")
|
64 |
|
65 |
# ================================
|
66 |
-
#
|
67 |
# ================================
|
68 |
def load_and_process_audio(audio_path):
|
69 |
"""Loads and processes a single audio file into model format."""
|
@@ -80,7 +87,7 @@ def load_and_process_audio(audio_path):
|
|
80 |
# Manually create dataset structure
|
81 |
dataset = [{"input_features": load_and_process_audio(f), "labels": []} for f in audio_files[:100]]
|
82 |
|
83 |
-
# Split dataset into train and eval
|
84 |
train_size = int(0.9 * len(dataset))
|
85 |
train_dataset = dataset[:train_size]
|
86 |
eval_dataset = dataset[train_size:]
|
@@ -88,11 +95,11 @@ eval_dataset = dataset[train_size:]
|
|
88 |
print(f"✅ Dataset Loaded! Training: {len(train_dataset)}, Evaluation: {len(eval_dataset)}")
|
89 |
|
90 |
# ================================
|
91 |
-
#
|
92 |
# ================================
|
93 |
training_args = TrainingArguments(
|
94 |
output_dir="./asr_model_finetuned",
|
95 |
-
eval_strategy="epoch", #
|
96 |
save_strategy="epoch",
|
97 |
learning_rate=5e-5,
|
98 |
per_device_train_batch_size=8,
|
@@ -102,7 +109,9 @@ training_args = TrainingArguments(
|
|
102 |
logging_dir="./logs",
|
103 |
logging_steps=500,
|
104 |
save_total_limit=2,
|
105 |
-
push_to_hub=True,
|
|
|
|
|
106 |
)
|
107 |
|
108 |
# Data collator (for dynamic padding)
|
@@ -113,13 +122,13 @@ trainer = Trainer(
|
|
113 |
model=model,
|
114 |
args=training_args,
|
115 |
train_dataset=train_dataset,
|
116 |
-
eval_dataset=eval_dataset,
|
117 |
processing_class=processor, # Fix: Replacing deprecated `tokenizer`
|
118 |
data_collator=data_collator,
|
119 |
)
|
120 |
|
121 |
# ================================
|
122 |
-
#
|
123 |
# ================================
|
124 |
if st.button("Start Fine-Tuning"):
|
125 |
with st.spinner("Fine-tuning in progress... Please wait!"):
|
@@ -127,7 +136,7 @@ if st.button("Start Fine-Tuning"):
|
|
127 |
st.success("✅ Fine-Tuning Completed! Model updated.")
|
128 |
|
129 |
# ================================
|
130 |
-
#
|
131 |
# ================================
|
132 |
st.title("🎙️ Speech-to-Text ASR with Fine-Tuning 🎶")
|
133 |
|
@@ -159,7 +168,7 @@ if audio_file:
|
|
159 |
st.write(transcription)
|
160 |
|
161 |
# ================================
|
162 |
-
#
|
163 |
# ================================
|
164 |
user_correction = st.text_area("🔧 Correct the transcription (if needed):", transcription)
|
165 |
|
|
|
4 |
import torchaudio
|
5 |
import numpy as np
|
6 |
import streamlit as st
|
7 |
+
from huggingface_hub import login
|
8 |
from transformers import (
|
9 |
AutoProcessor,
|
10 |
AutoModelForSpeechSeq2Seq,
|
|
|
14 |
)
|
15 |
|
16 |
# ================================
|
17 |
+
# 1️⃣ Authenticate with Hugging Face Hub
|
18 |
+
# ================================
|
19 |
+
HF_TOKEN = "hf_xxxxxxxxxxxxxxxxxxxxxxx" # Replace with your Hugging Face token
|
20 |
+
login(token=HF_TOKEN) # Ensure authentication
|
21 |
+
|
22 |
+
# ================================
|
23 |
+
# 2️⃣ Load Model & Processor
|
24 |
# ================================
|
25 |
MODEL_NAME = "AqeelShafy7/AudioSangraha-Audio_to_Text"
|
26 |
|
|
|
34 |
print(f"✅ Model loaded on {device}")
|
35 |
|
36 |
# ================================
|
37 |
+
# 3️⃣ Load Dataset (Recursively from Extracted Path)
|
38 |
# ================================
|
39 |
DATASET_TAR_PATH = "dev-clean.tar.gz"
|
40 |
EXTRACT_PATH = "./librispeech_dev_clean"
|
|
|
70 |
print(f"✅ Found {len(audio_files)} audio files in dataset!")
|
71 |
|
72 |
# ================================
|
73 |
+
# 4️⃣ Preprocess Dataset (Fixed input_features)
|
74 |
# ================================
|
75 |
def load_and_process_audio(audio_path):
|
76 |
"""Loads and processes a single audio file into model format."""
|
|
|
87 |
# Manually create dataset structure
|
88 |
dataset = [{"input_features": load_and_process_audio(f), "labels": []} for f in audio_files[:100]]
|
89 |
|
90 |
+
# Split dataset into train and eval
|
91 |
train_size = int(0.9 * len(dataset))
|
92 |
train_dataset = dataset[:train_size]
|
93 |
eval_dataset = dataset[train_size:]
|
|
|
95 |
print(f"✅ Dataset Loaded! Training: {len(train_dataset)}, Evaluation: {len(eval_dataset)}")
|
96 |
|
97 |
# ================================
|
98 |
+
# 5️⃣ Training Arguments & Trainer
|
99 |
# ================================
|
100 |
training_args = TrainingArguments(
|
101 |
output_dir="./asr_model_finetuned",
|
102 |
+
eval_strategy="epoch", # Fixed deprecated evaluation_strategy
|
103 |
save_strategy="epoch",
|
104 |
learning_rate=5e-5,
|
105 |
per_device_train_batch_size=8,
|
|
|
109 |
logging_dir="./logs",
|
110 |
logging_steps=500,
|
111 |
save_total_limit=2,
|
112 |
+
push_to_hub=True, # Fix: Properly authenticate Hugging Face Hub
|
113 |
+
hub_model_id="tahirsher/ASR_Model", # Replace with your Hugging Face repo
|
114 |
+
hub_token=HF_TOKEN,
|
115 |
)
|
116 |
|
117 |
# Data collator (for dynamic padding)
|
|
|
122 |
model=model,
|
123 |
args=training_args,
|
124 |
train_dataset=train_dataset,
|
125 |
+
eval_dataset=eval_dataset,
|
126 |
processing_class=processor, # Fix: Replacing deprecated `tokenizer`
|
127 |
data_collator=data_collator,
|
128 |
)
|
129 |
|
130 |
# ================================
|
131 |
+
# 6️⃣ Fine-Tuning Execution
|
132 |
# ================================
|
133 |
if st.button("Start Fine-Tuning"):
|
134 |
with st.spinner("Fine-tuning in progress... Please wait!"):
|
|
|
136 |
st.success("✅ Fine-Tuning Completed! Model updated.")
|
137 |
|
138 |
# ================================
|
139 |
+
# 7️⃣ Streamlit ASR Web App
|
140 |
# ================================
|
141 |
st.title("🎙️ Speech-to-Text ASR with Fine-Tuning 🎶")
|
142 |
|
|
|
168 |
st.write(transcription)
|
169 |
|
170 |
# ================================
|
171 |
+
# 8️⃣ Fine-Tune Model with User Correction
|
172 |
# ================================
|
173 |
user_correction = st.text_area("🔧 Correct the transcription (if needed):", transcription)
|
174 |
|