Update app.py
Browse files
app.py
CHANGED
@@ -37,7 +37,7 @@ model.to(device)
|
|
37 |
print(f"β
Model loaded on {device}")
|
38 |
|
39 |
# ================================
|
40 |
-
# 3οΈβ£ Load
|
41 |
# ================================
|
42 |
DATASET_TAR_PATH = "dev-clean.tar.gz"
|
43 |
EXTRACT_PATH = "./librispeech_dev_clean"
|
@@ -50,35 +50,41 @@ if not os.path.exists(EXTRACT_PATH):
|
|
50 |
else:
|
51 |
print("β
Dataset already extracted.")
|
52 |
|
53 |
-
|
54 |
-
dataset = load_dataset("librispeech_asr", "clean", split="train", trust_remote_code=True)
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
# Tokenize transcript text
|
69 |
-
batch["labels"] = processor.tokenizer(batch["text"], padding="max_length", truncation=True, return_tensors="pt").input_ids[0]
|
70 |
-
|
71 |
-
return batch
|
72 |
|
73 |
-
|
74 |
-
dataset = dataset.map(preprocess_data, remove_columns=["file", "audio", "text"])
|
75 |
|
76 |
-
# Split into train & eval
|
77 |
train_size = int(0.8 * len(dataset))
|
78 |
-
train_dataset = dataset
|
79 |
-
eval_dataset = dataset
|
80 |
|
81 |
-
print(f"β
Dataset
|
82 |
|
83 |
# ================================
|
84 |
# 4οΈβ£ Training Arguments & Trainer
|
|
|
37 |
print(f"β
Model loaded on {device}")
|
38 |
|
39 |
# ================================
|
40 |
+
# 3οΈβ£ Load Dataset (Recursively from Extracted Path)
|
41 |
# ================================
|
42 |
DATASET_TAR_PATH = "dev-clean.tar.gz"
|
43 |
EXTRACT_PATH = "./librispeech_dev_clean"
|
|
|
50 |
else:
|
51 |
print("β
Dataset already extracted.")
|
52 |
|
53 |
+
AUDIO_FOLDER = os.path.join(EXTRACT_PATH, "LibriSpeech", "dev-clean")
|
|
|
54 |
|
55 |
+
def find_audio_files(base_folder):
|
56 |
+
"""Recursively search for all .flac files in subdirectories."""
|
57 |
+
audio_files = []
|
58 |
+
for root, _, files in os.walk(base_folder):
|
59 |
+
for file in files:
|
60 |
+
if file.endswith(".flac"):
|
61 |
+
audio_files.append(os.path.join(root, file))
|
62 |
+
return audio_files
|
63 |
|
64 |
+
audio_files = find_audio_files(AUDIO_FOLDER)
|
65 |
+
|
66 |
+
if not audio_files:
|
67 |
+
raise FileNotFoundError(f"β No .flac files found in {AUDIO_FOLDER}. Check dataset structure!")
|
68 |
+
|
69 |
+
print(f"β
Found {len(audio_files)} audio files in dataset!")
|
70 |
+
|
71 |
+
# ================================
|
72 |
+
# 4οΈβ£ Preprocess Dataset
|
73 |
+
# ================================
|
74 |
+
def load_and_process_audio(audio_path):
|
75 |
+
"""Loads and processes a single audio file into model format."""
|
76 |
+
waveform, sample_rate = torchaudio.load(audio_path)
|
77 |
waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
|
78 |
+
input_features = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features[0]
|
79 |
+
return input_features
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
+
dataset = [{"input_features": load_and_process_audio(f), "labels": []} for f in audio_files[:100]]
|
|
|
82 |
|
|
|
83 |
train_size = int(0.8 * len(dataset))
|
84 |
+
train_dataset = dataset[:train_size]
|
85 |
+
eval_dataset = dataset[train_size:]
|
86 |
|
87 |
+
print(f"β
Dataset Loaded! Training: {len(train_dataset)}, Evaluation: {len(eval_dataset)}")
|
88 |
|
89 |
# ================================
|
90 |
# 4οΈβ£ Training Arguments & Trainer
|