Spaces:

tahirsher
/

ASR_Model_for_Transcription_into_Text

Sleeping

tahirsher commited on Mar 9

Commit

1bb8243

verified ·

1 Parent(s): d2d38cf

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ model.to(device)
 print(f"✅ Model loaded on {device}")
 # ================================
-# 2️⃣ Load Dataset (Manually from Extracted Path)
 # ================================
 DATASET_TAR_PATH = "dev-clean.tar.gz"  # Dataset stored in Hugging Face Space
 EXTRACT_PATH = "./librispeech_dev_clean"  # Extracted dataset folder
@@ -41,9 +41,27 @@ if not os.path.exists(EXTRACT_PATH):
 else:
     print("✅ Dataset already extracted.")
-# Load audio files manually
-AUDIO_FOLDER = os.path.join(EXTRACT_PATH, "LibriSpeech", "train-clean-100")  # Adjust as per structure
-audio_files = [os.path.join(AUDIO_FOLDER, f) for f in os.listdir(AUDIO_FOLDER) if f.endswith(".flac")]
 # ================================
 # 3️⃣ Preprocess Dataset (Manually)

 print(f"✅ Model loaded on {device}")
 # ================================
+# 2️⃣ Load Dataset (Recursively from Extracted Path)
 # ================================
 DATASET_TAR_PATH = "dev-clean.tar.gz"  # Dataset stored in Hugging Face Space
 EXTRACT_PATH = "./librispeech_dev_clean"  # Extracted dataset folder
 else:
     print("✅ Dataset already extracted.")
+# Define the base directory where audio files are stored
+AUDIO_FOLDER = os.path.join(EXTRACT_PATH, "LibriSpeech", "dev-clean")
+# Recursively find all `.flac` files inside the dataset directory
+def find_audio_files(base_folder):
+    """Recursively search for all .flac files in subdirectories."""
+    audio_files = []
+    for root, _, files in os.walk(base_folder):
+        for file in files:
+            if file.endswith(".flac"):
+                audio_files.append(os.path.join(root, file))
+    return audio_files
+# Get all audio files
+audio_files = find_audio_files(AUDIO_FOLDER)
+# Check if audio files were found
+if not audio_files:
+    raise FileNotFoundError(f"❌ No .flac files found in {AUDIO_FOLDER}. Check dataset structure!")
+print(f"✅ Found {len(audio_files)} audio files in dataset!")
 # ================================
 # 3️⃣ Preprocess Dataset (Manually)