tahirsher commited on
Commit
771c2e9
Β·
verified Β·
1 Parent(s): aa42e50

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -23
app.py CHANGED
@@ -37,7 +37,7 @@ model.to(device)
37
  print(f"βœ… Model loaded on {device}")
38
 
39
  # ================================
40
- # 3️⃣ Load and Prepare Dataset
41
  # ================================
42
  DATASET_TAR_PATH = "dev-clean.tar.gz"
43
  EXTRACT_PATH = "./librispeech_dev_clean"
@@ -50,35 +50,41 @@ if not os.path.exists(EXTRACT_PATH):
50
  else:
51
  print("βœ… Dataset already extracted.")
52
 
53
- # Load dataset with transcripts
54
- dataset = load_dataset("librispeech_asr", "clean", split="train", trust_remote_code=True)
55
 
56
- # Ensure dataset has transcripts
57
- if "text" not in dataset.column_names:
58
- raise ValueError("❌ Dataset is missing transcription text!")
 
 
 
 
 
59
 
60
- # Preprocessing Function
61
- def preprocess_data(batch):
62
- # Process audio
63
- waveform, sample_rate = torchaudio.load(batch["file"])
 
 
 
 
 
 
 
 
 
64
  waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
65
-
66
- batch["input_features"] = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features[0]
67
-
68
- # Tokenize transcript text
69
- batch["labels"] = processor.tokenizer(batch["text"], padding="max_length", truncation=True, return_tensors="pt").input_ids[0]
70
-
71
- return batch
72
 
73
- # Apply preprocessing
74
- dataset = dataset.map(preprocess_data, remove_columns=["file", "audio", "text"])
75
 
76
- # Split into train & eval
77
  train_size = int(0.8 * len(dataset))
78
- train_dataset = dataset.select(range(train_size))
79
- eval_dataset = dataset.select(range(train_size, len(dataset)))
80
 
81
- print(f"βœ… Dataset Prepared! Training: {len(train_dataset)}, Evaluation: {len(eval_dataset)}")
82
 
83
  # ================================
84
  # 4️⃣ Training Arguments & Trainer
 
37
  print(f"βœ… Model loaded on {device}")
38
 
39
  # ================================
40
+ # 3️⃣ Load Dataset (Recursively from Extracted Path)
41
  # ================================
42
  DATASET_TAR_PATH = "dev-clean.tar.gz"
43
  EXTRACT_PATH = "./librispeech_dev_clean"
 
50
  else:
51
  print("βœ… Dataset already extracted.")
52
 
53
+ AUDIO_FOLDER = os.path.join(EXTRACT_PATH, "LibriSpeech", "dev-clean")
 
54
 
55
+ def find_audio_files(base_folder):
56
+ """Recursively search for all .flac files in subdirectories."""
57
+ audio_files = []
58
+ for root, _, files in os.walk(base_folder):
59
+ for file in files:
60
+ if file.endswith(".flac"):
61
+ audio_files.append(os.path.join(root, file))
62
+ return audio_files
63
 
64
+ audio_files = find_audio_files(AUDIO_FOLDER)
65
+
66
+ if not audio_files:
67
+ raise FileNotFoundError(f"❌ No .flac files found in {AUDIO_FOLDER}. Check dataset structure!")
68
+
69
+ print(f"βœ… Found {len(audio_files)} audio files in dataset!")
70
+
71
+ # ================================
72
+ # 4️⃣ Preprocess Dataset
73
+ # ================================
74
+ def load_and_process_audio(audio_path):
75
+ """Loads and processes a single audio file into model format."""
76
+ waveform, sample_rate = torchaudio.load(audio_path)
77
  waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
78
+ input_features = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features[0]
79
+ return input_features
 
 
 
 
 
80
 
81
+ dataset = [{"input_features": load_and_process_audio(f), "labels": []} for f in audio_files[:100]]
 
82
 
 
83
  train_size = int(0.8 * len(dataset))
84
+ train_dataset = dataset[:train_size]
85
+ eval_dataset = dataset[train_size:]
86
 
87
+ print(f"βœ… Dataset Loaded! Training: {len(train_dataset)}, Evaluation: {len(eval_dataset)}")
88
 
89
  # ================================
90
  # 4️⃣ Training Arguments & Trainer