tahirsher commited on
Commit
8d19597
Β·
verified Β·
1 Parent(s): a4a32f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -15
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import torch
3
  import torchaudio
4
- import librosa
5
  import streamlit as st
6
  from huggingface_hub import login
7
  from transformers import AutoProcessor, AutoModelForCTC
@@ -69,31 +68,29 @@ if audio_file:
69
  with open(audio_path, "wb") as f:
70
  f.write(audio_file.read())
71
 
72
- # Load and preprocess the audio file using librosa
73
- speech, sr = librosa.load(audio_path, sr=16000)
74
-
 
 
75
  # ================================
76
  # βœ… Optimized Adversarial Attack Handling
77
  # ================================
78
- noise = attack_strength * torch.randn_like(torch.tensor(speech))
79
- adversarial_waveform = torch.tensor(speech) + noise
80
  adversarial_waveform = torch.clamp(adversarial_waveform, -1.0, 1.0)
81
 
82
- # Remove background noise for speed & accuracy
83
- denoised_waveform = torchaudio.functional.vad(adversarial_waveform, sample_rate=16000)
84
-
85
  # ================================
86
  # βœ… Fast Transcription Processing with Conformer
87
  # ================================
88
- # Convert waveform into the required format
89
- inputs = processor(denoised_waveform.numpy(), sampling_rate=sr, return_tensors="pt", padding=True).to("cuda" if torch.cuda.is_available() else "cpu")
90
 
91
- # Make sure the input has batch dimension (even if it's one example)
92
- if len(inputs.input_values.shape) == 1:
93
- inputs.input_values = inputs.input_values.unsqueeze(0)
94
 
95
  with torch.no_grad():
96
- logits = model(**inputs).logits
97
 
98
  predicted_ids = torch.argmax(logits, dim=-1)
99
  transcription = processor.batch_decode(predicted_ids)
 
1
  import os
2
  import torch
3
  import torchaudio
 
4
  import streamlit as st
5
  from huggingface_hub import login
6
  from transformers import AutoProcessor, AutoModelForCTC
 
68
  with open(audio_path, "wb") as f:
69
  f.write(audio_file.read())
70
 
71
+ # Load and preprocess the audio file using torchaudio
72
+ waveform, sample_rate = torchaudio.load(audio_path)
73
+ waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
74
+ waveform = waveform.to(dtype=torch.float32)
75
+
76
  # ================================
77
  # βœ… Optimized Adversarial Attack Handling
78
  # ================================
79
+ noise = attack_strength * torch.randn_like(waveform)
80
+ adversarial_waveform = waveform + noise
81
  adversarial_waveform = torch.clamp(adversarial_waveform, -1.0, 1.0)
82
 
 
 
 
83
  # ================================
84
  # βœ… Fast Transcription Processing with Conformer
85
  # ================================
86
+ input_features = processor(adversarial_waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features.to("cuda" if torch.cuda.is_available() else "cpu")
 
87
 
88
+ # Ensure the input has batch dimension (even if it's one example)
89
+ if len(input_features.shape) == 1:
90
+ input_features = input_features.unsqueeze(0)
91
 
92
  with torch.no_grad():
93
+ logits = model(input_features).logits
94
 
95
  predicted_ids = torch.argmax(logits, dim=-1)
96
  transcription = processor.batch_decode(predicted_ids)