Spaces:

hriteshMaikap
/

marathi-asr-wav2vec2bert

Sleeping

App Files Files Community

hriteshMaikap commited on Apr 16

Commit

ee75be0

verified ·

1 Parent(s): a36ff2a

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -31

app.py CHANGED Viewed

@@ -10,41 +10,46 @@ processor = Wav2Vec2BertProcessor.from_pretrained(repo_id)
 model = Wav2Vec2BertForCTC.from_pretrained(repo_id)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = model.to(device)
-def transcribe(audio):
-    # Process audio
-    waveform, sample_rate = torchaudio.load(audio)
-    # Resample if needed
-    if sample_rate != 16000:
-        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
-        waveform = resampler(waveform)
-    # Convert to mono if needed
-    if waveform.shape[0] > 1:
-        waveform = torch.mean(waveform, dim=0, keepdim=True)
-    # Convert to numpy
-    speech_array = waveform.squeeze().numpy()
-    # Process and run inference
-    with torch.no_grad():
-        inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt").to(device)
-        logits = model(inputs.input_features).logits
-        predicted_ids = torch.argmax(logits, dim=-1)
-    # Decode the predicted IDs
-    transcription = processor.decode(predicted_ids[0])
-    return transcription
-# Create Gradio interface
-iface = gr.Interface(
     fn=transcribe,
-    inputs=gr.Audio(source="microphone", type="filepath"),
     outputs="text",
     title="Marathi Speech Recognition",
-    description="Record your voice in Marathi and get a transcription."
 )
-iface.launch()

 model = Wav2Vec2BertForCTC.from_pretrained(repo_id)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = model.to(device)
+model.eval()  # Set to evaluation mode
+def transcribe(audio_file):
+    try:
+        # Process audio
+        waveform, sample_rate = torchaudio.load(audio_file)
+        # Resample if needed
+        if sample_rate != 16000:
+            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
+            waveform = resampler(waveform)
+            sample_rate = 16000
+        # Convert to mono if needed
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        # Convert to numpy
+        speech_array = waveform.squeeze().numpy()
+        # Process and run inference
+        with torch.no_grad():
+            inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt").to(device)
+            logits = model(inputs.input_features).logits
+            predicted_ids = torch.argmax(logits, dim=-1)
+        # Decode the predicted IDs
+        transcription = processor.decode(predicted_ids[0])
+        return transcription
+    except Exception as e:
+        return f"Error processing audio: {str(e)}"
+# Create Gradio interface with updated syntax
+demo = gr.Interface(
     fn=transcribe,
+    inputs=gr.Audio(type="filepath"),  # Removed 'source' parameter
     outputs="text",
     title="Marathi Speech Recognition",
+    description="Record your voice in Marathi and get a transcription. Click the microphone icon to start recording, then submit to transcribe."
 )
+demo.launch()