root commited on
Commit
da2eec2
·
1 Parent(s): 5570489

Add application file

Browse files
Files changed (1) hide show
  1. app.py +46 -4
app.py CHANGED
@@ -1,7 +1,49 @@
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ import torchaudio
7
+ from transformers import AutoConfig, Wav2Vec2FeatureExtractor, Wav2Vec2ForSpeechClassification
8
 
9
+ import librosa
10
+ import IPython.display as ipd
11
+ import numpy as np
12
+ import pandas as pd
13
+
14
+
15
+
16
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
+ model_name_or_path = "m3hrdadfi/wav2vec2-base-100k-voxpopuli-gtzan-music"
18
+ config = AutoConfig.from_pretrained(model_name_or_path)
19
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
20
+ sampling_rate = feature_extractor.sampling_rate
21
+ model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)
22
+
23
+
24
+ def speech_file_to_array_fn(path, sampling_rate):
25
+ speech_array, _sampling_rate = torchaudio.load(path)
26
+ resampler = torchaudio.transforms.Resample(_sampling_rate)
27
+ speech = resampler(speech_array).squeeze().numpy()
28
+ return speech
29
+
30
+
31
+ def predict(path, sampling_rate):
32
+ speech = speech_file_to_array_fn(path, sampling_rate)
33
+ inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
34
+ inputs = {key: inputs[key].to(device) for key in inputs}
35
+
36
+ with torch.no_grad():
37
+ logits = model(**inputs).logits
38
+
39
+ scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
40
+ outputs = [{"Label": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
41
+ return outputs
42
+
43
+
44
+ path = "La Campanella.mp3"
45
+ outputs = predict(path, sampling_rate)
46
+
47
+
48
+ iface = gr.Interface(fn=predict, inputs=path, outputs=predict(path, sampling_rate))
49
+ iface.launch()