usamadv commited on
Commit
2214cbc
Β·
verified Β·
1 Parent(s): 8dee154

Update mai.py

Browse files
Files changed (1) hide show
  1. mai.py +44 -0
mai.py CHANGED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import torch
3
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
4
+ from datasets import load_dataset
5
+ from IPython.display import Audio
6
+ device = "cuda" if torch.cuda.is_available() else "cpu"
7
+ from transformers import pipeline
8
+
9
+ pipe = pipeline(
10
+ "automatic-speech-recognition", model="openai/whisper-base", device=device
11
+ )
12
+
13
+ dataset = load_dataset("facebook/voxpopuli", "es", split="validation", streaming=True)
14
+ sample = next(iter(dataset))
15
+ def translate(audio):
16
+ outputs = pipe(audio, generate_kwargs={"task": "translate",'max_new_tokens':255})
17
+ return outputs["text"]
18
+ # Load SpeechT5 Processor and Model
19
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
20
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
21
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
22
+
23
+ # Load Speaker Embeddings
24
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
25
+ speaker_embeddings = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0).to(device)
26
+ # Define Speech Synthesis Function
27
+ def synthesise(text):
28
+ inputs = processor(text=text, return_tensors="pt")
29
+ speech = model.generate_speech(
30
+ inputs["input_ids"].to(device),
31
+ speaker_embeddings,
32
+ vocoder=vocoder
33
+ )
34
+ return speech.cpu()
35
+ import numpy as np
36
+
37
+ target_dtype = np.int16
38
+ max_range = np.iinfo(target_dtype).max
39
+ def speech_to_speech_translation(audio):
40
+ translated_text = translate(audio)
41
+ synthesised_speech = synthesise(translated_text)
42
+ synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
43
+ return 16000, synthesised_speech
44
+ sampling_rate, synthesised_speech = speech_to_speech_translation(sample["audio"])