File size: 4,470 Bytes
14cda64
 
 
f1f4016
14cda64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1f4016
 
14cda64
 
 
 
 
 
f1f4016
 
14cda64
 
 
 
 
 
 
f1f4016
 
14cda64
 
f1f4016
 
 
 
14cda64
 
 
 
 
 
f1f4016
 
 
14cda64
 
f1f4016
 
 
 
 
 
14cda64
 
f1f4016
14cda64
 
 
 
 
 
f1f4016
 
 
 
 
 
14cda64
 
f1f4016
14cda64
 
 
 
 
 
 
 
 
 
 
f1f4016
 
 
 
14cda64
 
 
 
 
 
 
 
 
 
 
 
 
f1f4016
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import torch
import torchaudio
import gradio as gr
import soundfile as sf
import wave
import numpy as np
from transformers import WhisperForCTC, WhisperProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import OpenVoiceV2Processor, OpenVoiceV2

# Load ASR model and processor
processor_asr = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
model_asr = WhisperForCTC.from_pretrained("openai/whisper-large-v3")

# Load text-to-text model and tokenizer
text_model = AutoModelForSeq2SeqLM.from_pretrained("meta-llama/Meta-Llama-3-8B")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

# Load TTS model
tts_processor = OpenVoiceV2Processor.from_pretrained("myshell-ai/OpenVoiceV2")
tts_model = OpenVoiceV2.from_pretrained("myshell-ai/OpenVoiceV2")

@spaces.GPU()
# ASR function
def transcribe(audio):
    waveform, sample_rate = torchaudio.load(audio)
    inputs = processor_asr(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model_asr(inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor_asr.batch_decode(predicted_ids)
    return transcription[0]
    
@spaces.GPU()
# Text-to-text function
def generate_response(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = text_model.generate(**inputs)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response
    
@spaces.GPU()
# TTS function
def synthesize_speech(text):
    inputs = tts_processor(text, return_tensors="pt")
    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, _, alignments = tts_model.inference(inputs.input_ids)
        audio = tts_model.infer(mel_outputs_postnet)
    return audio
    
@spaces.GPU()
# Real-time processing function
def real_time_pipeline():
    # Adjust this part to handle live recording using soundfile and play back using simpleaudio
    import simpleaudio as sa
    import tempfile
    import time

    wake_word = "hello mate"
    wake_word_detected = False

    print("Listening for wake word...")

    with tempfile.NamedTemporaryFile(delete=False) as tmp_wav_file:
        tmp_wav_path = tmp_wav_file.name

    try:
        while True:
            # Capture audio here (this is a simplified example, you need actual audio capture logic)
            time.sleep(2)  # Simulate 2 seconds of audio capture

            # Save the captured audio to the temp file for ASR
            data, sample_rate = sf.read(tmp_wav_path)
            sf.write(tmp_wav_path, data, sample_rate)

            # Step 1: Transcribe audio to text
            transcription = transcribe(tmp_wav_path).lower()

            if wake_word in transcription:
                wake_word_detected = True
                print("Wake word detected. Processing audio...")

                while wake_word_detected:
                    # Capture audio here (this is a simplified example, you need actual audio capture logic)
                    time.sleep(2)  # Simulate 2 seconds of audio capture

                    # Save the captured audio to the temp file for ASR
                    data, sample_rate = sf.read(tmp_wav_path)
                    sf.write(tmp_wav_path, data, sample_rate)

                    # Step 1: Transcribe audio to text
                    transcription = transcribe(tmp_wav_path)

                    # Step 2: Generate response using text-to-text model
                    response = generate_response(transcription)

                    # Step 3: Synthesize speech from text
                    synthesized_audio = synthesize_speech(response)

                    # Save the synthesized audio to a temporary file
                    output_path = "output.wav"
                    torchaudio.save(output_path, synthesized_audio.squeeze(1), 22050)

                    # Play the synthesized audio using simpleaudio
                    wave_obj = sa.WaveObject.from_wave_file(output_path)
                    play_obj = wave_obj.play()
                    play_obj.wait_done()
    except KeyboardInterrupt:
        print("Stopping...")

# Gradio interface
gr_interface = gr.Interface(
    fn=real_time_pipeline, 
    inputs=None, 
    outputs=None,
    live=True,
    title="Real-Time Audio-to-Audio Model",
    description="ASR + Text-to-Text Model + TTS with Human-like Voice and Emotions"
)


iface.launch(inline=False)