RealTime / app.py
VanguardAI's picture
Update app.py
f1f4016 verified
raw
history blame
4.47 kB
import torch
import torchaudio
import gradio as gr
import soundfile as sf
import wave
import numpy as np
from transformers import WhisperForCTC, WhisperProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import OpenVoiceV2Processor, OpenVoiceV2
# Load ASR model and processor
processor_asr = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
model_asr = WhisperForCTC.from_pretrained("openai/whisper-large-v3")
# Load text-to-text model and tokenizer
text_model = AutoModelForSeq2SeqLM.from_pretrained("meta-llama/Meta-Llama-3-8B")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
# Load TTS model
tts_processor = OpenVoiceV2Processor.from_pretrained("myshell-ai/OpenVoiceV2")
tts_model = OpenVoiceV2.from_pretrained("myshell-ai/OpenVoiceV2")
@spaces.GPU()
# ASR function
def transcribe(audio):
waveform, sample_rate = torchaudio.load(audio)
inputs = processor_asr(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model_asr(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor_asr.batch_decode(predicted_ids)
return transcription[0]
@spaces.GPU()
# Text-to-text function
def generate_response(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
outputs = text_model.generate(**inputs)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
@spaces.GPU()
# TTS function
def synthesize_speech(text):
inputs = tts_processor(text, return_tensors="pt")
with torch.no_grad():
mel_outputs, mel_outputs_postnet, _, alignments = tts_model.inference(inputs.input_ids)
audio = tts_model.infer(mel_outputs_postnet)
return audio
@spaces.GPU()
# Real-time processing function
def real_time_pipeline():
# Adjust this part to handle live recording using soundfile and play back using simpleaudio
import simpleaudio as sa
import tempfile
import time
wake_word = "hello mate"
wake_word_detected = False
print("Listening for wake word...")
with tempfile.NamedTemporaryFile(delete=False) as tmp_wav_file:
tmp_wav_path = tmp_wav_file.name
try:
while True:
# Capture audio here (this is a simplified example, you need actual audio capture logic)
time.sleep(2) # Simulate 2 seconds of audio capture
# Save the captured audio to the temp file for ASR
data, sample_rate = sf.read(tmp_wav_path)
sf.write(tmp_wav_path, data, sample_rate)
# Step 1: Transcribe audio to text
transcription = transcribe(tmp_wav_path).lower()
if wake_word in transcription:
wake_word_detected = True
print("Wake word detected. Processing audio...")
while wake_word_detected:
# Capture audio here (this is a simplified example, you need actual audio capture logic)
time.sleep(2) # Simulate 2 seconds of audio capture
# Save the captured audio to the temp file for ASR
data, sample_rate = sf.read(tmp_wav_path)
sf.write(tmp_wav_path, data, sample_rate)
# Step 1: Transcribe audio to text
transcription = transcribe(tmp_wav_path)
# Step 2: Generate response using text-to-text model
response = generate_response(transcription)
# Step 3: Synthesize speech from text
synthesized_audio = synthesize_speech(response)
# Save the synthesized audio to a temporary file
output_path = "output.wav"
torchaudio.save(output_path, synthesized_audio.squeeze(1), 22050)
# Play the synthesized audio using simpleaudio
wave_obj = sa.WaveObject.from_wave_file(output_path)
play_obj = wave_obj.play()
play_obj.wait_done()
except KeyboardInterrupt:
print("Stopping...")
# Gradio interface
gr_interface = gr.Interface(
fn=real_time_pipeline,
inputs=None,
outputs=None,
live=True,
title="Real-Time Audio-to-Audio Model",
description="ASR + Text-to-Text Model + TTS with Human-like Voice and Emotions"
)
iface.launch(inline=False)