RealTime

Build error

App Files Files Community

RealTime / app.py

VanguardAI

Update app.py

f1f4016 verified 10 months ago

raw

history blame

4.47 kB

	import torch
	import torchaudio
	import gradio as gr
	import soundfile as sf
	import wave
	import numpy as np
	from transformers import WhisperForCTC, WhisperProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
	from transformers import OpenVoiceV2Processor, OpenVoiceV2

	# Load ASR model and processor
	processor_asr = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
	model_asr = WhisperForCTC.from_pretrained("openai/whisper-large-v3")

	# Load text-to-text model and tokenizer
	text_model = AutoModelForSeq2SeqLM.from_pretrained("meta-llama/Meta-Llama-3-8B")
	tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

	# Load TTS model
	tts_processor = OpenVoiceV2Processor.from_pretrained("myshell-ai/OpenVoiceV2")
	tts_model = OpenVoiceV2.from_pretrained("myshell-ai/OpenVoiceV2")

	@spaces.GPU()
	# ASR function
	def transcribe(audio):
	waveform, sample_rate = torchaudio.load(audio)
	inputs = processor_asr(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)
	with torch.no_grad():
	logits = model_asr(inputs.input_values).logits
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor_asr.batch_decode(predicted_ids)
	return transcription[0]

	@spaces.GPU()
	# Text-to-text function
	def generate_response(text):
	inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
	outputs = text_model.generate(**inputs)
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return response

	@spaces.GPU()
	# TTS function
	def synthesize_speech(text):
	inputs = tts_processor(text, return_tensors="pt")
	with torch.no_grad():
	mel_outputs, mel_outputs_postnet, _, alignments = tts_model.inference(inputs.input_ids)
	audio = tts_model.infer(mel_outputs_postnet)
	return audio

	@spaces.GPU()
	# Real-time processing function
	def real_time_pipeline():
	# Adjust this part to handle live recording using soundfile and play back using simpleaudio
	import simpleaudio as sa
	import tempfile
	import time

	wake_word = "hello mate"
	wake_word_detected = False

	print("Listening for wake word...")

	with tempfile.NamedTemporaryFile(delete=False) as tmp_wav_file:
	tmp_wav_path = tmp_wav_file.name

	try:
	while True:
	# Capture audio here (this is a simplified example, you need actual audio capture logic)
	time.sleep(2) # Simulate 2 seconds of audio capture

	# Save the captured audio to the temp file for ASR
	data, sample_rate = sf.read(tmp_wav_path)
	sf.write(tmp_wav_path, data, sample_rate)

	# Step 1: Transcribe audio to text
	transcription = transcribe(tmp_wav_path).lower()

	if wake_word in transcription:
	wake_word_detected = True
	print("Wake word detected. Processing audio...")

	while wake_word_detected:
	# Capture audio here (this is a simplified example, you need actual audio capture logic)
	time.sleep(2) # Simulate 2 seconds of audio capture

	# Save the captured audio to the temp file for ASR
	data, sample_rate = sf.read(tmp_wav_path)
	sf.write(tmp_wav_path, data, sample_rate)

	# Step 1: Transcribe audio to text
	transcription = transcribe(tmp_wav_path)

	# Step 2: Generate response using text-to-text model
	response = generate_response(transcription)

	# Step 3: Synthesize speech from text
	synthesized_audio = synthesize_speech(response)

	# Save the synthesized audio to a temporary file
	output_path = "output.wav"
	torchaudio.save(output_path, synthesized_audio.squeeze(1), 22050)

	# Play the synthesized audio using simpleaudio
	wave_obj = sa.WaveObject.from_wave_file(output_path)
	play_obj = wave_obj.play()
	play_obj.wait_done()
	except KeyboardInterrupt:
	print("Stopping...")

	# Gradio interface
	gr_interface = gr.Interface(
	fn=real_time_pipeline,
	inputs=None,
	outputs=None,
	live=True,
	title="Real-Time Audio-to-Audio Model",
	description="ASR + Text-to-Text Model + TTS with Human-like Voice and Emotions"
	)


	iface.launch(inline=False)