import os import gradio as gr from fastrtc import Stream, ReplyOnPause, AdditionalOutputs # Import your custom models from tts import tortoise_tts, TortoiseOptions from stt import whisper_stt import cohereAPI # Import HumAware-VAD from humaware_vad import HumAwareVADModel # Environment variables COHERE_API_KEY = os.getenv("COHERE_API_KEY") system_message = "You respond concisely, in about 15 words or less" # Initialize conversation history conversation_history = [] # Initialize the HumAware-VAD model vad_model = HumAwareVADModel() # Create a handler function that uses both your custom models def response(audio): global conversation_history # Convert speech to text using your Whisper model user_message = whisper_stt.stt(audio) # Yield the transcription yield AdditionalOutputs(user_message) # Send text to Cohere API response_text, updated_history = cohereAPI.send_message( system_message, user_message, conversation_history, COHERE_API_KEY ) # Update conversation history conversation_history = updated_history # Print the response for logging print(f"Assistant: {response_text}") # Use your TTS model to generate audio tts_options = TortoiseOptions(voice_preset="random") # Stream the audio response in chunks for chunk in tortoise_tts.stream_tts_sync(response_text, tts_options): yield chunk # Create the FastRTC stream with HumAware-VAD for better pause detection stream = Stream( handler=ReplyOnPause(response, model=vad_model), # Use HumAware-VAD model modality="audio", mode="send-receive", additional_outputs=[gr.Textbox(label="Transcription")], additional_outputs_handler=lambda old, new: new if old is None else f"{old}\nUser: {new}" ) # Launch the Gradio UI if __name__ == "__main__": # Update your requirements.txt to include humaware-vad stream.ui.launch( server_name="0.0.0.0", share=False, show_error=True )