"""Real-time Speech Interface This module provides a real-time speech interface using Google's Gemini model. It handles bidirectional audio streaming with automatic speech recognition and synthesis. Important: Use headphones to prevent audio feedback and echo issues. """ import argparse import asyncio import json import logging import os import traceback from helpers.loop import AudioLoop, TextLoop from helpers.session import Session from models import AudioConfig, ModelConfig from tools import TOOLS # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) logger = logging.getLogger(__name__) def main( modality: str = "text", system_prompt: str = None, instruction_audio: str = None ) -> None: """Entry point for the application.""" try: model_config = ModelConfig( api_key=os.environ.get("GOOGLE_API_KEY"), name="models/gemini-2.0-flash-exp", system_instruction=system_prompt, tools=TOOLS, generation_config={ "response_modalities": modality.upper(), }, ) if modality == "audio": loop_instance = AudioLoop( audio_config=AudioConfig(), model_config=model_config, instruction_audio=instruction_audio, ) elif modality == "text": loop_instance = TextLoop(model_config=model_config) else: raise ValueError("Invalid modality") asyncio.run(loop_instance.run(), debug=True) except KeyboardInterrupt: logger.info("Application terminated by user") except Exception as e: logger.error(f"Application error: {e}") logger.debug(traceback.format_exc()) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Real-time Speech Interface") parser.add_argument( "-m", "--modality", choices=["text", "audio"], help="Response modality", required=True, ) parser.add_argument( "--instruction-audio", type=str, help="Path to audio instructions (.wav file)", required=False, ) parser.add_argument( "-q", "--questions", type=str, help="Path to JSON file containing questions", required=True, ) args = parser.parse_args() with open(args.questions, "r") as f: questions_dict = json.load(f) session = Session(questions=questions_dict) system_prompt = session.zero_shot_prompt("src/prompts/default_prompt.jinja2") print(system_prompt) main( modality=args.modality, system_prompt=system_prompt, instruction_audio=args.instruction_audio, )