File size: 5,100 Bytes
819225f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import gradio as gr
import tempfile
import torchaudio
from dotenv import load_dotenv
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice
from openai import OpenAI

# === Debug and environment setup ===
print(f"Current working directory: {os.getcwd()}")
load_dotenv()  # By default, looks for .env in current working directory

# === Initialize OpenAI client with better error handling ===
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("❌ OPENAI_API_KEY not found in your .env file.")
print(f"API key found: {'Yes' if api_key else 'No'}")
print(f"API key loaded: {api_key[:4]}...{api_key[-4:] if len(api_key) > 8 else ''}")

# Initialize with timeout to prevent hanging requests
client = OpenAI(
    api_key=api_key,
    timeout=60.0  # 60 second timeout
)

# === Initialize Tortoise TTS ===
print("Initializing Tortoise TTS...")
tts = TextToSpeech()
voice_samples, conditioning_latents = load_voice("train_dotrice")
print("TTS initialized successfully!")

# === Ask GPT-4o with improved error handling ===
def ask_gpt(prompt: str) -> str:
    try:
        print(f"Sending request to GPT-4o: {prompt[:30]}...")
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,
            max_tokens=300
        )
        return response.choices[0].message.content
    except Exception as e:
        error_type = type(e).__name__
        error_message = str(e)
        print(f"[GPT-4 ERROR] {error_type}: {error_message}")
        
        # Provide more helpful error messages
        if "api_key" in error_message.lower():
            return "[GPT-4 ERROR] API key issue: Check that your API key is valid and properly formatted in the .env file."
        elif "rate limit" in error_message.lower():
            return "[GPT-4 ERROR] Rate limit exceeded: Please wait a moment before trying again."
        elif "connect" in error_message.lower():
            return "[GPT-4 ERROR] Connection error: Check your internet connection and ensure OpenAI's API is accessible."
        else:
            return f"[GPT-4 ERROR] {error_type}: {error_message}"

# === Generate TTS Audio ===
def synthesize(text: str) -> str:
    try:
        print(f"Synthesizing speech for: {text[:30]}...")
        audio = tts.tts_with_preset(
            text=text,
            voice_samples=voice_samples,
            conditioning_latents=conditioning_latents,
            preset="fast"
        )
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
            torchaudio.save(tmp.name, audio.squeeze(0).cpu(), 24000)
            print(f"Audio saved to temporary file: {tmp.name}")
            return tmp.name
    except Exception as e:
        print(f"[TTS ERROR] {type(e).__name__}: {str(e)}")
        return None

# === Unified Agent Logic with Enhanced Error Handling ===
def run_agent(audio_input, text_input):
    try:
        if audio_input:
            return "🧠 Voice transcription not implemented yet.", None

        if text_input and text_input.strip():
            cleaned_input = text_input.strip()
            if len(cleaned_input) < 2:
                return "⚠️ Please enter more meaningful text.", None

            print("Processing text input...")
            gpt_reply = ask_gpt(cleaned_input)
            if gpt_reply.startswith("[GPT-4 ERROR]"):
                return gpt_reply, None
                
            audio_path = synthesize(gpt_reply)
            if audio_path is None:
                return gpt_reply + "\n\n[TTS ERROR] Failed to generate audio.", None
                
            return gpt_reply, audio_path

        return "⚠️ Please enter a message or audio input.", None
    except Exception as e:
        print(f"[AGENT ERROR] {type(e).__name__}: {str(e)}")
        return f"⚠️ An unexpected error occurred: {type(e).__name__}: {str(e)}", None

# === Gradio UI ===
with gr.Blocks() as demo:
    gr.Markdown("## 🧠 GPT-4o Voice Agent + Tortoise TTS")
    gr.Markdown("*Ensure your `.env` file with OPENAI_API_KEY is in the correct directory*")

    with gr.Row():
        mic_input = gr.Audio(
            label="🎙️ Mic Input (WAV format, not yet active)",
            type="filepath",
            format="wav"
        )
        text_input = gr.Textbox(
            lines=2,
            placeholder="Ask anything here...",
            label="💬 Text Input"
        )

    run_btn = gr.Button("🧠 Ask GPT-4o")
    gpt_output = gr.Textbox(label="🧠 GPT-4o Response")
    audio_output = gr.Audio(label="🔊 Spoken Response", autoplay=True)

    run_btn.click(
        fn=run_agent,
        inputs=[mic_input, text_input],
        outputs=[gpt_output, audio_output]
    )

# Launch for local + mobile access
print("Launching Gradio interface...")
demo.launch(share=True, server_name="0.0.0.0", server_port=7860, debug=True)