File size: 2,821 Bytes
1e82508
 
839f7b2
1e82508
839f7b2
 
 
 
1e82508
839f7b2
 
 
1e82508
839f7b2
 
1e82508
839f7b2
 
1e82508
839f7b2
 
1e82508
839f7b2
 
1e82508
839f7b2
 
 
 
 
 
 
1e82508
839f7b2
 
1e82508
839f7b2
 
 
 
 
1e82508
839f7b2
 
1e82508
839f7b2
 
1e82508
839f7b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e82508
 
839f7b2
 
1e82508
839f7b2
1e82508
839f7b2
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import gradio as gr
from fastrtc import Stream, ReplyOnPause, AdditionalOutputs

# Import your modules
import stt
import tts
import cohereAPI

# Environment variables
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
system_message = "You respond concisely, in about 15 words or less"

# Initialize conversation history
conversation_history = []

async def response(audio_file_path):
    global conversation_history
    
    # Convert speech to text
    user_message = await stt.transcribe_audio(audio_file_path)
    
    # Add user message to chat history
    yield AdditionalOutputs({"transcript": user_message, "role": "user"})
    
    # Send text to Cohere API
    response_text, updated_history = await cohereAPI.send_message(
        system_message, 
        user_message, 
        conversation_history,
        COHERE_API_KEY
    )
    
    # Update conversation history
    conversation_history = updated_history
    
    # Generate speech from text
    _, (sample_rate, speech_array) = await tts.generate_speech(
        response_text, 
        voice_preset="random"
    )
    
    # Add assistant message to chat history
    yield AdditionalOutputs({"transcript": response_text, "role": "assistant"})
    
    # Return audio response
    yield (sample_rate, speech_array)

# Create FastRTC stream with ReplyOnPause
stream = Stream(
    handler=ReplyOnPause(response),
    modality="audio",
    mode="send-receive",
    additional_outputs=[
        {"name": "transcript", "type": "text"},
        {"name": "role", "type": "text"}
    ]
)

# Create Gradio interface that uses the FastRTC stream
with gr.Blocks(title="Voice Chat Assistant with ReplyOnPause") as demo:
    gr.Markdown("# Voice Chat Assistant")
    gr.Markdown("Speak and pause to trigger a response.")
    
    chatbot = gr.Chatbot(label="Conversation")
    
    # Mount the FastRTC UI
    stream_ui = stream.ui(label="Speak")
    
    # Handle additional outputs from FastRTC to update the chatbot
    def update_chat(transcript, role, history):
        if transcript and role:
            if role == "user":
                history.append((transcript, None))
            elif role == "assistant":
                if history and history[-1][1] is None:
                    history[-1] = (history[-1][0], transcript)
                else:
                    history.append((None, transcript))
        return history
    
    stream_ui.change(
        update_chat,
        inputs=[stream_ui.output_components[0], stream_ui.output_components[1], chatbot],
        outputs=[chatbot]
    )
    
    clear_btn = gr.Button("Clear Conversation")
    clear_btn.click(lambda: [], outputs=[chatbot])

# Launch the app
if __name__ == "__main__":
    demo.queue().launch(
        server_name="0.0.0.0",
        share=False,
        show_error=True
    )