Athspi commited on
Commit
740846d
·
verified ·
1 Parent(s): 7891270

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -38
app.py CHANGED
@@ -1,59 +1,150 @@
1
- import os
2
  import asyncio
 
 
 
 
 
 
3
  from google import genai
4
  from google.genai import types
5
- import gradio as gr
6
 
7
- API_KEY = os.getenv("GEMINI_API_KEY")
8
- client = genai.Client(api_key=API_KEY)
 
 
 
 
 
9
 
10
- async def generate_audio(text):
11
- try:
12
- config = types.LiveConnectConfig(
 
 
 
 
 
 
 
13
  response_modalities=["audio"],
14
  speech_config=types.SpeechConfig(
15
  voice_config=types.VoiceConfig(
16
  prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck")
17
  )
18
  ),
19
- # Fixed Part.from_text() call with explicit text parameter
20
  system_instruction=types.Content(
21
- parts=[types.Part.from_text(text="Repeat user input exactly without explanation")],
22
  role="user"
23
  ),
24
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- audio_data = b""
27
- async with client.aio.live.connect(model="models/gemini-2.0-flash-exp", config=config) as session:
28
- await session.send(input=text, end_of_turn=True)
29
- async for response in session.receive():
30
- if data := response.data:
31
- audio_data += data
 
32
 
33
- # Save audio with proper WAV header (24kHz 16-bit PCM)
34
- with open("output.wav", "wb") as f:
35
- f.write(b'RIFF\x00\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x00\x7d\x00\x00\x02\x00\x10\x00data\x00\x00\x00\x00')
36
- f.write(audio_data)
37
 
38
- return "output.wav"
 
 
 
39
 
40
- except Exception as e:
41
- print(f"Error: {str(e)}")
42
- raise
43
-
44
- def tts(text):
45
- if not text.strip():
46
- return None
47
- return asyncio.run(generate_audio(text))
48
-
49
- iface = gr.Interface(
50
- fn=tts,
51
- inputs=gr.Textbox(label="Enter Text", placeholder="Type here..."),
52
- outputs=gr.Audio(label="TTS Output", type="filepath"),
53
- examples=["Hello, this is a test.", "How are you today?"],
54
- title="Gemini TTS Demo",
55
- description="Convert text to speech using Google's Gemini 2.0 Flash model"
56
- )
57
 
58
  if __name__ == "__main__":
59
- iface.launch(share=True)
 
 
1
+ import gradio as gr
2
  import asyncio
3
+ import base64
4
+ import io
5
+ import cv2
6
+ import pyaudio
7
+ import PIL.Image
8
+ import mss
9
  from google import genai
10
  from google.genai import types
 
11
 
12
+ # Configuration
13
+ FORMAT = pyaudio.paInt16
14
+ CHANNELS = 1
15
+ SEND_SAMPLE_RATE = 16000
16
+ RECEIVE_SAMPLE_RATE = 24000
17
+ CHUNK_SIZE = 1024
18
+ MODEL = "models/gemini-2.0-flash-exp"
19
 
20
+ class GeminiTTS:
21
+ def __init__(self, api_key):
22
+ self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
23
+ self.pya = pyaudio.PyAudio()
24
+ self.audio_in_queue = asyncio.Queue()
25
+ self.out_queue = asyncio.Queue(maxsize=5)
26
+ self.session = None
27
+ self.audio_stream = None
28
+
29
+ self.config = types.LiveConnectConfig(
30
  response_modalities=["audio"],
31
  speech_config=types.SpeechConfig(
32
  voice_config=types.VoiceConfig(
33
  prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck")
34
  )
35
  ),
 
36
  system_instruction=types.Content(
37
+ parts=[types.Part.from_text(text="Answer user ask, replay same thing user say no other word explain")],
38
  role="user"
39
  ),
40
  )
41
+
42
+ async def _get_frame(self, cap):
43
+ ret, frame = cap.read()
44
+ if not ret:
45
+ return None
46
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
47
+ img = PIL.Image.fromarray(frame_rgb)
48
+ img.thumbnail([1024, 1024])
49
+ image_io = io.BytesIO()
50
+ img.save(image_io, format="jpeg")
51
+ image_io.seek(0)
52
+ return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
53
+
54
+ async def _get_screen(self):
55
+ sct = mss.mss()
56
+ monitor = sct.monitors[0]
57
+ i = sct.grab(monitor)
58
+ img = PIL.Image.open(io.BytesIO(mss.tools.to_png(i.rgb, i.size)))
59
+ image_io = io.BytesIO()
60
+ img.save(image_io, format="jpeg")
61
+ image_io.seek(0)
62
+ return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
63
+
64
+ async def process_input(self, text=None, mode="text"):
65
+ try:
66
+ async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
67
+ self.session = session
68
+
69
+ if mode == "text" and text:
70
+ await session.send(input=text or ".", end_of_turn=True)
71
+ elif mode == "camera":
72
+ cap = cv2.VideoCapture(0)
73
+ frame = await self._get_frame(cap)
74
+ cap.release()
75
+ if frame:
76
+ await session.send(input=frame)
77
+ elif mode == "screen":
78
+ frame = await self._get_screen()
79
+ if frame:
80
+ await session.send(input=frame)
81
+
82
+ # Get response
83
+ turn = session.receive()
84
+ async for response in turn:
85
+ if data := response.data:
86
+ return data
87
+ if text := response.text:
88
+ return text
89
+
90
+ return "No response received"
91
+ except Exception as e:
92
+ return f"Error: {str(e)}"
93
+
94
+ def create_gradio_interface():
95
+ tts_handler = None
96
+
97
+ def init_tts(api_key):
98
+ nonlocal tts_handler
99
+ tts_handler = GeminiTTS(api_key)
100
+ return "Gemini TTS Initialized!"
101
+
102
+ async def generate_response(text, mode):
103
+ if not tts_handler:
104
+ raise gr.Error("Please initialize the TTS system first with your API key")
105
+
106
+ result = await tts_handler.process_input(text, mode)
107
+
108
+ if isinstance(result, bytes):
109
+ # Audio response
110
+ with io.BytesIO() as wav_buffer:
111
+ wav_buffer.write(result)
112
+ wav_buffer.seek(0)
113
+ return (RECEIVE_SAMPLE_RATE, wav_buffer.read())
114
+ else:
115
+ # Text response
116
+ return result
117
+
118
+ with gr.Blocks(title="Gemini TTS Interface") as demo:
119
+ gr.Markdown("# 🎤 Gemini Text-to-Speech Interface")
120
+
121
+ with gr.Row():
122
+ api_key = gr.Textbox(label="Gemini API Key", type="password")
123
+ init_btn = gr.Button("Initialize TTS")
124
+
125
+ init_output = gr.Textbox(label="Initialization Status", interactive=False)
126
+ init_btn.click(init_tts, inputs=api_key, outputs=init_output)
127
 
128
+ with gr.Tab("Text Input"):
129
+ with gr.Row():
130
+ text_input = gr.Textbox(label="Enter Text", lines=3)
131
+ text_btn = gr.Button("Generate Speech")
132
+
133
+ text_output = gr.Audio(label="Generated Speech")
134
+ text_btn.click(generate_response, inputs=[text_input, gr.Text("text", visible=False)], outputs=text_output)
135
 
136
+ with gr.Tab("Camera Input"):
137
+ camera_btn = gr.Button("Capture and Process")
138
+ camera_output = gr.Audio(label="Generated Speech from Camera")
139
+ camera_btn.click(generate_response, inputs=[gr.Text("", visible=False), gr.Text("camera", visible=False)], outputs=camera_output)
140
 
141
+ with gr.Tab("Screen Capture"):
142
+ screen_btn = gr.Button("Capture Screen and Process")
143
+ screen_output = gr.Audio(label="Generated Speech from Screen")
144
+ screen_btn.click(generate_response, inputs=[gr.Text("", visible=False), gr.Text("screen", visible=False)], outputs=screen_output)
145
 
146
+ return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  if __name__ == "__main__":
149
+ demo = create_gradio_interface()
150
+ demo.launch()