Athspi commited on
Commit
af3c122
·
verified ·
1 Parent(s): c0a29a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -77
app.py CHANGED
@@ -2,10 +2,7 @@ import gradio as gr
2
  import asyncio
3
  import base64
4
  import io
5
- import cv2
6
  import numpy as np
7
- import PIL.Image
8
- import mss
9
  from google import genai
10
  from google.genai import types
11
  import soundfile as sf
@@ -16,6 +13,8 @@ MODEL = "models/gemini-2.0-flash-exp"
16
 
17
  class GeminiTTS:
18
  def __init__(self, api_key):
 
 
19
  self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
20
  self.session = None
21
 
@@ -32,53 +31,25 @@ class GeminiTTS:
32
  ),
33
  )
34
 
35
- async def _get_frame(self, cap):
36
- ret, frame = cap.read()
37
- if not ret:
38
- return None
39
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
40
- img = PIL.Image.fromarray(frame_rgb)
41
- img.thumbnail([1024, 1024])
42
- image_io = io.BytesIO()
43
- img.save(image_io, format="jpeg")
44
- image_io.seek(0)
45
- return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
46
-
47
- async def _get_screen(self):
48
- sct = mss.mss()
49
- monitor = sct.monitors[0]
50
- i = sct.grab(monitor)
51
- img = PIL.Image.open(io.BytesIO(mss.tools.to_png(i.rgb, i.size)))
52
- image_io = io.BytesIO()
53
- img.save(image_io, format="jpeg")
54
- image_io.seek(0)
55
- return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
56
-
57
- async def process_input(self, text=None, mode="text"):
58
  try:
59
  async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
60
- self.session = session
61
-
62
- if mode == "text" and text:
63
- await session.send(input=text or ".", end_of_turn=True)
64
- elif mode == "camera":
65
- cap = cv2.VideoCapture(0)
66
- frame = await self._get_frame(cap)
67
- cap.release()
68
- if frame:
69
- await session.send(input=frame)
70
- elif mode == "screen":
71
- frame = await self._get_screen()
72
- if frame:
73
- await session.send(input=frame)
74
 
75
  # Get response
76
  turn = session.receive()
77
  async for response in turn:
78
  if data := response.data:
79
- # Convert to numpy array for Gradio Audio component
80
- audio_array = np.frombuffer(data, dtype=np.float32)
81
- return (SAMPLE_RATE, audio_array)
 
 
 
 
 
 
 
82
  if text := response.text:
83
  return text
84
 
@@ -91,56 +62,58 @@ def create_gradio_interface():
91
 
92
  def init_tts(api_key):
93
  nonlocal tts_handler
94
- tts_handler = GeminiTTS(api_key)
95
- return "Gemini TTS Initialized!"
 
 
 
96
 
97
- async def generate_response(text, mode):
98
  if not tts_handler:
99
  raise gr.Error("Please initialize the TTS system first with your API key")
100
- return await tts_handler.process_input(text, mode)
 
 
 
 
 
 
 
 
101
 
102
  with gr.Blocks(title="Gemini TTS Interface") as demo:
103
  gr.Markdown("# 🎤 Gemini Text-to-Speech Interface")
104
 
105
  with gr.Row():
106
- api_key = gr.Textbox(label="Gemini API Key", type="password")
 
 
 
 
107
  init_btn = gr.Button("Initialize TTS")
108
 
109
- init_output = gr.Textbox(label="Initialization Status", interactive=False)
110
- init_btn.click(init_tts, inputs=api_key, outputs=init_output)
111
 
112
- with gr.Tab("Text Input"):
113
- with gr.Row():
114
- text_input = gr.Textbox(label="Enter Text", lines=3)
115
- text_btn = gr.Button("Generate Speech")
116
-
117
- text_output = gr.Audio(label="Generated Speech")
118
- text_btn.click(
119
- generate_response,
120
- inputs=[text_input, gr.Text("text", visible=False)],
121
- outputs=text_output
122
  )
 
123
 
124
- with gr.Tab("Camera Input"):
125
- camera_btn = gr.Button("Capture and Process")
126
- camera_output = gr.Audio(label="Generated Speech from Camera")
127
- camera_btn.click(
128
- generate_response,
129
- inputs=[gr.Text("", visible=False), gr.Text("camera", visible=False)],
130
- outputs=camera_output
131
- )
132
 
133
- with gr.Tab("Screen Capture"):
134
- screen_btn = gr.Button("Capture Screen and Process")
135
- screen_output = gr.Audio(label="Generated Speech from Screen")
136
- screen_btn.click(
137
- generate_response,
138
- inputs=[gr.Text("", visible=False), gr.Text("screen", visible=False)],
139
- outputs=screen_output
140
- )
141
 
142
  return demo
143
 
144
  if __name__ == "__main__":
145
  demo = create_gradio_interface()
146
- demo.launch()
 
2
  import asyncio
3
  import base64
4
  import io
 
5
  import numpy as np
 
 
6
  from google import genai
7
  from google.genai import types
8
  import soundfile as sf
 
13
 
14
  class GeminiTTS:
15
  def __init__(self, api_key):
16
+ if not api_key:
17
+ raise ValueError("API key cannot be empty")
18
  self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
19
  self.session = None
20
 
 
31
  ),
32
  )
33
 
34
+ async def process_text(self, text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  try:
36
  async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
37
+ await session.send(input=text or ".", end_of_turn=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  # Get response
40
  turn = session.receive()
41
  async for response in turn:
42
  if data := response.data:
43
+ # Convert to properly formatted numpy array
44
+ audio_data = np.frombuffer(data, dtype=np.float32)
45
+
46
+ # Normalize audio to prevent processing warnings
47
+ if audio_data.size > 0:
48
+ max_val = np.max(np.abs(audio_data))
49
+ if max_val > 0:
50
+ audio_data = audio_data / max_val
51
+
52
+ return (SAMPLE_RATE, audio_data)
53
  if text := response.text:
54
  return text
55
 
 
62
 
63
  def init_tts(api_key):
64
  nonlocal tts_handler
65
+ try:
66
+ tts_handler = GeminiTTS(api_key)
67
+ return "Gemini TTS Initialized Successfully!"
68
+ except Exception as e:
69
+ return f"Initialization Failed: {str(e)}"
70
 
71
+ async def generate_response(text):
72
  if not tts_handler:
73
  raise gr.Error("Please initialize the TTS system first with your API key")
74
+
75
+ result = await tts_handler.process_text(text)
76
+
77
+ if isinstance(result, tuple) and len(result) == 2:
78
+ # Audio response (sample_rate, audio_data)
79
+ return result
80
+ else:
81
+ # Text response
82
+ return result
83
 
84
  with gr.Blocks(title="Gemini TTS Interface") as demo:
85
  gr.Markdown("# 🎤 Gemini Text-to-Speech Interface")
86
 
87
  with gr.Row():
88
+ api_key = gr.Textbox(
89
+ label="Gemini API Key",
90
+ type="password",
91
+ placeholder="Enter your Gemini API key here"
92
+ )
93
  init_btn = gr.Button("Initialize TTS")
94
 
95
+ init_status = gr.Textbox(label="Initialization Status", interactive=False)
96
+ init_btn.click(init_tts, inputs=api_key, outputs=init_status)
97
 
98
+ with gr.Group():
99
+ text_input = gr.Textbox(
100
+ label="Enter Text",
101
+ lines=3,
102
+ placeholder="Type something to convert to speech..."
 
 
 
 
 
103
  )
104
+ generate_btn = gr.Button("Generate Speech")
105
 
106
+ audio_output = gr.Audio(label="Generated Speech")
107
+ text_output = gr.Textbox(label="Text Response", visible=False)
 
 
 
 
 
 
108
 
109
+ generate_btn.click(
110
+ generate_response,
111
+ inputs=text_input,
112
+ outputs=[audio_output, text_output]
113
+ )
 
 
 
114
 
115
  return demo
116
 
117
  if __name__ == "__main__":
118
  demo = create_gradio_interface()
119
+ demo.launch(server_name="0.0.0.0", server_port=7860)