Spaces:

Athspi
/

Whshhs

Runtime error

App Files Files Community

Athspi commited on Mar 29

Commit

af3c122

verified ·

1 Parent(s): c0a29a1

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -77

app.py CHANGED Viewed

@@ -2,10 +2,7 @@ import gradio as gr
 import asyncio
 import base64
 import io
-import cv2
 import numpy as np
-import PIL.Image
-import mss
 from google import genai
 from google.genai import types
 import soundfile as sf
@@ -16,6 +13,8 @@ MODEL = "models/gemini-2.0-flash-exp"
 class GeminiTTS:
     def __init__(self, api_key):
         self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
         self.session = None
@@ -32,53 +31,25 @@ class GeminiTTS:
             ),
         )
-    async def _get_frame(self, cap):
-        ret, frame = cap.read()
-        if not ret:
-            return None
-        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        img = PIL.Image.fromarray(frame_rgb)
-        img.thumbnail([1024, 1024])
-        image_io = io.BytesIO()
-        img.save(image_io, format="jpeg")
-        image_io.seek(0)
-        return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
-    async def _get_screen(self):
-        sct = mss.mss()
-        monitor = sct.monitors[0]
-        i = sct.grab(monitor)
-        img = PIL.Image.open(io.BytesIO(mss.tools.to_png(i.rgb, i.size)))
-        image_io = io.BytesIO()
-        img.save(image_io, format="jpeg")
-        image_io.seek(0)
-        return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
-    async def process_input(self, text=None, mode="text"):
         try:
             async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
-                self.session = session
-                if mode == "text" and text:
-                    await session.send(input=text or ".", end_of_turn=True)
-                elif mode == "camera":
-                    cap = cv2.VideoCapture(0)
-                    frame = await self._get_frame(cap)
-                    cap.release()
-                    if frame:
-                        await session.send(input=frame)
-                elif mode == "screen":
-                    frame = await self._get_screen()
-                    if frame:
-                        await session.send(input=frame)
                 # Get response
                 turn = session.receive()
                 async for response in turn:
                     if data := response.data:
-                        # Convert to numpy array for Gradio Audio component
-                        audio_array = np.frombuffer(data, dtype=np.float32)
-                        return (SAMPLE_RATE, audio_array)
                     if text := response.text:
                         return text
@@ -91,56 +62,58 @@ def create_gradio_interface():
     def init_tts(api_key):
         nonlocal tts_handler
-        tts_handler = GeminiTTS(api_key)
-        return "Gemini TTS Initialized!"
-    async def generate_response(text, mode):
         if not tts_handler:
             raise gr.Error("Please initialize the TTS system first with your API key")
-        return await tts_handler.process_input(text, mode)
     with gr.Blocks(title="Gemini TTS Interface") as demo:
         gr.Markdown("# 🎤 Gemini Text-to-Speech Interface")
         with gr.Row():
-            api_key = gr.Textbox(label="Gemini API Key", type="password")
             init_btn = gr.Button("Initialize TTS")
-        init_output = gr.Textbox(label="Initialization Status", interactive=False)
-        init_btn.click(init_tts, inputs=api_key, outputs=init_output)
-        with gr.Tab("Text Input"):
-            with gr.Row():
-                text_input = gr.Textbox(label="Enter Text", lines=3)
-                text_btn = gr.Button("Generate Speech")
-            text_output = gr.Audio(label="Generated Speech")
-            text_btn.click(
-                generate_response,
-                inputs=[text_input, gr.Text("text", visible=False)],
-                outputs=text_output
             )
-        with gr.Tab("Camera Input"):
-            camera_btn = gr.Button("Capture and Process")
-            camera_output = gr.Audio(label="Generated Speech from Camera")
-            camera_btn.click(
-                generate_response,
-                inputs=[gr.Text("", visible=False), gr.Text("camera", visible=False)],
-                outputs=camera_output
-            )
-        with gr.Tab("Screen Capture"):
-            screen_btn = gr.Button("Capture Screen and Process")
-            screen_output = gr.Audio(label="Generated Speech from Screen")
-            screen_btn.click(
-                generate_response,
-                inputs=[gr.Text("", visible=False), gr.Text("screen", visible=False)],
-                outputs=screen_output
-            )
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
-    demo.launch()

 import asyncio
 import base64
 import io
 import numpy as np
 from google import genai
 from google.genai import types
 import soundfile as sf
 class GeminiTTS:
     def __init__(self, api_key):
+        if not api_key:
+            raise ValueError("API key cannot be empty")
         self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
         self.session = None
             ),
         )
+    async def process_text(self, text):
         try:
             async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
+                await session.send(input=text or ".", end_of_turn=True)
                 # Get response
                 turn = session.receive()
                 async for response in turn:
                     if data := response.data:
+                        # Convert to properly formatted numpy array
+                        audio_data = np.frombuffer(data, dtype=np.float32)
+                        # Normalize audio to prevent processing warnings
+                        if audio_data.size > 0:
+                            max_val = np.max(np.abs(audio_data))
+                            if max_val > 0:
+                                audio_data = audio_data / max_val
+                        return (SAMPLE_RATE, audio_data)
                     if text := response.text:
                         return text
     def init_tts(api_key):
         nonlocal tts_handler
+        try:
+            tts_handler = GeminiTTS(api_key)
+            return "Gemini TTS Initialized Successfully!"
+        except Exception as e:
+            return f"Initialization Failed: {str(e)}"
+    async def generate_response(text):
         if not tts_handler:
             raise gr.Error("Please initialize the TTS system first with your API key")
+        result = await tts_handler.process_text(text)
+        if isinstance(result, tuple) and len(result) == 2:
+            # Audio response (sample_rate, audio_data)
+            return result
+        else:
+            # Text response
+            return result
     with gr.Blocks(title="Gemini TTS Interface") as demo:
         gr.Markdown("# 🎤 Gemini Text-to-Speech Interface")
         with gr.Row():
+            api_key = gr.Textbox(
+                label="Gemini API Key",
+                type="password",
+                placeholder="Enter your Gemini API key here"
+            )
             init_btn = gr.Button("Initialize TTS")
+        init_status = gr.Textbox(label="Initialization Status", interactive=False)
+        init_btn.click(init_tts, inputs=api_key, outputs=init_status)
+        with gr.Group():
+            text_input = gr.Textbox(
+                label="Enter Text",
+                lines=3,
+                placeholder="Type something to convert to speech..."
             )
+            generate_btn = gr.Button("Generate Speech")
+        audio_output = gr.Audio(label="Generated Speech")
+        text_output = gr.Textbox(label="Text Response", visible=False)
+        generate_btn.click(
+            generate_response,
+            inputs=text_input,
+            outputs=[audio_output, text_output]
+        )
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
+    demo.launch(server_name="0.0.0.0", server_port=7860)