Spaces:

Athspi
/

Whshhs

Runtime error

App Files Files Community

Athspi commited on Mar 29

Commit

740846d

verified ·

1 Parent(s): 7891270

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -38

app.py CHANGED Viewed

@@ -1,59 +1,150 @@
-import os
 import asyncio
 from google import genai
 from google.genai import types
-import gradio as gr
-API_KEY = os.getenv("GEMINI_API_KEY")
-client = genai.Client(api_key=API_KEY)
-async def generate_audio(text):
-    try:
-        config = types.LiveConnectConfig(
             response_modalities=["audio"],
             speech_config=types.SpeechConfig(
                 voice_config=types.VoiceConfig(
                     prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck")
                 )
             ),
-            # Fixed Part.from_text() call with explicit text parameter
             system_instruction=types.Content(
-                parts=[types.Part.from_text(text="Repeat user input exactly without explanation")],
                 role="user"
             ),
         )
-        audio_data = b""
-        async with client.aio.live.connect(model="models/gemini-2.0-flash-exp", config=config) as session:
-            await session.send(input=text, end_of_turn=True)
-            async for response in session.receive():
-                if data := response.data:
-                    audio_data += data
-        # Save audio with proper WAV header (24kHz 16-bit PCM)
-        with open("output.wav", "wb") as f:
-            f.write(b'RIFF\x00\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x00\x7d\x00\x00\x02\x00\x10\x00data\x00\x00\x00\x00')
-            f.write(audio_data)
-        return "output.wav"
-    except Exception as e:
-        print(f"Error: {str(e)}")
-        raise
-def tts(text):
-    if not text.strip():
-        return None
-    return asyncio.run(generate_audio(text))
-iface = gr.Interface(
-    fn=tts,
-    inputs=gr.Textbox(label="Enter Text", placeholder="Type here..."),
-    outputs=gr.Audio(label="TTS Output", type="filepath"),
-    examples=["Hello, this is a test.", "How are you today?"],
-    title="Gemini TTS Demo",
-    description="Convert text to speech using Google's Gemini 2.0 Flash model"
-)
 if __name__ == "__main__":
-    iface.launch(share=True)

+import gradio as gr
 import asyncio
+import base64
+import io
+import cv2
+import pyaudio
+import PIL.Image
+import mss
 from google import genai
 from google.genai import types
+# Configuration
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+SEND_SAMPLE_RATE = 16000
+RECEIVE_SAMPLE_RATE = 24000
+CHUNK_SIZE = 1024
+MODEL = "models/gemini-2.0-flash-exp"
+class GeminiTTS:
+    def __init__(self, api_key):
+        self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
+        self.pya = pyaudio.PyAudio()
+        self.audio_in_queue = asyncio.Queue()
+        self.out_queue = asyncio.Queue(maxsize=5)
+        self.session = None
+        self.audio_stream = None
+        self.config = types.LiveConnectConfig(
             response_modalities=["audio"],
             speech_config=types.SpeechConfig(
                 voice_config=types.VoiceConfig(
                     prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck")
                 )
             ),
             system_instruction=types.Content(
+                parts=[types.Part.from_text(text="Answer user ask, replay same thing user say no other word explain")],
                 role="user"
             ),
         )
+    async def _get_frame(self, cap):
+        ret, frame = cap.read()
+        if not ret:
+            return None
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        img = PIL.Image.fromarray(frame_rgb)
+        img.thumbnail([1024, 1024])
+        image_io = io.BytesIO()
+        img.save(image_io, format="jpeg")
+        image_io.seek(0)
+        return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
+    async def _get_screen(self):
+        sct = mss.mss()
+        monitor = sct.monitors[0]
+        i = sct.grab(monitor)
+        img = PIL.Image.open(io.BytesIO(mss.tools.to_png(i.rgb, i.size)))
+        image_io = io.BytesIO()
+        img.save(image_io, format="jpeg")
+        image_io.seek(0)
+        return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
+    async def process_input(self, text=None, mode="text"):
+        try:
+            async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
+                self.session = session
+                if mode == "text" and text:
+                    await session.send(input=text or ".", end_of_turn=True)
+                elif mode == "camera":
+                    cap = cv2.VideoCapture(0)
+                    frame = await self._get_frame(cap)
+                    cap.release()
+                    if frame:
+                        await session.send(input=frame)
+                elif mode == "screen":
+                    frame = await self._get_screen()
+                    if frame:
+                        await session.send(input=frame)
+                # Get response
+                turn = session.receive()
+                async for response in turn:
+                    if data := response.data:
+                        return data
+                    if text := response.text:
+                        return text
+                return "No response received"
+        except Exception as e:
+            return f"Error: {str(e)}"
+def create_gradio_interface():
+    tts_handler = None
+    def init_tts(api_key):
+        nonlocal tts_handler
+        tts_handler = GeminiTTS(api_key)
+        return "Gemini TTS Initialized!"
+    async def generate_response(text, mode):
+        if not tts_handler:
+            raise gr.Error("Please initialize the TTS system first with your API key")
+        result = await tts_handler.process_input(text, mode)
+        if isinstance(result, bytes):
+            # Audio response
+            with io.BytesIO() as wav_buffer:
+                wav_buffer.write(result)
+                wav_buffer.seek(0)
+                return (RECEIVE_SAMPLE_RATE, wav_buffer.read())
+        else:
+            # Text response
+            return result
+    with gr.Blocks(title="Gemini TTS Interface") as demo:
+        gr.Markdown("# 🎤 Gemini Text-to-Speech Interface")
+        with gr.Row():
+            api_key = gr.Textbox(label="Gemini API Key", type="password")
+            init_btn = gr.Button("Initialize TTS")
+        init_output = gr.Textbox(label="Initialization Status", interactive=False)
+        init_btn.click(init_tts, inputs=api_key, outputs=init_output)
+        with gr.Tab("Text Input"):
+            with gr.Row():
+                text_input = gr.Textbox(label="Enter Text", lines=3)
+                text_btn = gr.Button("Generate Speech")
+            text_output = gr.Audio(label="Generated Speech")
+            text_btn.click(generate_response, inputs=[text_input, gr.Text("text", visible=False)], outputs=text_output)
+        with gr.Tab("Camera Input"):
+            camera_btn = gr.Button("Capture and Process")
+            camera_output = gr.Audio(label="Generated Speech from Camera")
+            camera_btn.click(generate_response, inputs=[gr.Text("", visible=False), gr.Text("camera", visible=False)], outputs=camera_output)
+        with gr.Tab("Screen Capture"):
+            screen_btn = gr.Button("Capture Screen and Process")
+            screen_output = gr.Audio(label="Generated Speech from Screen")
+            screen_btn.click(generate_response, inputs=[gr.Text("", visible=False), gr.Text("screen", visible=False)], outputs=screen_output)
+    return demo
 if __name__ == "__main__":
+    demo = create_gradio_interface()
+    demo.launch()