Athspi commited on
Commit
cb63aa0
·
verified ·
1 Parent(s): 3aa23bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -14
app.py CHANGED
@@ -3,24 +3,22 @@ import asyncio
3
  import base64
4
  import io
5
  import cv2
6
- import pyaudio
 
7
  import PIL.Image
8
  import mss
9
  from google import genai
10
  from google.genai import types
 
11
 
12
  # Configuration
13
- FORMAT = pyaudio.paInt16
14
- CHANNELS = 1
15
- SEND_SAMPLE_RATE = 16000
16
- RECEIVE_SAMPLE_RATE = 24000
17
  CHUNK_SIZE = 1024
18
  MODEL = "models/gemini-2.0-flash-exp"
19
 
20
  class GeminiTTS:
21
  def __init__(self, api_key):
22
  self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
23
- self.pya = pyaudio.PyAudio()
24
  self.audio_in_queue = asyncio.Queue()
25
  self.out_queue = asyncio.Queue(maxsize=5)
26
  self.session = None
@@ -61,6 +59,21 @@ class GeminiTTS:
61
  image_io.seek(0)
62
  return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  async def process_input(self, text=None, mode="text"):
65
  try:
66
  async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
@@ -83,7 +96,11 @@ class GeminiTTS:
83
  turn = session.receive()
84
  async for response in turn:
85
  if data := response.data:
86
- return data
 
 
 
 
87
  if text := response.text:
88
  return text
89
 
@@ -105,18 +122,27 @@ def create_gradio_interface():
105
 
106
  result = await tts_handler.process_input(text, mode)
107
 
108
- if isinstance(result, bytes):
109
- # Audio response
110
- with io.BytesIO() as wav_buffer:
111
- wav_buffer.write(result)
112
- wav_buffer.seek(0)
113
- return (RECEIVE_SAMPLE_RATE, wav_buffer.read())
114
  else:
115
  # Text response
116
  return result
117
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  with gr.Blocks(title="Gemini TTS Interface") as demo:
119
- gr.Markdown("# 🎤 Gemini Text-to-Speech Interface")
120
 
121
  with gr.Row():
122
  api_key = gr.Textbox(label="Gemini API Key", type="password")
@@ -133,6 +159,11 @@ def create_gradio_interface():
133
  text_output = gr.Audio(label="Generated Speech")
134
  text_btn.click(generate_response, inputs=[text_input, gr.Text("text", visible=False)], outputs=text_output)
135
 
 
 
 
 
 
136
  with gr.Tab("Camera Input"):
137
  camera_btn = gr.Button("Capture and Process")
138
  camera_output = gr.Audio(label="Generated Speech from Camera")
 
3
  import base64
4
  import io
5
  import cv2
6
+ import sounddevice as sd
7
+ import numpy as np
8
  import PIL.Image
9
  import mss
10
  from google import genai
11
  from google.genai import types
12
+ import soundfile as sf
13
 
14
  # Configuration
15
+ SAMPLE_RATE = 24000
 
 
 
16
  CHUNK_SIZE = 1024
17
  MODEL = "models/gemini-2.0-flash-exp"
18
 
19
  class GeminiTTS:
20
  def __init__(self, api_key):
21
  self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
 
22
  self.audio_in_queue = asyncio.Queue()
23
  self.out_queue = asyncio.Queue(maxsize=5)
24
  self.session = None
 
59
  image_io.seek(0)
60
  return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
61
 
62
+ async def record_audio(self, duration=5):
63
+ """Record audio using sounddevice"""
64
+ print(f"Recording for {duration} seconds...")
65
+ recording = sd.rec(int(duration * SAMPLE_RATE),
66
+ samplerate=SAMPLE_RATE,
67
+ channels=1,
68
+ dtype='float32')
69
+ sd.wait() # Wait until recording is finished
70
+ return recording
71
+
72
+ async def play_audio(self, audio_data):
73
+ """Play audio using sounddevice"""
74
+ sd.play(audio_data, samplerate=SAMPLE_RATE)
75
+ sd.wait() # Wait until playback is finished
76
+
77
  async def process_input(self, text=None, mode="text"):
78
  try:
79
  async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
 
96
  turn = session.receive()
97
  async for response in turn:
98
  if data := response.data:
99
+ # Save audio to buffer
100
+ with io.BytesIO() as wav_buffer:
101
+ sf.write(wav_buffer, data, SAMPLE_RATE, format='WAV')
102
+ wav_buffer.seek(0)
103
+ return (SAMPLE_RATE, wav_buffer.read())
104
  if text := response.text:
105
  return text
106
 
 
122
 
123
  result = await tts_handler.process_input(text, mode)
124
 
125
+ if isinstance(result, tuple) and len(result) == 2:
126
+ # Audio response (sample_rate, audio_data)
127
+ return result
 
 
 
128
  else:
129
  # Text response
130
  return result
131
 
132
+ async def record_and_process():
133
+ if not tts_handler:
134
+ raise gr.Error("Please initialize the TTS system first with your API key")
135
+
136
+ # Record audio
137
+ recording = await tts_handler.record_audio(duration=5)
138
+
139
+ # Process audio (you would need to implement this part)
140
+ # For now, we'll just play it back
141
+ await tts_handler.play_audio(recording)
142
+ return (SAMPLE_RATE, recording.tobytes())
143
+
144
  with gr.Blocks(title="Gemini TTS Interface") as demo:
145
+ gr.Markdown("# 🎤 Gemini Text-to-Speech Interface with SoundDevice")
146
 
147
  with gr.Row():
148
  api_key = gr.Textbox(label="Gemini API Key", type="password")
 
159
  text_output = gr.Audio(label="Generated Speech")
160
  text_btn.click(generate_response, inputs=[text_input, gr.Text("text", visible=False)], outputs=text_output)
161
 
162
+ with gr.Tab("Voice Input"):
163
+ record_btn = gr.Button("Record and Process (5 sec)")
164
+ voice_output = gr.Audio(label="Processed Audio")
165
+ record_btn.click(record_and_process, outputs=voice_output)
166
+
167
  with gr.Tab("Camera Input"):
168
  camera_btn = gr.Button("Capture and Process")
169
  camera_output = gr.Audio(label="Generated Speech from Camera")