TTS-Conv-Darija / app.py
ayatnourhachmi's picture
Update Google Gemini API configuration to use environment variable for API key
68e69bf
raw
history blame contribute delete
4.51 kB
import gradio as gr
import google.generativeai as genai
from gradio_client import Client, handle_file
import tempfile
import os
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
model = genai.GenerativeModel('gemini-2.0-flash')
# Load TTS client - using the correct Space URL
tts_client = Client("https://medmac01-darija-arabic-tts.hf.space/")
def generate_conversation(subject, speaker1_audio, speaker2_audio):
try:
prompt = f"""
Generate a natural Moroccan Darija conversation in Arabic script only between two people about: "{subject}".
Rules:
- Use only Arabic script for Darija
- Do not include any transliterations or translations
- Do not include any Latin characters or parentheses
- Use "Speaker 1" and "Speaker 2" as the speaker names
Format:
Speaker 1: [Arabic Darija text only]
Speaker 2: [Arabic Darija text only]
Speaker 1: [Arabic Darija text only]
Speaker 2: [Arabic Darija text only]
Keep it short and casual (4 lines).
"""
print("Sending prompt to Gemini API...")
response = model.generate_content(prompt)
print(f"Gemini API Response: {response}")
if not response or not response.text:
print("No response text received from Gemini API")
return ["Error: No response from the model"] + [None] * 4
result = response.text
print(f"Generated text: {result}")
# Split the text into lines and process each line
lines = []
for line in result.split('\n'):
line = line.strip()
if ':' in line:
# Extract the text after the colon
text = line.split(':', 1)[1].strip()
# Add the appropriate speaker prefix
if len(lines) % 2 == 0:
lines.append(f"Speaker 1: {text}")
else:
lines.append(f"Speaker 2: {text}")
print(f"Processed lines: {lines}")
if not lines:
print("No valid lines found in the response")
return ["Error: No valid conversation generated"] + [None] * 4
# Generate audio files using TTS
audio_paths = []
idx = 0
for line in lines:
speaker_audio = speaker1_audio if line.startswith("Speaker 1") else speaker2_audio
text = line.split(":", 1)[1].strip()
# Create TTS audio using the correct API call
result = tts_client.predict(
text=text,
speaker_audio_path=handle_file(speaker_audio),
temperature=0.75,
api_name="/infer_EGTTS"
)
# Save the result to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
with open(result, "rb") as f:
tmp.write(f.read())
tmp.flush()
audio_paths.append(tmp.name)
idx += 1
# Format the conversation text
conversation_text = "\n".join(lines)
# Ensure we have exactly 4 audio paths
while len(audio_paths) < 4:
audio_paths.append(None)
# Return all outputs in the correct order
return [conversation_text] + audio_paths[:4]
except Exception as e:
print(f"Error occurred: {str(e)}")
return [f"Error: {str(e)}"] + [None] * 4
with gr.Blocks() as demo:
gr.Markdown("# πŸ—£οΈ Moroccan Darija Conversation Generator")
gr.Markdown("Enter a discussion topic and upload 2 speaker voices. We'll generate a Darija conversation!")
with gr.Row():
subject = gr.Textbox(label="Subject of the discussion", placeholder="e.g. Going to the souk")
with gr.Row():
speaker1 = gr.Audio(label="Speaker 1 Reference (4-5 sec)", type="filepath")
speaker2 = gr.Audio(label="Speaker 2 Reference (4-5 sec)", type="filepath")
btn = gr.Button("🎀 Generate Conversation")
# Add text output for the conversation
conversation_output = gr.Textbox(label="Generated Conversation", lines=6)
# Audio outputs
audio_outputs = [gr.Audio(label=f"Line {i+1}") for i in range(4)]
btn.click(
generate_conversation,
inputs=[subject, speaker1, speaker2],
outputs=[conversation_output] + audio_outputs
)
demo.launch()