Spaces:
Running
Running
File size: 4,508 Bytes
b5b5c8d e8c4e1d e120b5a c678629 b5b5c8d 68e69bf e8c4e1d e120b5a cf6d1a0 c678629 e8c4e1d c678629 e8c4e1d c678629 e8c4e1d c678629 e8c4e1d c678629 e8c4e1d c678629 e120b5a c678629 e120b5a c678629 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import gradio as gr
import google.generativeai as genai
from gradio_client import Client, handle_file
import tempfile
import os
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
model = genai.GenerativeModel('gemini-2.0-flash')
# Load TTS client - using the correct Space URL
tts_client = Client("https://medmac01-darija-arabic-tts.hf.space/")
def generate_conversation(subject, speaker1_audio, speaker2_audio):
try:
prompt = f"""
Generate a natural Moroccan Darija conversation in Arabic script only between two people about: "{subject}".
Rules:
- Use only Arabic script for Darija
- Do not include any transliterations or translations
- Do not include any Latin characters or parentheses
- Use "Speaker 1" and "Speaker 2" as the speaker names
Format:
Speaker 1: [Arabic Darija text only]
Speaker 2: [Arabic Darija text only]
Speaker 1: [Arabic Darija text only]
Speaker 2: [Arabic Darija text only]
Keep it short and casual (4 lines).
"""
print("Sending prompt to Gemini API...")
response = model.generate_content(prompt)
print(f"Gemini API Response: {response}")
if not response or not response.text:
print("No response text received from Gemini API")
return ["Error: No response from the model"] + [None] * 4
result = response.text
print(f"Generated text: {result}")
# Split the text into lines and process each line
lines = []
for line in result.split('\n'):
line = line.strip()
if ':' in line:
# Extract the text after the colon
text = line.split(':', 1)[1].strip()
# Add the appropriate speaker prefix
if len(lines) % 2 == 0:
lines.append(f"Speaker 1: {text}")
else:
lines.append(f"Speaker 2: {text}")
print(f"Processed lines: {lines}")
if not lines:
print("No valid lines found in the response")
return ["Error: No valid conversation generated"] + [None] * 4
# Generate audio files using TTS
audio_paths = []
idx = 0
for line in lines:
speaker_audio = speaker1_audio if line.startswith("Speaker 1") else speaker2_audio
text = line.split(":", 1)[1].strip()
# Create TTS audio using the correct API call
result = tts_client.predict(
text=text,
speaker_audio_path=handle_file(speaker_audio),
temperature=0.75,
api_name="/infer_EGTTS"
)
# Save the result to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
with open(result, "rb") as f:
tmp.write(f.read())
tmp.flush()
audio_paths.append(tmp.name)
idx += 1
# Format the conversation text
conversation_text = "\n".join(lines)
# Ensure we have exactly 4 audio paths
while len(audio_paths) < 4:
audio_paths.append(None)
# Return all outputs in the correct order
return [conversation_text] + audio_paths[:4]
except Exception as e:
print(f"Error occurred: {str(e)}")
return [f"Error: {str(e)}"] + [None] * 4
with gr.Blocks() as demo:
gr.Markdown("# 🗣️ Moroccan Darija Conversation Generator")
gr.Markdown("Enter a discussion topic and upload 2 speaker voices. We'll generate a Darija conversation!")
with gr.Row():
subject = gr.Textbox(label="Subject of the discussion", placeholder="e.g. Going to the souk")
with gr.Row():
speaker1 = gr.Audio(label="Speaker 1 Reference (4-5 sec)", type="filepath")
speaker2 = gr.Audio(label="Speaker 2 Reference (4-5 sec)", type="filepath")
btn = gr.Button("🎤 Generate Conversation")
# Add text output for the conversation
conversation_output = gr.Textbox(label="Generated Conversation", lines=6)
# Audio outputs
audio_outputs = [gr.Audio(label=f"Line {i+1}") for i in range(4)]
btn.click(
generate_conversation,
inputs=[subject, speaker1, speaker2],
outputs=[conversation_output] + audio_outputs
)
demo.launch() |