Spaces:

AiDi-UIR
/

TTS-Conv-Darija

Running

File size: 4,508 Bytes

import gradio as gr
import google.generativeai as genai
from gradio_client import Client, handle_file
import tempfile
import os

genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
model = genai.GenerativeModel('gemini-2.0-flash')

# Load TTS client - using the correct Space URL
tts_client = Client("https://medmac01-darija-arabic-tts.hf.space/")

def generate_conversation(subject, speaker1_audio, speaker2_audio):
    try:
        prompt = f"""
        Generate a natural Moroccan Darija conversation in Arabic script only between two people about: "{subject}".
        Rules:
        - Use only Arabic script for Darija
        - Do not include any transliterations or translations
        - Do not include any Latin characters or parentheses
        - Use "Speaker 1" and "Speaker 2" as the speaker names
        
        Format:
        Speaker 1: [Arabic Darija text only]
        Speaker 2: [Arabic Darija text only]
        Speaker 1: [Arabic Darija text only]
        Speaker 2: [Arabic Darija text only]
        
        Keep it short and casual (4 lines).
        """

        print("Sending prompt to Gemini API...")
        response = model.generate_content(prompt)
        print(f"Gemini API Response: {response}")
        
        if not response or not response.text:
            print("No response text received from Gemini API")
            return ["Error: No response from the model"] + [None] * 4
            
        result = response.text
        print(f"Generated text: {result}")
        
        # Split the text into lines and process each line
        lines = []
        for line in result.split('\n'):
            line = line.strip()
            if ':' in line:
                # Extract the text after the colon
                text = line.split(':', 1)[1].strip()
                # Add the appropriate speaker prefix
                if len(lines) % 2 == 0:
                    lines.append(f"Speaker 1: {text}")
                else:
                    lines.append(f"Speaker 2: {text}")
        
        print(f"Processed lines: {lines}")
        
        if not lines:
            print("No valid lines found in the response")
            return ["Error: No valid conversation generated"] + [None] * 4

        # Generate audio files using TTS
        audio_paths = []
        idx = 0
        for line in lines:
            speaker_audio = speaker1_audio if line.startswith("Speaker 1") else speaker2_audio
            text = line.split(":", 1)[1].strip()

            # Create TTS audio using the correct API call
            result = tts_client.predict(
                text=text,
                speaker_audio_path=handle_file(speaker_audio),
                temperature=0.75,
                api_name="/infer_EGTTS"
            )
            
            # Save the result to a temporary file
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
                with open(result, "rb") as f:
                    tmp.write(f.read())
                tmp.flush()
                audio_paths.append(tmp.name)
                idx += 1

        # Format the conversation text
        conversation_text = "\n".join(lines)
        
        # Ensure we have exactly 4 audio paths
        while len(audio_paths) < 4:
            audio_paths.append(None)
        
        # Return all outputs in the correct order
        return [conversation_text] + audio_paths[:4]
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return [f"Error: {str(e)}"] + [None] * 4

with gr.Blocks() as demo:
    gr.Markdown("# 🗣️ Moroccan Darija Conversation Generator")
    gr.Markdown("Enter a discussion topic and upload 2 speaker voices. We'll generate a Darija conversation!")

    with gr.Row():
        subject = gr.Textbox(label="Subject of the discussion", placeholder="e.g. Going to the souk")
    with gr.Row():
        speaker1 = gr.Audio(label="Speaker 1 Reference (4-5 sec)", type="filepath")
        speaker2 = gr.Audio(label="Speaker 2 Reference (4-5 sec)", type="filepath")

    btn = gr.Button("🎤 Generate Conversation")
    
    # Add text output for the conversation
    conversation_output = gr.Textbox(label="Generated Conversation", lines=6)
    
    # Audio outputs
    audio_outputs = [gr.Audio(label=f"Line {i+1}") for i in range(4)]

    btn.click(
        generate_conversation, 
        inputs=[subject, speaker1, speaker2], 
        outputs=[conversation_output] + audio_outputs
    )

demo.launch()