File size: 4,508 Bytes
b5b5c8d
e8c4e1d
e120b5a
c678629
 
b5b5c8d
68e69bf
e8c4e1d
 
e120b5a
 
cf6d1a0
c678629
e8c4e1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c678629
e8c4e1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c678629
e8c4e1d
 
 
 
 
 
c678629
e8c4e1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c678629
e8c4e1d
 
 
 
 
 
 
 
 
 
 
 
 
c678629
 
 
 
 
 
 
 
 
 
 
 
e120b5a
 
 
 
 
 
c678629
e120b5a
 
 
 
 
c678629
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
import google.generativeai as genai
from gradio_client import Client, handle_file
import tempfile
import os

genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
model = genai.GenerativeModel('gemini-2.0-flash')

# Load TTS client - using the correct Space URL
tts_client = Client("https://medmac01-darija-arabic-tts.hf.space/")

def generate_conversation(subject, speaker1_audio, speaker2_audio):
    try:
        prompt = f"""
        Generate a natural Moroccan Darija conversation in Arabic script only between two people about: "{subject}".
        Rules:
        - Use only Arabic script for Darija
        - Do not include any transliterations or translations
        - Do not include any Latin characters or parentheses
        - Use "Speaker 1" and "Speaker 2" as the speaker names
        
        Format:
        Speaker 1: [Arabic Darija text only]
        Speaker 2: [Arabic Darija text only]
        Speaker 1: [Arabic Darija text only]
        Speaker 2: [Arabic Darija text only]
        
        Keep it short and casual (4 lines).
        """

        print("Sending prompt to Gemini API...")
        response = model.generate_content(prompt)
        print(f"Gemini API Response: {response}")
        
        if not response or not response.text:
            print("No response text received from Gemini API")
            return ["Error: No response from the model"] + [None] * 4
            
        result = response.text
        print(f"Generated text: {result}")
        
        # Split the text into lines and process each line
        lines = []
        for line in result.split('\n'):
            line = line.strip()
            if ':' in line:
                # Extract the text after the colon
                text = line.split(':', 1)[1].strip()
                # Add the appropriate speaker prefix
                if len(lines) % 2 == 0:
                    lines.append(f"Speaker 1: {text}")
                else:
                    lines.append(f"Speaker 2: {text}")
        
        print(f"Processed lines: {lines}")
        
        if not lines:
            print("No valid lines found in the response")
            return ["Error: No valid conversation generated"] + [None] * 4

        # Generate audio files using TTS
        audio_paths = []
        idx = 0
        for line in lines:
            speaker_audio = speaker1_audio if line.startswith("Speaker 1") else speaker2_audio
            text = line.split(":", 1)[1].strip()

            # Create TTS audio using the correct API call
            result = tts_client.predict(
                text=text,
                speaker_audio_path=handle_file(speaker_audio),
                temperature=0.75,
                api_name="/infer_EGTTS"
            )
            
            # Save the result to a temporary file
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
                with open(result, "rb") as f:
                    tmp.write(f.read())
                tmp.flush()
                audio_paths.append(tmp.name)
                idx += 1

        # Format the conversation text
        conversation_text = "\n".join(lines)
        
        # Ensure we have exactly 4 audio paths
        while len(audio_paths) < 4:
            audio_paths.append(None)
        
        # Return all outputs in the correct order
        return [conversation_text] + audio_paths[:4]
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return [f"Error: {str(e)}"] + [None] * 4

with gr.Blocks() as demo:
    gr.Markdown("# 🗣️ Moroccan Darija Conversation Generator")
    gr.Markdown("Enter a discussion topic and upload 2 speaker voices. We'll generate a Darija conversation!")

    with gr.Row():
        subject = gr.Textbox(label="Subject of the discussion", placeholder="e.g. Going to the souk")
    with gr.Row():
        speaker1 = gr.Audio(label="Speaker 1 Reference (4-5 sec)", type="filepath")
        speaker2 = gr.Audio(label="Speaker 2 Reference (4-5 sec)", type="filepath")

    btn = gr.Button("🎤 Generate Conversation")
    
    # Add text output for the conversation
    conversation_output = gr.Textbox(label="Generated Conversation", lines=6)
    
    # Audio outputs
    audio_outputs = [gr.Audio(label=f"Line {i+1}") for i in range(4)]

    btn.click(
        generate_conversation, 
        inputs=[subject, speaker1, speaker2], 
        outputs=[conversation_output] + audio_outputs
    )

demo.launch()