File size: 16,281 Bytes
6343c61
 
f7f638f
 
 
aa11dfb
01f44ce
6343c61
d7859f0
6343c61
 
f7f638f
890121e
cb986c3
f7f638f
 
 
 
6343c61
890121e
 
cb986c3
 
 
 
 
890121e
6343c61
8154f9c
6343c61
 
890121e
145b9ee
890121e
 
 
 
 
 
 
 
2e4aab1
890121e
2e4aab1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
890121e
 
96e6ee2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cf6b58
96e6ee2
890121e
 
 
 
 
 
 
 
 
96e6ee2
890121e
96e6ee2
890121e
 
cb986c3
890121e
 
2e4aab1
890121e
2e4aab1
 
 
 
 
 
 
b219932
890121e
2e4aab1
890121e
 
 
49d166b
 
 
b219932
 
49d166b
2e4aab1
 
 
b219932
2e4aab1
 
 
 
 
 
890121e
 
 
96e6ee2
cb986c3
 
 
145b9ee
cb986c3
 
 
 
 
 
 
 
 
145b9ee
cb986c3
 
 
 
 
145b9ee
cb986c3
 
 
 
e48ab41
 
cb986c3
e48ab41
 
 
 
 
 
 
 
 
145b9ee
 
 
 
cb986c3
 
145b9ee
 
cb986c3
96e6ee2
 
 
 
 
890121e
53332ff
 
96e6ee2
890121e
cb986c3
890121e
 
53332ff
96e6ee2
 
 
 
2e078ec
d7859f0
890121e
 
 
96e6ee2
 
 
890121e
 
 
96e6ee2
890121e
145b9ee
890121e
96e6ee2
 
 
890121e
 
 
 
 
 
 
 
 
 
 
 
 
 
145b9ee
cb986c3
145b9ee
 
 
 
 
 
 
96e6ee2
 
 
 
 
 
 
 
 
 
 
 
cb986c3
96e6ee2
 
 
 
d7859f0
 
 
 
1ff44b2
96e6ee2
 
 
 
145b9ee
 
 
96e6ee2
 
145b9ee
 
96e6ee2
 
 
1ff44b2
96e6ee2
 
1ff44b2
96e6ee2
 
cb986c3
96e6ee2
 
 
 
 
cb986c3
96e6ee2
f7f638f
6343c61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01f44ce
 
 
6343c61
 
01f44ce
6343c61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ff44b2
 
6343c61
 
 
 
 
01f44ce
6343c61
01f44ce
 
6343c61
 
 
 
 
01f44ce
 
 
6343c61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01f44ce
6343c61
 
 
 
01f44ce
6343c61
 
 
 
 
a228aa2
6343c61
 
 
 
 
 
 
 
 
 
 
 
01f44ce
6343c61
 
 
cb986c3
01f44ce
6343c61
 
f7f638f
 
 
 
 
 
 
 
 
cb986c3
f7f638f
 
 
 
 
 
 
 
 
baf5fdc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
import os
import gradio as gr
import threading
import discord
from discord import app_commands
from typing import List
from elevenlabs import set_api_key, voices, generate, Voice, VoiceSettings, User
import tempfile
import io
import speech_recognition as sr
from pydub import AudioSegment
import logging
import google.generativeai as genai
import asyncio

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Configure Gemini AI
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if GEMINI_API_KEY:
    genai.configure(api_key=GEMINI_API_KEY)
    model = genai.GenerativeModel('gemini-pro')
else:
    logger.warning("GEMINI_API_KEY not found! Accent modification will be disabled.")

# Set your ElevenLabs API key
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
set_api_key(ELEVENLABS_API_KEY)

# Role configurations
ALLOWED_ROLES = ["+mechanic", "+trusted"]
CHAR_LIMITS = {
    "+mechanic": 500,
    "+trusted": 3000,
    "Administrator": float('inf')
}

# Accent configurations
ACCENT_OPTIONS = ["American", "Arabian", "Russian"]
# Update the ACCENT_PROMPTS dictionary
ACCENT_PROMPTS = {
    "Russian": """Modify this text to sound like someone speaking with a strong Russian accent. Rules:
                 1. Use simple Russian words like 'da', 'nyet', 'tovarisch' occasionally
                 2. Drop articles ('the', 'a') sometimes and insist on the 'r' sometimes
                 3. DO NOT add any annotations, asterisks, or explanations
                 4. DO NOT add formatting or parentheses
                 5. Return ONLY the modified text
                 Example input: "We will work tomorrow my friend"
                 Example output: Ve vill vork tomorrow, tovarisch""",
    "Arabian": """Modify this text to sound like someone speaking with an Arabian accent. Rules:
                 1. Use simple Arabic words like 'habibi', 'yalla', 'wallah' occasionally
                 2. Modify 'th' sounds to 'z' sometimes
                 3. DO NOT add any annotations, asterisks, or explanations
                 4. DO NOT add formatting or parentheses
                 5. Return ONLY the modified text
                 Example input: "Hello my friend, how are you today?"
                 Example output: Habibi, how are you zis beautiful day?""",
    "American": """Modify this text to sound like someone speaking with a strong American accent. Rules:
                  1. Use American colloquialisms
                  2. Add casual American phrases
                  3. DO NOT add any annotations, asterisks, or explanations
                  4. DO NOT add formatting or parentheses
                  5. Return ONLY the modified text
                  Example input: "Hello there, how are you?"
                  Example output: Howdy partner, how ya doin'?"""
}

def get_available_voices():
    """Fetch only custom voices from ElevenLabs account"""
    all_voices = voices()
    return {voice.name: voice.voice_id for voice in all_voices if not voice.category == "premade"}

def get_remaining_credits():
    """Get remaining character credits from ElevenLabs"""
    user = User.from_api()
    subscription = user.subscription
    return {
        "character_count": subscription.character_count,
        "character_limit": subscription.character_limit
    }

def format_credits_message(credits_info):
    """Format credits information into a readable message"""
    return f"Credits Status: {credits_info['character_count']} / {credits_info['character_limit']}"

def has_permission(member: discord.Member) -> tuple[bool, int]:
    """Check if member has permission and return their character limit"""
    if member.guild_permissions.administrator:
        return True, CHAR_LIMITS["Administrator"]
    
    member_roles = [role.name for role in member.roles]
    for role_name in ALLOWED_ROLES:
        if role_name in member_roles:
            return True, CHAR_LIMITS.get(role_name, 0)
    
    return False, 0

async def modify_accent(text: str, accent: str, enhance: bool = False) -> str:
    """Modify text based on selected accent using Gemini AI"""
    if not GEMINI_API_KEY or not accent or accent == "American":
        return text
        
    base_prompt = ACCENT_PROMPTS[accent]
    if enhance:
        base_prompt += """\n\nAdd more authentic elements:
                         1. Include more cultural phrases
                         2. Use more native words (but keep text mostly understandable)
                         3. Adjust speech patterns
                         BUT REMEMBER:
                         - DO NOT add any annotations or explanations
                         - DO NOT use asterisks or parentheses
                         - Return ONLY the modified text"""
    
    prompt = f"{base_prompt}\n\nInput text: {text}\nModified text:"
    
    try:
        response = await model.generate_content_async(prompt)
        # Get content from the first part of the first candidate
        parts = response.candidates[0].content.parts
        if not parts:
            return text
            
        modified_text = parts[0].text.strip()
        
        # Clean up any remaining annotations or formatting
        modified_text = modified_text.replace('*', '').replace('(', '').replace(')', '')
        modified_text = modified_text.split('\n')[0] if '\n' in modified_text else modified_text
        
        # If response is empty or contains unwanted formatting, return original
        if not modified_text or '**' in modified_text or 'Enhanced Text:' in modified_text:
            return text
            
        return modified_text
    except Exception as e:
        logger.error(f"Error modifying accent: {str(e)}")
        return text

# Get available voices early
VOICE_LIST = get_available_voices()

# Discord bot setup
class VoiceBot(discord.Client):
    def __init__(self):
        super().__init__(intents=discord.Intents.default())
        self.tree = app_commands.CommandTree(self)
        self.guild_id = int(os.getenv('DISCORD_GUILD_ID', '0'))
        self.activity = discord.Activity(
            type=discord.ActivityType.watching,
            name="voice creation | /create /list"
        )

    async def setup_hook(self):
        """This is called when the bot starts up"""
        guild = discord.Object(id=self.guild_id)
        self.tree.copy_global_to(guild=guild)
        await self.tree.sync(guild=guild)

client = VoiceBot()
tree = client.tree

@tree.command(name="list", description="List all available voices", guild=discord.Object(id=int(os.getenv('DISCORD_GUILD_ID', '0'))))
async def voice_list(interaction: discord.Interaction):
    await interaction.response.defer()
    voice_list = "\n".join([f"• {name}" for name in VOICE_LIST.keys()])
    credits_info = get_remaining_credits()
    credits_msg = format_credits_message(credits_info)
    
    embed = discord.Embed(
        title="Available Voices",
        description=f"{voice_list}\n\n{credits_msg}",
        color=0x2B2D31
    )
    await interaction.followup.send(embed=embed)

async def voice_autocomplete(interaction: discord.Interaction, current: str) -> List[app_commands.Choice[str]]:
    return [
        app_commands.Choice(name=voice, value=voice)
        for voice in VOICE_LIST.keys()
        if current.lower() in voice.lower()
    ][:25]

@tree.command(name="create", description="Create a voice message", guild=discord.Object(id=int(os.getenv('DISCORD_GUILD_ID', '0'))))
@app_commands.describe(
    text="Text to convert to speech",
    voice_name="Select a voice to use",
    stability="Voice stability (0-1)",
    clarity="Voice clarity (0-1)",
    style="Speaking style (0-1)",
    accent="Select an accent style (optional)",
    accent_enhance="Add cultural elements to enhance the accent"
)
@app_commands.choices(accent=[
    app_commands.Choice(name=accent, value=accent)
    for accent in ACCENT_OPTIONS
])
@app_commands.autocomplete(voice_name=voice_autocomplete)
async def voice_create(
    interaction: discord.Interaction,
    text: str,
    voice_name: str,
    stability: float = 0.5,
    clarity: float = 0.75,
    style: float = 0.5,
    accent: str = None,
    accent_enhance: bool = False
):
    await interaction.response.defer()
    
    # Check permissions
    has_perm, char_limit = has_permission(interaction.user)
    if not has_perm:
        embed = discord.Embed(
            title="Permission Denied",
            description="You need to be an administrator or have the +mechanic/+trusted role to use this command.",
            color=0xFF0000
        )
        await interaction.followup.send(embed=embed)
        return
    
    # Check character limit
    if len(text) > char_limit:
        embed = discord.Embed(
            title="Character Limit Exceeded",
            description=f"Your message exceeds your character limit of {char_limit}. Current length: {len(text)}",
            color=0xFF0000
        )
        await interaction.followup.send(embed=embed)
        return
    
    # Process accent if specified
    if accent:
        text = await modify_accent(text, accent, accent_enhance)
    
    if voice_name not in VOICE_LIST:
        embed = discord.Embed(
            title="Voice Not Found",
            description=f"The voice '{voice_name}' was not found. Use `/list` to see available voices.",
            color=0x2B2D31
        )
        await interaction.followup.send(embed=embed)
        return
        
    try:
        voice_settings = VoiceSettings(
            stability=stability,
            similarity_boost=clarity,
            style=style,
            use_speaker_boost=True
        )
        
        audio = generate(
            text=text,
            voice=Voice(
                voice_id=VOICE_LIST[voice_name],
                settings=voice_settings
            )
        )
        
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
            audio_segment = AudioSegment.from_file(io.BytesIO(audio), format="mp3")
            audio_segment = audio_segment.set_frame_rate(22050).set_channels(1).set_sample_width(2)
            audio_segment.export(temp_file.name, format='wav')
            temp_path = temp_file.name
        
        credits_info = get_remaining_credits()
        credits_msg = format_credits_message(credits_info)
        
        accent_info = f"\nAccent: {accent}" if accent else ""
        accent_enhance_info = f"\nAccent Enhancement: {'On' if accent_enhance else 'Off'}" if accent else ""
        
        embed = discord.Embed(
            title="Voice Generated",
            description=f"Prompt: {text}\nVoice: {voice_name}\nStability: {stability}\nClarity: {clarity}\nStyle: {style}{accent_info}{accent_enhance_info}\n\n{credits_msg}",
            color=0x57F287
        )
        await interaction.followup.send(
            embed=embed,
            file=discord.File(temp_path)
        )
        
        os.unlink(temp_path)
        
    except Exception as e:
        logger.error(f"Error generating audio: {str(e)}")
        await interaction.followup.send(f"Error generating audio: {str(e)}")

@client.event
async def on_ready():
    logger.info(f"Bot is ready and logged in as {client.user}")
    await client.change_presence(activity=client.activity)

# Gradio interface functions
def text_to_speech(text, voice_name, stability, clarity, style):
    """Convert text to speech using selected voice and parameters"""
    voice_settings = VoiceSettings(
        stability=stability,
        similarity_boost=clarity,
        style=style,
        use_speaker_boost=True
    )
    
    voice_id = VOICE_LIST[voice_name]
    audio = generate(
        text=text,
        voice=Voice(
            voice_id=voice_id,
            settings=voice_settings
        )
    )
    
    credits_info = get_remaining_credits()
    credits_message = format_credits_message(credits_info)
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
        temp_file.write(audio)
        return temp_file.name, credits_message

def speech_to_text(audio_file):
    """Convert speech to text using speech recognition"""
    recognizer = sr.Recognizer()
    
    audio = AudioSegment.from_file(audio_file)
    wav_path = tempfile.mktemp(suffix=".wav")
    audio.export(wav_path, format="wav")
    
    with sr.AudioFile(wav_path) as source:
        audio_data = recognizer.record(source)
        try:
            text = recognizer.recognize_google(audio_data)
            return text
        except sr.UnknownValueError:
            return "Could not understand audio"
        except sr.RequestError:
            return "Error in speech recognition service"
        finally:
            os.unlink(wav_path)

def speech_to_speech(audio_file, voice_name, stability, clarity, style):
    """Convert speech to speech by first converting to text, then to speech"""
    text = speech_to_text(audio_file)
    if text.startswith("Error") or text.startswith("Could not"):
        return None, text, ""
    
    audio_output, credits_message = text_to_speech(text, voice_name, stability, clarity, style)
    return audio_output, text, credits_message

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# ElevenLabs Voice Generation")
    
    credits_info = get_remaining_credits()
    credits_display = gr.Markdown(format_credits_message(credits_info))
    
    with gr.Tab("Text to Speech"):
        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(label="Text to convert", lines=5)
                voice_dropdown = gr.Dropdown(choices=list(VOICE_LIST.keys()), label="Select Voice")
                
                with gr.Row():
                    stability = gr.Slider(minimum=0, maximum=1, value=0.5, label="Stability")
                    clarity = gr.Slider(minimum=0, maximum=1, value=0.75, label="Clarity/Similarity Boost")
                    style = gr.Slider(minimum=0, maximum=1, value=0.5, label="Style")
                
                convert_btn = gr.Button("Convert")
            
            with gr.Column():
                audio_output = gr.Audio(label="Generated Audio")
                credits_output = gr.Markdown()
        
        convert_btn.click(
            fn=text_to_speech,
            inputs=[text_input, voice_dropdown, stability, clarity, style],
            outputs=[audio_output, credits_output]
        )
    
    with gr.Tab("Speech to Speech"):
        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(label="Input Audio", sources=["microphone", "upload"])
                voice_dropdown_s2s = gr.Dropdown(choices=list(VOICE_LIST.keys()), label="Select Voice")
                
                with gr.Row():
                    stability_s2s = gr.Slider(minimum=0, maximum=1, value=0.5, label="Stability")
                    clarity_s2s = gr.Slider(minimum=0, maximum=1, value=0.75, label="Clarity/Similarity Boost")
                    style_s2s = gr.Slider(minimum=0, maximum=1, value=0.5, label="Style")
                
                convert_btn_s2s = gr.Button("Convert")
            
            with gr.Column():
                text_output = gr.Textbox(label="Recognized Text", lines=3)
                audio_output_s2s = gr.Audio(label="Generated Audio")
                credits_output_s2s = gr.Markdown()
        
        convert_btn_s2s.click(
            fn=speech_to_speech,
            inputs=[audio_input, voice_dropdown_s2s, stability, clarity_s2s, style_s2s],
            outputs=[audio_output_s2s, text_output, credits_output_s2s]
        )

def start_discord_bot():
    """Start the Discord bot"""
    DISCORD_TOKEN = os.getenv("DISCORD_BOT_TOKEN")
    if not DISCORD_TOKEN:
        logger.error("DISCORD_BOT_TOKEN not found!")
        return
    
    logger.info("Starting Discord bot...")
    try:
        asyncio.set_event_loop(asyncio.new_event_loop())
        client.run(DISCORD_TOKEN)
    except Exception as e:
        logger.error(f"Failed to start Discord bot: {str(e)}")

# Start Discord bot in a separate thread
discord_thread = threading.Thread(target=start_discord_bot, daemon=True)
discord_thread.start()

# Launch Gradio interface
demo.launch(server_name="0.0.0.0", share=False)