Spaces:
Running
Running
import torch | |
import soundfile as sf | |
import os | |
import re | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from speechbrain.pretrained import EncoderClassifier | |
# Define paths and device | |
model_path = "HAMMALE/speecht5-darija" # Path to your model on HF Hub | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(f"Using device: {device}") | |
# Load models | |
processor = SpeechT5Processor.from_pretrained(model_path) | |
model = SpeechT5ForTextToSpeech.from_pretrained(model_path).to(device) | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device) | |
# Load speaker embedding model | |
speaker_model = EncoderClassifier.from_hparams( | |
source="speechbrain/spkrec-xvect-voxceleb", | |
run_opts={"device": device}, | |
savedir=os.path.join("/tmp", "spkrec-xvect-voxceleb"), | |
) | |
# Load pre-computed speaker embeddings | |
male_embedding = torch.load("male_embedding.pt") if os.path.exists("male_embedding.pt") else torch.randn(1, 512) | |
female_embedding = torch.load("female_embedding.pt") if os.path.exists("female_embedding.pt") else torch.randn(1, 512) | |
# Text normalization function | |
def normalize_text(text): | |
"""Normalize text for TTS processing""" | |
text = text.lower() | |
# Keep letters, numbers, spaces and apostrophes - fixed regex | |
text = re.sub(r'[^\w\s\'\u0600-\u06FF]', '', text) | |
text = ' '.join(text.split()) | |
return text | |
# Function to synthesize speech | |
def synthesize_speech(text, voice_type="male", speed=1.0): | |
"""Generate speech from text using the specified voice type""" | |
try: | |
# Select speaker embedding based on voice type | |
if voice_type == "male": | |
speaker_embeddings = male_embedding.to(device) | |
else: | |
speaker_embeddings = female_embedding.to(device) | |
# Normalize and tokenize input text | |
normalized_text = normalize_text(text) | |
inputs = processor(text=normalized_text, return_tensors="pt").to(device) | |
# Generate speech | |
with torch.no_grad(): | |
speech = model.generate_speech( | |
inputs["input_ids"], | |
speaker_embeddings, | |
vocoder=vocoder | |
) | |
# Convert to numpy array and adjust speed if needed | |
speech_np = speech.cpu().numpy() | |
# Apply speed adjustment (simple resampling) | |
if speed != 1.0: | |
# This is a simple approach - for production use a proper resampling library | |
import numpy as np | |
from scipy import signal | |
sample_rate = 16000 | |
new_length = int(len(speech_np) / speed) | |
speech_np = signal.resample(speech_np, new_length) | |
# Save temporary audio file | |
output_file = "output_speech.wav" | |
sf.write(output_file, speech_np, 16000) | |
return output_file, None | |
except Exception as e: | |
return None, f"Error generating speech: {str(e)}" | |
# Gradio imports need to be added | |
import gradio as gr | |
# Custom CSS for better design | |
custom_css = """ | |
.gradio-container { | |
font-family: 'Poppins', 'Arial', sans-serif; | |
max-width: 750px; | |
margin: auto; | |
} | |
.main-header { | |
background: linear-gradient(90deg, #c31432, #240b36); | |
color: white; | |
padding: 1.5em; | |
border-radius: 10px; | |
text-align: center; | |
margin-bottom: 1em; | |
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); | |
} | |
.main-header h1 { | |
font-size: 2.2em; | |
margin-bottom: 0.3em; | |
} | |
.main-header p { | |
font-size: 1.1em; | |
opacity: 0.9; | |
} | |
footer { | |
text-align: center; | |
margin-top: 2em; | |
color: #555; | |
font-size: 0.9em; | |
} | |
.flag-icon { | |
width: 24px; | |
height: 24px; | |
vertical-align: middle; | |
margin-right: 8px; | |
} | |
.example-header { | |
font-weight: bold; | |
color: #c31432; | |
margin-top: 1em; | |
} | |
.info-box { | |
background-color: #f9f9f9; | |
border-left: 4px solid #c31432; | |
padding: 1em; | |
margin: 1em 0; | |
border-radius: 5px; | |
} | |
.voice-selector { | |
display: flex; | |
justify-content: center; | |
gap: 20px; | |
margin: 10px 0; | |
} | |
.voice-option { | |
border: 2px solid #ddd; | |
border-radius: 10px; | |
padding: 10px 15px; | |
transition: all 0.3s ease; | |
cursor: pointer; | |
} | |
.voice-option.selected { | |
border-color: #c31432; | |
background-color: #fff5f5; | |
} | |
.slider-container { | |
margin: 20px 0; | |
} | |
""" | |
# Create Gradio interface with improved design | |
with gr.Blocks(css=custom_css) as demo: | |
gr.HTML( | |
""" | |
<div class="main-header"> | |
<h1>🇲🇦 Moroccan Darija Text-to-Speech 🎧</h1> | |
<p>Convert Moroccan Arabic (Darija) text into natural-sounding speech</p> | |
</div> | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
gr.HTML( | |
""" | |
<div class="info-box"> | |
<p>This model was fine-tuned on the DODa audio dataset to produce high-quality | |
Darija speech from text input. You can adjust the voice and speed below.</p> | |
</div> | |
""" | |
) | |
text_input = gr.Textbox( | |
label="Enter Darija Text", | |
placeholder="Kteb chi jomla b darija hna...", | |
lines=3 | |
) | |
with gr.Row(): | |
voice_type = gr.Radio( | |
["male", "female"], | |
label="Voice Type", | |
value="male" | |
) | |
speed = gr.Slider( | |
minimum=0.5, | |
maximum=2.0, | |
value=1.0, | |
step=0.1, | |
label="Speech Speed" | |
) | |
generate_btn = gr.Button("Generate Speech", variant="primary") | |
gr.HTML( | |
""" | |
<div class="example-header">Example phrases:</div> | |
<ul> | |
<li>"Ana Nadi Bezzaaf hhh"</li> | |
<li>"Lyoum ajwaa zwina bezzaf."</li> | |
<li>"lmaghrib ahssan blad fi l3alam "</li> | |
</ul> | |
""" | |
) | |
with gr.Column(): | |
audio_output = gr.Audio(label="Generated Speech") | |
error_output = gr.Textbox(label="Error (if any)", visible=False) | |
gr.Examples( | |
examples=[ | |
["Ana Nadi Bezzaaf hhh", "male", 1.0], | |
["Lyoum ajwaa zwina bezzaf.", "female", 1.0], | |
["lmaghrib ahssan blad fi l3alam", "male", 1.0], | |
["Filistine hora mina lbar ila lbahr", "female", 0.8], | |
], | |
inputs=[text_input, voice_type, speed], | |
outputs=[audio_output, error_output], | |
fn=synthesize_speech | |
) | |
gr.HTML( | |
""" | |
<footer> | |
<p>Developed by HAMMALE | Powered by Microsoft SpeechT5 | Data: DODa</p> | |
</footer> | |
""" | |
) | |
# Set button click action | |
generate_btn.click( | |
fn=synthesize_speech, | |
inputs=[text_input, voice_type, speed], | |
outputs=[audio_output, error_output] | |
) | |
# Launch the demo | |
if __name__ == "__main__": | |
demo.launch() | |