Spaces:

HAMMALE
/

speecht5-darija

Running

App Files Files Community

speecht5-darija / app.py

HAMMALE

Upload all space files

c56a4e0 verified about 1 month ago

raw

history blame

7.2 kB


	import torch
	import soundfile as sf
	import os
	import re
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from speechbrain.pretrained import EncoderClassifier

	# Define paths and device
	model_path = "HAMMALE/speecht5-darija" # Path to your model on HF Hub
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Load models
	processor = SpeechT5Processor.from_pretrained(model_path)
	model = SpeechT5ForTextToSpeech.from_pretrained(model_path).to(device)
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

	# Load speaker embedding model
	speaker_model = EncoderClassifier.from_hparams(
	source="speechbrain/spkrec-xvect-voxceleb",
	run_opts={"device": device},
	savedir=os.path.join("/tmp", "spkrec-xvect-voxceleb"),
	)

	# Load pre-computed speaker embeddings
	male_embedding = torch.load("male_embedding.pt") if os.path.exists("male_embedding.pt") else torch.randn(1, 512)
	female_embedding = torch.load("female_embedding.pt") if os.path.exists("female_embedding.pt") else torch.randn(1, 512)

	# Text normalization function
	def normalize_text(text):
	"""Normalize text for TTS processing"""
	text = text.lower()
	# Keep letters, numbers, spaces and apostrophes - fixed regex
	text = re.sub(r'[^\w\s\'\u0600-\u06FF]', '', text)
	text = ' '.join(text.split())
	return text

	# Function to synthesize speech
	def synthesize_speech(text, voice_type="male", speed=1.0):
	"""Generate speech from text using the specified voice type"""
	try:
	# Select speaker embedding based on voice type
	if voice_type == "male":
	speaker_embeddings = male_embedding.to(device)
	else:
	speaker_embeddings = female_embedding.to(device)

	# Normalize and tokenize input text
	normalized_text = normalize_text(text)
	inputs = processor(text=normalized_text, return_tensors="pt").to(device)

	# Generate speech
	with torch.no_grad():
	speech = model.generate_speech(
	inputs["input_ids"],
	speaker_embeddings,
	vocoder=vocoder
	)

	# Convert to numpy array and adjust speed if needed
	speech_np = speech.cpu().numpy()

	# Apply speed adjustment (simple resampling)
	if speed != 1.0:
	# This is a simple approach - for production use a proper resampling library
	import numpy as np
	from scipy import signal
	sample_rate = 16000
	new_length = int(len(speech_np) / speed)
	speech_np = signal.resample(speech_np, new_length)

	# Save temporary audio file
	output_file = "output_speech.wav"
	sf.write(output_file, speech_np, 16000)

	return output_file, None

	except Exception as e:
	return None, f"Error generating speech: {str(e)}"

	# Gradio imports need to be added
	import gradio as gr

	# Custom CSS for better design
	custom_css = """
	.gradio-container {
	font-family: 'Poppins', 'Arial', sans-serif;
	max-width: 750px;
	margin: auto;
	}

	.main-header {
	background: linear-gradient(90deg, #c31432, #240b36);
	color: white;
	padding: 1.5em;
	border-radius: 10px;
	text-align: center;
	margin-bottom: 1em;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
	}

	.main-header h1 {
	font-size: 2.2em;
	margin-bottom: 0.3em;
	}

	.main-header p {
	font-size: 1.1em;
	opacity: 0.9;
	}

	footer {
	text-align: center;
	margin-top: 2em;
	color: #555;
	font-size: 0.9em;
	}

	.flag-icon {
	width: 24px;
	height: 24px;
	vertical-align: middle;
	margin-right: 8px;
	}

	.example-header {
	font-weight: bold;
	color: #c31432;
	margin-top: 1em;
	}

	.info-box {
	background-color: #f9f9f9;
	border-left: 4px solid #c31432;
	padding: 1em;
	margin: 1em 0;
	border-radius: 5px;
	}

	.voice-selector {
	display: flex;
	justify-content: center;
	gap: 20px;
	margin: 10px 0;
	}

	.voice-option {
	border: 2px solid #ddd;
	border-radius: 10px;
	padding: 10px 15px;
	transition: all 0.3s ease;
	cursor: pointer;
	}

	.voice-option.selected {
	border-color: #c31432;
	background-color: #fff5f5;
	}

	.slider-container {
	margin: 20px 0;
	}
	"""

	# Create Gradio interface with improved design
	with gr.Blocks(css=custom_css) as demo:
	gr.HTML(
	"""
	<div class="main-header">
	<h1>🇲🇦 Moroccan Darija Text-to-Speech 🎧</h1>
	<p>Convert Moroccan Arabic (Darija) text into natural-sounding speech</p>
	</div>
	"""
	)

	with gr.Row():
	with gr.Column():
	gr.HTML(
	"""
	<div class="info-box">
	<p>This model was fine-tuned on the DODa audio dataset to produce high-quality
	Darija speech from text input. You can adjust the voice and speed below.</p>
	</div>
	"""
	)

	text_input = gr.Textbox(
	label="Enter Darija Text",
	placeholder="Kteb chi jomla b darija hna...",
	lines=3
	)

	with gr.Row():
	voice_type = gr.Radio(
	["male", "female"],
	label="Voice Type",
	value="male"
	)

	speed = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.1,
	label="Speech Speed"
	)

	generate_btn = gr.Button("Generate Speech", variant="primary")

	gr.HTML(
	"""
	<div class="example-header">Example phrases:</div>
	<ul>
	<li>"Ana Nadi Bezzaaf hhh"</li>
	<li>"Lyoum ajwaa zwina bezzaf."</li>
	<li>"lmaghrib ahssan blad fi l3alam "</li>
	</ul>
	"""
	)

	with gr.Column():
	audio_output = gr.Audio(label="Generated Speech")
	error_output = gr.Textbox(label="Error (if any)", visible=False)

	gr.Examples(
	examples=[
	["Ana Nadi Bezzaaf hhh", "male", 1.0],
	["Lyoum ajwaa zwina bezzaf.", "female", 1.0],
	["lmaghrib ahssan blad fi l3alam", "male", 1.0],
	["Filistine hora mina lbar ila lbahr", "female", 0.8],
	],
	inputs=[text_input, voice_type, speed],
	outputs=[audio_output, error_output],
	fn=synthesize_speech
	)

	gr.HTML(
	"""
	<footer>
	<p>Developed by HAMMALE \| Powered by Microsoft SpeechT5 \| Data: DODa</p>
	</footer>
	"""
	)

	# Set button click action
	generate_btn.click(
	fn=synthesize_speech,
	inputs=[text_input, voice_type, speed],
	outputs=[audio_output, error_output]
	)

	# Launch the demo
	if __name__ == "__main__":
	demo.launch()