Spaces:
Running
Running
import gradio as gr | |
import torch | |
import soundfile as sf | |
import spaces | |
import os | |
import numpy as np | |
import re | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, AutoTokenizer | |
from speechbrain.pretrained import EncoderClassifier | |
from datasets import load_dataset | |
from huggingface_hub import hf_hub_download | |
import uuid | |
import wave | |
import io | |
import tempfile | |
import shutil | |
from piper import PiperVoice | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
auth_token = os.environ.get("hf_token") or True | |
model_path = hf_hub_download(repo_id="fahadqazi/piper-sindhi", filename="model.onnx", use_auth_token=auth_token) | |
config_path = hf_hub_download(repo_id="fahadqazi/piper-sindhi", filename="model.onnx.json", use_auth_token=auth_token) | |
voice = PiperVoice.load(model_path=model_path, config_path=config_path, use_cuda=device=="cuda") | |
synthesize_args = { | |
"speaker_id": 0, | |
"sentence_silence": 0.5 | |
} | |
# def load_models_and_data(): | |
# auth_token = os.environ.get("hf_token") or True | |
# model_name = "microsoft/speecht5_tts" | |
# processor = SpeechT5Processor.from_pretrained(model_name) | |
# tokenizer = AutoTokenizer.from_pretrained("fahadqazi/testts1234", use_auth_token=auth_token) | |
# processor.tokenizer = tokenizer | |
# model = SpeechT5ForTextToSpeech.from_pretrained("fahadqazi/testts1234", use_auth_token=auth_token).to(device) | |
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device) | |
# return model, processor, vocoder | |
# model, processor, vocoder = load_models_and_data() | |
# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
# speaker_embeddings = embeddings_dataset[7306]["xvector"] | |
# speaker_embeddings = torch.tensor(speaker_embeddings).to(device) | |
# default_embedding = speaker_embeddings | |
replacements = [ | |
# ("۾", "مين"), # | |
# ("۽", "ائين"), # | |
] | |
number_words = { | |
0: "ٻڙي", | |
1: "هڪ", | |
2: "ٻہ", | |
3: "ٽي", | |
4: "چار", | |
5: "پنج", | |
6: "ڇه", | |
7: "ست", | |
8: "اٺ", | |
9: "نوه", | |
10: "ڏهہ", | |
11: "يارنهن", 12: "ٻارنهن", 13: "تيرنهن", 14: "چوڏنهن", 15: "پنڌرنهن", 16: "سورنهن", 17: "سترنهن", | |
18: "ارڙنهن", 19: "اوڻينهن", 20: "ويهہ", 30: "ٽيهہ", 40: "چاليهہ", 50: "پنجها", 60: "سٺ", 70: "ستر", | |
80: "اسي", 90: "نوي", 100: "سوه", 1000: "هزار" | |
} | |
def number_to_words(number): | |
if number < 20: | |
return number_words[number] | |
elif number < 100: | |
tens, unit = divmod(number, 10) | |
return (number_words[unit] if unit else "") + (" " + number_words[tens * 10]) | |
elif number < 1000: | |
hundreds, remainder = divmod(number, 100) | |
return (number_words[hundreds] + " سوه" if hundreds > 1 else "سوه") + (" " + number_to_words(remainder) if remainder else "") | |
elif number < 1000000: | |
thousands, remainder = divmod(number, 1000) | |
return (number_to_words(thousands) + " هزار" if thousands > 1 else "هزار") + (" " + number_to_words(remainder) if remainder else "") | |
elif number < 1000000000: | |
millions, remainder = divmod(number, 1000000) | |
return number_to_words(millions) + " ملين" + (" " + number_to_words(remainder) if remainder else "") | |
elif number < 1000000000000: | |
billions, remainder = divmod(number, 1000000000) | |
return number_to_words(billions) + " بلين" + (" " + number_to_words(remainder) if remainder else "") | |
else: | |
return str(number) | |
def replace_numbers_with_words(text): | |
def replace(match): | |
number = int(match.group()) | |
return number_to_words(number) | |
# Find the numbers and change with words. | |
result = re.sub(r'\b\d+\b', replace, text) | |
return result | |
def normalize_text(text): | |
# Convert to lowercase | |
text = text.lower() | |
# Replace numbers followed by "ع" with "عيسوي" | |
text = re.sub(r'(\d+)\s*ع', r'\1 عيسوي', text) | |
# Replace numbers with words | |
text = replace_numbers_with_words(text) | |
# Apply character replacements | |
for old, new in replacements: | |
text = text.replace(old, new) | |
# # Remove punctuation | |
# text = re.sub(r'[^\w\s]', '', text) | |
return text | |
def text_to_speech(text, audio_file=None): | |
# Clear all outputs | |
# yield gr.update(value=None), gr.update(value=None) | |
# Normalize the input text | |
normalized_text = normalize_text(text) | |
print("Normalized text: ", normalized_text) | |
# Split text while preserving "..." (ellipsis) | |
segments = re.split(r'(\.\.\.|[\n.۔])', normalized_text) | |
segments = [x.strip() for x in segments] | |
#print("segments: ", segments) | |
# Merge back the ellipsis with previous segment | |
combined_segments = [] | |
temp_segment = "" | |
for segment in segments: | |
if segment == '...': | |
temp_segment += " ..." # Keep ellipsis as part of the previous segment | |
elif segment in ['.', '\n', '۔']: | |
if temp_segment: | |
combined_segments.append(temp_segment.strip()) | |
temp_segment = "" | |
else: | |
if temp_segment: | |
combined_segments.append(temp_segment.strip()) | |
temp_segment = segment | |
if temp_segment: | |
combined_segments.append(temp_segment.strip()) | |
#print("combined_segments: ", combined_segments) | |
# Silence lengths (50ms for '.', '\n', '۔', 150ms for '...') | |
short_silence = np.zeros(int(22050 * 0.05), dtype=np.int16) # 50ms pause | |
long_silence = np.zeros(int(22050 * 0.15), dtype=np.int16) # 150ms pause for "..." | |
# Create a temporary directory for storing individual segment WAV files | |
temp_dir = tempfile.mkdtemp() | |
try: | |
output_file = f"{uuid.uuid4()}.wav" | |
# Open the final output WAV file | |
with sf.SoundFile(output_file, 'w', samplerate=22050, channels=1, subtype='PCM_16') as output: | |
# Synthesize and save each segment to a WAV file | |
for i, segment in enumerate(combined_segments): | |
segment_path = os.path.join(temp_dir, f"segment_{i}.wav") | |
with wave.open(segment_path, "wb") as wav_file: | |
voice.synthesize(segment, wav_file, **synthesize_args) | |
# Read the segment and write it to the final output | |
audio_segment, _ = sf.read(segment_path, dtype='int16') | |
output.write(audio_segment) | |
# Stream the current progress | |
# yield output_file | |
# Add silence after each segment | |
if segment.endswith("...") or segment.endswith("…"): | |
output.write(long_silence) | |
elif segment.endswith(".") or segment.endswith("\n") or segment.endswith("۔"): | |
output.write(short_silence) | |
finally: | |
# Clean up the temporary directory | |
shutil.rmtree(temp_dir) | |
# Return the final WAV file | |
yield output_file | |
# def text_to_speech(text, audio_file=None): | |
# # Normalize the input text | |
# normalized_text = normalize_text(text) | |
# print("normalized text: ", normalized_text) | |
# # Generate speech: Write to file | |
# output_file = f"{uuid.uuid4()}.wav" | |
# with wave.open(output_file, "wb") as wav_file: | |
# voice.synthesize(normalized_text, wav_file, **synthesize_args) | |
# return output_file | |
iface = gr.Interface( | |
fn=text_to_speech, | |
inputs=[ | |
gr.Textbox(label="Enter Sindhi text to convert to speech", value="هيلو ڪهڙا حال آهن") | |
], | |
outputs=[ | |
gr.Audio(label="Generated Speech", type="numpy") | |
], | |
title="Sindhi Text-to-Speech Demo", | |
description="Enter Sindhi text, and listen to the generated speech. Use shorter messages for better results." | |
) | |
iface.launch(share=True) |