# voice_cloner.py from transformers import BarkModel, AutoProcessor import torchaudio import torch import os def clone_and_generate_text(text, reference_audio_path, language="English", emotion="Neutral"): processor = AutoProcessor.from_pretrained("suno/bark") model = BarkModel.from_pretrained("suno/bark") device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) # Load and process reference audio speech_array, sampling_rate = torchaudio.load(reference_audio_path) speech_array = torchaudio.functional.resample(speech_array, sampling_rate, 16000) speech_array = speech_array.mean(dim=0).unsqueeze(0) # mono inputs = processor( text=text, voice_preset="v2/en_speaker_9", # generic fallback voice return_tensors="pt" ).to(device) with torch.no_grad(): speech = model.generate(**inputs) output_path = "output_voice.wav" torchaudio.save(output_path, speech.cpu(), 22050) return output_path