import streamlit as st import outetts from scipy.io.wavfile import write import tempfile import os from pydub import AudioSegment # Initialize model configuration model_config = outetts.HFModelConfig_v1( model_path="OuteAI/OuteTTS-0.2-500M", language="en" # Supported languages: en, zh, ja, ko ) # Initialize the interface interface = outetts.InterfaceHF(model_version="0.2", cfg=model_config) # Streamlit UI st.title("OuteTTS Speech Synthesis") st.write("Enter text below to generate speech.") # Sidebar for reference voice st.sidebar.title("Voice Cloning") reference_audio = st.sidebar.file_uploader("Upload a reference audio (any format)", type=["wav", "mp3", "ogg", "flac", "m4a"]) # Function to convert audio to WAV format def convert_to_wav(audio_file): temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") audio = AudioSegment.from_file(audio_file) audio.export(temp_audio.name, format="wav") return temp_audio.name if reference_audio: ref_audio_path = convert_to_wav(reference_audio) else: ref_audio_path = None # Recording functionality if ref_audio_path is None: st.sidebar.write("Or record your voice below:") if st.sidebar.button("Record Voice"): st.sidebar.warning("Recording functionality not implemented yet. Please upload a file.") text_input = st.text_area("Text to convert to speech:", "Hello, this is an AI-generated voice.") if st.button("Generate Speech"): with st.spinner("Generating audio..."): # Generate speech with reference audio output = interface.generate( text=text_input, temperature=0.1, repetition_penalty=1.1, max_length=4096, speaker_wav=ref_audio_path if ref_audio_path else None ) # Save the synthesized speech to a file output_path = "output.wav" output.save(output_path) # Play the audio in the Streamlit app st.audio(output_path, format="audio/wav") st.success("Speech generated successfully!") # Clean up temporary files if ref_audio_path: os.remove(ref_audio_path)