Spaces:
Running
Running
import os | |
try: | |
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_audioclips | |
print("Moviepy importado com sucesso!") | |
except ModuleNotFoundError: | |
print("Erro: 'moviepy' não está instalado. Adicione 'moviepy' ao requirements.txt ou instale com 'pip install moviepy'.") | |
raise SystemExit | |
from transformers import WhisperProcessor, WhisperForConditionalGeneration, M2M100Tokenizer, M2M100ForConditionalGeneration | |
from transformers import AutoProcessor, AutoModelForTextToSpeech | |
import librosa | |
import soundfile as sf | |
import numpy as np | |
import streamlit as st | |
# Carregando os modelos | |
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small") | |
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small") | |
m2m_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") | |
m2m_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") | |
bark_processor = AutoProcessor.from_pretrained("suno/bark-small") | |
bark_model = AutoModelForTextToSpeech.from_pretrained("suno/bark-small") | |
# Função para ajustar a velocidade do áudio | |
def adjust_speed(audio, original_duration, target_duration, sample_rate=24000): | |
rate = original_duration / target_duration | |
adjusted_audio = librosa.effects.time_stretch(audio, rate=rate) | |
return adjusted_audio | |
# Função principal para processar o vídeo | |
def process_video(video_path): | |
try: | |
video = VideoFileClip(video_path) | |
original_duration = video.duration | |
audio = video.audio | |
audio.write_audiofile("temp_audio.wav") | |
audio_data, sample_rate = librosa.load("temp_audio.wav", sr=16000) | |
input_features = whisper_processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_features | |
predicted_ids = whisper_model.generate(input_features, return_timestamps=True, language="en") | |
transcription = whisper_processor.decode(predicted_ids[0], skip_special_tokens=True, output_word_offsets=True) | |
segments = transcription.get("word_offsets", []) | |
detected_lang = "en" | |
translated_segments = [] | |
for segment in segments: | |
text = segment.get("word", "") | |
start = segment.get("offset", 0) / 1000 | |
end = start + segment.get("length", 0) / 1000 | |
if text.strip(): | |
m2m_tokenizer.src_lang = detected_lang | |
inputs = m2m_tokenizer(text, return_tensors="pt") | |
translated_ids = m2m_model.generate(**inputs, forced_bos_token_id=m2m_tokenizer.get_lang_id("pt")) | |
translated_text = m2m_tokenizer.decode(translated_ids[0], skip_special_tokens=True) | |
translated_segments.append({"text": translated_text, "start": start, "end": end}) | |
speech_segments = [] | |
for segment in translated_segments: | |
inputs = bark_processor(segment["text"], return_tensors="pt") | |
speech = bark_model.generate(**inputs).cpu().numpy() | |
generated_duration = len(speech) / 24000 | |
target_duration = segment["end"] - segment["start"] | |
adjusted_speech = adjust_speed(speech, generated_duration, target_duration) | |
speech_segments.append(adjusted_speech) | |
full_speech = np.concatenate(speech_segments) | |
sf.write("new_audio.wav", full_speech, 24000) | |
new_audio = AudioFileClip("new_audio.wav") | |
if new_audio.duration < original_duration: | |
silence = AudioFileClip("silence.wav").set_duration(original_duration - new_audio.duration) | |
final_audio = concatenate_audioclips([new_audio, silence]) | |
else: | |
final_audio = new_audio.set_duration(original_duration) | |
final_video = video.set_audio(final_audio) | |
output_path = "output_video.mp4" | |
final_video.write_videofile(output_path) | |
for temp_file in ["temp_audio.wav", "new_audio.wav", "silence.wav"]: | |
if os.path.exists(temp_file): | |
os.remove(temp_file) | |
return output_path | |
except Exception as e: | |
return f"Erro ao processar o vídeo: {str(e)}" | |
# Criar arquivo de silêncio | |
def create_silence_file(): | |
silence = np.zeros(int(24000 * 1.0)) | |
sf.write("silence.wav", silence, 24000) | |
# Interface com Streamlit | |
def main(): | |
st.title("Conversor de Áudio de Vídeo para Português") | |
st.write("Carregue um vídeo e obtenha uma versão com o áudio traduzido para português usando vozes realistas.") | |
uploaded_file = st.file_uploader("Escolha um vídeo", type=["mp4", "avi", "mov"]) | |
if uploaded_file is not None: | |
with open("input_video.mp4", "wb") as f: | |
f.write(uploaded_file.read()) | |
st.write("Processando...") | |
result = process_video("input_video.mp4") | |
if os.path.exists(result): | |
st.video(result) | |
with open(result, "rb") as file: | |
st.download_button("Baixar vídeo traduzido", file, file_name="output_video.mp4") | |
else: | |
st.error(result) | |
if __name__ == "__main__": | |
create_silence_file() | |
main() | |