traducao-videos / app.py
RXTIME's picture
Update app.py
bdb858d verified
raw
history blame contribute delete
5.11 kB
import os
try:
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_audioclips
print("Moviepy importado com sucesso!")
except ModuleNotFoundError:
print("Erro: 'moviepy' não está instalado. Adicione 'moviepy' ao requirements.txt ou instale com 'pip install moviepy'.")
raise SystemExit
from transformers import WhisperProcessor, WhisperForConditionalGeneration, M2M100Tokenizer, M2M100ForConditionalGeneration
from transformers import AutoProcessor, AutoModelForTextToSpeech
import librosa
import soundfile as sf
import numpy as np
import streamlit as st
# Carregando os modelos
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
m2m_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
m2m_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
bark_processor = AutoProcessor.from_pretrained("suno/bark-small")
bark_model = AutoModelForTextToSpeech.from_pretrained("suno/bark-small")
# Função para ajustar a velocidade do áudio
def adjust_speed(audio, original_duration, target_duration, sample_rate=24000):
rate = original_duration / target_duration
adjusted_audio = librosa.effects.time_stretch(audio, rate=rate)
return adjusted_audio
# Função principal para processar o vídeo
def process_video(video_path):
try:
video = VideoFileClip(video_path)
original_duration = video.duration
audio = video.audio
audio.write_audiofile("temp_audio.wav")
audio_data, sample_rate = librosa.load("temp_audio.wav", sr=16000)
input_features = whisper_processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_features
predicted_ids = whisper_model.generate(input_features, return_timestamps=True, language="en")
transcription = whisper_processor.decode(predicted_ids[0], skip_special_tokens=True, output_word_offsets=True)
segments = transcription.get("word_offsets", [])
detected_lang = "en"
translated_segments = []
for segment in segments:
text = segment.get("word", "")
start = segment.get("offset", 0) / 1000
end = start + segment.get("length", 0) / 1000
if text.strip():
m2m_tokenizer.src_lang = detected_lang
inputs = m2m_tokenizer(text, return_tensors="pt")
translated_ids = m2m_model.generate(**inputs, forced_bos_token_id=m2m_tokenizer.get_lang_id("pt"))
translated_text = m2m_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
translated_segments.append({"text": translated_text, "start": start, "end": end})
speech_segments = []
for segment in translated_segments:
inputs = bark_processor(segment["text"], return_tensors="pt")
speech = bark_model.generate(**inputs).cpu().numpy()
generated_duration = len(speech) / 24000
target_duration = segment["end"] - segment["start"]
adjusted_speech = adjust_speed(speech, generated_duration, target_duration)
speech_segments.append(adjusted_speech)
full_speech = np.concatenate(speech_segments)
sf.write("new_audio.wav", full_speech, 24000)
new_audio = AudioFileClip("new_audio.wav")
if new_audio.duration < original_duration:
silence = AudioFileClip("silence.wav").set_duration(original_duration - new_audio.duration)
final_audio = concatenate_audioclips([new_audio, silence])
else:
final_audio = new_audio.set_duration(original_duration)
final_video = video.set_audio(final_audio)
output_path = "output_video.mp4"
final_video.write_videofile(output_path)
for temp_file in ["temp_audio.wav", "new_audio.wav", "silence.wav"]:
if os.path.exists(temp_file):
os.remove(temp_file)
return output_path
except Exception as e:
return f"Erro ao processar o vídeo: {str(e)}"
# Criar arquivo de silêncio
def create_silence_file():
silence = np.zeros(int(24000 * 1.0))
sf.write("silence.wav", silence, 24000)
# Interface com Streamlit
def main():
st.title("Conversor de Áudio de Vídeo para Português")
st.write("Carregue um vídeo e obtenha uma versão com o áudio traduzido para português usando vozes realistas.")
uploaded_file = st.file_uploader("Escolha um vídeo", type=["mp4", "avi", "mov"])
if uploaded_file is not None:
with open("input_video.mp4", "wb") as f:
f.write(uploaded_file.read())
st.write("Processando...")
result = process_video("input_video.mp4")
if os.path.exists(result):
st.video(result)
with open(result, "rb") as file:
st.download_button("Baixar vídeo traduzido", file, file_name="output_video.mp4")
else:
st.error(result)
if __name__ == "__main__":
create_silence_file()
main()