Spaces:
Running
Running
File size: 5,107 Bytes
f5bcd9e 6038906 e9c5e1c 6038906 e9c5e1c 6038906 e9c5e1c 6038906 e9c5e1c 6038906 e9c5e1c 6038906 e9c5e1c 6038906 e9c5e1c 6038906 e9c5e1c 6038906 e9c5e1c 6038906 e9c5e1c 6038906 e9c5e1c 6038906 e9c5e1c bdb858d e9c5e1c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import os
try:
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_audioclips
print("Moviepy importado com sucesso!")
except ModuleNotFoundError:
print("Erro: 'moviepy' não está instalado. Adicione 'moviepy' ao requirements.txt ou instale com 'pip install moviepy'.")
raise SystemExit
from transformers import WhisperProcessor, WhisperForConditionalGeneration, M2M100Tokenizer, M2M100ForConditionalGeneration
from transformers import AutoProcessor, AutoModelForTextToSpeech
import librosa
import soundfile as sf
import numpy as np
import streamlit as st
# Carregando os modelos
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
m2m_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
m2m_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
bark_processor = AutoProcessor.from_pretrained("suno/bark-small")
bark_model = AutoModelForTextToSpeech.from_pretrained("suno/bark-small")
# Função para ajustar a velocidade do áudio
def adjust_speed(audio, original_duration, target_duration, sample_rate=24000):
rate = original_duration / target_duration
adjusted_audio = librosa.effects.time_stretch(audio, rate=rate)
return adjusted_audio
# Função principal para processar o vídeo
def process_video(video_path):
try:
video = VideoFileClip(video_path)
original_duration = video.duration
audio = video.audio
audio.write_audiofile("temp_audio.wav")
audio_data, sample_rate = librosa.load("temp_audio.wav", sr=16000)
input_features = whisper_processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_features
predicted_ids = whisper_model.generate(input_features, return_timestamps=True, language="en")
transcription = whisper_processor.decode(predicted_ids[0], skip_special_tokens=True, output_word_offsets=True)
segments = transcription.get("word_offsets", [])
detected_lang = "en"
translated_segments = []
for segment in segments:
text = segment.get("word", "")
start = segment.get("offset", 0) / 1000
end = start + segment.get("length", 0) / 1000
if text.strip():
m2m_tokenizer.src_lang = detected_lang
inputs = m2m_tokenizer(text, return_tensors="pt")
translated_ids = m2m_model.generate(**inputs, forced_bos_token_id=m2m_tokenizer.get_lang_id("pt"))
translated_text = m2m_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
translated_segments.append({"text": translated_text, "start": start, "end": end})
speech_segments = []
for segment in translated_segments:
inputs = bark_processor(segment["text"], return_tensors="pt")
speech = bark_model.generate(**inputs).cpu().numpy()
generated_duration = len(speech) / 24000
target_duration = segment["end"] - segment["start"]
adjusted_speech = adjust_speed(speech, generated_duration, target_duration)
speech_segments.append(adjusted_speech)
full_speech = np.concatenate(speech_segments)
sf.write("new_audio.wav", full_speech, 24000)
new_audio = AudioFileClip("new_audio.wav")
if new_audio.duration < original_duration:
silence = AudioFileClip("silence.wav").set_duration(original_duration - new_audio.duration)
final_audio = concatenate_audioclips([new_audio, silence])
else:
final_audio = new_audio.set_duration(original_duration)
final_video = video.set_audio(final_audio)
output_path = "output_video.mp4"
final_video.write_videofile(output_path)
for temp_file in ["temp_audio.wav", "new_audio.wav", "silence.wav"]:
if os.path.exists(temp_file):
os.remove(temp_file)
return output_path
except Exception as e:
return f"Erro ao processar o vídeo: {str(e)}"
# Criar arquivo de silêncio
def create_silence_file():
silence = np.zeros(int(24000 * 1.0))
sf.write("silence.wav", silence, 24000)
# Interface com Streamlit
def main():
st.title("Conversor de Áudio de Vídeo para Português")
st.write("Carregue um vídeo e obtenha uma versão com o áudio traduzido para português usando vozes realistas.")
uploaded_file = st.file_uploader("Escolha um vídeo", type=["mp4", "avi", "mov"])
if uploaded_file is not None:
with open("input_video.mp4", "wb") as f:
f.write(uploaded_file.read())
st.write("Processando...")
result = process_video("input_video.mp4")
if os.path.exists(result):
st.video(result)
with open(result, "rb") as file:
st.download_button("Baixar vídeo traduzido", file, file_name="output_video.mp4")
else:
st.error(result)
if __name__ == "__main__":
create_silence_file()
main()
|