File size: 5,107 Bytes
f5bcd9e
6038906
 
 
 
 
 
 
 
 
 
 
 
e9c5e1c
6038906
e9c5e1c
6038906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9c5e1c
6038906
e9c5e1c
6038906
e9c5e1c
6038906
e9c5e1c
6038906
 
 
e9c5e1c
6038906
 
 
 
 
 
 
 
 
 
 
 
e9c5e1c
6038906
 
 
 
 
 
 
 
 
 
 
 
e9c5e1c
6038906
 
 
 
 
 
 
 
 
 
 
 
 
 
e9c5e1c
6038906
e9c5e1c
6038906
 
e9c5e1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdb858d
e9c5e1c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
try:
    from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_audioclips
    print("Moviepy importado com sucesso!")
except ModuleNotFoundError:
    print("Erro: 'moviepy' não está instalado. Adicione 'moviepy' ao requirements.txt ou instale com 'pip install moviepy'.")
    raise SystemExit

from transformers import WhisperProcessor, WhisperForConditionalGeneration, M2M100Tokenizer, M2M100ForConditionalGeneration
from transformers import AutoProcessor, AutoModelForTextToSpeech
import librosa
import soundfile as sf
import numpy as np
import streamlit as st

# Carregando os modelos
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
m2m_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
m2m_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
bark_processor = AutoProcessor.from_pretrained("suno/bark-small")
bark_model = AutoModelForTextToSpeech.from_pretrained("suno/bark-small")

# Função para ajustar a velocidade do áudio
def adjust_speed(audio, original_duration, target_duration, sample_rate=24000):
    rate = original_duration / target_duration
    adjusted_audio = librosa.effects.time_stretch(audio, rate=rate)
    return adjusted_audio

# Função principal para processar o vídeo
def process_video(video_path):
    try:
        video = VideoFileClip(video_path)
        original_duration = video.duration
        audio = video.audio
        audio.write_audiofile("temp_audio.wav")

        audio_data, sample_rate = librosa.load("temp_audio.wav", sr=16000)
        input_features = whisper_processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_features
        predicted_ids = whisper_model.generate(input_features, return_timestamps=True, language="en")
        transcription = whisper_processor.decode(predicted_ids[0], skip_special_tokens=True, output_word_offsets=True)
        segments = transcription.get("word_offsets", [])

        detected_lang = "en"
        translated_segments = []
        for segment in segments:
            text = segment.get("word", "")
            start = segment.get("offset", 0) / 1000
            end = start + segment.get("length", 0) / 1000
            if text.strip():
                m2m_tokenizer.src_lang = detected_lang
                inputs = m2m_tokenizer(text, return_tensors="pt")
                translated_ids = m2m_model.generate(**inputs, forced_bos_token_id=m2m_tokenizer.get_lang_id("pt"))
                translated_text = m2m_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
                translated_segments.append({"text": translated_text, "start": start, "end": end})

        speech_segments = []
        for segment in translated_segments:
            inputs = bark_processor(segment["text"], return_tensors="pt")
            speech = bark_model.generate(**inputs).cpu().numpy()
            generated_duration = len(speech) / 24000
            target_duration = segment["end"] - segment["start"]
            adjusted_speech = adjust_speed(speech, generated_duration, target_duration)
            speech_segments.append(adjusted_speech)

        full_speech = np.concatenate(speech_segments)
        sf.write("new_audio.wav", full_speech, 24000)

        new_audio = AudioFileClip("new_audio.wav")
        if new_audio.duration < original_duration:
            silence = AudioFileClip("silence.wav").set_duration(original_duration - new_audio.duration)
            final_audio = concatenate_audioclips([new_audio, silence])
        else:
            final_audio = new_audio.set_duration(original_duration)

        final_video = video.set_audio(final_audio)
        output_path = "output_video.mp4"
        final_video.write_videofile(output_path)

        for temp_file in ["temp_audio.wav", "new_audio.wav", "silence.wav"]:
            if os.path.exists(temp_file):
                os.remove(temp_file)

        return output_path

    except Exception as e:
        return f"Erro ao processar o vídeo: {str(e)}"

# Criar arquivo de silêncio
def create_silence_file():
    silence = np.zeros(int(24000 * 1.0))
    sf.write("silence.wav", silence, 24000)

# Interface com Streamlit
def main():
    st.title("Conversor de Áudio de Vídeo para Português")
    st.write("Carregue um vídeo e obtenha uma versão com o áudio traduzido para português usando vozes realistas.")
    
    uploaded_file = st.file_uploader("Escolha um vídeo", type=["mp4", "avi", "mov"])
    if uploaded_file is not None:
        with open("input_video.mp4", "wb") as f:
            f.write(uploaded_file.read())
        st.write("Processando...")
        result = process_video("input_video.mp4")
        if os.path.exists(result):
            st.video(result)
            with open(result, "rb") as file:
                st.download_button("Baixar vídeo traduzido", file, file_name="output_video.mp4")
        else:
            st.error(result)

if __name__ == "__main__":
    create_silence_file()
    main()