File size: 3,439 Bytes
50c4728
 
 
 
 
d03632e
9e780c8
50c4728
83ebf54
50c4728
 
 
 
 
 
 
 
 
83ebf54
50c4728
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83ebf54
d03632e
9e780c8
 
 
 
83ebf54
9e780c8
 
 
 
d03632e
83ebf54
50c4728
38e25e4
83ebf54
9e780c8
 
83ebf54
9e780c8
83ebf54
38e25e4
50c4728
83ebf54
50c4728
38e25e4
50c4728
 
 
83ebf54
50c4728
83ebf54
50c4728
 
83ebf54
50c4728
 
 
 
38e25e4
 
50c4728
 
 
38e25e4
 
50c4728
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import gradio as gr
import time
from moviepy.editor import VideoFileClip
from faster_whisper import WhisperModel
from pytube import YouTube
from pytube.exceptions import VideoUnavailable, PytubeError

# λΉ„λ””μ˜€λ₯Ό MP3둜 λ³€ν™˜ν•˜λŠ” ν•¨μˆ˜
def convert_mp4_to_mp3(video_file_path, output_dir):
    video = VideoFileClip(video_file_path)
    audio = video.audio
    output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(video_file_path))[0] + ".mp3")
    audio.write_audiofile(output_path)
    audio.close()
    video.close()
    return output_path

# Whisper λͺ¨λΈμ„ μ‚¬μš©ν•˜μ—¬ MP3 νŒŒμΌμ„ ν…μŠ€νŠΈλ‘œ λ³€ν™˜ν•˜λŠ” ν•¨μˆ˜
def transcribe_audio(model_size, audio_file):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    start_time = time.time()

    try:
        segments, info = model.transcribe(audio_file, beam_size=5)

        detected_language = "Detected language '%s' with probability %f" % (info.language, info.language_probability)
        result = []
        for segment in segments:
            result.append("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
        result_text = "\n".join(result)

    except PermissionError as e:
        return f"PermissionError: {e}"
    except ValueError as e:
        return f"ValueError: {e}"

    end_time = time.time()
    elapsed_time = end_time - start_time

    return f"{detected_language}\n\nTranscription:\n{result_text}\n\nElapsed time: {elapsed_time:.2f} seconds"

# YouTube URLμ—μ„œ λΉ„λ””μ˜€λ₯Ό λ‹€μš΄λ‘œλ“œν•˜λŠ” ν•¨μˆ˜
def download_youtube_video(url, output_dir):
    try:
        yt = YouTube(url)
        stream = yt.streams.filter(file_extension='mp4').first()
        output_path = stream.download(output_dir)
        return output_path, None
    except VideoUnavailable:
        return None, "Video unavailable. Please check the URL."
    except PytubeError as e:
        return None, f"An error occurred: {e}"

# Gradio μΈν„°νŽ˜μ΄μŠ€μ—μ„œ μ‚¬μš©ν•  메인 ν•¨μˆ˜
def process_video(model_size, video_file=None, video_url=None):
    if video_url and not video_file:
        print(f"Downloading video from URL: {video_url}")
        video_file_path, error = download_youtube_video(video_url, '/tmp')
        if error:
            print(f"Error downloading video: {error}")
            return error
        print(f"Downloaded video to: {video_file_path}")
    elif video_file and not video_url:
        video_file_path = video_file.name
        print(f"Using uploaded video file: {video_file_path}")
    else:
        return "Please upload a video file or provide a video URL, but not both."

    save_path = "/tmp"
    mp3_file_path = convert_mp4_to_mp3(video_file_path, save_path)
    print(f"Converted video to MP3: {mp3_file_path}")
    transcription = transcribe_audio(model_size, mp3_file_path)
    print(f"Transcription complete")
    return transcription

# Gradio μΈν„°νŽ˜μ΄μŠ€ μ •μ˜
iface = gr.Interface(
    fn=process_video,
    inputs=[
        gr.Dropdown(["tiny", "base", "small", "medium", "large"], label="Model Size"),
        gr.File(label="Upload Video File"),
        gr.Textbox(label="Video URL")
    ],
    outputs="text",
    title="Video to Text Converter using Whisper",
    description="Upload a video file or provide a video URL, select the Whisper model size, and get the transcribed text.",
    live=True
)

if __name__ == "__main__":
    iface.launch()