video_splitter / app.py
BahadirGLCK's picture
fix model names.
f96caac
import os
import datetime
import hashlib
import requests
import numpy as np
import gradio as gr
import whisper
import srt
import torch
LANGUAGE_OPTIONS = {
"Afrikaans": "af",
"Arabic": "ar",
"Azerbaijani": "az",
"Belarusian": "be",
"Bulgarian": "bg",
"Bengali": "bn",
"Catalan": "ca",
"Czech": "cs",
"Welsh": "cy",
"Danish": "da",
"German": "de",
"Greek": "el",
"English": "en",
"Spanish": "es",
"Estonian": "et",
"Persian": "fa",
"Finnish": "fi",
"French": "fr",
"Irish": "ga",
"Galician": "gl",
"Gujarati": "gu",
"Hebrew": "he",
"Hindi": "hi",
"Croatian": "hr",
"Hungarian": "hu",
"Armenian": "hy",
"Indonesian": "id",
"Icelandic": "is",
"Italian": "it",
"Japanese": "ja",
"Georgian": "ka",
"Kazakh": "kk",
"Khmer": "km",
"Kannada": "kn",
"Korean": "ko",
"Lithuanian": "lt",
"Latvian": "lv",
"Macedonian": "mk",
"Malayalam": "ml",
"Mongolian": "mn",
"Marathi": "mr",
"Malay": "ms",
"Maltese": "mt",
"Nepali": "ne",
"Dutch": "nl",
"Norwegian": "no",
"Odia": "or",
"Punjabi": "pa",
"Polish": "pl",
"Portuguese": "pt",
"Romanian": "ro",
"Russian": "ru",
"Sinhala": "si",
"Slovak": "sk",
"Slovenian": "sl",
"Albanian": "sq",
"Serbian": "sr",
"Swedish": "sv",
"Swahili": "sw",
"Tamil": "ta",
"Telugu": "te",
"Thai": "th",
"Turkish": "tr",
"Ukrainian": "uk",
"Urdu": "ur",
"Vietnamese": "vi",
"Chinese": "zh"
}
def transcribe_audio(audio_file_path, model_size='base', language="en"):
model = whisper.load_model(model_size)
model.to("cpu")
result = model.transcribe(audio_file_path, language=language)
transcription = result["text"]
segments = result["segments"]
try:
from whisper.utils import format_srt
srt_text = format_srt(segments)
except Exception:
srt_text = generate_srt(segments)
return transcription, srt_text, segments
def generate_srt(segments):
import datetime
import srt
subtitles = []
for i, seg in enumerate(segments):
start_td = datetime.timedelta(seconds=seg["start"])
end_td = datetime.timedelta(seconds=seg["end"])
subtitle = srt.Subtitle(index=i+1, start=start_td, end=end_td, content=seg["text"])
subtitles.append(subtitle)
return srt.compose(subtitles)
def prepare_chapter_prompt(srt_text):
"""
Prepare a complete prompt (system + user instructions) for ChatGPT models.
Although the prompt is in English, it instructs the model to output chapter headers in the same language as the provided SRT transcript.
The output format should be one chapter per line: "mm:ss Chapter Title".
"""
system_prompt = (
"You are a highly skilled video content segmentation and optimization expert. "
"Your task is to analyze a transcript of a YouTube video provided in SRT format and produce engaging and concise chapter headers. "
"Each chapter header must be on its own line in the exact format: 'mm:ss Chapter Title'.\n\n"
"- 'mm:ss' represents the starting time of the chapter (minutes and seconds).\n"
"- 'Chapter Title' must be a catchy, audience-friendly title that summarizes the key idea or transition at that point in the video.\n\n"
"IMPORTANT: Although these instructions are in English, please ensure that your output is in the same language as the provided SRT transcript."
)
user_prompt = (
"Below is the transcript of a YouTube video in SRT format:\n\n"
"```\n"
f"{srt_text}\n"
"```\n\n"
"Please generate only the chapter breakdown using the guidelines above. "
"Each chapter header should be formatted as:\n"
"mm:ss Chapter Title"
)
final_prompt = system_prompt + "\n\n" + user_prompt
return final_prompt
def format_prompt_html(prompt):
"""
Displays the prompt in a read-only textarea using Gradio's color variables for background and text.
Includes a 'Copy Prompt' button (blue) and a short 'Prompt Copied!' confirmation message.
"""
html_content = f"""
<div style="display: flex; flex-direction: column; gap: 10px; margin-top: 10px;">
<textarea id="prompt_text" rows="10"
style="width: 100%; resize: vertical;
background-color: var(--block-background-fill);
color: var(--block-text-color);
border: 1px solid var(--block-border-color);
border-radius: 4px;"
readonly>{prompt}</textarea>
<button
style="width: 150px; padding: 8px;
background-color: #007bff;
color: white;
border: none;
border-radius: 4px;
cursor: pointer;"
onclick="
navigator.clipboard.writeText(document.getElementById('prompt_text').value);
const copiedMsg = document.getElementById('copied_msg');
copiedMsg.style.display = 'inline';
setTimeout(() => copiedMsg.style.display = 'none', 2000);
">
Copy Prompt
</button>
<span id="copied_msg" style="display: none; color: var(--primary-text-color); font-weight: bold;">Prompt Copied!</span>
</div>
"""
return html_content
def process_audio(audio, language_name):
lang_code = LANGUAGE_OPTIONS.get(language_name, "en")
try:
transcription, srt_text, segments = transcribe_audio(audio, model_size='base', language=lang_code)
except Exception as e:
return f"Error during transcription: {str(e)}", "", ""
chapter_prompt = prepare_chapter_prompt(srt_text)
prompt_html = format_prompt_html(chapter_prompt)
return transcription, srt_text, prompt_html
iface = gr.Interface(
fn=process_audio,
inputs=[
gr.Audio(type="filepath", label="Upload Audio"),
gr.Dropdown(choices=list(LANGUAGE_OPTIONS.keys()), label="Audio Language", value="English")
],
outputs=[
gr.Textbox(label="Full Transcription", lines=10),
gr.Textbox(label="SRT File Content", lines=10),
gr.HTML(label="Prepared Chapter Prompt (Copy & Paste into ChatGPT)")
],
title="Video Chapter Splitter from Audio (MP3)",
description=(
"Upload an audio file (e.g., MP3) of your YouTube video and select the audio language. "
"The app will transcribe the audio using Whisper, generate subtitles in SRT format, "
"and prepare a single, complete prompt that instructs ChatGPT -> o1 model to generate a chapter breakdown in the format 'mm:ss Chapter Title'.\n\n"
"Click the 'Copy Prompt' button to copy the entire prompt, and a brief 'Prompt Copied!' message will appear."
)
)
if __name__ == "__main__":
iface.launch()