Spaces:
Running
Running
File size: 7,061 Bytes
91f8d48 818e336 e6d59c3 91f8d48 e6d59c3 91f8d48 e6d59c3 818e336 224f399 91f8d48 4620e6c 818e336 43fec16 91f8d48 818e336 91f8d48 818e336 224f399 43fec16 224f399 818e336 43fec16 818e336 43fec16 818e336 224f399 43fec16 818e336 43fec16 818e336 43fec16 224f399 43fec16 224f399 43fec16 224f399 43fec16 224f399 43fec16 224f399 43fec16 91f8d48 818e336 43fec16 224f399 43fec16 224f399 43fec16 224f399 43fec16 224f399 43fec16 91f8d48 43fec16 224f399 818e336 43fec16 91f8d48 818e336 43fec16 818e336 91f8d48 818e336 91f8d48 818e336 91f8d48 818e336 91f8d48 43fec16 91f8d48 818e336 91f8d48 818e336 91f8d48 818e336 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
import os
import re
import google.generativeai as genai
from moviepy.video.io.VideoFileClip import VideoFileClip
import tempfile
import logging
import gradio as gr
from datetime import timedelta
from pydub import AudioSegment
# Suppress moviepy logs
logging.getLogger("moviepy").setLevel(logging.ERROR)
# Configure Gemini API
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel("gemini-2.0-pro-exp-02-05")
# Supported languages
SUPPORTED_LANGUAGES = [
"Auto Detect", "English", "Spanish", "French", "German", "Italian",
"Portuguese", "Russian", "Japanese", "Korean", "Arabic", "Hindi",
"Chinese", "Dutch", "Turkish", "Polish", "Vietnamese", "Thai"
]
# Magic Prompts
TRANSCRIPTION_PROMPT = """Generate precise subtitles with accurate timestamps:
1. Use [HH:MM:SS.ms -> HH:MM:SS.ms] format
2. Each subtitle 3-7 words
3. Include speaker changes
4. Preserve emotional tone
5. Example:
[00:00:05.250 -> 00:00:08.100]
Example subtitle text
Return ONLY subtitles with timestamps."""
TRANSLATION_PROMPT = """Translate these subtitles to {target_language}:
1. Keep timestamps identical
2. Match text length to timing
3. Preserve technical terms
4. Use natural speech patterns
ORIGINAL:
{subtitles}
TRANSLATED:"""
def split_audio(audio_path, chunk_duration=60):
"""Split audio into smaller chunks (default: 60 seconds)"""
audio = AudioSegment.from_wav(audio_path)
chunks = []
for i in range(0, len(audio), chunk_duration * 1000):
chunk = audio[i:i + chunk_duration * 1000]
chunk_path = os.path.join(tempfile.gettempdir(), f"chunk_{i//1000}.wav")
chunk.export(chunk_path, format="wav")
chunks.append(chunk_path)
return chunks
def process_audio_chunk(chunk_path, start_time):
"""Transcribe a single audio chunk"""
try:
# Upload file using Gemini's File API
uploaded_file = genai.upload_file(path=chunk_path)
# Get transcription
response = model.generate_content(
[TRANSCRIPTION_PROMPT, uploaded_file]
)
# Adjust timestamps relative to chunk start
adjusted_transcription = []
for line in response.text.splitlines():
if '->' in line:
start, end = line.split('->')
adjusted_start = parse_timestamp(start.strip()) + start_time
adjusted_end = parse_timestamp(end.strip()) + start_time
adjusted_line = f"[{format_timestamp(adjusted_start)} -> {format_timestamp(adjusted_end)}]"
adjusted_transcription.append(adjusted_line)
else:
adjusted_transcription.append(line)
return "\n".join(adjusted_transcription)
finally:
os.remove(chunk_path)
def parse_timestamp(timestamp_str):
"""Flexible timestamp parser"""
clean_ts = timestamp_str.strip("[] ").replace(',', '.')
parts = clean_ts.split(':')
seconds = 0.0
if len(parts) == 3: # HH:MM:SS.ss
hours, minutes, seconds_part = parts
seconds += float(hours) * 3600
elif len(parts) == 2: # MM:SS.ss
minutes, seconds_part = parts
else:
raise ValueError(f"Invalid timestamp: {timestamp_str}")
seconds += float(minutes) * 60
seconds += float(seconds_part)
return seconds
def format_timestamp(seconds):
"""Convert seconds to SRT format"""
return str(timedelta(seconds=seconds)).replace('.', ',')
def create_srt(subtitles_text):
"""Convert raw transcription to SRT format"""
entries = re.split(r'\n{2,}', subtitles_text.strip())
srt_output = []
for idx, entry in enumerate(entries, 1):
try:
time_match = re.search(
r'\[?\s*((?:\d+:)?\d+:\d+[.,]\d{3})\s*->\s*((?:\d+:)?\d+:\d+[.,]\d{3})\s*\]?',
entry
)
if not time_match:
continue
start_time = parse_timestamp(time_match.group(1))
end_time = parse_timestamp(time_match.group(2))
text = entry.split(']', 1)[-1].strip()
srt_entry = (
f"{idx}\n"
f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n"
f"{text}\n"
)
srt_output.append(srt_entry)
except Exception as e:
print(f"Skipping invalid entry {idx}: {str(e)}")
continue
return "\n".join(srt_output)
def process_video(video_path, source_lang, target_lang):
"""Complete processing pipeline"""
try:
# Extract audio
audio_path = extract_audio(video_path)
# Split into chunks
chunks = split_audio(audio_path)
full_transcription = []
# Process each chunk
for i, chunk_path in enumerate(chunks):
start_time = i * 60 # 60 seconds per chunk
chunk_transcription = process_audio_chunk(chunk_path, start_time)
full_transcription.append(chunk_transcription)
# Combine results
srt_original = create_srt("\n\n".join(full_transcription))
# Save original subtitles
original_srt = os.path.join(tempfile.gettempdir(), "original.srt")
with open(original_srt, "w") as f:
f.write(srt_original)
# Translate if needed
translated_srt = None
if target_lang != "None":
translated_text = translate_subtitles(srt_original, target_lang)
translated_srt = os.path.join(tempfile.gettempdir(), "translated.srt")
with open(translated_srt, "w") as f:
f.write(create_srt(translated_text))
return original_srt, translated_srt
except Exception as e:
print(f"Processing error: {str(e)}")
return None, None
finally:
if os.path.exists(audio_path):
os.remove(audio_path)
# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft(), title="AI Subtitle Studio") as app:
gr.Markdown("# 🎬 Professional Subtitle Generator")
with gr.Row():
video_input = gr.Video(label="Upload Video", sources=["upload"])
with gr.Column():
source_lang = gr.Dropdown(
label="Source Language",
choices=SUPPORTED_LANGUAGES,
value="Auto Detect"
)
target_lang = gr.Dropdown(
label="Translate To",
choices=["None"] + SUPPORTED_LANGUAGES[1:],
value="None"
)
process_btn = gr.Button("Generate", variant="primary")
with gr.Row():
original_sub = gr.File(label="Original Subtitles")
translated_sub = gr.File(label="Translated Subtitles")
process_btn.click(
process_video,
inputs=[video_input, source_lang, target_lang],
outputs=[original_sub, translated_sub]
)
if __name__ == "__main__":
app.launch(server_port=7860, share=True) |