Spaces:
Sleeping
Sleeping
import os | |
import google.generativeai as genai | |
from moviepy.video.io.VideoFileClip import VideoFileClip | |
import tempfile | |
import logging | |
import gradio as gr | |
# Suppress moviepy logs | |
logging.getLogger("moviepy").setLevel(logging.ERROR) | |
# Configure Gemini API | |
genai.configure(api_key=os.environ["GEMINI_API_KEY"]) | |
# Create the Gemini model | |
generation_config = { | |
"temperature": 0.7, # Lower temperature for more accurate results | |
"top_p": 0.9, | |
"top_k": 40, | |
"max_output_tokens": 8192, | |
"response_mime_type": "text/plain", | |
} | |
model = genai.GenerativeModel( | |
model_name="gemini-2.0-pro-exp-02-05", | |
generation_config=generation_config, | |
) | |
# List of all supported languages | |
SUPPORTED_LANGUAGES = [ | |
"Auto Detect", "English", "Chinese", "German", "Spanish", "Russian", "Korean", | |
"French", "Japanese", "Portuguese", "Turkish", "Polish", "Catalan", "Dutch", | |
"Arabic", "Swedish", "Italian", "Indonesian", "Hindi", "Finnish", "Vietnamese", | |
"Hebrew", "Ukrainian", "Greek", "Malay", "Czech", "Romanian", "Danish", | |
"Hungarian", "Tamil", "Norwegian", "Thai", "Urdu", "Croatian", "Bulgarian", | |
"Lithuanian", "Latin", "Maori", "Malayalam", "Welsh", "Slovak", "Telugu", | |
"Persian", "Latvian", "Bengali", "Serbian", "Azerbaijani", "Slovenian", | |
"Kannada", "Estonian", "Macedonian", "Breton", "Basque", "Icelandic", | |
"Armenian", "Nepali", "Mongolian", "Bosnian", "Kazakh", "Albanian", | |
"Swahili", "Galician", "Marathi", "Punjabi", "Sinhala", "Khmer", "Shona", | |
"Yoruba", "Somali", "Afrikaans", "Occitan", "Georgian", "Belarusian", | |
"Tajik", "Sindhi", "Gujarati", "Amharic", "Yiddish", "Lao", "Uzbek", | |
"Faroese", "Haitian Creole", "Pashto", "Turkmen", "Nynorsk", "Maltese", | |
"Sanskrit", "Luxembourgish", "Burmese", "Tibetan", "Tagalog", "Malagasy", | |
"Assamese", "Tatar", "Hawaiian", "Lingala", "Hausa", "Bashkir", "Javanese", | |
"Sundanese" | |
] | |
def extract_audio_from_video(video_file): | |
"""Extract audio from a video file and save it as a WAV file.""" | |
video = VideoFileClip(video_file) | |
audio_file = os.path.join(tempfile.gettempdir(), "extracted_audio.wav") | |
video.audio.write_audiofile(audio_file, fps=16000, logger=None) # Suppress logs | |
return audio_file | |
def transcribe_audio_with_gemini(audio_file): | |
"""Transcribe audio using Gemini with a magic prompt for accurate timestamps.""" | |
with open(audio_file, "rb") as f: | |
audio_data = f.read() | |
# Create proper audio blob | |
audio_blob = { | |
'mime_type': 'audio/wav', | |
'data': audio_data | |
} | |
# Magic prompt for transcription with timestamps | |
prompt = """ | |
You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. | |
Include timestamps for each sentence in the following format: | |
[HH:MM:SS] Sentence 1 | |
[HH:MM:SS] Sentence 2 | |
don't change any in format | |
... | |
Ensure the timestamps are accurate and correspond to the start of each sentence. | |
Respond only with the transcription and timestamps. Do not add explanations or extra text. | |
""" | |
# Transcribe audio | |
convo = model.start_chat() | |
convo.send_message(prompt) | |
response = convo.send_message(audio_blob) | |
return response.text.strip() | |
def generate_subtitles(transcription): | |
"""Generate SRT subtitles from transcription with timestamps.""" | |
lines = transcription.split("\n") | |
srt_subtitles = "" | |
for i, line in enumerate(lines, start=1): | |
if not line.strip(): | |
continue | |
# Extract timestamp and text | |
if line.startswith("["): | |
timestamp = line.split("]")[0] + "]" | |
text = line.split("]")[1].strip() | |
else: | |
timestamp = "[00:00:00]" | |
text = line.strip() | |
# Convert timestamp to SRT format | |
start_time = timestamp[1:-1] # Remove brackets | |
start_seconds = time_to_seconds(start_time) | |
end_seconds = start_seconds + 5 # Placeholder: 5 seconds per line | |
end_time = seconds_to_time(end_seconds) | |
srt_subtitles += f"{i}\n{start_time},000 --> {end_time},000\n{text}\n\n" | |
return srt_subtitles | |
def time_to_seconds(time_str): | |
"""Convert HH:MM:SS to seconds.""" | |
hh, mm, ss = map(int, time_str.split(":")) | |
return hh * 3600 + mm * 60 + ss | |
def seconds_to_time(seconds): | |
"""Convert seconds to HH:MM:SS.""" | |
hh = seconds // 3600 | |
mm = (seconds % 3600) // 60 | |
ss = seconds % 60 | |
return f"{hh:02}:{mm:02}:{ss:02}" | |
def translate_srt(srt_text, target_language): | |
"""Translate an SRT file while preserving timestamps using a magic prompt.""" | |
# Magic prompt for translation | |
prompt = f""" | |
Translate the following SRT subtitles into {target_language}. | |
Preserve the SRT format (timestamps and structure). | |
Translate only the text after the timestamp. | |
Do not add explanations or extra text. | |
Ensure the translation is accurate and culturally appropriate. | |
Here is the SRT file: | |
{srt_text} | |
""" | |
response = model.generate_content(prompt) | |
return response.text | |
def process_video(video_file, language="Auto Detect", translate_to=None): | |
"""Process a video file to generate and translate subtitles.""" | |
# Extract audio from the video | |
audio_file = extract_audio_from_video(video_file) | |
# Transcribe audio using Gemini | |
transcription = transcribe_audio_with_gemini(audio_file) | |
# Generate subtitles | |
subtitles = generate_subtitles(transcription) | |
# Save original subtitles to an SRT file | |
original_srt_file = os.path.join(tempfile.gettempdir(), "original_subtitles.srt") | |
with open(original_srt_file, "w", encoding="utf-8") as f: | |
f.write(subtitles) | |
# Translate subtitles if a target language is provided | |
translated_srt_file = None | |
if translate_to and translate_to != "None": | |
translated_subtitles = translate_srt(subtitles, translate_to) | |
translated_srt_file = os.path.join(tempfile.gettempdir(), "translated_subtitles.srt") | |
with open(translated_srt_file, "w", encoding="utf-8") as f: | |
f.write(translated_subtitles) | |
# Clean up extracted audio file | |
os.remove(audio_file) | |
return original_srt_file, translated_srt_file, "Detected Language: Auto" | |
# Define the Gradio interface | |
with gr.Blocks(title="AutoSubGen - AI Video Subtitle Generator") as demo: | |
# Header | |
with gr.Column(): | |
gr.Markdown("# 🎥 AutoSubGen") | |
gr.Markdown("### AI-Powered Video Subtitle Generator") | |
gr.Markdown("Automatically generate and translate subtitles for your videos in **SRT format**. Supports **100+ languages** and **auto-detection**.") | |
# Main content | |
with gr.Tab("Generate Subtitles"): | |
gr.Markdown("### Upload a video file to generate subtitles.") | |
with gr.Row(): | |
video_input = gr.Video(label="Upload Video File", scale=2) | |
language_dropdown = gr.Dropdown( | |
choices=SUPPORTED_LANGUAGES, | |
label="Select Language", | |
value="Auto Detect", | |
scale=1 | |
) | |
translate_to_dropdown = gr.Dropdown( | |
choices=["None"] + SUPPORTED_LANGUAGES[1:], # Exclude "Auto Detect" | |
label="Translate To", | |
value="None", | |
scale=1 | |
) | |
generate_button = gr.Button("Generate Subtitles", variant="primary") | |
with gr.Row(): | |
original_subtitle_output = gr.File(label="Download Original Subtitles (SRT)") | |
translated_subtitle_output = gr.File(label="Download Translated Subtitles (SRT)") | |
detected_language_output = gr.Textbox(label="Detected Language") | |
# Link button to function | |
generate_button.click( | |
process_video, | |
inputs=[video_input, language_dropdown, translate_to_dropdown], | |
outputs=[original_subtitle_output, translated_subtitle_output, detected_language_output] | |
) | |
# Launch the Gradio interface with a public link | |
demo.launch(share=True) |