Spaces:
Running
Running
Added movie title and optional subtitles
Browse files
app.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
|
2 |
-
|
3 |
import gradio as gr
|
4 |
import torch
|
5 |
import torchaudio
|
@@ -11,7 +9,6 @@ import os
|
|
11 |
from PIL import Image
|
12 |
|
13 |
def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7):
|
14 |
-
print("Estimating chunk durations...")
|
15 |
words = text.split()
|
16 |
chunks = []
|
17 |
current_chunk = []
|
@@ -26,89 +23,80 @@ def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7):
|
|
26 |
current_duration = 0
|
27 |
if current_chunk:
|
28 |
chunks.append(" ".join(current_chunk))
|
29 |
-
|
30 |
-
total_time = sum([min(max(len(chunk.split()) / words_per_second, min_sec), max_sec) for chunk in chunks])
|
31 |
-
print(f"Total estimated time for video: {total_time:.2f} seconds")
|
32 |
return chunks
|
33 |
|
34 |
def generate_speech(text):
|
35 |
-
print("Generating speech...")
|
36 |
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC")
|
37 |
-
print("TTS model loaded")
|
38 |
-
tts.tts_to_file(text="Hello world!", file_path="test.wav")
|
39 |
wav_path = "speech.wav"
|
40 |
tts.tts_to_file(text=text, file_path=wav_path)
|
41 |
-
print("Speech generated")
|
42 |
return wav_path
|
43 |
|
44 |
def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=40):
|
45 |
-
print("Generating images...")
|
46 |
image_paths = []
|
47 |
-
|
48 |
if use_diffusion:
|
49 |
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
|
50 |
-
def custom_checker(images, **kwargs):
|
51 |
-
return images, [False] * len(images) # Force all images to be "safe"
|
52 |
-
|
53 |
-
pipe.safety_checker = custom_checker
|
54 |
pipe.to("cuda" if torch.cuda.is_available() else "cpu")
|
55 |
|
56 |
for i, chunk in enumerate(chunks):
|
57 |
-
print(f"Generating image for chunk {i + 1} of {len(chunks)}: {chunk[:50]}...")
|
58 |
if use_diffusion:
|
59 |
image = pipe(chunk, num_inference_steps=num_steps).images[0]
|
60 |
image = image.resize(image_size)
|
61 |
else:
|
62 |
image = Image.new("RGB", image_size, (0, 0, 0))
|
63 |
-
|
64 |
img_path = f"image_{i}.png"
|
65 |
image.save(img_path)
|
66 |
image_paths.append(img_path)
|
67 |
-
|
68 |
return image_paths
|
69 |
|
70 |
-
def create_video(images, durations, speech_path, image_size=(640, 480)):
|
71 |
clips = []
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
77 |
clip = mp.ImageClip(frame).set_duration(dur)
|
|
|
|
|
|
|
|
|
78 |
clips.append(clip)
|
79 |
-
|
80 |
-
black_start = mp.ColorClip(image_size, color=(0,0,0), duration=1)
|
81 |
black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2)
|
82 |
-
video = mp.concatenate_videoclips(
|
83 |
audio = mp.AudioFileClip(speech_path)
|
84 |
final_video = video.set_audio(audio)
|
85 |
final_video.write_videofile("output.mp4", fps=24)
|
86 |
return "output.mp4"
|
87 |
|
88 |
-
def process_text(text, image_size, use_diffusion, num_steps):
|
89 |
chunks = estimate_chunk_durations(text)
|
90 |
speech_path = generate_speech(text)
|
91 |
image_paths = generate_images(chunks, image_size, use_diffusion, num_steps)
|
92 |
durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
|
93 |
-
video_path = create_video(image_paths, durations, speech_path, image_size)
|
94 |
return video_path
|
95 |
|
96 |
with gr.Blocks() as demo:
|
97 |
gr.Markdown("# Text-to-Video Generator using AI 🎥")
|
98 |
text_input = gr.Textbox(label="Enter your text")
|
|
|
99 |
file_input = gr.File(label="Or upload a .txt file")
|
100 |
image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
|
101 |
use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True)
|
102 |
num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=5, label="Diffusion Model Steps")
|
|
|
103 |
process_btn = gr.Button("Generate Video")
|
104 |
output_video = gr.Video()
|
105 |
|
106 |
-
def handle_request(text, file, image_size, use_diffusion, num_steps):
|
107 |
if file is not None:
|
108 |
text = open(file.name, "r").read()
|
109 |
image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}
|
110 |
-
return process_text(text, image_size_dict[image_size], use_diffusion, num_steps)
|
111 |
|
112 |
-
process_btn.click(handle_request, inputs=[text_input, file_input, image_size_input, use_diffusion_input, num_steps_input], outputs=output_video)
|
113 |
|
114 |
demo.launch()
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
import torchaudio
|
|
|
9 |
from PIL import Image
|
10 |
|
11 |
def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7):
|
|
|
12 |
words = text.split()
|
13 |
chunks = []
|
14 |
current_chunk = []
|
|
|
23 |
current_duration = 0
|
24 |
if current_chunk:
|
25 |
chunks.append(" ".join(current_chunk))
|
|
|
|
|
|
|
26 |
return chunks
|
27 |
|
28 |
def generate_speech(text):
|
|
|
29 |
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC")
|
|
|
|
|
30 |
wav_path = "speech.wav"
|
31 |
tts.tts_to_file(text=text, file_path=wav_path)
|
|
|
32 |
return wav_path
|
33 |
|
34 |
def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=40):
|
|
|
35 |
image_paths = []
|
|
|
36 |
if use_diffusion:
|
37 |
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
|
|
|
|
|
|
|
|
|
38 |
pipe.to("cuda" if torch.cuda.is_available() else "cpu")
|
39 |
|
40 |
for i, chunk in enumerate(chunks):
|
|
|
41 |
if use_diffusion:
|
42 |
image = pipe(chunk, num_inference_steps=num_steps).images[0]
|
43 |
image = image.resize(image_size)
|
44 |
else:
|
45 |
image = Image.new("RGB", image_size, (0, 0, 0))
|
|
|
46 |
img_path = f"image_{i}.png"
|
47 |
image.save(img_path)
|
48 |
image_paths.append(img_path)
|
|
|
49 |
return image_paths
|
50 |
|
51 |
+
def create_video(images, durations, speech_path, movie_title, add_subtitles, chunks, image_size=(640, 480)):
|
52 |
clips = []
|
53 |
+
title_clip = mp.TextClip(movie_title, fontsize=50, color='white', size=image_size)
|
54 |
+
title_clip = title_clip.set_duration(1).set_position('center')
|
55 |
+
black_start = mp.ColorClip(image_size, color=(0,0,0), duration=1).set_opacity(0.8)
|
56 |
+
clips.append(mp.CompositeVideoClip([black_start, title_clip]))
|
57 |
+
|
58 |
+
for img, dur, chunk in zip(images, durations, chunks):
|
59 |
+
frame = np.array(Image.open(img).resize(image_size, Image.Resampling.LANCZOS))
|
60 |
clip = mp.ImageClip(frame).set_duration(dur)
|
61 |
+
if add_subtitles:
|
62 |
+
txt_clip = mp.TextClip(chunk, fontsize=30, color='white', size=(image_size[0] - 20, None), method='caption')
|
63 |
+
txt_clip = txt_clip.set_duration(dur).set_position(('center', 'bottom'))
|
64 |
+
clip = mp.CompositeVideoClip([clip, txt_clip])
|
65 |
clips.append(clip)
|
66 |
+
|
|
|
67 |
black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2)
|
68 |
+
video = mp.concatenate_videoclips(clips + [black_end])
|
69 |
audio = mp.AudioFileClip(speech_path)
|
70 |
final_video = video.set_audio(audio)
|
71 |
final_video.write_videofile("output.mp4", fps=24)
|
72 |
return "output.mp4"
|
73 |
|
74 |
+
def process_text(text, movie_title, image_size, use_diffusion, num_steps, add_subtitles):
|
75 |
chunks = estimate_chunk_durations(text)
|
76 |
speech_path = generate_speech(text)
|
77 |
image_paths = generate_images(chunks, image_size, use_diffusion, num_steps)
|
78 |
durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
|
79 |
+
video_path = create_video(image_paths, durations, speech_path, movie_title, add_subtitles, chunks, image_size)
|
80 |
return video_path
|
81 |
|
82 |
with gr.Blocks() as demo:
|
83 |
gr.Markdown("# Text-to-Video Generator using AI 🎥")
|
84 |
text_input = gr.Textbox(label="Enter your text")
|
85 |
+
movie_title_input = gr.Textbox(label="Movie Title", value="")
|
86 |
file_input = gr.File(label="Or upload a .txt file")
|
87 |
image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
|
88 |
use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True)
|
89 |
num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=5, label="Diffusion Model Steps")
|
90 |
+
add_subtitles_input = gr.Checkbox(label="Add Subtitles", value=False)
|
91 |
process_btn = gr.Button("Generate Video")
|
92 |
output_video = gr.Video()
|
93 |
|
94 |
+
def handle_request(text, movie_title, file, image_size, use_diffusion, num_steps, add_subtitles):
|
95 |
if file is not None:
|
96 |
text = open(file.name, "r").read()
|
97 |
image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}
|
98 |
+
return process_text(text, movie_title, image_size_dict[image_size], use_diffusion, num_steps, add_subtitles)
|
99 |
|
100 |
+
process_btn.click(handle_request, inputs=[text_input, movie_title_input, file_input, image_size_input, use_diffusion_input, num_steps_input, add_subtitles_input], outputs=output_video)
|
101 |
|
102 |
demo.launch()
|