Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
dc44a16
1
Parent(s):
e952ea3
adding checkbox for diffusion
Browse files
app.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
import torchaudio
|
@@ -9,7 +11,7 @@ import os
|
|
9 |
from PIL import Image
|
10 |
|
11 |
def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=10):
|
12 |
-
print("Estimating chunk durations...")
|
13 |
words = text.split()
|
14 |
chunks = []
|
15 |
current_chunk = []
|
@@ -37,33 +39,35 @@ def generate_speech(text):
|
|
37 |
print("Speech generated")
|
38 |
return wav_path
|
39 |
|
40 |
-
def generate_images(chunks, image_size=(640, 480)):
|
41 |
print("Generating images...")
|
42 |
-
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
|
43 |
-
pipe.to("cuda" if torch.cuda.is_available() else "cpu")
|
44 |
image_paths = []
|
45 |
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
49 |
for i, chunk in enumerate(chunks):
|
50 |
-
print(f"Generating image for chunk {i + 1} of {len(chunks)}: {chunk[:50]}...")
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
53 |
img_path = f"image_{i}.png"
|
54 |
image.save(img_path)
|
55 |
image_paths.append(img_path)
|
56 |
-
|
57 |
return image_paths
|
58 |
|
59 |
-
|
60 |
def create_video(images, durations, speech_path, image_size=(640, 480)):
|
61 |
clips = []
|
62 |
for img, dur in zip(images, durations):
|
63 |
-
pil_image = Image.open(img)
|
64 |
-
pil_image = pil_image.resize(image_size, Image.Resampling.LANCZOS)
|
65 |
-
img_resized_path = f"resized_{os.path.basename(img)}"
|
66 |
-
pil_image.save(img_resized_path)
|
67 |
|
68 |
clip = mp.ImageClip(img_resized_path).set_duration(dur)
|
69 |
clips.append(clip)
|
@@ -76,28 +80,30 @@ def create_video(images, durations, speech_path, image_size=(640, 480)):
|
|
76 |
final_video.write_videofile("output.mp4", fps=24)
|
77 |
return "output.mp4"
|
78 |
|
79 |
-
def process_text(text, image_size):
|
80 |
chunks = estimate_chunk_durations(text)
|
81 |
speech_path = generate_speech(text)
|
82 |
-
image_paths = generate_images(chunks, image_size)
|
83 |
durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
|
84 |
video_path = create_video(image_paths, durations, speech_path, image_size)
|
85 |
return video_path
|
86 |
|
87 |
with gr.Blocks() as demo:
|
88 |
gr.Markdown("# Text-to-Video Generator using AI 🎥")
|
89 |
-
text_input = gr.Textbox(label="Enter your text")
|
90 |
file_input = gr.File(label="Or upload a .txt file")
|
91 |
image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
|
|
|
|
|
92 |
process_btn = gr.Button("Generate Video")
|
93 |
output_video = gr.Video()
|
94 |
|
95 |
-
def handle_request(text, file, image_size):
|
96 |
if file is not None:
|
97 |
text = open(file.name, "r").read()
|
98 |
image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}
|
99 |
-
return process_text(text, image_size_dict[image_size])
|
100 |
|
101 |
-
process_btn.click(handle_request, inputs=[text_input, file_input, image_size_input], outputs=output_video)
|
102 |
|
103 |
demo.launch()
|
|
|
1 |
+
|
2 |
+
|
3 |
import gradio as gr
|
4 |
import torch
|
5 |
import torchaudio
|
|
|
11 |
from PIL import Image
|
12 |
|
13 |
def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=10):
|
14 |
+
print("Estimating chunk durations...")
|
15 |
words = text.split()
|
16 |
chunks = []
|
17 |
current_chunk = []
|
|
|
39 |
print("Speech generated")
|
40 |
return wav_path
|
41 |
|
42 |
+
def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=5):
|
43 |
print("Generating images...")
|
|
|
|
|
44 |
image_paths = []
|
45 |
|
46 |
+
if use_diffusion:
|
47 |
+
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
|
48 |
+
pipe.to("cuda" if torch.cuda.is_available() else "cpu")
|
49 |
+
|
50 |
for i, chunk in enumerate(chunks):
|
51 |
+
print(f"Generating image for chunk {i + 1} of {len(chunks)}: {chunk[:50]}...")
|
52 |
+
if use_diffusion:
|
53 |
+
image = pipe(chunk, num_inference_steps=num_steps).images[0]
|
54 |
+
image = image.resize(image_size)
|
55 |
+
else:
|
56 |
+
image = Image.new("RGB", image_size, (0, 0, 0))
|
57 |
+
|
58 |
img_path = f"image_{i}.png"
|
59 |
image.save(img_path)
|
60 |
image_paths.append(img_path)
|
61 |
+
|
62 |
return image_paths
|
63 |
|
|
|
64 |
def create_video(images, durations, speech_path, image_size=(640, 480)):
|
65 |
clips = []
|
66 |
for img, dur in zip(images, durations):
|
67 |
+
pil_image = Image.open(img)
|
68 |
+
pil_image = pil_image.resize(image_size, Image.Resampling.LANCZOS)
|
69 |
+
img_resized_path = f"resized_{os.path.basename(img)}"
|
70 |
+
pil_image.save(img_resized_path)
|
71 |
|
72 |
clip = mp.ImageClip(img_resized_path).set_duration(dur)
|
73 |
clips.append(clip)
|
|
|
80 |
final_video.write_videofile("output.mp4", fps=24)
|
81 |
return "output.mp4"
|
82 |
|
83 |
+
def process_text(text, image_size, use_diffusion, num_steps):
|
84 |
chunks = estimate_chunk_durations(text)
|
85 |
speech_path = generate_speech(text)
|
86 |
+
image_paths = generate_images(chunks, image_size, use_diffusion, num_steps)
|
87 |
durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
|
88 |
video_path = create_video(image_paths, durations, speech_path, image_size)
|
89 |
return video_path
|
90 |
|
91 |
with gr.Blocks() as demo:
|
92 |
gr.Markdown("# Text-to-Video Generator using AI 🎥")
|
93 |
+
text_input = gr.Textbox(label="Enter your text", placeholder="Ancient Egypt, one of the most fascinating and enduring civilizations in history, flourished along the Nile River for over 3,000 years. Its civilization developed around 3100 BCE and lasted until Alexander the Great conquered Egypt in 332 BCE. Egypt is famous for its monumental achievements in architecture, art, and culture, many of which have had a lasting influence on the world.")
|
94 |
file_input = gr.File(label="Or upload a .txt file")
|
95 |
image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
|
96 |
+
use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True)
|
97 |
+
num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=5, label="Diffusion Model Steps")
|
98 |
process_btn = gr.Button("Generate Video")
|
99 |
output_video = gr.Video()
|
100 |
|
101 |
+
def handle_request(text, file, image_size, use_diffusion, num_steps):
|
102 |
if file is not None:
|
103 |
text = open(file.name, "r").read()
|
104 |
image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}
|
105 |
+
return process_text(text, image_size_dict[image_size], use_diffusion, num_steps)
|
106 |
|
107 |
+
process_btn.click(handle_request, inputs=[text_input, file_input, image_size_input, use_diffusion_input, num_steps_input], outputs=output_video)
|
108 |
|
109 |
demo.launch()
|