nezihtopaloglu commited on
Commit
dc44a16
·
1 Parent(s): e952ea3

adding checkbox for diffusion

Browse files
Files changed (1) hide show
  1. app.py +28 -22
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import gradio as gr
2
  import torch
3
  import torchaudio
@@ -9,7 +11,7 @@ import os
9
  from PIL import Image
10
 
11
  def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=10):
12
- print("Estimating chunk durations...") # Printing part of the text
13
  words = text.split()
14
  chunks = []
15
  current_chunk = []
@@ -37,33 +39,35 @@ def generate_speech(text):
37
  print("Speech generated")
38
  return wav_path
39
 
40
- def generate_images(chunks, image_size=(640, 480)):
41
  print("Generating images...")
42
- pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
43
- pipe.to("cuda" if torch.cuda.is_available() else "cpu")
44
  image_paths = []
45
 
46
- # Set number of inference steps to 10 for faster image generation
47
- num_inference_steps = 5
48
-
 
49
  for i, chunk in enumerate(chunks):
50
- print(f"Generating image for chunk {i + 1} of {len(chunks)}: {chunk[:50]}...") # Printing part of the chunk
51
- image = pipe(chunk, num_inference_steps=num_inference_steps).images[0]
52
- image = image.resize(image_size)
 
 
 
 
53
  img_path = f"image_{i}.png"
54
  image.save(img_path)
55
  image_paths.append(img_path)
56
-
57
  return image_paths
58
 
59
-
60
  def create_video(images, durations, speech_path, image_size=(640, 480)):
61
  clips = []
62
  for img, dur in zip(images, durations):
63
- pil_image = Image.open(img) # Open the image with PIL
64
- pil_image = pil_image.resize(image_size, Image.Resampling.LANCZOS) # Resize with the new resampling filter
65
- img_resized_path = f"resized_{os.path.basename(img)}" # Temporary file to store resized image
66
- pil_image.save(img_resized_path) # Save resized image to file
67
 
68
  clip = mp.ImageClip(img_resized_path).set_duration(dur)
69
  clips.append(clip)
@@ -76,28 +80,30 @@ def create_video(images, durations, speech_path, image_size=(640, 480)):
76
  final_video.write_videofile("output.mp4", fps=24)
77
  return "output.mp4"
78
 
79
- def process_text(text, image_size):
80
  chunks = estimate_chunk_durations(text)
81
  speech_path = generate_speech(text)
82
- image_paths = generate_images(chunks, image_size)
83
  durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
84
  video_path = create_video(image_paths, durations, speech_path, image_size)
85
  return video_path
86
 
87
  with gr.Blocks() as demo:
88
  gr.Markdown("# Text-to-Video Generator using AI 🎥")
89
- text_input = gr.Textbox(label="Enter your text")
90
  file_input = gr.File(label="Or upload a .txt file")
91
  image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
 
 
92
  process_btn = gr.Button("Generate Video")
93
  output_video = gr.Video()
94
 
95
- def handle_request(text, file, image_size):
96
  if file is not None:
97
  text = open(file.name, "r").read()
98
  image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}
99
- return process_text(text, image_size_dict[image_size])
100
 
101
- process_btn.click(handle_request, inputs=[text_input, file_input, image_size_input], outputs=output_video)
102
 
103
  demo.launch()
 
1
+
2
+
3
  import gradio as gr
4
  import torch
5
  import torchaudio
 
11
  from PIL import Image
12
 
13
  def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=10):
14
+ print("Estimating chunk durations...")
15
  words = text.split()
16
  chunks = []
17
  current_chunk = []
 
39
  print("Speech generated")
40
  return wav_path
41
 
42
+ def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=5):
43
  print("Generating images...")
 
 
44
  image_paths = []
45
 
46
+ if use_diffusion:
47
+ pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
48
+ pipe.to("cuda" if torch.cuda.is_available() else "cpu")
49
+
50
  for i, chunk in enumerate(chunks):
51
+ print(f"Generating image for chunk {i + 1} of {len(chunks)}: {chunk[:50]}...")
52
+ if use_diffusion:
53
+ image = pipe(chunk, num_inference_steps=num_steps).images[0]
54
+ image = image.resize(image_size)
55
+ else:
56
+ image = Image.new("RGB", image_size, (0, 0, 0))
57
+
58
  img_path = f"image_{i}.png"
59
  image.save(img_path)
60
  image_paths.append(img_path)
61
+
62
  return image_paths
63
 
 
64
  def create_video(images, durations, speech_path, image_size=(640, 480)):
65
  clips = []
66
  for img, dur in zip(images, durations):
67
+ pil_image = Image.open(img)
68
+ pil_image = pil_image.resize(image_size, Image.Resampling.LANCZOS)
69
+ img_resized_path = f"resized_{os.path.basename(img)}"
70
+ pil_image.save(img_resized_path)
71
 
72
  clip = mp.ImageClip(img_resized_path).set_duration(dur)
73
  clips.append(clip)
 
80
  final_video.write_videofile("output.mp4", fps=24)
81
  return "output.mp4"
82
 
83
+ def process_text(text, image_size, use_diffusion, num_steps):
84
  chunks = estimate_chunk_durations(text)
85
  speech_path = generate_speech(text)
86
+ image_paths = generate_images(chunks, image_size, use_diffusion, num_steps)
87
  durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
88
  video_path = create_video(image_paths, durations, speech_path, image_size)
89
  return video_path
90
 
91
  with gr.Blocks() as demo:
92
  gr.Markdown("# Text-to-Video Generator using AI 🎥")
93
+ text_input = gr.Textbox(label="Enter your text", placeholder="Ancient Egypt, one of the most fascinating and enduring civilizations in history, flourished along the Nile River for over 3,000 years. Its civilization developed around 3100 BCE and lasted until Alexander the Great conquered Egypt in 332 BCE. Egypt is famous for its monumental achievements in architecture, art, and culture, many of which have had a lasting influence on the world.")
94
  file_input = gr.File(label="Or upload a .txt file")
95
  image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
96
+ use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True)
97
+ num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=5, label="Diffusion Model Steps")
98
  process_btn = gr.Button("Generate Video")
99
  output_video = gr.Video()
100
 
101
+ def handle_request(text, file, image_size, use_diffusion, num_steps):
102
  if file is not None:
103
  text = open(file.name, "r").read()
104
  image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}
105
+ return process_text(text, image_size_dict[image_size], use_diffusion, num_steps)
106
 
107
+ process_btn.click(handle_request, inputs=[text_input, file_input, image_size_input, use_diffusion_input, num_steps_input], outputs=output_video)
108
 
109
  demo.launch()