nezihtopaloglu commited on
Commit
db78349
·
verified ·
1 Parent(s): 872b88b

Added movie title and optional subtitles

Browse files
Files changed (1) hide show
  1. app.py +21 -33
app.py CHANGED
@@ -1,5 +1,3 @@
1
-
2
-
3
  import gradio as gr
4
  import torch
5
  import torchaudio
@@ -11,7 +9,6 @@ import os
11
  from PIL import Image
12
 
13
  def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7):
14
- print("Estimating chunk durations...")
15
  words = text.split()
16
  chunks = []
17
  current_chunk = []
@@ -26,89 +23,80 @@ def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7):
26
  current_duration = 0
27
  if current_chunk:
28
  chunks.append(" ".join(current_chunk))
29
-
30
- total_time = sum([min(max(len(chunk.split()) / words_per_second, min_sec), max_sec) for chunk in chunks])
31
- print(f"Total estimated time for video: {total_time:.2f} seconds")
32
  return chunks
33
 
34
  def generate_speech(text):
35
- print("Generating speech...")
36
  tts = TTS("tts_models/en/ljspeech/tacotron2-DDC")
37
- print("TTS model loaded")
38
- tts.tts_to_file(text="Hello world!", file_path="test.wav")
39
  wav_path = "speech.wav"
40
  tts.tts_to_file(text=text, file_path=wav_path)
41
- print("Speech generated")
42
  return wav_path
43
 
44
  def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=40):
45
- print("Generating images...")
46
  image_paths = []
47
-
48
  if use_diffusion:
49
  pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
50
- def custom_checker(images, **kwargs):
51
- return images, [False] * len(images) # Force all images to be "safe"
52
-
53
- pipe.safety_checker = custom_checker
54
  pipe.to("cuda" if torch.cuda.is_available() else "cpu")
55
 
56
  for i, chunk in enumerate(chunks):
57
- print(f"Generating image for chunk {i + 1} of {len(chunks)}: {chunk[:50]}...")
58
  if use_diffusion:
59
  image = pipe(chunk, num_inference_steps=num_steps).images[0]
60
  image = image.resize(image_size)
61
  else:
62
  image = Image.new("RGB", image_size, (0, 0, 0))
63
-
64
  img_path = f"image_{i}.png"
65
  image.save(img_path)
66
  image_paths.append(img_path)
67
-
68
  return image_paths
69
 
70
- def create_video(images, durations, speech_path, image_size=(640, 480)):
71
  clips = []
72
- for img, dur in zip(images, durations):
73
- pil_image = Image.open(img)
74
- pil_image = pil_image.resize(image_size, Image.Resampling.LANCZOS)
75
- frame = np.array(pil_image) # Convert to NumPy array
76
- print(f"Duration: {dur}")
 
 
77
  clip = mp.ImageClip(frame).set_duration(dur)
 
 
 
 
78
  clips.append(clip)
79
-
80
- black_start = mp.ColorClip(image_size, color=(0,0,0), duration=1)
81
  black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2)
82
- video = mp.concatenate_videoclips([black_start] + clips + [black_end])
83
  audio = mp.AudioFileClip(speech_path)
84
  final_video = video.set_audio(audio)
85
  final_video.write_videofile("output.mp4", fps=24)
86
  return "output.mp4"
87
 
88
- def process_text(text, image_size, use_diffusion, num_steps):
89
  chunks = estimate_chunk_durations(text)
90
  speech_path = generate_speech(text)
91
  image_paths = generate_images(chunks, image_size, use_diffusion, num_steps)
92
  durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
93
- video_path = create_video(image_paths, durations, speech_path, image_size)
94
  return video_path
95
 
96
  with gr.Blocks() as demo:
97
  gr.Markdown("# Text-to-Video Generator using AI 🎥")
98
  text_input = gr.Textbox(label="Enter your text")
 
99
  file_input = gr.File(label="Or upload a .txt file")
100
  image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
101
  use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True)
102
  num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=5, label="Diffusion Model Steps")
 
103
  process_btn = gr.Button("Generate Video")
104
  output_video = gr.Video()
105
 
106
- def handle_request(text, file, image_size, use_diffusion, num_steps):
107
  if file is not None:
108
  text = open(file.name, "r").read()
109
  image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}
110
- return process_text(text, image_size_dict[image_size], use_diffusion, num_steps)
111
 
112
- process_btn.click(handle_request, inputs=[text_input, file_input, image_size_input, use_diffusion_input, num_steps_input], outputs=output_video)
113
 
114
  demo.launch()
 
 
 
1
  import gradio as gr
2
  import torch
3
  import torchaudio
 
9
  from PIL import Image
10
 
11
  def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7):
 
12
  words = text.split()
13
  chunks = []
14
  current_chunk = []
 
23
  current_duration = 0
24
  if current_chunk:
25
  chunks.append(" ".join(current_chunk))
 
 
 
26
  return chunks
27
 
28
  def generate_speech(text):
 
29
  tts = TTS("tts_models/en/ljspeech/tacotron2-DDC")
 
 
30
  wav_path = "speech.wav"
31
  tts.tts_to_file(text=text, file_path=wav_path)
 
32
  return wav_path
33
 
34
  def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=40):
 
35
  image_paths = []
 
36
  if use_diffusion:
37
  pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
 
 
 
 
38
  pipe.to("cuda" if torch.cuda.is_available() else "cpu")
39
 
40
  for i, chunk in enumerate(chunks):
 
41
  if use_diffusion:
42
  image = pipe(chunk, num_inference_steps=num_steps).images[0]
43
  image = image.resize(image_size)
44
  else:
45
  image = Image.new("RGB", image_size, (0, 0, 0))
 
46
  img_path = f"image_{i}.png"
47
  image.save(img_path)
48
  image_paths.append(img_path)
 
49
  return image_paths
50
 
51
+ def create_video(images, durations, speech_path, movie_title, add_subtitles, chunks, image_size=(640, 480)):
52
  clips = []
53
+ title_clip = mp.TextClip(movie_title, fontsize=50, color='white', size=image_size)
54
+ title_clip = title_clip.set_duration(1).set_position('center')
55
+ black_start = mp.ColorClip(image_size, color=(0,0,0), duration=1).set_opacity(0.8)
56
+ clips.append(mp.CompositeVideoClip([black_start, title_clip]))
57
+
58
+ for img, dur, chunk in zip(images, durations, chunks):
59
+ frame = np.array(Image.open(img).resize(image_size, Image.Resampling.LANCZOS))
60
  clip = mp.ImageClip(frame).set_duration(dur)
61
+ if add_subtitles:
62
+ txt_clip = mp.TextClip(chunk, fontsize=30, color='white', size=(image_size[0] - 20, None), method='caption')
63
+ txt_clip = txt_clip.set_duration(dur).set_position(('center', 'bottom'))
64
+ clip = mp.CompositeVideoClip([clip, txt_clip])
65
  clips.append(clip)
66
+
 
67
  black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2)
68
+ video = mp.concatenate_videoclips(clips + [black_end])
69
  audio = mp.AudioFileClip(speech_path)
70
  final_video = video.set_audio(audio)
71
  final_video.write_videofile("output.mp4", fps=24)
72
  return "output.mp4"
73
 
74
+ def process_text(text, movie_title, image_size, use_diffusion, num_steps, add_subtitles):
75
  chunks = estimate_chunk_durations(text)
76
  speech_path = generate_speech(text)
77
  image_paths = generate_images(chunks, image_size, use_diffusion, num_steps)
78
  durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
79
+ video_path = create_video(image_paths, durations, speech_path, movie_title, add_subtitles, chunks, image_size)
80
  return video_path
81
 
82
  with gr.Blocks() as demo:
83
  gr.Markdown("# Text-to-Video Generator using AI 🎥")
84
  text_input = gr.Textbox(label="Enter your text")
85
+ movie_title_input = gr.Textbox(label="Movie Title", value="")
86
  file_input = gr.File(label="Or upload a .txt file")
87
  image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
88
  use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True)
89
  num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=5, label="Diffusion Model Steps")
90
+ add_subtitles_input = gr.Checkbox(label="Add Subtitles", value=False)
91
  process_btn = gr.Button("Generate Video")
92
  output_video = gr.Video()
93
 
94
+ def handle_request(text, movie_title, file, image_size, use_diffusion, num_steps, add_subtitles):
95
  if file is not None:
96
  text = open(file.name, "r").read()
97
  image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}
98
+ return process_text(text, movie_title, image_size_dict[image_size], use_diffusion, num_steps, add_subtitles)
99
 
100
+ process_btn.click(handle_request, inputs=[text_input, movie_title_input, file_input, image_size_input, use_diffusion_input, num_steps_input, add_subtitles_input], outputs=output_video)
101
 
102
  demo.launch()