AZILS commited on
Commit
ce8dacb
·
verified ·
1 Parent(s): e591a65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +398 -353
app.py CHANGED
@@ -6,7 +6,8 @@ import random
6
  import tempfile
7
  import requests
8
  import numpy as np
9
- from PIL import Image
 
10
  from io import BytesIO
11
  from datetime import datetime
12
  import gradio as gr
@@ -16,19 +17,28 @@ from moviepy.editor import *
16
  from moviepy.audio.fx.all import volumex
17
  from moviepy.video.fx.all import crop
18
 
 
 
 
 
 
 
19
  # Load environment variables from .env file if present
20
  load_dotenv()
21
 
22
- # Constants
 
 
 
 
 
23
  CACHE_DIR = os.path.join(tempfile.gettempdir(), "yt_shorts_generator")
24
- ASSETS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets")
25
- MUSIC_DIR = os.path.join(ASSETS_DIR, "background_music")
26
- FONTS_DIR = os.path.join(ASSETS_DIR, "fonts")
27
 
28
  # Create necessary directories
29
- os.makedirs(CACHE_DIR, exist_ok=True)
30
  os.makedirs(MUSIC_DIR, exist_ok=True)
31
  os.makedirs(FONTS_DIR, exist_ok=True)
 
32
 
33
  # Helper functions for logging
34
  def info(message):
@@ -75,7 +85,7 @@ def get_font_files():
75
  if not font_files:
76
  return ["default"]
77
 
78
- return ["default"] + font_files
79
 
80
  def choose_random_music():
81
  """Selects a random music file from the music directory."""
@@ -90,6 +100,19 @@ def choose_random_music():
90
 
91
  return os.path.join(MUSIC_DIR, random.choice(music_files))
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  class YouTube:
94
  def __init__(self, niche: str, language: str,
95
  text_gen="g4f", text_model="gpt-4",
@@ -211,20 +234,17 @@ class YouTube:
211
  ).choices[0].message.content
212
 
213
  else:
214
- # Default to g4f if other methods aren't available
215
- self.log(f"Using default G4F model as fallback")
216
- import g4f
217
- response = g4f.ChatCompletion.create(
218
- model="gpt-3.5-turbo",
219
- messages=[{"role": "user", "content": prompt}]
220
- )
221
 
222
  self.log(f"Response generated successfully, length: {len(response)} characters")
223
  return response
224
 
225
  except Exception as e:
226
  error_msg = f"Error generating response: {str(e)}"
227
- self.log(error_msg)
228
  raise Exception(error_msg)
229
 
230
  def generate_topic(self) -> str:
@@ -281,8 +301,8 @@ class YouTube:
281
  raise Exception("Failed to generate a script. Please try again.")
282
 
283
  if len(completion) > 5000:
284
- self.log(warning("Generated Script is too long. Retrying..."))
285
- return self.generate_script()
286
 
287
  self.script = completion
288
  self.log(success(f"Generated script ({len(completion)} chars)"))
@@ -299,8 +319,8 @@ class YouTube:
299
  )
300
 
301
  if len(title) > 100:
302
- self.log(warning("Generated Title is too long. Retrying..."))
303
- return self.generate_metadata()
304
 
305
  description = self.generate_response(
306
  f"Please generate a YouTube Video Description for the following script: {self.script}. "
@@ -370,15 +390,8 @@ class YouTube:
370
  r = re.compile(r"\[.*\]", re.DOTALL)
371
  matches = r.findall(completion)
372
  if len(matches) == 0:
373
- self.log(warning("Failed to extract array. Creating generic image prompts."))
374
- # Create generic prompts based on the subject
375
- image_prompts = [
376
- f"A beautiful image showing {self.subject}, photorealistic",
377
- f"A detailed visualization of {self.subject}, high quality",
378
- f"An artistic representation of {self.subject}, vibrant colors",
379
- f"A photorealistic image about {self.subject}, high resolution",
380
- f"A dramatic scene related to {self.subject}, cinema quality"
381
- ]
382
  else:
383
  try:
384
  image_prompts = json.loads(matches[0])
@@ -390,15 +403,13 @@ class YouTube:
390
  if strings:
391
  image_prompts = strings
392
  else:
393
- # Last resort - split by commas and clean up
394
- image_prompts = [
395
- s.strip().strip('"').strip("'")
396
- for s in matches[0].strip('[]').split(',')
397
- ]
398
 
399
  # Ensure we have the requested number of prompts
400
- while len(image_prompts) < count:
401
- image_prompts.append(f"A high-quality image about {self.subject}")
 
402
 
403
  # Limit to the requested count
404
  image_prompts = image_prompts[:count]
@@ -414,9 +425,10 @@ class YouTube:
414
  """Generate an image using the selected image generation model."""
415
  self.log(f"Generating image for prompt: {prompt[:50]}...")
416
 
 
 
 
417
  try:
418
- image_path = os.path.join(CACHE_DIR, f"img_{len(self.images)}_{int(time.time())}.png")
419
-
420
  if self.image_gen == "prodia":
421
  self.log("Using Prodia provider for image generation")
422
  s = requests.Session()
@@ -496,31 +508,28 @@ class YouTube:
496
 
497
  elif self.image_gen == "g4f":
498
  self.log("Using G4F provider for image generation")
499
- try:
500
- from g4f.client import Client
501
- client = Client()
502
- response = client.images.generate(
503
- model=self.image_model,
504
- prompt=prompt,
505
- response_format="url"
506
- )
 
 
 
507
 
508
- if response and response.data and len(response.data) > 0:
509
- image_url = response.data[0].url
510
- image_response = requests.get(image_url)
511
-
512
- if image_response.status_code == 200:
513
- with open(image_path, "wb") as f:
514
- f.write(image_response.content)
515
- self.images.append(image_path)
516
- self.log(success(f"Image saved to: {image_path}"))
517
- return image_path
518
- else:
519
- raise Exception(f"Failed to download image from {image_url}")
520
  else:
521
- raise Exception("No image URL received from G4F")
522
- except Exception as e:
523
- raise Exception(f"G4F image generation failed: {str(e)}")
524
 
525
  elif self.image_gen == "segmind":
526
  self.log("Using Segmind provider for image generation")
@@ -569,10 +578,8 @@ class YouTube:
569
  raise Exception(f"Pollinations request failed with status code: {response.status_code}")
570
 
571
  else:
572
- # Default to generating a colored placeholder image
573
  self.log(f"Unknown provider '{self.image_gen}'. Generating placeholder image.")
574
-
575
- # Create a placeholder colored image with the prompt text
576
  img = Image.new('RGB', (800, 800), color=(random.randint(0, 255),
577
  random.randint(0, 255),
578
  random.randint(0, 255)))
@@ -585,7 +592,7 @@ class YouTube:
585
  error_msg = f"Image generation failed: {str(e)}"
586
  self.log(error(error_msg))
587
 
588
- # Create a fallback image
589
  try:
590
  img = Image.new('RGB', (800, 800), color=(200, 200, 200))
591
  image_path = os.path.join(CACHE_DIR, f"error_img_{len(self.images)}_{int(time.time())}.png")
@@ -607,6 +614,7 @@ class YouTube:
607
 
608
  self.log(f"Using TTS Engine: {self.tts_engine}, Voice: {self.tts_voice}")
609
 
 
610
  audio_path = os.path.join(CACHE_DIR, f"speech_{int(time.time())}.{output_format}")
611
 
612
  try:
@@ -624,16 +632,35 @@ class YouTube:
624
 
625
  payload = {
626
  "text": text,
627
- "model_id": "eleven_monolingual_v1",
628
  "voice_settings": {
629
  "stability": 0.5,
630
  "similarity_boost": 0.5,
631
  "style": 0.0,
632
  "use_speaker_boost": True
633
- }
 
 
634
  }
635
 
636
- voice_id = self.tts_voice if self.tts_voice not in ["Sarah", "default"] else "21m00Tcm4TlvDq8ikWAM"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
637
 
638
  response = requests.post(
639
  url=f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}",
@@ -646,8 +673,15 @@ class YouTube:
646
  f.write(response.content)
647
  self.log(success(f"Speech generated successfully using ElevenLabs at {audio_path}"))
648
  else:
649
- raise Exception(f"ElevenLabs API error: {response.text}")
650
-
 
 
 
 
 
 
 
651
  elif self.tts_engine == "gtts":
652
  self.log("Using Google TTS provider for speech generation")
653
  from gtts import gTTS
@@ -685,11 +719,18 @@ class YouTube:
685
  asyncio.run(generate())
686
 
687
  else:
688
- # Fallback to gtts
689
- self.log(f"Unknown TTS engine '{self.tts_engine}'. Falling back to gTTS.")
690
- from gtts import gTTS
691
- tts = gTTS(text=text, lang=self.language[:2].lower(), slow=False)
692
- tts.save(audio_path)
 
 
 
 
 
 
 
693
 
694
  self.log(success(f"Speech generated and saved to: {audio_path}"))
695
  self.tts_path = audio_path
@@ -715,24 +756,18 @@ class YouTube:
715
  self.log(error("Failed to create silent audio fallback"))
716
  return None
717
 
718
- def generate_subtitles(self, audio_path):
719
- """Generate word-level subtitles for the video."""
720
- if not self.subtitles_enabled:
721
- self.log("Subtitles are disabled. Skipping subtitle generation.")
722
- return None
723
-
724
- self.progress(0.65, desc="Creating subtitles")
725
- self.log("Starting subtitle generation process")
726
-
727
  try:
728
- assemblyai_api_key = os.environ.get("ASSEMBLYAI_API_KEY", "")
729
 
730
- if not assemblyai_api_key:
731
- self.log(warning("AssemblyAI API key not set. Generating simulated subtitles."))
732
- return self._generate_simulated_subtitles()
 
733
 
734
- import assemblyai as aai
735
- aai.settings.api_key = assemblyai_api_key
736
 
737
  config = aai.TranscriptionConfig(speaker_labels=False, word_boost=[], format_text=True)
738
  transcriber = aai.Transcriber(config=config)
@@ -741,51 +776,63 @@ class YouTube:
741
  transcript = transcriber.transcribe(audio_path)
742
 
743
  if not transcript or not transcript.words:
744
- self.log(warning("Transcription returned no words. Using simulated subtitles."))
745
- return self._generate_simulated_subtitles()
746
 
747
  # Process word-level information
748
  wordlevel_info = []
749
  for word in transcript.words:
750
  word_data = {
751
  "word": word.text.strip(),
752
- "start": word.start / 1000.0,
753
- "end": word.end / 1000.0
754
  }
755
  wordlevel_info.append(word_data)
756
 
757
  self.log(success(f"Transcription successful. Got {len(wordlevel_info)} words."))
758
 
759
  # Define constants for subtitle generation
760
- FONT = self.subtitle_font
 
 
 
 
 
 
761
  FONTSIZE = self.font_size
762
  COLOR = self.text_color
763
  BG_COLOR = self.highlight_color if self.highlighting_enabled else None
764
- FRAME_SIZE = (1080, 1920)
765
- MAX_CHARS = 30
766
- MAX_DURATION = 3.0
767
- MAX_GAP = 2.5
 
 
768
 
769
- # Split text into lines based on character count, duration, and gap
770
  subtitles = []
771
  line = []
772
  line_duration = 0
773
 
774
  for idx, word_data in enumerate(wordlevel_info):
 
 
 
 
775
  line.append(word_data)
776
- line_duration += word_data["end"] - word_data["start"]
 
777
  temp = " ".join(item["word"] for item in line)
778
  new_line_chars = len(temp)
 
779
  duration_exceeded = line_duration > MAX_DURATION
780
  chars_exceeded = new_line_chars > MAX_CHARS
781
 
782
  if idx > 0:
783
- gap = word_data['start'] - wordlevel_info[idx - 1]['end']
784
  maxgap_exceeded = gap > MAX_GAP
785
  else:
786
  maxgap_exceeded = False
787
 
788
- # Check if any condition is exceeded to finalize the current line
789
  if duration_exceeded or chars_exceeded or maxgap_exceeded:
790
  if line:
791
  subtitle_line = {
@@ -798,7 +845,7 @@ class YouTube:
798
  line = []
799
  line_duration = 0
800
 
801
- # Add the remaining words as the last subtitle line if any
802
  if line:
803
  subtitle_line = {
804
  "text": " ".join(item["word"] for item in line),
@@ -809,6 +856,8 @@ class YouTube:
809
  subtitles.append(subtitle_line)
810
 
811
  self.log(success(f"Generated {len(subtitles)} subtitle lines"))
 
 
812
  return {
813
  "wordlevel": wordlevel_info,
814
  "linelevel": subtitles,
@@ -823,101 +872,152 @@ class YouTube:
823
  }
824
 
825
  except Exception as e:
826
- error_msg = f"Subtitle generation failed: {str(e)}"
827
  self.log(error(error_msg))
828
- return self._generate_simulated_subtitles()
829
-
830
- def _generate_simulated_subtitles(self):
831
- """Generate simulated subtitles when AssemblyAI is not available."""
832
- self.log("Generating simulated subtitles")
833
-
834
- # Split script into words
835
- words = self.script.split()
836
-
837
- # Estimate audio duration based on word count (average speaking rate)
838
- estimated_duration = len(words) * 0.3 # 0.3 seconds per word on average
839
-
840
- # Generate word-level timings
841
- wordlevel_info = []
842
- current_time = 0
843
 
844
- for word in words:
845
- # Adjust duration based on word length
846
- word_duration = 0.2 + min(0.05 * len(word), 0.3) # Between 0.2 and 0.5 seconds
847
-
848
- word_data = {
849
- "word": word,
850
- "start": current_time,
851
- "end": current_time + word_duration
852
- }
853
- wordlevel_info.append(word_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
854
 
855
- # Add a small gap between words
856
- current_time += word_duration + 0.05
 
 
 
 
 
857
 
858
- # Generate line-level subtitles
859
- subtitles = []
860
- line = []
861
- line_start = 0
862
- line_text = ""
863
 
864
- for word_data in wordlevel_info:
865
- # Check if adding this word would exceed character limit
866
- if len(line_text + " " + word_data["word"]) > 30 and line:
867
- # Finalize current line
868
- subtitle_line = {
869
- "text": line_text,
870
- "start": line_start,
871
- "end": line[-1]["end"],
872
- "words": line.copy()
873
- }
874
- subtitles.append(subtitle_line)
 
 
 
 
 
 
 
 
 
 
 
875
 
876
- # Start new line
877
- line = [word_data]
878
- line_start = word_data["start"]
879
- line_text = word_data["word"]
880
- else:
881
- # Add word to current line
882
- line.append(word_data)
883
- line_text = (line_text + " " + word_data["word"]).strip()
884
- if len(line) == 1:
885
- line_start = word_data["start"]
886
-
887
- # Add final line if not empty
888
- if line:
889
- subtitle_line = {
890
- "text": line_text,
891
- "start": line_start,
892
- "end": line[-1]["end"],
893
- "words": line
894
- }
895
- subtitles.append(subtitle_line)
896
-
897
- self.log(success(f"Generated {len(wordlevel_info)} simulated word timings and {len(subtitles)} subtitle lines"))
898
-
899
- # Define settings for subtitle display
900
- settings = {
901
- "font": self.subtitle_font,
902
- "fontsize": self.font_size,
903
- "color": self.text_color,
904
- "bg_color": self.highlight_color if self.highlighting_enabled else None,
905
- "position": self.subtitle_position,
906
- "highlighting_enabled": self.highlighting_enabled
907
- }
 
 
 
 
 
 
 
 
 
 
 
 
908
 
909
- return {
910
- "wordlevel": wordlevel_info,
911
- "linelevel": subtitles,
912
- "settings": settings
913
- }
914
 
915
  def combine(self) -> str:
916
  """Combine images, audio, and subtitles into a final video."""
917
  self.progress(0.8, desc="Creating final video")
918
  self.log("Combining images and audio into final video")
919
-
920
  try:
 
921
  output_path = os.path.join(CACHE_DIR, f"output_{int(time.time())}.mp4")
922
 
923
  # Check for required files
@@ -989,6 +1089,11 @@ class YouTube:
989
  final_clip = concatenate_videoclips(clips)
990
  final_clip = final_clip.set_fps(30)
991
 
 
 
 
 
 
992
  # Add background music if available
993
  music_path = None
994
  if self.music_file == "random":
@@ -1002,120 +1107,35 @@ class YouTube:
1002
  music_clip = AudioFileClip(music_path)
1003
  # Loop music if it's shorter than the video
1004
  if music_clip.duration < max_duration:
1005
- repeats = int(max_duration / music_clip.duration) + 1
1006
- music_clip = concatenate_audioclips([music_clip] * repeats)
1007
- # Trim if it's longer
1008
  music_clip = music_clip.subclip(0, max_duration)
1009
- # Reduce volume
1010
- music_clip = music_clip.fx(volumex, 0.1)
1011
-
1012
- # Combine audio tracks
1013
- comp_audio = CompositeAudioClip([tts_clip, music_clip])
1014
- final_clip = final_clip.set_audio(comp_audio)
1015
  except Exception as e:
1016
- self.log(warning(f"Error adding background music: {str(e)}"))
1017
- final_clip = final_clip.set_audio(tts_clip)
1018
  else:
1019
- self.log("No background music found, using TTS audio only")
1020
- final_clip = final_clip.set_audio(tts_clip)
1021
 
1022
- # Set final duration
1023
- final_clip = final_clip.set_duration(tts_clip.duration)
1024
 
1025
- # Generate subtitles if enabled
1026
- subtitle_clips = []
1027
- if self.subtitles_enabled:
1028
- subtitles = self.generate_subtitles(self.tts_path)
1029
-
1030
- if subtitles and 'wordlevel' in subtitles:
1031
- self.log("Adding word-level subtitles")
1032
-
1033
- from moviepy.video.tools.subtitles import TextClip
1034
-
1035
- # Define subtitle styles
1036
- font = subtitles['settings']['font'] if subtitles['settings']['font'] != "default" and os.path.exists(os.path.join(FONTS_DIR, f"{subtitles['settings']['font']}.ttf")) else None
1037
- fontsize = subtitles['settings']['fontsize']
1038
- color = subtitles['settings']['color']
1039
- bg_color = subtitles['settings']['bg_color'] if subtitles['settings']['highlighting_enabled'] else None
1040
-
1041
- # Calculate position based on subtitle_position setting
1042
- frame_width, frame_height = 1080, 1920
1043
- if self.subtitle_position == "top":
1044
- y_pos = frame_height * 0.1 # Position at 10% from top
1045
- elif self.subtitle_position == "middle":
1046
- y_pos = frame_height * 0.5 # Position at middle
1047
- else: # bottom (default)
1048
- y_pos = frame_height * 0.85 # Position at 85% from top
1049
-
1050
- for subtitle in subtitles['linelevel']:
1051
- full_duration = subtitle['end'] - subtitle['start']
1052
-
1053
- # Initialize position for each subtitle line
1054
- x_pos = 0
1055
- x_buffer = frame_width * 1 / 10
1056
-
1057
- # Handle word-level subtitles if highlighting is enabled
1058
- if self.highlighting_enabled:
1059
- # Add each word with proper timing and highlighting
1060
- for word_data in subtitle['words']:
1061
- word = word_data['word']
1062
- start = word_data['start']
1063
- end = word_data['end']
1064
-
1065
- # Create text clip for word
1066
- try:
1067
- word_clip = TextClip(
1068
- txt=word,
1069
- font=font,
1070
- fontsize=fontsize,
1071
- color=color,
1072
- bg_color=bg_color,
1073
- stroke_color='black',
1074
- stroke_width=1
1075
- ).set_position((x_pos + x_buffer, y_pos)).set_start(start).set_duration(end - start)
1076
-
1077
- subtitle_clips.append(word_clip)
1078
- x_pos += word_clip.w + 10 # Add spacing between words
1079
-
1080
- # Wrap to next line if needed
1081
- if x_pos + word_clip.w > frame_width - 2 * x_buffer:
1082
- x_pos = 0
1083
- y_pos += word_clip.h + 10
1084
- except Exception as e:
1085
- self.log(warning(f"Error creating subtitle for word '{word}': {str(e)}"))
1086
- else:
1087
- # Show entire line without word-level highlighting
1088
- try:
1089
- line_clip = TextClip(
1090
- txt=subtitle['text'],
1091
- font=font,
1092
- fontsize=fontsize,
1093
- color=color,
1094
- bg_color=None,
1095
- stroke_color='black',
1096
- stroke_width=1,
1097
- method='caption',
1098
- size=(frame_width - 2 * x_buffer, None),
1099
- align='center'
1100
- ).set_position(('center', y_pos)).set_start(subtitle['start']).set_duration(full_duration)
1101
-
1102
- subtitle_clips.append(line_clip)
1103
- except Exception as e:
1104
- self.log(warning(f"Error creating subtitle line: {str(e)}"))
1105
-
1106
- # Add subtitles to video if any were created
1107
- if subtitle_clips:
1108
- self.log(f"Adding {len(subtitle_clips)} subtitle clips to video")
1109
- final_clip = CompositeVideoClip([final_clip] + subtitle_clips)
1110
-
1111
- # Write final video
1112
  self.log("Writing final video file")
1113
- final_clip.write_videofile(output_path, threads=4, codec='libx264', audio_codec='aac')
1114
-
1115
- success_msg = f"Video successfully created at: {output_path}"
1116
- self.log(success(success_msg))
1117
- self.video_path = output_path
 
 
 
1118
 
 
1119
  return output_path
1120
 
1121
  except Exception as e:
@@ -1135,7 +1155,6 @@ class YouTube:
1135
  video_clip.write_videofile(fallback_path, threads=2, codec='libx264', audio_codec='aac')
1136
 
1137
  self.log(warning(f"Created fallback video at: {fallback_path}"))
1138
- self.video_path = fallback_path
1139
  return fallback_path
1140
  else:
1141
  raise Exception("Cannot create fallback video: missing images or audio")
@@ -1148,6 +1167,11 @@ class YouTube:
1148
  try:
1149
  self.log("Starting video generation process")
1150
 
 
 
 
 
 
1151
  # Step 1: Generate topic
1152
  self.log("Generating topic")
1153
  self.generate_topic()
@@ -1181,17 +1205,23 @@ class YouTube:
1181
  self.log("Generating speech")
1182
  self.generate_speech(self.script)
1183
 
1184
- # Step 7: Combine all elements into final video
 
 
 
 
 
1185
  self.progress(0.8, desc="Creating final video")
1186
  self.log("Combining all elements into final video")
1187
  path = self.combine()
1188
 
1189
  self.progress(0.95, desc="Finalizing")
1190
- self.log(f"Video generation complete. File saved at: {path}")
1191
 
1192
  # Return the result
1193
  return {
1194
  'video_path': path,
 
1195
  'title': self.metadata['title'],
1196
  'description': self.metadata['description'],
1197
  'subject': self.subject,
@@ -1202,7 +1232,13 @@ class YouTube:
1202
  except Exception as e:
1203
  error_msg = f"Error during video generation: {str(e)}"
1204
  self.log(error(error_msg))
1205
- raise Exception(error_msg)
 
 
 
 
 
 
1206
 
1207
  # Data for dynamic dropdowns
1208
  def get_text_generator_models(generator):
@@ -1269,15 +1305,15 @@ def get_tts_voices(engine):
1269
  """Get available voices for the selected TTS engine."""
1270
  voices = {
1271
  "elevenlabs": [
1272
- "Sarah",
1273
- "Brian",
1274
- "Lily",
1275
- "Monika Sogam",
1276
- "George",
1277
- "River",
1278
- "Matilda",
1279
- "Will",
1280
- "Jessica"
1281
  ],
1282
  "openai": [
1283
  "alloy",
@@ -1310,7 +1346,7 @@ def get_tts_voices(engine):
1310
 
1311
  # Create the Gradio interface
1312
  def create_interface():
1313
- with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo"), title="YouTube Shorts Generator") as demo:
1314
  with gr.Row():
1315
  gr.Markdown(
1316
  """
@@ -1321,7 +1357,7 @@ def create_interface():
1321
 
1322
  with gr.Row(equal_height=True):
1323
  # Left panel: Content Settings
1324
- with gr.Column(scale=1, min_width=400):
1325
  with gr.Group():
1326
  gr.Markdown("### 📝 Content")
1327
  niche = gr.Textbox(
@@ -1336,7 +1372,7 @@ def create_interface():
1336
  value="English"
1337
  )
1338
 
1339
- # Middle panel: Generator Settings
1340
  with gr.Group():
1341
  gr.Markdown("### 🔧 Generator Settings")
1342
  with gr.Tabs():
@@ -1375,10 +1411,13 @@ def create_interface():
1375
  label="Voice",
1376
  value="en-US-AriaNeural"
1377
  )
 
 
 
1378
  music_file = gr.Dropdown(
1379
- choices=get_music_files(),
1380
  label="Background Music",
1381
- value="random"
1382
  )
1383
 
1384
  with gr.TabItem("Subtitles"):
@@ -1387,7 +1426,7 @@ def create_interface():
1387
  subtitle_font = gr.Dropdown(
1388
  choices=get_font_files(),
1389
  label="Font",
1390
- value="default"
1391
  )
1392
  with gr.Row():
1393
  font_size = gr.Slider(
@@ -1406,50 +1445,51 @@ def create_interface():
1406
  text_color = gr.ColorPicker(label="Text Color", value="#FFFFFF")
1407
  highlight_color = gr.ColorPicker(label="Highlight Color", value="#0000FF")
1408
 
1409
- # API Keys section
1410
- with gr.Accordion("🔑 API Keys", open=False):
1411
- gemini_api_key = gr.Textbox(
1412
- label="Gemini API Key",
1413
- type="password",
1414
- value=os.environ.get("GEMINI_API_KEY", "")
1415
- )
1416
- assemblyai_api_key = gr.Textbox(
1417
- label="AssemblyAI API Key",
1418
- type="password",
1419
- value=os.environ.get("ASSEMBLYAI_API_KEY", "")
1420
- )
1421
- elevenlabs_api_key = gr.Textbox(
1422
- label="ElevenLabs API Key",
1423
- type="password",
1424
- value=os.environ.get("ELEVENLABS_API_KEY", "")
1425
- )
1426
- segmind_api_key = gr.Textbox(
1427
- label="Segmind API Key",
1428
- type="password",
1429
- value=os.environ.get("SEGMIND_API_KEY", "")
1430
- )
1431
- openai_api_key = gr.Textbox(
1432
- label="OpenAI API Key",
1433
- type="password",
1434
- value=os.environ.get("OPENAI_API_KEY", "")
1435
- )
1436
-
1437
  # Generate button
1438
  generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
1439
 
1440
  # Right panel: Output display
1441
- with gr.Column(scale=1, min_width=400):
1442
  with gr.Tabs():
1443
  with gr.TabItem("Video"):
1444
- video_output = gr.Video(label="Generated Video", height=600)
 
1445
 
1446
  with gr.TabItem("Metadata"):
1447
  title_output = gr.Textbox(label="Title", lines=2)
1448
  description_output = gr.Textbox(label="Description", lines=4)
1449
  script_output = gr.Textbox(label="Script", lines=8)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1450
 
1451
  with gr.TabItem("Log"):
1452
- log_output = gr.Textbox(label="Process Log", lines=20, max_lines=100)
1453
 
1454
  # Dynamic dropdown updates
1455
  def update_text_models(generator):
@@ -1467,12 +1507,13 @@ def create_interface():
1467
  tts_engine.change(fn=update_tts_voices, inputs=tts_engine, outputs=tts_voice)
1468
 
1469
  # Main generation function
1470
- def generate_youtube_short(niche, language, gemini_api_key, assemblyai_api_key,
1471
- elevenlabs_api_key, segmind_api_key, openai_api_key,
1472
- text_gen, text_model, image_gen, image_model,
1473
  tts_engine, tts_voice, subtitles_enabled, highlighting_enabled,
1474
  subtitle_font, font_size, subtitle_position,
1475
- text_color, highlight_color, music_file, progress=gr.Progress()):
 
 
 
1476
 
1477
  if not niche.strip():
1478
  return {
@@ -1551,20 +1592,23 @@ def create_interface():
1551
  generate_btn.click(
1552
  fn=generate_youtube_short,
1553
  inputs=[
1554
- niche, language, gemini_api_key, assemblyai_api_key, elevenlabs_api_key,
1555
- segmind_api_key, openai_api_key, text_gen, text_model, image_gen, image_model,
1556
  tts_engine, tts_voice, subtitles_enabled, highlighting_enabled,
1557
- subtitle_font, font_size, subtitle_position, text_color, highlight_color, music_file
 
1558
  ],
1559
  outputs=[video_output, title_output, description_output, script_output, log_output]
1560
  )
1561
 
1562
  # Add examples
 
 
 
1563
  gr.Examples(
1564
  [
1565
- ["Historical Facts", "English", "g4f", "gpt-4", "g4f", "flux", "edge", "en-US-AriaNeural", True, True, "default", 80, "bottom", "#FFFFFF", "#0000FF", "random"],
1566
- ["Cooking Tips", "English", "g4f", "gpt-4", "g4f", "flux", "edge", "en-US-AriaNeural", True, True, "default", 80, "bottom", "#FFFFFF", "#FF0000", "random"],
1567
- ["Technology News", "English", "g4f", "gpt-4", "g4f", "flux", "edge", "en-US-GuyNeural", True, True, "default", 80, "bottom", "#FFFFFF", "#00FF00", "random"],
1568
  ],
1569
  [niche, language, text_gen, text_model, image_gen, image_model, tts_engine, tts_voice,
1570
  subtitles_enabled, highlighting_enabled, subtitle_font, font_size,
@@ -1577,9 +1621,10 @@ def create_interface():
1577
  # Create and launch the interface
1578
  if __name__ == "__main__":
1579
  # Create necessary directories
1580
- os.makedirs(CACHE_DIR, exist_ok=True)
1581
  os.makedirs(MUSIC_DIR, exist_ok=True)
1582
  os.makedirs(FONTS_DIR, exist_ok=True)
 
1583
 
1584
  # Launch the app
1585
  demo = create_interface()
 
6
  import tempfile
7
  import requests
8
  import numpy as np
9
+ import uuid
10
+ from PIL import Image, ImageDraw, ImageFont
11
  from io import BytesIO
12
  from datetime import datetime
13
  import gradio as gr
 
17
  from moviepy.audio.fx.all import volumex
18
  from moviepy.video.fx.all import crop
19
 
20
+ # Suppress the asyncio "Event loop is closed" warning on Windows
21
+ import sys
22
+ if sys.platform.startswith('win'):
23
+ import asyncio
24
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
25
+
26
  # Load environment variables from .env file if present
27
  load_dotenv()
28
 
29
+ # Directory structure constants
30
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
31
+ STATIC_DIR = os.path.join(BASE_DIR, "static")
32
+ MUSIC_DIR = os.path.join(STATIC_DIR, "music")
33
+ FONTS_DIR = os.path.join(STATIC_DIR, "fonts")
34
+ # Use temp directory for faster file operations
35
  CACHE_DIR = os.path.join(tempfile.gettempdir(), "yt_shorts_generator")
 
 
 
36
 
37
  # Create necessary directories
38
+ os.makedirs(STATIC_DIR, exist_ok=True)
39
  os.makedirs(MUSIC_DIR, exist_ok=True)
40
  os.makedirs(FONTS_DIR, exist_ok=True)
41
+ os.makedirs(CACHE_DIR, exist_ok=True)
42
 
43
  # Helper functions for logging
44
  def info(message):
 
85
  if not font_files:
86
  return ["default"]
87
 
88
+ return ["random"] + font_files
89
 
90
  def choose_random_music():
91
  """Selects a random music file from the music directory."""
 
100
 
101
  return os.path.join(MUSIC_DIR, random.choice(music_files))
102
 
103
+ def choose_random_font():
104
+ """Selects a random font file from the fonts directory."""
105
+ if not os.path.exists(FONTS_DIR):
106
+ error(f"Fonts directory {FONTS_DIR} does not exist")
107
+ return "default"
108
+
109
+ font_files = [f for f in os.listdir(FONTS_DIR) if f.endswith(('.ttf', '.otf'))]
110
+ if not font_files:
111
+ warning(f"No font files found in {FONTS_DIR}")
112
+ return None
113
+
114
+ return font_files[0].split('.')[0] if len(font_files) == 1 else random.choice([f.split('.')[0] for f in font_files])
115
+
116
  class YouTube:
117
  def __init__(self, niche: str, language: str,
118
  text_gen="g4f", text_model="gpt-4",
 
234
  ).choices[0].message.content
235
 
236
  else:
237
+ # No fallback, raise an exception for unsupported text generator
238
+ error_msg = f"Unsupported text generator: {self.text_gen}"
239
+ self.log(error(error_msg))
240
+ raise ValueError(error_msg)
 
 
 
241
 
242
  self.log(f"Response generated successfully, length: {len(response)} characters")
243
  return response
244
 
245
  except Exception as e:
246
  error_msg = f"Error generating response: {str(e)}"
247
+ self.log(error(error_msg))
248
  raise Exception(error_msg)
249
 
250
  def generate_topic(self) -> str:
 
301
  raise Exception("Failed to generate a script. Please try again.")
302
 
303
  if len(completion) > 5000:
304
+ self.log(warning("Generated script is too long."))
305
+ raise ValueError("Generated script exceeds 5000 characters. Please try again.")
306
 
307
  self.script = completion
308
  self.log(success(f"Generated script ({len(completion)} chars)"))
 
319
  )
320
 
321
  if len(title) > 100:
322
+ self.log(warning("Generated title exceeds 100 characters."))
323
+ raise ValueError("Generated title exceeds 100 characters. Please try again.")
324
 
325
  description = self.generate_response(
326
  f"Please generate a YouTube Video Description for the following script: {self.script}. "
 
390
  r = re.compile(r"\[.*\]", re.DOTALL)
391
  matches = r.findall(completion)
392
  if len(matches) == 0:
393
+ self.log(warning("Failed to extract array. Unable to create image prompts."))
394
+ raise ValueError("Failed to generate valid image prompts. Please try again.")
 
 
 
 
 
 
 
395
  else:
396
  try:
397
  image_prompts = json.loads(matches[0])
 
403
  if strings:
404
  image_prompts = strings
405
  else:
406
+ self.log(error("Failed to extract strings from regex match."))
407
+ raise ValueError("Failed to parse image prompts. Please try again.")
 
 
 
408
 
409
  # Ensure we have the requested number of prompts
410
+ if len(image_prompts) < count:
411
+ self.log(warning(f"Received fewer prompts ({len(image_prompts)}) than requested ({count})."))
412
+ raise ValueError(f"Received only {len(image_prompts)} prompts instead of {count}. Please try again.")
413
 
414
  # Limit to the requested count
415
  image_prompts = image_prompts[:count]
 
425
  """Generate an image using the selected image generation model."""
426
  self.log(f"Generating image for prompt: {prompt[:50]}...")
427
 
428
+ # Use simpler file naming for speed
429
+ image_path = os.path.join(CACHE_DIR, f"img_{len(self.images)}_{int(time.time())}.png")
430
+
431
  try:
 
 
432
  if self.image_gen == "prodia":
433
  self.log("Using Prodia provider for image generation")
434
  s = requests.Session()
 
508
 
509
  elif self.image_gen == "g4f":
510
  self.log("Using G4F provider for image generation")
511
+ from g4f.client import Client
512
+ client = Client()
513
+ response = client.images.generate(
514
+ model=self.image_model,
515
+ prompt=prompt,
516
+ response_format="url"
517
+ )
518
+
519
+ if response and response.data and len(response.data) > 0:
520
+ image_url = response.data[0].url
521
+ image_response = requests.get(image_url)
522
 
523
+ if image_response.status_code == 200:
524
+ with open(image_path, "wb") as f:
525
+ f.write(image_response.content)
526
+ self.images.append(image_path)
527
+ self.log(success(f"Image saved to: {image_path}"))
528
+ return image_path
 
 
 
 
 
 
529
  else:
530
+ raise Exception(f"Failed to download image from {image_url}")
531
+ else:
532
+ raise Exception("No image URL received from G4F")
533
 
534
  elif self.image_gen == "segmind":
535
  self.log("Using Segmind provider for image generation")
 
578
  raise Exception(f"Pollinations request failed with status code: {response.status_code}")
579
 
580
  else:
581
+ # Create a fallback colored placeholder image instead of throwing an error
582
  self.log(f"Unknown provider '{self.image_gen}'. Generating placeholder image.")
 
 
583
  img = Image.new('RGB', (800, 800), color=(random.randint(0, 255),
584
  random.randint(0, 255),
585
  random.randint(0, 255)))
 
592
  error_msg = f"Image generation failed: {str(e)}"
593
  self.log(error(error_msg))
594
 
595
+ # Create a fallback image instead of failing completely
596
  try:
597
  img = Image.new('RGB', (800, 800), color=(200, 200, 200))
598
  image_path = os.path.join(CACHE_DIR, f"error_img_{len(self.images)}_{int(time.time())}.png")
 
614
 
615
  self.log(f"Using TTS Engine: {self.tts_engine}, Voice: {self.tts_voice}")
616
 
617
+ # Use simpler file naming for speed
618
  audio_path = os.path.join(CACHE_DIR, f"speech_{int(time.time())}.{output_format}")
619
 
620
  try:
 
632
 
633
  payload = {
634
  "text": text,
635
+ "model_id": "eleven_turbo_v2", # Using latest and most capable model
636
  "voice_settings": {
637
  "stability": 0.5,
638
  "similarity_boost": 0.5,
639
  "style": 0.0,
640
  "use_speaker_boost": True
641
+ },
642
+ "output_format": "mp3_44100_128", # Higher quality audio (44.1kHz, 128kbps)
643
+ "optimize_streaming_latency": 0 # Optimize for quality over latency
644
  }
645
 
646
+ # Map voice names to ElevenLabs voice IDs
647
+ voice_id_mapping = {
648
+ "Sarah": "21m00Tcm4TlvDq8ikWAM",
649
+ "Brian": "hxppwzoRmvxK7YkDrjhQ",
650
+ "Lily": "p7TAj7L6QVq1fE6XGyjR",
651
+ "Monika Sogam": "Fc3XhIu9tfgOPOsU1hMr",
652
+ "George": "o7lPjDgzlF8ZAeSpqmaN",
653
+ "River": "f0k5evLkhJxrIRJXQJvy",
654
+ "Matilda": "XrExE9yKIg1WjnnlVkGX",
655
+ "Will": "pvKWM1B1sNRNTlEYYAEZ",
656
+ "Jessica": "A5EAMYWMCSsLNL1wYxOv",
657
+ "default": "21m00Tcm4TlvDq8ikWAM" # Default to Sarah
658
+ }
659
+
660
+ # Get the voice ID from mapping or use the voice name as ID if not found
661
+ voice_id = voice_id_mapping.get(self.tts_voice, self.tts_voice)
662
+
663
+ self.log(f"Using ElevenLabs voice: {self.tts_voice} (ID: {voice_id})")
664
 
665
  response = requests.post(
666
  url=f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}",
 
673
  f.write(response.content)
674
  self.log(success(f"Speech generated successfully using ElevenLabs at {audio_path}"))
675
  else:
676
+ try:
677
+ error_data = response.json()
678
+ error_message = error_data.get('detail', {}).get('message', response.text)
679
+ error_status = error_data.get('status', 'error')
680
+ raise Exception(f"ElevenLabs API error ({response.status_code}, {error_status}): {error_message}")
681
+ except ValueError:
682
+ # If JSON parsing fails, use the raw response
683
+ raise Exception(f"ElevenLabs API error ({response.status_code}): {response.text}")
684
+
685
  elif self.tts_engine == "gtts":
686
  self.log("Using Google TTS provider for speech generation")
687
  from gtts import gTTS
 
719
  asyncio.run(generate())
720
 
721
  else:
722
+ # Default to edge TTS if other methods aren't available
723
+ self.log(f"Using default Edge TTS as fallback")
724
+ import edge_tts
725
+ import asyncio
726
+
727
+ voice = "en-US-AriaNeural"
728
+
729
+ async def generate():
730
+ communicate = edge_tts.Communicate(text, voice)
731
+ await communicate.save(audio_path)
732
+
733
+ asyncio.run(generate())
734
 
735
  self.log(success(f"Speech generated and saved to: {audio_path}"))
736
  self.tts_path = audio_path
 
756
  self.log(error("Failed to create silent audio fallback"))
757
  return None
758
 
759
+ def generate_subtitles(self, audio_path: str) -> dict:
760
+ """Generate subtitles from audio using AssemblyAI."""
761
+ self.log("Generating subtitles from audio")
 
 
 
 
 
 
762
  try:
763
+ import assemblyai as aai
764
 
765
+ # Check if API key is set
766
+ aai_api_key = os.environ.get("ASSEMBLYAI_API_KEY", "")
767
+ if not aai_api_key:
768
+ raise ValueError("AssemblyAI API key is not set. Please provide a valid API key.")
769
 
770
+ aai.settings.api_key = aai_api_key
 
771
 
772
  config = aai.TranscriptionConfig(speaker_labels=False, word_boost=[], format_text=True)
773
  transcriber = aai.Transcriber(config=config)
 
776
  transcript = transcriber.transcribe(audio_path)
777
 
778
  if not transcript or not transcript.words:
779
+ raise ValueError("Transcription returned no words.")
 
780
 
781
  # Process word-level information
782
  wordlevel_info = []
783
  for word in transcript.words:
784
  word_data = {
785
  "word": word.text.strip(),
786
+ "start": word.start / 1000.0, # Convert from ms to seconds
787
+ "end": word.end / 1000.0 # Convert from ms to seconds
788
  }
789
  wordlevel_info.append(word_data)
790
 
791
  self.log(success(f"Transcription successful. Got {len(wordlevel_info)} words."))
792
 
793
  # Define constants for subtitle generation
794
+ # Handle random font selection if configured
795
+ if self.subtitle_font == "random":
796
+ FONT = choose_random_font()
797
+ self.log(f"Using random font: {FONT}")
798
+ else:
799
+ FONT = self.subtitle_font
800
+
801
  FONTSIZE = self.font_size
802
  COLOR = self.text_color
803
  BG_COLOR = self.highlight_color if self.highlighting_enabled else None
804
+ FRAME_SIZE = (1080, 1920) # Vertical video format
805
+
806
+ # Constants for line splitting
807
+ MAX_CHARS = 30 # Maximum characters per line for vertical video format
808
+ MAX_DURATION = 3.0 # Maximum duration for a single line
809
+ MAX_GAP = 1.5 # Split if nothing is spoken for this many seconds
810
 
811
+ # Split text into lines
812
  subtitles = []
813
  line = []
814
  line_duration = 0
815
 
816
  for idx, word_data in enumerate(wordlevel_info):
817
+ word = word_data["word"]
818
+ start = word_data["start"]
819
+ end = word_data["end"]
820
+
821
  line.append(word_data)
822
+ line_duration += end - start
823
+
824
  temp = " ".join(item["word"] for item in line)
825
  new_line_chars = len(temp)
826
+
827
  duration_exceeded = line_duration > MAX_DURATION
828
  chars_exceeded = new_line_chars > MAX_CHARS
829
 
830
  if idx > 0:
831
+ gap = word_data['start'] - wordlevel_info[idx-1]['end']
832
  maxgap_exceeded = gap > MAX_GAP
833
  else:
834
  maxgap_exceeded = False
835
 
 
836
  if duration_exceeded or chars_exceeded or maxgap_exceeded:
837
  if line:
838
  subtitle_line = {
 
845
  line = []
846
  line_duration = 0
847
 
848
+ # Add remaining words as last line
849
  if line:
850
  subtitle_line = {
851
  "text": " ".join(item["word"] for item in line),
 
856
  subtitles.append(subtitle_line)
857
 
858
  self.log(success(f"Generated {len(subtitles)} subtitle lines"))
859
+
860
+ # Return the subtitle data and settings
861
  return {
862
  "wordlevel": wordlevel_info,
863
  "linelevel": subtitles,
 
872
  }
873
 
874
  except Exception as e:
875
+ error_msg = f"Error generating subtitles: {str(e)}"
876
  self.log(error(error_msg))
877
+ raise Exception(error_msg)
878
+
879
+ def create_subtitle_clip(self, subtitle_data, frame_size):
880
+ """Create subtitle clips for a line of text with word-level highlighting."""
881
+ settings = subtitle_data["settings"]
882
+ font_name = settings["font"]
883
+ fontsize = settings["fontsize"]
884
+ color = settings["color"]
885
+ bg_color = settings["bg_color"]
886
+ highlighting_enabled = settings["highlighting_enabled"]
 
 
 
 
 
887
 
888
+ def create_text_clip(text, font_size, color, bg_color=None):
889
+ try:
890
+ # Try to use the specified font, fallback to default
891
+ try:
892
+ # Check if font is a path or just a name
893
+ font_path = os.path.join(FONTS_DIR, f"{font_name}.ttf")
894
+ if os.path.exists(font_path):
895
+ pil_font = ImageFont.truetype(font_path, font_size)
896
+ else:
897
+ self.log(warning(f"Font {font_name} not found, using default"))
898
+ pil_font = ImageFont.load_default()
899
+ except Exception as e:
900
+ self.log(warning(f"Error loading font: {str(e)}"))
901
+ pil_font = ImageFont.load_default()
902
+
903
+ # Get text size
904
+ text_width, text_height = pil_font.getbbox(text)[2:4]
905
+
906
+ # Add padding
907
+ padding = 10
908
+ img_width = text_width + padding * 2
909
+ img_height = text_height + padding * 2
910
+
911
+ # Create image with background color or transparent
912
+ if bg_color:
913
+ if bg_color.startswith('#'):
914
+ bg_color_rgb = tuple(int(bg_color.lstrip('#')[i:i+2], 16) for i in (0, 2, 4))
915
+ else:
916
+ bg_color_rgb = (0, 0, 255) # Default blue
917
+ img = Image.new('RGB', (img_width, img_height), color=bg_color_rgb)
918
+ else:
919
+ img = Image.new('RGBA', (img_width, img_height), color=(0, 0, 0, 0))
920
+
921
+ # Draw text
922
+ draw = ImageDraw.Draw(img)
923
+ if color.startswith('#'):
924
+ text_color_rgb = tuple(int(color.lstrip('#')[i:i+2], 16) for i in (0, 2, 4))
925
+ else:
926
+ text_color_rgb = (255, 255, 255) # Default white
927
+
928
+ draw.text((padding, padding), text, font=pil_font, fill=text_color_rgb)
929
+
930
+ # Convert to numpy array for MoviePy
931
+ img_array = np.array(img)
932
+ clip = ImageClip(img_array)
933
+ return clip, img_width, img_height
934
 
935
+ except Exception as e:
936
+ self.log(warning(f"Error creating text clip: {str(e)}"))
937
+ # Create a simple colored rectangle as fallback
938
+ img = Image.new('RGB', (100, 50), color=(100, 100, 100))
939
+ img_array = np.array(img)
940
+ clip = ImageClip(img_array)
941
+ return clip, 100, 50
942
 
943
+ subtitle_clips = []
 
 
 
 
944
 
945
+ for line in subtitle_data["linelevel"]:
946
+ x_pos = 0
947
+ y_pos = 0
948
+ word_positions = []
949
+
950
+ # Calculate vertical position based on subtitle position setting
951
+ if settings["position"] == "top":
952
+ y_buffer = frame_size[1] * 0.1 # 10% from top
953
+ elif settings["position"] == "middle":
954
+ y_buffer = frame_size[1] * 0.4 # 40% from top
955
+ else: # bottom
956
+ y_buffer = frame_size[1] * 0.7 # 70% from top
957
+
958
+ x_buffer = frame_size[0] * 0.1 # 10% from left
959
+ space_width = 20
960
+
961
+ # Create clips for each word in the line
962
+ for word_data in line["words"]:
963
+ word = word_data["word"]
964
+ start_time = word_data["start"]
965
+ end_time = word_data["end"]
966
+ duration = end_time - start_time
967
 
968
+ # Create word clip
969
+ word_clip, word_width, word_height = create_text_clip(word, fontsize, color)
970
+
971
+ # Check if word fits on current line
972
+ if x_pos + word_width + space_width > frame_size[0] - 2 * x_buffer:
973
+ x_pos = 0
974
+ y_pos += word_height + 20
975
+
976
+ # Store word position info
977
+ word_positions.append({
978
+ "word": word,
979
+ "x_pos": x_pos + x_buffer,
980
+ "y_pos": y_pos + y_buffer,
981
+ "width": word_width,
982
+ "height": word_height,
983
+ "start": start_time,
984
+ "end": end_time
985
+ })
986
+
987
+ # Set position and timing for word clip
988
+ word_clip = word_clip.set_position((x_pos + x_buffer, y_pos + y_buffer))
989
+ word_clip = word_clip.set_start(line["start"]).set_duration(line["end"] - line["start"])
990
+ subtitle_clips.append(word_clip)
991
+
992
+ # Add space after word
993
+ space_clip, _, _ = create_text_clip(" ", fontsize, color)
994
+ space_clip = space_clip.set_position((x_pos + word_width + x_buffer, y_pos + y_buffer))
995
+ space_clip = space_clip.set_start(line["start"]).set_duration(line["end"] - line["start"])
996
+ subtitle_clips.append(space_clip)
997
+
998
+ x_pos += word_width + space_width
999
+
1000
+ # Add highlighted words if enabled
1001
+ if highlighting_enabled and bg_color:
1002
+ for word_pos in word_positions:
1003
+ highlight_clip, _, _ = create_text_clip(
1004
+ word_pos["word"],
1005
+ fontsize,
1006
+ color,
1007
+ bg_color
1008
+ )
1009
+ highlight_clip = highlight_clip.set_position((word_pos["x_pos"], word_pos["y_pos"]))
1010
+ highlight_clip = highlight_clip.set_start(word_pos["start"]).set_duration(word_pos["end"] - word_pos["start"])
1011
+ subtitle_clips.append(highlight_clip)
1012
 
1013
+ return subtitle_clips
 
 
 
 
1014
 
1015
  def combine(self) -> str:
1016
  """Combine images, audio, and subtitles into a final video."""
1017
  self.progress(0.8, desc="Creating final video")
1018
  self.log("Combining images and audio into final video")
 
1019
  try:
1020
+ # Use simple file naming for faster processing
1021
  output_path = os.path.join(CACHE_DIR, f"output_{int(time.time())}.mp4")
1022
 
1023
  # Check for required files
 
1089
  final_clip = concatenate_videoclips(clips)
1090
  final_clip = final_clip.set_fps(30)
1091
 
1092
+ # Add subtitles if enabled
1093
+ if self.subtitles_enabled and hasattr(self, 'subtitle_data'):
1094
+ subtitle_clips = self.create_subtitle_clip(self.subtitle_data, (1080, 1920))
1095
+ final_clip = CompositeVideoClip([final_clip] + subtitle_clips)
1096
+
1097
  # Add background music if available
1098
  music_path = None
1099
  if self.music_file == "random":
 
1107
  music_clip = AudioFileClip(music_path)
1108
  # Loop music if it's shorter than the video
1109
  if music_clip.duration < max_duration:
1110
+ num_loops = int(np.ceil(max_duration / music_clip.duration))
1111
+ music_clip = concatenate_audioclips([music_clip] * num_loops)
1112
+ # Trim music if it's longer than the video
1113
  music_clip = music_clip.subclip(0, max_duration)
1114
+ # Reduce music volume
1115
+ music_clip = music_clip.volumex(0.1)
1116
+ # Combine with TTS audio
1117
+ final_audio = CompositeAudioClip([tts_clip, music_clip])
 
 
1118
  except Exception as e:
1119
+ self.log(warning(f"Error processing music: {str(e)}"))
1120
+ final_audio = tts_clip
1121
  else:
1122
+ final_audio = tts_clip
 
1123
 
1124
+ # Set final audio
1125
+ final_clip = final_clip.set_audio(final_audio)
1126
 
1127
+ # Write final video - use faster encoding settings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1128
  self.log("Writing final video file")
1129
+ final_clip.write_videofile(
1130
+ output_path,
1131
+ fps=30,
1132
+ codec="libx264",
1133
+ audio_codec="aac",
1134
+ threads=4,
1135
+ # Remove preset parameter for faster encoding
1136
+ )
1137
 
1138
+ self.log(success(f"Video saved to: {output_path}"))
1139
  return output_path
1140
 
1141
  except Exception as e:
 
1155
  video_clip.write_videofile(fallback_path, threads=2, codec='libx264', audio_codec='aac')
1156
 
1157
  self.log(warning(f"Created fallback video at: {fallback_path}"))
 
1158
  return fallback_path
1159
  else:
1160
  raise Exception("Cannot create fallback video: missing images or audio")
 
1167
  try:
1168
  self.log("Starting video generation process")
1169
 
1170
+ # Create a simple generation directory - avoid complex numbering schemes
1171
+ self.generation_folder = os.path.join(CACHE_DIR, f"gen_{int(time.time())}")
1172
+ os.makedirs(self.generation_folder, exist_ok=True)
1173
+ self.log(f"Created generation folder: {self.generation_folder}")
1174
+
1175
  # Step 1: Generate topic
1176
  self.log("Generating topic")
1177
  self.generate_topic()
 
1205
  self.log("Generating speech")
1206
  self.generate_speech(self.script)
1207
 
1208
+ # Step 7: Generate subtitles
1209
+ self.progress(0.7, desc="Generating subtitles")
1210
+ if self.subtitles_enabled and hasattr(self, 'tts_path') and os.path.exists(self.tts_path):
1211
+ self.subtitle_data = self.generate_subtitles(self.tts_path)
1212
+
1213
+ # Step 8: Combine all elements into final video
1214
  self.progress(0.8, desc="Creating final video")
1215
  self.log("Combining all elements into final video")
1216
  path = self.combine()
1217
 
1218
  self.progress(0.95, desc="Finalizing")
1219
+ self.log(f"Video generation complete. Files saved in: {self.generation_folder}")
1220
 
1221
  # Return the result
1222
  return {
1223
  'video_path': path,
1224
+ 'generation_folder': self.generation_folder,
1225
  'title': self.metadata['title'],
1226
  'description': self.metadata['description'],
1227
  'subject': self.subject,
 
1232
  except Exception as e:
1233
  error_msg = f"Error during video generation: {str(e)}"
1234
  self.log(error(error_msg))
1235
+
1236
+ # Return basic data even on error
1237
+ return {
1238
+ 'video_path': getattr(self, 'video_path', None),
1239
+ 'error': str(e),
1240
+ 'logs': self.logs
1241
+ }
1242
 
1243
  # Data for dynamic dropdowns
1244
  def get_text_generator_models(generator):
 
1305
  """Get available voices for the selected TTS engine."""
1306
  voices = {
1307
  "elevenlabs": [
1308
+ "Sarah", # Female, American accent
1309
+ "Brian", # Male, British accent
1310
+ "Lily", # Female, British accent
1311
+ "Monika Sogam", # Female, Indian accent
1312
+ "George", # Male, American accent
1313
+ "River", # Female, American accent
1314
+ "Matilda", # Female, British accent
1315
+ "Will", # Male, American accent
1316
+ "Jessica" # Female, American accent
1317
  ],
1318
  "openai": [
1319
  "alloy",
 
1346
 
1347
  # Create the Gradio interface
1348
  def create_interface():
1349
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo", radius_size="lg"), title="YouTube Shorts Generator") as demo:
1350
  with gr.Row():
1351
  gr.Markdown(
1352
  """
 
1357
 
1358
  with gr.Row(equal_height=True):
1359
  # Left panel: Content Settings
1360
+ with gr.Column(scale=2, min_width=500):
1361
  with gr.Group():
1362
  gr.Markdown("### 📝 Content")
1363
  niche = gr.Textbox(
 
1372
  value="English"
1373
  )
1374
 
1375
+ # Generator Settings
1376
  with gr.Group():
1377
  gr.Markdown("### 🔧 Generator Settings")
1378
  with gr.Tabs():
 
1411
  label="Voice",
1412
  value="en-US-AriaNeural"
1413
  )
1414
+ # Fix for music_file - Get available music and set proper default
1415
+ music_choices = get_music_files()
1416
+ default_music = "none" if "random" not in music_choices else "random"
1417
  music_file = gr.Dropdown(
1418
+ choices=music_choices,
1419
  label="Background Music",
1420
+ value=default_music
1421
  )
1422
 
1423
  with gr.TabItem("Subtitles"):
 
1426
  subtitle_font = gr.Dropdown(
1427
  choices=get_font_files(),
1428
  label="Font",
1429
+ value="random"
1430
  )
1431
  with gr.Row():
1432
  font_size = gr.Slider(
 
1445
  text_color = gr.ColorPicker(label="Text Color", value="#FFFFFF")
1446
  highlight_color = gr.ColorPicker(label="Highlight Color", value="#0000FF")
1447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1448
  # Generate button
1449
  generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
1450
 
1451
  # Right panel: Output display
1452
+ with gr.Column(scale=1, min_width=300):
1453
  with gr.Tabs():
1454
  with gr.TabItem("Video"):
1455
+ # Larger video preview with proper mobile proportions
1456
+ video_output = gr.Video(label="Generated Video", height=580, width=330)
1457
 
1458
  with gr.TabItem("Metadata"):
1459
  title_output = gr.Textbox(label="Title", lines=2)
1460
  description_output = gr.Textbox(label="Description", lines=4)
1461
  script_output = gr.Textbox(label="Script", lines=8)
1462
+
1463
+ # API Keys section as a tab
1464
+ with gr.TabItem("🔑 API Keys"):
1465
+ gemini_api_key = gr.Textbox(
1466
+ label="Gemini API Key",
1467
+ type="password",
1468
+ value=os.environ.get("GEMINI_API_KEY", "")
1469
+ )
1470
+ assemblyai_api_key = gr.Textbox(
1471
+ label="AssemblyAI API Key",
1472
+ type="password",
1473
+ value=os.environ.get("ASSEMBLYAI_API_KEY", "")
1474
+ )
1475
+ elevenlabs_api_key = gr.Textbox(
1476
+ label="ElevenLabs API Key",
1477
+ type="password",
1478
+ value=os.environ.get("ELEVENLABS_API_KEY", "")
1479
+ )
1480
+ segmind_api_key = gr.Textbox(
1481
+ label="Segmind API Key",
1482
+ type="password",
1483
+ value=os.environ.get("SEGMIND_API_KEY", "")
1484
+ )
1485
+ openai_api_key = gr.Textbox(
1486
+ label="OpenAI API Key",
1487
+ type="password",
1488
+ value=os.environ.get("OPENAI_API_KEY", "")
1489
+ )
1490
 
1491
  with gr.TabItem("Log"):
1492
+ log_output = gr.Textbox(label="Process Log", lines=15, max_lines=100)
1493
 
1494
  # Dynamic dropdown updates
1495
  def update_text_models(generator):
 
1507
  tts_engine.change(fn=update_tts_voices, inputs=tts_engine, outputs=tts_voice)
1508
 
1509
  # Main generation function
1510
+ def generate_youtube_short(niche, language, text_gen, text_model, image_gen, image_model,
 
 
1511
  tts_engine, tts_voice, subtitles_enabled, highlighting_enabled,
1512
  subtitle_font, font_size, subtitle_position,
1513
+ text_color, highlight_color, music_file,
1514
+ gemini_api_key, assemblyai_api_key,
1515
+ elevenlabs_api_key, segmind_api_key, openai_api_key,
1516
+ progress=gr.Progress()):
1517
 
1518
  if not niche.strip():
1519
  return {
 
1592
  generate_btn.click(
1593
  fn=generate_youtube_short,
1594
  inputs=[
1595
+ niche, language, text_gen, text_model, image_gen, image_model,
 
1596
  tts_engine, tts_voice, subtitles_enabled, highlighting_enabled,
1597
+ subtitle_font, font_size, subtitle_position, text_color, highlight_color, music_file,
1598
+ gemini_api_key, assemblyai_api_key, elevenlabs_api_key, segmind_api_key, openai_api_key
1599
  ],
1600
  outputs=[video_output, title_output, description_output, script_output, log_output]
1601
  )
1602
 
1603
  # Add examples
1604
+ music_choices = get_music_files()
1605
+ default_music = "none" if "random" not in music_choices else "random"
1606
+
1607
  gr.Examples(
1608
  [
1609
+ ["Historical Facts", "English", "g4f", "gpt-4", "g4f", "flux", "edge", "en-US-AriaNeural", True, True, "default", 80, "bottom", "#FFFFFF", "#0000FF", default_music],
1610
+ ["Cooking Tips", "English", "g4f", "gpt-4", "g4f", "flux", "edge", "en-US-AriaNeural", True, True, "default", 80, "bottom", "#FFFFFF", "#FF0000", default_music],
1611
+ ["Technology News", "English", "g4f", "gpt-4", "g4f", "flux", "edge", "en-US-GuyNeural", True, True, "default", 80, "bottom", "#FFFFFF", "#00FF00", default_music],
1612
  ],
1613
  [niche, language, text_gen, text_model, image_gen, image_model, tts_engine, tts_voice,
1614
  subtitles_enabled, highlighting_enabled, subtitle_font, font_size,
 
1621
  # Create and launch the interface
1622
  if __name__ == "__main__":
1623
  # Create necessary directories
1624
+ os.makedirs(STATIC_DIR, exist_ok=True)
1625
  os.makedirs(MUSIC_DIR, exist_ok=True)
1626
  os.makedirs(FONTS_DIR, exist_ok=True)
1627
+ os.makedirs(CACHE_DIR, exist_ok=True)
1628
 
1629
  # Launch the app
1630
  demo = create_interface()