Prof-Hunt commited on
Commit
02fe023
·
verified ·
1 Parent(s): 6722825

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -53
app.py CHANGED
@@ -56,25 +56,20 @@ model_lm = AutoModelForCausalLM.from_pretrained(checkpoint).to("cuda")
56
  # Initialize Kokoro TTS pipeline
57
  pipeline = KPipeline(lang_code='a') # 'a' for American English
58
 
59
- def load_sd_model():
60
- """Load Stable Diffusion model only when needed"""
61
- pipe = StableDiffusionPipeline.from_pretrained(
62
- "runwayml/stable-diffusion-v1-5",
63
- torch_dtype=torch.float16,
64
- )
65
- pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
66
- pipe.to("cuda")
67
- pipe.enable_attention_slicing()
68
- return pipe
69
 
70
  @torch.inference_mode()
71
  @spaces.GPU(duration=30)
72
  def generate_image():
73
  """Generate a random landscape image."""
74
  clear_memory()
75
-
76
- pipe = load_sd_model()
77
-
78
  default_prompt = "a beautiful, professional landscape photograph"
79
  default_negative_prompt = "blurry, bad quality, distorted, deformed"
80
  default_steps = 30
@@ -84,7 +79,7 @@ def generate_image():
84
  generator = torch.Generator("cuda").manual_seed(default_seed)
85
 
86
  try:
87
- image = pipe(
88
  prompt=default_prompt,
89
  negative_prompt=default_negative_prompt,
90
  num_inference_steps=default_steps,
@@ -148,7 +143,7 @@ def analyze_image(image):
148
 
149
  # Split into sentences and take only the first three
150
  sentences = re.split(r'(?<=[.!?])\s+', description)
151
- description = ' '.join(sentences[:3])
152
 
153
  clear_memory()
154
  return description
@@ -168,7 +163,7 @@ def generate_story(image_description):
168
  Requirements:
169
  1. Main character: An English bulldog named Champ
170
  2. Include these values: confidence, teamwork, caring, and hope
171
- 3. Theme: "We are stronger together than as individuals"
172
  4. Keep it simple and engaging for young children
173
  5. End with a simple moral lesson"""
174
 
@@ -207,7 +202,7 @@ def generate_image_prompts(story_text):
207
  all_prompts = []
208
  prompt_instruction = '''Here is a story paragraph: {paragraph}
209
 
210
- Start your response with "Watercolor bulldog" and describe what Champ is doing in this scene. Add where it takes place and one mood detail. Keep it short.'''
211
 
212
  try:
213
  for i, paragraph in enumerate(paragraphs, 1):
@@ -243,10 +238,8 @@ def generate_image_prompts(story_text):
243
  def generate_story_image(prompt, seed=-1):
244
  clear_memory()
245
 
246
- pipe = load_sd_model()
247
-
248
  try:
249
- pipe.load_lora_weights("Prof-Hunt/lora-bulldog")
250
 
251
  generator = torch.Generator("cuda")
252
  if seed != -1:
@@ -256,7 +249,7 @@ def generate_story_image(prompt, seed=-1):
256
 
257
  enhanced_prompt = f"{prompt}, watercolor style, children's book illustration, soft colors"
258
 
259
- image = pipe(
260
  prompt=enhanced_prompt,
261
  negative_prompt="deformed, ugly, blurry, bad art, poor quality, distorted",
262
  num_inference_steps=50,
@@ -264,16 +257,13 @@ def generate_story_image(prompt, seed=-1):
264
  generator=generator
265
  ).images[0]
266
 
267
- pipe.unload_lora_weights()
268
- del pipe
269
  clear_memory()
270
  return image
271
 
272
  except Exception as e:
273
  print(f"Error generating image: {e}")
274
- if 'pipe' in locals():
275
- pipe.unload_lora_weights()
276
- del pipe
277
  clear_memory()
278
  return None
279
 
@@ -522,10 +512,9 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
522
 
523
  print(f"Processing paragraph {i+1}/{len(paragraphs)}")
524
  print(f"Paragraph length: {len(paragraph)}")
525
- print(f"Paragraph text: {paragraph[:100]}...") # Print first 100 chars
526
 
527
  try:
528
- # Generate audio for each sentence separately
529
  sentences = [s.strip() for s in paragraph.split('.') if s.strip()]
530
  print(f"Split into {len(sentences)} sentences")
531
 
@@ -533,35 +522,39 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
533
  print(f"Processing sentence {j+1}/{len(sentences)}")
534
  print(f"Sentence length: {len(sentence)}")
535
 
536
- # Add more robust error handling around the generator
537
  try:
538
  generator = pipeline(
539
- sentence + '.', # Add period back
540
  voice=voice,
541
  speed=speed,
542
  split_pattern=r'\n+'
543
  )
544
 
545
- # Add type checking and validation for generator output
546
  if generator is None:
547
  print(f"Warning: Generator returned None for sentence: {sentence[:50]}...")
548
  continue
549
 
550
- # Process generator output with additional error handling
551
  for batch_idx, metadata, audio in generator:
552
- print(f"Processing batch {batch_idx}, audio length: {len(audio) if audio is not None else 0}")
 
553
 
554
  if audio is not None and len(audio) > 0:
555
- # Validate audio data
556
- if isinstance(audio, (list, np.ndarray)):
557
- combined_audio.extend(audio)
558
- else:
559
- print(f"Warning: Invalid audio type: {type(audio)}")
560
- else:
561
- print(f"Warning: Empty audio generated for sentence: {sentence[:50]}...")
 
 
562
 
563
- # Add a small pause between sentences
564
- combined_audio.extend([0] * 1000) # 1000 samples of silence
 
 
 
 
565
 
566
  except Exception as e:
567
  print(f"Error processing sentence {j+1}: {str(e)}")
@@ -569,8 +562,8 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
569
  print(traceback.format_exc())
570
  continue
571
 
572
- # Add a longer pause between paragraphs
573
- combined_audio.extend([0] * 2000) # 2000 samples of silence
574
 
575
  except Exception as e:
576
  print(f"Error processing paragraph {i+1}: {str(e)}")
@@ -582,20 +575,20 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
582
  print("No audio was generated")
583
  return None
584
 
585
- # Convert combined audio to NumPy array and normalize
586
- combined_audio = np.array(combined_audio)
587
  if len(combined_audio) > 0:
588
- # Print audio statistics
589
  print(f"Final audio length: {len(combined_audio)}")
 
590
  print(f"Audio min/max values: {np.min(combined_audio)}/{np.max(combined_audio)}")
591
 
592
- # Normalize audio to prevent clipping
593
- max_val = np.max(np.abs(combined_audio))
594
- if max_val > 0:
595
- combined_audio = combined_audio * 0.9 / max_val
596
  print("Audio normalized successfully")
 
 
597
 
598
- # Save audio with error handling
599
  try:
600
  filename = "combined_story.wav"
601
  sf.write(filename, combined_audio, 24000)
@@ -617,7 +610,7 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
617
 
618
  finally:
619
  clear_memory()
620
-
621
  # Helper functions
622
  def clean_story_output(story):
623
  """Clean up the generated story text."""
 
56
  # Initialize Kokoro TTS pipeline
57
  pipeline = KPipeline(lang_code='a') # 'a' for American English
58
 
59
+ # Load Stable Diffusion model at startup
60
+ pipe_sd = StableDiffusionPipeline.from_pretrained(
61
+ "runwayml/stable-diffusion-v1-5",
62
+ torch_dtype=torch.float16,
63
+ ).to("cuda")
64
+ pipe_sd.scheduler = DPMSolverMultistepScheduler.from_config(pipe_sd.scheduler.config)
65
+ pipe_sd.enable_attention_slicing()
 
 
 
66
 
67
  @torch.inference_mode()
68
  @spaces.GPU(duration=30)
69
  def generate_image():
70
  """Generate a random landscape image."""
71
  clear_memory()
72
+
 
 
73
  default_prompt = "a beautiful, professional landscape photograph"
74
  default_negative_prompt = "blurry, bad quality, distorted, deformed"
75
  default_steps = 30
 
79
  generator = torch.Generator("cuda").manual_seed(default_seed)
80
 
81
  try:
82
+ image = pipe_sd(
83
  prompt=default_prompt,
84
  negative_prompt=default_negative_prompt,
85
  num_inference_steps=default_steps,
 
143
 
144
  # Split into sentences and take only the first three
145
  sentences = re.split(r'(?<=[.!?])\s+', description)
146
+ description = ' '.join(sentences[:9])
147
 
148
  clear_memory()
149
  return description
 
163
  Requirements:
164
  1. Main character: An English bulldog named Champ
165
  2. Include these values: confidence, teamwork, caring, and hope
166
+ 3. Theme: "Doing the right thing is important"
167
  4. Keep it simple and engaging for young children
168
  5. End with a simple moral lesson"""
169
 
 
202
  all_prompts = []
203
  prompt_instruction = '''Here is a story paragraph: {paragraph}
204
 
205
+ Start your response with "Watercolor bulldog" and describe what Champ is doing in this scene. Include any friends. Add where it takes place and one mood detail. Keep it short.'''
206
 
207
  try:
208
  for i, paragraph in enumerate(paragraphs, 1):
 
238
  def generate_story_image(prompt, seed=-1):
239
  clear_memory()
240
 
 
 
241
  try:
242
+ pipe_sd.load_lora_weights("Prof-Hunt/lora-bulldog")
243
 
244
  generator = torch.Generator("cuda")
245
  if seed != -1:
 
249
 
250
  enhanced_prompt = f"{prompt}, watercolor style, children's book illustration, soft colors"
251
 
252
+ image = pipe_sd(
253
  prompt=enhanced_prompt,
254
  negative_prompt="deformed, ugly, blurry, bad art, poor quality, distorted",
255
  num_inference_steps=50,
 
257
  generator=generator
258
  ).images[0]
259
 
260
+ pipe_sd.unload_lora_weights()
 
261
  clear_memory()
262
  return image
263
 
264
  except Exception as e:
265
  print(f"Error generating image: {e}")
266
+ pipe_sd.unload_lora_weights()
 
 
267
  clear_memory()
268
  return None
269
 
 
512
 
513
  print(f"Processing paragraph {i+1}/{len(paragraphs)}")
514
  print(f"Paragraph length: {len(paragraph)}")
515
+ print(f"Paragraph text: {paragraph[:100]}...")
516
 
517
  try:
 
518
  sentences = [s.strip() for s in paragraph.split('.') if s.strip()]
519
  print(f"Split into {len(sentences)} sentences")
520
 
 
522
  print(f"Processing sentence {j+1}/{len(sentences)}")
523
  print(f"Sentence length: {len(sentence)}")
524
 
 
525
  try:
526
  generator = pipeline(
527
+ sentence + '.',
528
  voice=voice,
529
  speed=speed,
530
  split_pattern=r'\n+'
531
  )
532
 
 
533
  if generator is None:
534
  print(f"Warning: Generator returned None for sentence: {sentence[:50]}...")
535
  continue
536
 
 
537
  for batch_idx, metadata, audio in generator:
538
+ print(f"Batch {batch_idx}")
539
+ print(f"Audio type: {type(audio)}")
540
 
541
  if audio is not None and len(audio) > 0:
542
+ print(f"Audio shape/length: {getattr(audio, 'shape', len(audio))}")
543
+ print(f"Audio dtype: {getattr(audio, 'dtype', type(audio[0]))}")
544
+ print(f"First few values: {audio[:5]}")
545
+
546
+ # Convert to float32 numpy array before extending
547
+ if isinstance(audio, list):
548
+ audio = np.array(audio, dtype=np.float32)
549
+ elif isinstance(audio, np.ndarray):
550
+ audio = audio.astype(np.float32)
551
 
552
+ combined_audio.extend(audio.tolist())
553
+ else:
554
+ print(f"Warning: Empty audio for sentence: {sentence[:50]}...")
555
+
556
+ # Add silence between sentences (as float32)
557
+ combined_audio.extend(np.zeros(1000, dtype=np.float32).tolist())
558
 
559
  except Exception as e:
560
  print(f"Error processing sentence {j+1}: {str(e)}")
 
562
  print(traceback.format_exc())
563
  continue
564
 
565
+ # Add silence between paragraphs (as float32)
566
+ combined_audio.extend(np.zeros(2000, dtype=np.float32).tolist())
567
 
568
  except Exception as e:
569
  print(f"Error processing paragraph {i+1}: {str(e)}")
 
575
  print("No audio was generated")
576
  return None
577
 
578
+ # Convert to numpy array and ensure float32
579
+ combined_audio = np.array(combined_audio, dtype=np.float32)
580
  if len(combined_audio) > 0:
 
581
  print(f"Final audio length: {len(combined_audio)}")
582
+ print(f"Final audio dtype: {combined_audio.dtype}")
583
  print(f"Audio min/max values: {np.min(combined_audio)}/{np.max(combined_audio)}")
584
 
585
+ # Only normalize if we have non-zero values
586
+ if np.max(np.abs(combined_audio)) > 0:
587
+ combined_audio = combined_audio / np.max(np.abs(combined_audio)) * 0.9
 
588
  print("Audio normalized successfully")
589
+ else:
590
+ print("Warning: Audio contains only zeros")
591
 
 
592
  try:
593
  filename = "combined_story.wav"
594
  sf.write(filename, combined_audio, 24000)
 
610
 
611
  finally:
612
  clear_memory()
613
+
614
  # Helper functions
615
  def clean_story_output(story):
616
  """Clean up the generated story text."""