Spaces:

Prof-Hunt
/

TECH_TALES

Runtime error

App Files Files Community

Prof-Hunt commited on Jan 31

Commit

02fe023

verified ·

1 Parent(s): 6722825

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -53

app.py CHANGED Viewed

@@ -56,25 +56,20 @@ model_lm = AutoModelForCausalLM.from_pretrained(checkpoint).to("cuda")
 # Initialize Kokoro TTS pipeline
 pipeline = KPipeline(lang_code='a')  # 'a' for American English
-def load_sd_model():
-    """Load Stable Diffusion model only when needed"""
-    pipe = StableDiffusionPipeline.from_pretrained(
-        "runwayml/stable-diffusion-v1-5",
-        torch_dtype=torch.float16,
-    )
-    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-    pipe.to("cuda")
-    pipe.enable_attention_slicing()
-    return pipe
 @torch.inference_mode()
 @spaces.GPU(duration=30)
 def generate_image():
     """Generate a random landscape image."""
     clear_memory()
-    pipe = load_sd_model()
     default_prompt = "a beautiful, professional landscape photograph"
     default_negative_prompt = "blurry, bad quality, distorted, deformed"
     default_steps = 30
@@ -84,7 +79,7 @@ def generate_image():
     generator = torch.Generator("cuda").manual_seed(default_seed)
     try:
-        image = pipe(
             prompt=default_prompt,
             negative_prompt=default_negative_prompt,
             num_inference_steps=default_steps,
@@ -148,7 +143,7 @@ def analyze_image(image):
         # Split into sentences and take only the first three
         sentences = re.split(r'(?<=[.!?])\s+', description)
-        description = ' '.join(sentences[:3])
         clear_memory()
         return description
@@ -168,7 +163,7 @@ def generate_story(image_description):
     Requirements:
     1. Main character: An English bulldog named Champ
     2. Include these values: confidence, teamwork, caring, and hope
-    3. Theme: "We are stronger together than as individuals"
     4. Keep it simple and engaging for young children
     5. End with a simple moral lesson"""
@@ -207,7 +202,7 @@ def generate_image_prompts(story_text):
     all_prompts = []
     prompt_instruction = '''Here is a story paragraph: {paragraph}
-    Start your response with "Watercolor bulldog" and describe what Champ is doing in this scene. Add where it takes place and one mood detail. Keep it short.'''
     try:
         for i, paragraph in enumerate(paragraphs, 1):
@@ -243,10 +238,8 @@ def generate_image_prompts(story_text):
 def generate_story_image(prompt, seed=-1):
     clear_memory()
-    pipe = load_sd_model()
     try:
-        pipe.load_lora_weights("Prof-Hunt/lora-bulldog")
         generator = torch.Generator("cuda")
         if seed != -1:
@@ -256,7 +249,7 @@ def generate_story_image(prompt, seed=-1):
         enhanced_prompt = f"{prompt}, watercolor style, children's book illustration, soft colors"
-        image = pipe(
             prompt=enhanced_prompt,
             negative_prompt="deformed, ugly, blurry, bad art, poor quality, distorted",
             num_inference_steps=50,
@@ -264,16 +257,13 @@ def generate_story_image(prompt, seed=-1):
             generator=generator
         ).images[0]
-        pipe.unload_lora_weights()
-        del pipe
         clear_memory()
         return image
     except Exception as e:
         print(f"Error generating image: {e}")
-        if 'pipe' in locals():
-            pipe.unload_lora_weights()
-            del pipe
         clear_memory()
         return None
@@ -522,10 +512,9 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
             print(f"Processing paragraph {i+1}/{len(paragraphs)}")
             print(f"Paragraph length: {len(paragraph)}")
-            print(f"Paragraph text: {paragraph[:100]}...")  # Print first 100 chars
             try:
-                # Generate audio for each sentence separately
                 sentences = [s.strip() for s in paragraph.split('.') if s.strip()]
                 print(f"Split into {len(sentences)} sentences")
@@ -533,35 +522,39 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
                     print(f"Processing sentence {j+1}/{len(sentences)}")
                     print(f"Sentence length: {len(sentence)}")
-                    # Add more robust error handling around the generator
                     try:
                         generator = pipeline(
-                            sentence + '.',  # Add period back
                             voice=voice,
                             speed=speed,
                             split_pattern=r'\n+'
                         )
-                        # Add type checking and validation for generator output
                         if generator is None:
                             print(f"Warning: Generator returned None for sentence: {sentence[:50]}...")
                             continue
-                        # Process generator output with additional error handling
                         for batch_idx, metadata, audio in generator:
-                            print(f"Processing batch {batch_idx}, audio length: {len(audio) if audio is not None else 0}")
                             if audio is not None and len(audio) > 0:
-                                # Validate audio data
-                                if isinstance(audio, (list, np.ndarray)):
-                                    combined_audio.extend(audio)
-                                else:
-                                    print(f"Warning: Invalid audio type: {type(audio)}")
-                            else:
-                                print(f"Warning: Empty audio generated for sentence: {sentence[:50]}...")
-                        # Add a small pause between sentences
-                        combined_audio.extend([0] * 1000)  # 1000 samples of silence
                     except Exception as e:
                         print(f"Error processing sentence {j+1}: {str(e)}")
@@ -569,8 +562,8 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
                         print(traceback.format_exc())
                         continue
-                # Add a longer pause between paragraphs
-                combined_audio.extend([0] * 2000)  # 2000 samples of silence
             except Exception as e:
                 print(f"Error processing paragraph {i+1}: {str(e)}")
@@ -582,20 +575,20 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
             print("No audio was generated")
             return None
-        # Convert combined audio to NumPy array and normalize
-        combined_audio = np.array(combined_audio)
         if len(combined_audio) > 0:
-            # Print audio statistics
             print(f"Final audio length: {len(combined_audio)}")
             print(f"Audio min/max values: {np.min(combined_audio)}/{np.max(combined_audio)}")
-            # Normalize audio to prevent clipping
-            max_val = np.max(np.abs(combined_audio))
-            if max_val > 0:
-                combined_audio = combined_audio * 0.9 / max_val
                 print("Audio normalized successfully")
-            # Save audio with error handling
             try:
                 filename = "combined_story.wav"
                 sf.write(filename, combined_audio, 24000)
@@ -617,7 +610,7 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
     finally:
         clear_memory()
 # Helper functions
 def clean_story_output(story):
     """Clean up the generated story text."""

 # Initialize Kokoro TTS pipeline
 pipeline = KPipeline(lang_code='a')  # 'a' for American English
+# Load Stable Diffusion model at startup
+pipe_sd = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+).to("cuda")
+pipe_sd.scheduler = DPMSolverMultistepScheduler.from_config(pipe_sd.scheduler.config)
+pipe_sd.enable_attention_slicing()
 @torch.inference_mode()
 @spaces.GPU(duration=30)
 def generate_image():
     """Generate a random landscape image."""
     clear_memory()
     default_prompt = "a beautiful, professional landscape photograph"
     default_negative_prompt = "blurry, bad quality, distorted, deformed"
     default_steps = 30
     generator = torch.Generator("cuda").manual_seed(default_seed)
     try:
+        image = pipe_sd(
             prompt=default_prompt,
             negative_prompt=default_negative_prompt,
             num_inference_steps=default_steps,
         # Split into sentences and take only the first three
         sentences = re.split(r'(?<=[.!?])\s+', description)
+        description = ' '.join(sentences[:9])
         clear_memory()
         return description
     Requirements:
     1. Main character: An English bulldog named Champ
     2. Include these values: confidence, teamwork, caring, and hope
+    3. Theme: "Doing the right thing is important"
     4. Keep it simple and engaging for young children
     5. End with a simple moral lesson"""
     all_prompts = []
     prompt_instruction = '''Here is a story paragraph: {paragraph}
+    Start your response with "Watercolor bulldog" and describe what Champ is doing in this scene. Include any friends. Add where it takes place and one mood detail. Keep it short.'''
     try:
         for i, paragraph in enumerate(paragraphs, 1):
 def generate_story_image(prompt, seed=-1):
     clear_memory()
     try:
+        pipe_sd.load_lora_weights("Prof-Hunt/lora-bulldog")
         generator = torch.Generator("cuda")
         if seed != -1:
         enhanced_prompt = f"{prompt}, watercolor style, children's book illustration, soft colors"
+        image = pipe_sd(
             prompt=enhanced_prompt,
             negative_prompt="deformed, ugly, blurry, bad art, poor quality, distorted",
             num_inference_steps=50,
             generator=generator
         ).images[0]
+        pipe_sd.unload_lora_weights()
         clear_memory()
         return image
     except Exception as e:
         print(f"Error generating image: {e}")
+        pipe_sd.unload_lora_weights()
         clear_memory()
         return None
             print(f"Processing paragraph {i+1}/{len(paragraphs)}")
             print(f"Paragraph length: {len(paragraph)}")
+            print(f"Paragraph text: {paragraph[:100]}...")
             try:
                 sentences = [s.strip() for s in paragraph.split('.') if s.strip()]
                 print(f"Split into {len(sentences)} sentences")
                     print(f"Processing sentence {j+1}/{len(sentences)}")
                     print(f"Sentence length: {len(sentence)}")
                     try:
                         generator = pipeline(
+                            sentence + '.',
                             voice=voice,
                             speed=speed,
                             split_pattern=r'\n+'
                         )
                         if generator is None:
                             print(f"Warning: Generator returned None for sentence: {sentence[:50]}...")
                             continue
                         for batch_idx, metadata, audio in generator:
+                            print(f"Batch {batch_idx}")
+                            print(f"Audio type: {type(audio)}")
                             if audio is not None and len(audio) > 0:
+                                print(f"Audio shape/length: {getattr(audio, 'shape', len(audio))}")
+                                print(f"Audio dtype: {getattr(audio, 'dtype', type(audio[0]))}")
+                                print(f"First few values: {audio[:5]}")
+                                # Convert to float32 numpy array before extending
+                                if isinstance(audio, list):
+                                    audio = np.array(audio, dtype=np.float32)
+                                elif isinstance(audio, np.ndarray):
+                                    audio = audio.astype(np.float32)
+                                combined_audio.extend(audio.tolist())
+                            else:
+                                print(f"Warning: Empty audio for sentence: {sentence[:50]}...")
+                        # Add silence between sentences (as float32)
+                        combined_audio.extend(np.zeros(1000, dtype=np.float32).tolist())
                     except Exception as e:
                         print(f"Error processing sentence {j+1}: {str(e)}")
                         print(traceback.format_exc())
                         continue
+                # Add silence between paragraphs (as float32)
+                combined_audio.extend(np.zeros(2000, dtype=np.float32).tolist())
             except Exception as e:
                 print(f"Error processing paragraph {i+1}: {str(e)}")
             print("No audio was generated")
             return None
+        # Convert to numpy array and ensure float32
+        combined_audio = np.array(combined_audio, dtype=np.float32)
         if len(combined_audio) > 0:
             print(f"Final audio length: {len(combined_audio)}")
+            print(f"Final audio dtype: {combined_audio.dtype}")
             print(f"Audio min/max values: {np.min(combined_audio)}/{np.max(combined_audio)}")
+            # Only normalize if we have non-zero values
+            if np.max(np.abs(combined_audio)) > 0:
+                combined_audio = combined_audio / np.max(np.abs(combined_audio)) * 0.9
                 print("Audio normalized successfully")
+            else:
+                print("Warning: Audio contains only zeros")
             try:
                 filename = "combined_story.wav"
                 sf.write(filename, combined_audio, 24000)
     finally:
         clear_memory()
 # Helper functions
 def clean_story_output(story):
     """Clean up the generated story text."""