Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -56,25 +56,20 @@ model_lm = AutoModelForCausalLM.from_pretrained(checkpoint).to("cuda")
|
|
56 |
# Initialize Kokoro TTS pipeline
|
57 |
pipeline = KPipeline(lang_code='a') # 'a' for American English
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
pipe.to("cuda")
|
67 |
-
pipe.enable_attention_slicing()
|
68 |
-
return pipe
|
69 |
|
70 |
@torch.inference_mode()
|
71 |
@spaces.GPU(duration=30)
|
72 |
def generate_image():
|
73 |
"""Generate a random landscape image."""
|
74 |
clear_memory()
|
75 |
-
|
76 |
-
pipe = load_sd_model()
|
77 |
-
|
78 |
default_prompt = "a beautiful, professional landscape photograph"
|
79 |
default_negative_prompt = "blurry, bad quality, distorted, deformed"
|
80 |
default_steps = 30
|
@@ -84,7 +79,7 @@ def generate_image():
|
|
84 |
generator = torch.Generator("cuda").manual_seed(default_seed)
|
85 |
|
86 |
try:
|
87 |
-
image =
|
88 |
prompt=default_prompt,
|
89 |
negative_prompt=default_negative_prompt,
|
90 |
num_inference_steps=default_steps,
|
@@ -148,7 +143,7 @@ def analyze_image(image):
|
|
148 |
|
149 |
# Split into sentences and take only the first three
|
150 |
sentences = re.split(r'(?<=[.!?])\s+', description)
|
151 |
-
description = ' '.join(sentences[:
|
152 |
|
153 |
clear_memory()
|
154 |
return description
|
@@ -168,7 +163,7 @@ def generate_story(image_description):
|
|
168 |
Requirements:
|
169 |
1. Main character: An English bulldog named Champ
|
170 |
2. Include these values: confidence, teamwork, caring, and hope
|
171 |
-
3. Theme: "
|
172 |
4. Keep it simple and engaging for young children
|
173 |
5. End with a simple moral lesson"""
|
174 |
|
@@ -207,7 +202,7 @@ def generate_image_prompts(story_text):
|
|
207 |
all_prompts = []
|
208 |
prompt_instruction = '''Here is a story paragraph: {paragraph}
|
209 |
|
210 |
-
Start your response with "Watercolor bulldog" and describe what Champ is doing in this scene. Add where it takes place and one mood detail. Keep it short.'''
|
211 |
|
212 |
try:
|
213 |
for i, paragraph in enumerate(paragraphs, 1):
|
@@ -243,10 +238,8 @@ def generate_image_prompts(story_text):
|
|
243 |
def generate_story_image(prompt, seed=-1):
|
244 |
clear_memory()
|
245 |
|
246 |
-
pipe = load_sd_model()
|
247 |
-
|
248 |
try:
|
249 |
-
|
250 |
|
251 |
generator = torch.Generator("cuda")
|
252 |
if seed != -1:
|
@@ -256,7 +249,7 @@ def generate_story_image(prompt, seed=-1):
|
|
256 |
|
257 |
enhanced_prompt = f"{prompt}, watercolor style, children's book illustration, soft colors"
|
258 |
|
259 |
-
image =
|
260 |
prompt=enhanced_prompt,
|
261 |
negative_prompt="deformed, ugly, blurry, bad art, poor quality, distorted",
|
262 |
num_inference_steps=50,
|
@@ -264,16 +257,13 @@ def generate_story_image(prompt, seed=-1):
|
|
264 |
generator=generator
|
265 |
).images[0]
|
266 |
|
267 |
-
|
268 |
-
del pipe
|
269 |
clear_memory()
|
270 |
return image
|
271 |
|
272 |
except Exception as e:
|
273 |
print(f"Error generating image: {e}")
|
274 |
-
|
275 |
-
pipe.unload_lora_weights()
|
276 |
-
del pipe
|
277 |
clear_memory()
|
278 |
return None
|
279 |
|
@@ -522,10 +512,9 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
|
|
522 |
|
523 |
print(f"Processing paragraph {i+1}/{len(paragraphs)}")
|
524 |
print(f"Paragraph length: {len(paragraph)}")
|
525 |
-
print(f"Paragraph text: {paragraph[:100]}...")
|
526 |
|
527 |
try:
|
528 |
-
# Generate audio for each sentence separately
|
529 |
sentences = [s.strip() for s in paragraph.split('.') if s.strip()]
|
530 |
print(f"Split into {len(sentences)} sentences")
|
531 |
|
@@ -533,35 +522,39 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
|
|
533 |
print(f"Processing sentence {j+1}/{len(sentences)}")
|
534 |
print(f"Sentence length: {len(sentence)}")
|
535 |
|
536 |
-
# Add more robust error handling around the generator
|
537 |
try:
|
538 |
generator = pipeline(
|
539 |
-
sentence + '.',
|
540 |
voice=voice,
|
541 |
speed=speed,
|
542 |
split_pattern=r'\n+'
|
543 |
)
|
544 |
|
545 |
-
# Add type checking and validation for generator output
|
546 |
if generator is None:
|
547 |
print(f"Warning: Generator returned None for sentence: {sentence[:50]}...")
|
548 |
continue
|
549 |
|
550 |
-
# Process generator output with additional error handling
|
551 |
for batch_idx, metadata, audio in generator:
|
552 |
-
print(f"
|
|
|
553 |
|
554 |
if audio is not None and len(audio) > 0:
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
|
|
|
|
562 |
|
563 |
-
|
564 |
-
|
|
|
|
|
|
|
|
|
565 |
|
566 |
except Exception as e:
|
567 |
print(f"Error processing sentence {j+1}: {str(e)}")
|
@@ -569,8 +562,8 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
|
|
569 |
print(traceback.format_exc())
|
570 |
continue
|
571 |
|
572 |
-
# Add
|
573 |
-
combined_audio.extend(
|
574 |
|
575 |
except Exception as e:
|
576 |
print(f"Error processing paragraph {i+1}: {str(e)}")
|
@@ -582,20 +575,20 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
|
|
582 |
print("No audio was generated")
|
583 |
return None
|
584 |
|
585 |
-
# Convert
|
586 |
-
combined_audio = np.array(combined_audio)
|
587 |
if len(combined_audio) > 0:
|
588 |
-
# Print audio statistics
|
589 |
print(f"Final audio length: {len(combined_audio)}")
|
|
|
590 |
print(f"Audio min/max values: {np.min(combined_audio)}/{np.max(combined_audio)}")
|
591 |
|
592 |
-
#
|
593 |
-
|
594 |
-
|
595 |
-
combined_audio = combined_audio * 0.9 / max_val
|
596 |
print("Audio normalized successfully")
|
|
|
|
|
597 |
|
598 |
-
# Save audio with error handling
|
599 |
try:
|
600 |
filename = "combined_story.wav"
|
601 |
sf.write(filename, combined_audio, 24000)
|
@@ -617,7 +610,7 @@ def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
|
|
617 |
|
618 |
finally:
|
619 |
clear_memory()
|
620 |
-
|
621 |
# Helper functions
|
622 |
def clean_story_output(story):
|
623 |
"""Clean up the generated story text."""
|
|
|
56 |
# Initialize Kokoro TTS pipeline
|
57 |
pipeline = KPipeline(lang_code='a') # 'a' for American English
|
58 |
|
59 |
+
# Load Stable Diffusion model at startup
|
60 |
+
pipe_sd = StableDiffusionPipeline.from_pretrained(
|
61 |
+
"runwayml/stable-diffusion-v1-5",
|
62 |
+
torch_dtype=torch.float16,
|
63 |
+
).to("cuda")
|
64 |
+
pipe_sd.scheduler = DPMSolverMultistepScheduler.from_config(pipe_sd.scheduler.config)
|
65 |
+
pipe_sd.enable_attention_slicing()
|
|
|
|
|
|
|
66 |
|
67 |
@torch.inference_mode()
|
68 |
@spaces.GPU(duration=30)
|
69 |
def generate_image():
|
70 |
"""Generate a random landscape image."""
|
71 |
clear_memory()
|
72 |
+
|
|
|
|
|
73 |
default_prompt = "a beautiful, professional landscape photograph"
|
74 |
default_negative_prompt = "blurry, bad quality, distorted, deformed"
|
75 |
default_steps = 30
|
|
|
79 |
generator = torch.Generator("cuda").manual_seed(default_seed)
|
80 |
|
81 |
try:
|
82 |
+
image = pipe_sd(
|
83 |
prompt=default_prompt,
|
84 |
negative_prompt=default_negative_prompt,
|
85 |
num_inference_steps=default_steps,
|
|
|
143 |
|
144 |
# Split into sentences and take only the first three
|
145 |
sentences = re.split(r'(?<=[.!?])\s+', description)
|
146 |
+
description = ' '.join(sentences[:9])
|
147 |
|
148 |
clear_memory()
|
149 |
return description
|
|
|
163 |
Requirements:
|
164 |
1. Main character: An English bulldog named Champ
|
165 |
2. Include these values: confidence, teamwork, caring, and hope
|
166 |
+
3. Theme: "Doing the right thing is important"
|
167 |
4. Keep it simple and engaging for young children
|
168 |
5. End with a simple moral lesson"""
|
169 |
|
|
|
202 |
all_prompts = []
|
203 |
prompt_instruction = '''Here is a story paragraph: {paragraph}
|
204 |
|
205 |
+
Start your response with "Watercolor bulldog" and describe what Champ is doing in this scene. Include any friends. Add where it takes place and one mood detail. Keep it short.'''
|
206 |
|
207 |
try:
|
208 |
for i, paragraph in enumerate(paragraphs, 1):
|
|
|
238 |
def generate_story_image(prompt, seed=-1):
|
239 |
clear_memory()
|
240 |
|
|
|
|
|
241 |
try:
|
242 |
+
pipe_sd.load_lora_weights("Prof-Hunt/lora-bulldog")
|
243 |
|
244 |
generator = torch.Generator("cuda")
|
245 |
if seed != -1:
|
|
|
249 |
|
250 |
enhanced_prompt = f"{prompt}, watercolor style, children's book illustration, soft colors"
|
251 |
|
252 |
+
image = pipe_sd(
|
253 |
prompt=enhanced_prompt,
|
254 |
negative_prompt="deformed, ugly, blurry, bad art, poor quality, distorted",
|
255 |
num_inference_steps=50,
|
|
|
257 |
generator=generator
|
258 |
).images[0]
|
259 |
|
260 |
+
pipe_sd.unload_lora_weights()
|
|
|
261 |
clear_memory()
|
262 |
return image
|
263 |
|
264 |
except Exception as e:
|
265 |
print(f"Error generating image: {e}")
|
266 |
+
pipe_sd.unload_lora_weights()
|
|
|
|
|
267 |
clear_memory()
|
268 |
return None
|
269 |
|
|
|
512 |
|
513 |
print(f"Processing paragraph {i+1}/{len(paragraphs)}")
|
514 |
print(f"Paragraph length: {len(paragraph)}")
|
515 |
+
print(f"Paragraph text: {paragraph[:100]}...")
|
516 |
|
517 |
try:
|
|
|
518 |
sentences = [s.strip() for s in paragraph.split('.') if s.strip()]
|
519 |
print(f"Split into {len(sentences)} sentences")
|
520 |
|
|
|
522 |
print(f"Processing sentence {j+1}/{len(sentences)}")
|
523 |
print(f"Sentence length: {len(sentence)}")
|
524 |
|
|
|
525 |
try:
|
526 |
generator = pipeline(
|
527 |
+
sentence + '.',
|
528 |
voice=voice,
|
529 |
speed=speed,
|
530 |
split_pattern=r'\n+'
|
531 |
)
|
532 |
|
|
|
533 |
if generator is None:
|
534 |
print(f"Warning: Generator returned None for sentence: {sentence[:50]}...")
|
535 |
continue
|
536 |
|
|
|
537 |
for batch_idx, metadata, audio in generator:
|
538 |
+
print(f"Batch {batch_idx}")
|
539 |
+
print(f"Audio type: {type(audio)}")
|
540 |
|
541 |
if audio is not None and len(audio) > 0:
|
542 |
+
print(f"Audio shape/length: {getattr(audio, 'shape', len(audio))}")
|
543 |
+
print(f"Audio dtype: {getattr(audio, 'dtype', type(audio[0]))}")
|
544 |
+
print(f"First few values: {audio[:5]}")
|
545 |
+
|
546 |
+
# Convert to float32 numpy array before extending
|
547 |
+
if isinstance(audio, list):
|
548 |
+
audio = np.array(audio, dtype=np.float32)
|
549 |
+
elif isinstance(audio, np.ndarray):
|
550 |
+
audio = audio.astype(np.float32)
|
551 |
|
552 |
+
combined_audio.extend(audio.tolist())
|
553 |
+
else:
|
554 |
+
print(f"Warning: Empty audio for sentence: {sentence[:50]}...")
|
555 |
+
|
556 |
+
# Add silence between sentences (as float32)
|
557 |
+
combined_audio.extend(np.zeros(1000, dtype=np.float32).tolist())
|
558 |
|
559 |
except Exception as e:
|
560 |
print(f"Error processing sentence {j+1}: {str(e)}")
|
|
|
562 |
print(traceback.format_exc())
|
563 |
continue
|
564 |
|
565 |
+
# Add silence between paragraphs (as float32)
|
566 |
+
combined_audio.extend(np.zeros(2000, dtype=np.float32).tolist())
|
567 |
|
568 |
except Exception as e:
|
569 |
print(f"Error processing paragraph {i+1}: {str(e)}")
|
|
|
575 |
print("No audio was generated")
|
576 |
return None
|
577 |
|
578 |
+
# Convert to numpy array and ensure float32
|
579 |
+
combined_audio = np.array(combined_audio, dtype=np.float32)
|
580 |
if len(combined_audio) > 0:
|
|
|
581 |
print(f"Final audio length: {len(combined_audio)}")
|
582 |
+
print(f"Final audio dtype: {combined_audio.dtype}")
|
583 |
print(f"Audio min/max values: {np.min(combined_audio)}/{np.max(combined_audio)}")
|
584 |
|
585 |
+
# Only normalize if we have non-zero values
|
586 |
+
if np.max(np.abs(combined_audio)) > 0:
|
587 |
+
combined_audio = combined_audio / np.max(np.abs(combined_audio)) * 0.9
|
|
|
588 |
print("Audio normalized successfully")
|
589 |
+
else:
|
590 |
+
print("Warning: Audio contains only zeros")
|
591 |
|
|
|
592 |
try:
|
593 |
filename = "combined_story.wav"
|
594 |
sf.write(filename, combined_audio, 24000)
|
|
|
610 |
|
611 |
finally:
|
612 |
clear_memory()
|
613 |
+
|
614 |
# Helper functions
|
615 |
def clean_story_output(story):
|
616 |
"""Clean up the generated story text."""
|