Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -485,129 +485,44 @@ def overlay_text_on_image(image, text):
|
|
485 |
return None
|
486 |
|
487 |
def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
|
488 |
-
"""Generate
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
print("No story text provided")
|
493 |
-
return None
|
494 |
|
495 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
496 |
|
497 |
-
|
498 |
-
|
499 |
-
if not paragraphs:
|
500 |
-
print("No valid paragraphs found in story")
|
501 |
-
return None
|
502 |
|
503 |
-
|
504 |
combined_audio = []
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
generator = pipeline(
|
525 |
-
sentence + '.',
|
526 |
-
voice=voice,
|
527 |
-
speed=speed,
|
528 |
-
split_pattern=r'\n+'
|
529 |
-
)
|
530 |
-
|
531 |
-
if generator is None:
|
532 |
-
print(f"Warning: Generator returned None for sentence: {sentence[:50]}...")
|
533 |
-
continue
|
534 |
-
|
535 |
-
for batch_idx, metadata, audio in generator:
|
536 |
-
print(f"Batch {batch_idx}")
|
537 |
-
print(f"Audio type: {type(audio)}")
|
538 |
-
|
539 |
-
if audio is not None and len(audio) > 0:
|
540 |
-
print(f"Audio shape/length: {getattr(audio, 'shape', len(audio))}")
|
541 |
-
print(f"Audio dtype: {getattr(audio, 'dtype', type(audio[0]))}")
|
542 |
-
print(f"First few values: {audio[:5]}")
|
543 |
-
|
544 |
-
# Convert to float32 numpy array before extending
|
545 |
-
if isinstance(audio, list):
|
546 |
-
audio = np.array(audio, dtype=np.float32)
|
547 |
-
elif isinstance(audio, np.ndarray):
|
548 |
-
audio = audio.astype(np.float32)
|
549 |
-
|
550 |
-
combined_audio.extend(audio.tolist())
|
551 |
-
else:
|
552 |
-
print(f"Warning: Empty audio for sentence: {sentence[:50]}...")
|
553 |
-
|
554 |
-
# Add silence between sentences (as float32)
|
555 |
-
combined_audio.extend(np.zeros(1000, dtype=np.float32).tolist())
|
556 |
-
|
557 |
-
except Exception as e:
|
558 |
-
print(f"Error processing sentence {j+1}: {str(e)}")
|
559 |
-
import traceback
|
560 |
-
print(traceback.format_exc())
|
561 |
-
continue
|
562 |
-
|
563 |
-
# Add silence between paragraphs (as float32)
|
564 |
-
combined_audio.extend(np.zeros(2000, dtype=np.float32).tolist())
|
565 |
-
|
566 |
-
except Exception as e:
|
567 |
-
print(f"Error processing paragraph {i+1}: {str(e)}")
|
568 |
-
import traceback
|
569 |
-
print(traceback.format_exc())
|
570 |
-
continue
|
571 |
-
|
572 |
-
if not combined_audio:
|
573 |
-
print("No audio was generated")
|
574 |
-
return None
|
575 |
-
|
576 |
-
# Convert to numpy array and ensure float32
|
577 |
-
combined_audio = np.array(combined_audio, dtype=np.float32)
|
578 |
-
if len(combined_audio) > 0:
|
579 |
-
print(f"Final audio length: {len(combined_audio)}")
|
580 |
-
print(f"Final audio dtype: {combined_audio.dtype}")
|
581 |
-
print(f"Audio min/max values: {np.min(combined_audio)}/{np.max(combined_audio)}")
|
582 |
-
|
583 |
-
# Only normalize if we have non-zero values
|
584 |
-
if np.max(np.abs(combined_audio)) > 0:
|
585 |
-
combined_audio = combined_audio / np.max(np.abs(combined_audio)) * 0.9
|
586 |
-
print("Audio normalized successfully")
|
587 |
-
else:
|
588 |
-
print("Warning: Audio contains only zeros")
|
589 |
-
|
590 |
-
try:
|
591 |
-
filename = "combined_story.wav"
|
592 |
-
sf.write(filename, combined_audio, 24000)
|
593 |
-
print(f"Successfully saved audio to {filename}")
|
594 |
-
return filename
|
595 |
-
except Exception as e:
|
596 |
-
print(f"Error saving audio file: {str(e)}")
|
597 |
-
return None
|
598 |
-
else:
|
599 |
-
print("Error: Combined audio array is empty")
|
600 |
-
return None
|
601 |
-
|
602 |
-
except Exception as e:
|
603 |
-
print(f"Error generating audio: {str(e)}")
|
604 |
-
import traceback
|
605 |
-
print(traceback.format_exc())
|
606 |
-
clear_memory()
|
607 |
-
return None
|
608 |
-
|
609 |
-
finally:
|
610 |
-
clear_memory()
|
611 |
|
612 |
# Helper functions
|
613 |
def clean_story_output(story):
|
|
|
485 |
return None
|
486 |
|
487 |
def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1):
|
488 |
+
"""Generate a single audio file for all paragraphs in the story."""
|
489 |
+
# Split story into paragraphs (reuse logic from generate_image_prompts)
|
490 |
+
paragraphs = []
|
491 |
+
current_paragraph = []
|
|
|
|
|
492 |
|
493 |
+
for line in story_text.split('\n'):
|
494 |
+
line = line.strip()
|
495 |
+
if not line: # Empty line indicates paragraph break
|
496 |
+
if current_paragraph:
|
497 |
+
paragraphs.append(' '.join(current_paragraph))
|
498 |
+
current_paragraph = []
|
499 |
+
else:
|
500 |
+
current_paragraph.append(line)
|
501 |
|
502 |
+
if current_paragraph:
|
503 |
+
paragraphs.append(' '.join(current_paragraph))
|
|
|
|
|
|
|
504 |
|
505 |
+
# Combine audio for all paragraphs
|
506 |
combined_audio = []
|
507 |
+
for paragraph in paragraphs:
|
508 |
+
if not paragraph.strip():
|
509 |
+
continue # Skip empty paragraphs
|
510 |
+
|
511 |
+
generator = pipeline(
|
512 |
+
paragraph,
|
513 |
+
voice=voice,
|
514 |
+
speed=speed,
|
515 |
+
split_pattern=r'\n+' # Split on newlines
|
516 |
+
)
|
517 |
+
for _, _, audio in generator:
|
518 |
+
combined_audio.extend(audio) # Append audio data
|
519 |
+
|
520 |
+
# Convert combined audio to NumPy array and save
|
521 |
+
combined_audio = np.array(combined_audio)
|
522 |
+
filename = "combined_story.wav"
|
523 |
+
sf.write(filename, combined_audio, 24000) # Save audio as .wav
|
524 |
+
clear_memory()
|
525 |
+
return filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
526 |
|
527 |
# Helper functions
|
528 |
def clean_story_output(story):
|