root commited on
Commit
4ddd8f4
·
1 Parent(s): db4c558

syllables trying first

Browse files
Files changed (3) hide show
  1. app.py +509 -52
  2. requirements.txt +2 -1
  3. utils.py +43 -29
app.py CHANGED
@@ -3,6 +3,8 @@ import io
3
  import gradio as gr
4
  import torch
5
  import numpy as np
 
 
6
  from transformers import (
7
  AutoModelForAudioClassification,
8
  AutoFeatureExtractor,
@@ -103,6 +105,41 @@ llm_pipeline = pipeline(
103
  # Initialize music emotion analyzer
104
  music_analyzer = MusicAnalyzer()
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  def extract_audio_features(audio_file):
107
  """Extract audio features from an audio file."""
108
  try:
@@ -228,19 +265,83 @@ def detect_music(audio_data):
228
  print(f"Error in music detection: {str(e)}")
229
  return False, []
230
 
 
231
  def detect_beats(y, sr):
232
- """Detect beats in the audio using librosa."""
233
  # Get tempo and beat frames
234
  tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
235
 
236
  # Convert beat frames to time in seconds
237
  beat_times = librosa.frames_to_time(beat_frames, sr=sr)
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  return {
240
  "tempo": tempo,
241
  "beat_frames": beat_frames,
242
  "beat_times": beat_times,
243
- "beat_count": len(beat_times)
 
 
 
 
244
  }
245
 
246
  def detect_sections(y, sr):
@@ -300,6 +401,124 @@ def detect_sections(y, sr):
300
 
301
  return sections
302
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  def estimate_syllables_per_section(beats_info, sections):
304
  """Estimate the number of syllables needed for each section based on beats."""
305
  syllables_per_section = []
@@ -314,17 +533,31 @@ def estimate_syllables_per_section(beats_info, sections):
314
  # Calculate syllables based on section type and beat count
315
  beat_count = len(section_beats)
316
 
317
- # Adjust syllable count based on section type and genre conventions
318
- if section["type"] == "verse":
319
- # Verses typically have more syllables per beat (more words)
320
- syllable_count = beat_count * 1.2
321
- elif section["type"] == "chorus":
322
- # Choruses often have fewer syllables per beat (more sustained notes)
323
- syllable_count = beat_count * 0.9
324
- elif section["type"] == "bridge":
325
- syllable_count = beat_count * 1.0
326
- else: # intro, outro
327
- syllable_count = beat_count * 0.5 # Often instrumental or sparse lyrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
  syllables_per_section.append({
330
  "type": section["type"],
@@ -332,7 +565,8 @@ def estimate_syllables_per_section(beats_info, sections):
332
  "end": section["end"],
333
  "duration": section["duration"],
334
  "beat_count": beat_count,
335
- "syllable_count": int(syllable_count)
 
336
  })
337
 
338
  return syllables_per_section
@@ -342,40 +576,71 @@ def calculate_detailed_song_structure(audio_data):
342
  y = audio_data["waveform"]
343
  sr = audio_data["sample_rate"]
344
 
345
- # Detect beats
346
  beats_info = detect_beats(y, sr)
347
 
348
  # Detect sections
349
  sections = detect_sections(y, sr)
350
 
351
- # Estimate syllables per section
352
  syllables_info = estimate_syllables_per_section(beats_info, sections)
353
 
 
 
 
 
 
 
 
354
  return {
355
  "beats": beats_info,
356
  "sections": sections,
357
- "syllables": syllables_info
 
358
  }
359
 
360
- def generate_lyrics(genre, duration, emotion_results):
361
- """Generate lyrics based on the genre and with appropriate length."""
362
- # Calculate appropriate lyrics length based on audio duration
363
- lines_count = calculate_lyrics_length(duration)
364
-
365
- # Calculate approximate number of verses and chorus
366
- if lines_count <= 6:
367
- # Very short song - one verse and chorus
368
- verse_lines = 2
369
- chorus_lines = 2
370
- elif lines_count <= 10:
371
- # Medium song - two verses and chorus
372
- verse_lines = 3
373
- chorus_lines = 2
374
- else:
375
- # Longer song - two verses, chorus, and bridge
376
- verse_lines = 3
377
- chorus_lines = 2
378
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  # Extract emotion and theme data from analysis results
380
  primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"]
381
  primary_theme = emotion_results["theme_analysis"]["primary_theme"]
@@ -389,8 +654,129 @@ def generate_lyrics(genre, duration, emotion_results):
389
  key = emotion_results["tonal_analysis"]["key"]
390
  mode = emotion_results["tonal_analysis"]["mode"]
391
 
392
- # Create prompt for the LLM
393
- prompt = f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  You are a talented songwriter who specializes in {genre} music.
395
  Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
396
 
@@ -400,18 +786,58 @@ Music analysis has detected the following qualities in the music:
400
  - Primary emotion: {primary_emotion}
401
  - Primary theme: {primary_theme}
402
 
 
 
 
 
 
 
 
 
 
403
  The lyrics should:
404
  - Perfectly capture the essence and style of {genre} music
405
  - Express the {primary_emotion} emotion and {primary_theme} theme
406
- - Be approximately {lines_count} lines long
407
- - Have a coherent theme and flow
408
  - Follow this structure:
409
  * Verse: {verse_lines} lines
410
  * Chorus: {chorus_lines} lines
411
- * {f'Bridge: 2 lines' if lines_count > 10 else ''}
412
  - Be completely original
413
  - Match the song duration of {duration:.1f} seconds
414
- - Keep each line concise and impactful
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
 
416
  Your lyrics:
417
  """
@@ -429,19 +855,31 @@ Your lyrics:
429
  # Extract and clean generated lyrics
430
  lyrics = response[0]["generated_text"].strip()
431
 
432
- # Add section labels if they're not present
433
- if "Verse" not in lyrics and "Chorus" not in lyrics:
 
 
 
 
434
  lines = lyrics.split('\n')
435
  formatted_lyrics = []
436
- current_section = "Verse"
 
437
  for i, line in enumerate(lines):
438
- if i == 0:
 
 
 
 
439
  formatted_lyrics.append("[Verse]")
440
- elif i == verse_lines:
441
  formatted_lyrics.append("\n[Chorus]")
442
- elif i == verse_lines + chorus_lines and lines_count > 10:
443
  formatted_lyrics.append("\n[Bridge]")
 
444
  formatted_lyrics.append(line)
 
 
445
  lyrics = '\n'.join(formatted_lyrics)
446
 
447
  return lyrics
@@ -496,10 +934,10 @@ def process_audio(audio_file):
496
  # Continue with a simpler approach if this fails
497
  song_structure = None
498
 
499
- # Generate lyrics based on top genre and emotion analysis
500
  try:
501
  primary_genre, _ = top_genres[0]
502
- lyrics = generate_lyrics(primary_genre, audio_data["duration"], emotion_results)
503
  except Exception as e:
504
  print(f"Error generating lyrics: {str(e)}")
505
  lyrics = f"Error generating lyrics: {str(e)}"
@@ -555,7 +993,25 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
555
  emotion_text += "\n\nSong Structure:\n"
556
  for section in song_structure["syllables"]:
557
  emotion_text += f"- {section['type'].capitalize()}: {section['start']:.1f}s to {section['end']:.1f}s "
558
- emotion_text += f"({section['duration']:.1f}s, {section['beat_count']} beats, ~{section['syllable_count']} syllables)\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
  except Exception as e:
560
  print(f"Error displaying song structure: {str(e)}")
561
  # Continue without showing structure details
@@ -590,8 +1046,9 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
590
  2. The system will classify the genre using the dima806/music_genres_classification model
591
  3. The system will analyze the musical emotion and theme using advanced audio processing
592
  4. The system will identify the song structure, beats, and timing patterns
593
- 5. Based on the detected genre, emotion, and structure, it will generate lyrics that match the beats, sections, and flow of the music
594
- 6. The lyrics will include appropriate section markings and syllable counts to align with the music
 
595
  """)
596
 
597
  # Launch the app
 
3
  import gradio as gr
4
  import torch
5
  import numpy as np
6
+ import re
7
+ import pronouncing # Add this to requirements.txt for syllable counting
8
  from transformers import (
9
  AutoModelForAudioClassification,
10
  AutoFeatureExtractor,
 
105
  # Initialize music emotion analyzer
106
  music_analyzer = MusicAnalyzer()
107
 
108
+ # New function: Count syllables in text
109
+ def count_syllables(text):
110
+ """Count syllables in a given text using the pronouncing library."""
111
+ words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
112
+ syllable_count = 0
113
+
114
+ for word in words:
115
+ # Get pronunciations for the word
116
+ pronunciations = pronouncing.phones_for_word(word)
117
+ if pronunciations:
118
+ # Count syllables in the first pronunciation
119
+ syllable_count += pronouncing.syllable_count(pronunciations[0])
120
+ else:
121
+ # Fallback: estimate syllables based on vowel groups
122
+ vowels = "aeiouy"
123
+ count = 0
124
+ prev_is_vowel = False
125
+
126
+ for char in word:
127
+ is_vowel = char.lower() in vowels
128
+ if is_vowel and not prev_is_vowel:
129
+ count += 1
130
+ prev_is_vowel = is_vowel
131
+
132
+ if word.endswith('e'):
133
+ count -= 1
134
+ if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
135
+ count += 1
136
+ if count == 0:
137
+ count = 1
138
+
139
+ syllable_count += count
140
+
141
+ return syllable_count
142
+
143
  def extract_audio_features(audio_file):
144
  """Extract audio features from an audio file."""
145
  try:
 
265
  print(f"Error in music detection: {str(e)}")
266
  return False, []
267
 
268
+ # Enhanced detect_beats function for better rhythm analysis
269
  def detect_beats(y, sr):
270
+ """Detect beats and create a detailed rhythmic map of the audio."""
271
  # Get tempo and beat frames
272
  tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
273
 
274
  # Convert beat frames to time in seconds
275
  beat_times = librosa.frames_to_time(beat_frames, sr=sr)
276
 
277
+ # Calculate beat strength to identify strong and weak beats
278
+ onset_env = librosa.onset.onset_strength(y=y, sr=sr)
279
+ beat_strengths = [onset_env[frame] for frame in beat_frames if frame < len(onset_env)]
280
+
281
+ # If we couldn't get strengths for all beats, use average for missing ones
282
+ if beat_strengths:
283
+ avg_strength = sum(beat_strengths) / len(beat_strengths)
284
+ while len(beat_strengths) < len(beat_times):
285
+ beat_strengths.append(avg_strength)
286
+ else:
287
+ beat_strengths = [1.0] * len(beat_times)
288
+
289
+ # Calculate time intervals between beats (for rhythm pattern detection)
290
+ intervals = []
291
+ for i in range(1, len(beat_times)):
292
+ intervals.append(beat_times[i] - beat_times[i-1])
293
+
294
+ # Try to detect time signature based on beat pattern
295
+ time_signature = 4 # Default assumption of 4/4 time
296
+ if len(beat_strengths) > 8:
297
+ strength_pattern = []
298
+ for i in range(0, len(beat_strengths), 2):
299
+ if i+1 < len(beat_strengths):
300
+ ratio = beat_strengths[i] / (beat_strengths[i+1] + 0.0001)
301
+ strength_pattern.append(ratio)
302
+
303
+ # Check if we have a clear 3/4 pattern (strong-weak-weak)
304
+ if strength_pattern:
305
+ three_pattern = sum(1 for r in strength_pattern if r > 1.2) / len(strength_pattern)
306
+ if three_pattern > 0.6:
307
+ time_signature = 3
308
+
309
+ # Group beats into phrases
310
+ phrases = []
311
+ current_phrase = []
312
+
313
+ for i in range(len(beat_times)):
314
+ current_phrase.append(i)
315
+
316
+ # Look for natural phrase boundaries
317
+ if i < len(beat_times) - 1:
318
+ is_stronger_next = False
319
+ if i < len(beat_strengths) - 1:
320
+ is_stronger_next = beat_strengths[i+1] > beat_strengths[i] * 1.2
321
+
322
+ is_longer_gap = False
323
+ if i < len(beat_times) - 1 and intervals:
324
+ current_gap = beat_times[i+1] - beat_times[i]
325
+ avg_gap = sum(intervals) / len(intervals)
326
+ is_longer_gap = current_gap > avg_gap * 1.3
327
+
328
+ if (is_stronger_next or is_longer_gap) and len(current_phrase) >= 2:
329
+ phrases.append(current_phrase)
330
+ current_phrase = []
331
+
332
+ # Add the last phrase if not empty
333
+ if current_phrase:
334
+ phrases.append(current_phrase)
335
+
336
  return {
337
  "tempo": tempo,
338
  "beat_frames": beat_frames,
339
  "beat_times": beat_times,
340
+ "beat_count": len(beat_times),
341
+ "beat_strengths": beat_strengths,
342
+ "intervals": intervals,
343
+ "time_signature": time_signature,
344
+ "phrases": phrases
345
  }
346
 
347
  def detect_sections(y, sr):
 
401
 
402
  return sections
403
 
404
+ # New function: Create flexible syllable templates
405
+ def create_flexible_syllable_templates(beats_info):
406
+ """Create syllable templates based purely on beat patterns without assuming song structure."""
407
+ # Get the beat times and strengths
408
+ beat_times = beats_info["beat_times"]
409
+ beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times))
410
+ phrases = beats_info.get("phrases", [])
411
+
412
+ # If no phrases were detected, create a simple division
413
+ if not phrases:
414
+ # Default to 4-beat phrases
415
+ phrases = []
416
+ for i in range(0, len(beat_times), 4):
417
+ end_idx = min(i + 4, len(beat_times))
418
+ if end_idx - i >= 2: # Ensure at least 2 beats per phrase
419
+ phrases.append(list(range(i, end_idx)))
420
+
421
+ # Create syllable templates for each phrase
422
+ syllable_templates = []
423
+
424
+ for phrase in phrases:
425
+ # Calculate appropriate syllable count for this phrase
426
+ beat_count = len(phrase)
427
+ phrase_strengths = [beat_strengths[i] for i in phrase if i < len(beat_strengths)]
428
+ avg_strength = sum(phrase_strengths) / len(phrase_strengths) if phrase_strengths else 1.0
429
+
430
+ # Base calculation: 1-2 syllables per beat depending on tempo
431
+ tempo = beats_info.get("tempo", 120)
432
+ if tempo > 120: # Fast tempo
433
+ syllables_per_beat = 1.0
434
+ elif tempo > 90: # Medium tempo
435
+ syllables_per_beat = 1.5
436
+ else: # Slow tempo
437
+ syllables_per_beat = 2.0
438
+
439
+ # Adjust for beat strength
440
+ syllables_per_beat *= (0.8 + (avg_strength * 0.4))
441
+
442
+ # Calculate total syllables for the phrase
443
+ phrase_syllables = int(beat_count * syllables_per_beat)
444
+ if phrase_syllables < 2:
445
+ phrase_syllables = 2
446
+
447
+ syllable_templates.append(str(phrase_syllables))
448
+
449
+ return "-".join(syllable_templates)
450
+
451
+ # New function: Analyze flexible structure
452
+ def analyze_flexible_structure(audio_data):
453
+ """Analyze music structure without assuming traditional song sections."""
454
+ y = audio_data["waveform"]
455
+ sr = audio_data["sample_rate"]
456
+
457
+ # Enhanced beat detection
458
+ beats_info = detect_beats(y, sr)
459
+
460
+ # Identify segments with similar audio features (using MFCC)
461
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
462
+
463
+ # Use agglomerative clustering to find segment boundaries
464
+ segment_boundaries = librosa.segment.agglomerative(mfcc, 3)
465
+ segment_times = librosa.frames_to_time(segment_boundaries, sr=sr)
466
+
467
+ # Create segments
468
+ segments = []
469
+ for i in range(len(segment_times)-1):
470
+ start = segment_times[i]
471
+ end = segment_times[i+1]
472
+
473
+ # Find beats within this segment
474
+ segment_beats = []
475
+ for j, time in enumerate(beats_info["beat_times"]):
476
+ if start <= time < end:
477
+ segment_beats.append(j)
478
+
479
+ # Create syllable template for this segment
480
+ if segment_beats:
481
+ segment_beats_info = {
482
+ "beat_times": [beats_info["beat_times"][j] for j in segment_beats],
483
+ "tempo": beats_info.get("tempo", 120)
484
+ }
485
+
486
+ if "beat_strengths" in beats_info:
487
+ segment_beats_info["beat_strengths"] = [
488
+ beats_info["beat_strengths"][j] for j in segment_beats
489
+ if j < len(beats_info["beat_strengths"])
490
+ ]
491
+
492
+ if "intervals" in beats_info:
493
+ segment_beats_info["intervals"] = beats_info["intervals"]
494
+
495
+ if "phrases" in beats_info:
496
+ # Filter phrases to include only beats in this segment
497
+ segment_phrases = []
498
+ for phrase in beats_info["phrases"]:
499
+ segment_phrase = [beat_idx for beat_idx in phrase if beat_idx in segment_beats]
500
+ if len(segment_phrase) >= 2:
501
+ segment_phrases.append(segment_phrase)
502
+
503
+ segment_beats_info["phrases"] = segment_phrases
504
+
505
+ syllable_template = create_flexible_syllable_templates(segment_beats_info)
506
+ else:
507
+ syllable_template = "4" # Default fallback
508
+
509
+ segments.append({
510
+ "start": start,
511
+ "end": end,
512
+ "duration": end - start,
513
+ "syllable_template": syllable_template
514
+ })
515
+
516
+ return {
517
+ "beats": beats_info,
518
+ "segments": segments
519
+ }
520
+
521
+ # Enhanced estimate_syllables_per_section function
522
  def estimate_syllables_per_section(beats_info, sections):
523
  """Estimate the number of syllables needed for each section based on beats."""
524
  syllables_per_section = []
 
533
  # Calculate syllables based on section type and beat count
534
  beat_count = len(section_beats)
535
 
536
+ # Extract beat strengths for this section if available
537
+ section_beat_strengths = []
538
+ if "beat_strengths" in beats_info:
539
+ for i, beat_time in enumerate(beats_info["beat_times"]):
540
+ if section["start"] <= beat_time < section["end"] and i < len(beats_info["beat_strengths"]):
541
+ section_beat_strengths.append(beats_info["beat_strengths"][i])
542
+
543
+ # Create a segment-specific beat info structure for template creation
544
+ segment_beats_info = {
545
+ "beat_times": section_beats,
546
+ "tempo": beats_info.get("tempo", 120)
547
+ }
548
+
549
+ if section_beat_strengths:
550
+ segment_beats_info["beat_strengths"] = section_beat_strengths
551
+
552
+ if "intervals" in beats_info:
553
+ segment_beats_info["intervals"] = beats_info["intervals"]
554
+
555
+ # Create a detailed syllable template for this section
556
+ syllable_template = create_flexible_syllable_templates(segment_beats_info)
557
+
558
+ # Calculate estimated syllable count
559
+ expected_counts = [int(count) for count in syllable_template.split("-")]
560
+ total_syllables = sum(expected_counts)
561
 
562
  syllables_per_section.append({
563
  "type": section["type"],
 
565
  "end": section["end"],
566
  "duration": section["duration"],
567
  "beat_count": beat_count,
568
+ "syllable_count": total_syllables,
569
+ "syllable_template": syllable_template
570
  })
571
 
572
  return syllables_per_section
 
576
  y = audio_data["waveform"]
577
  sr = audio_data["sample_rate"]
578
 
579
+ # Enhanced beat detection
580
  beats_info = detect_beats(y, sr)
581
 
582
  # Detect sections
583
  sections = detect_sections(y, sr)
584
 
585
+ # Create enhanced syllable info per section
586
  syllables_info = estimate_syllables_per_section(beats_info, sections)
587
 
588
+ # Get flexible structure analysis as an alternative approach
589
+ try:
590
+ flexible_structure = analyze_flexible_structure(audio_data)
591
+ except Exception as e:
592
+ print(f"Warning: Flexible structure analysis failed: {str(e)}")
593
+ flexible_structure = None
594
+
595
  return {
596
  "beats": beats_info,
597
  "sections": sections,
598
+ "syllables": syllables_info,
599
+ "flexible_structure": flexible_structure
600
  }
601
 
602
+ # New function: Verify syllable counts
603
+ def verify_flexible_syllable_counts(lyrics, templates):
604
+ """Verify that the generated lyrics match the required syllable counts."""
605
+ # Split lyrics into lines
606
+ lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
607
 
608
+ # Check syllable counts for each line
609
+ verification_notes = []
610
+
611
+ for i, line in enumerate(lines):
612
+ if i >= len(templates):
613
+ break
614
+
615
+ template = templates[i]
616
+
617
+ # Handle different template formats
618
+ if isinstance(template, dict) and "syllable_template" in template:
619
+ expected_counts = [int(count) for count in template["syllable_template"].split("-")]
620
+ elif isinstance(template, str):
621
+ expected_counts = [int(count) for count in template.split("-")]
622
+ else:
623
+ continue
624
+
625
+ # Count actual syllables
626
+ actual_count = count_syllables(line)
627
+
628
+ # Calculate difference
629
+ total_expected = sum(expected_counts)
630
+ if abs(actual_count - total_expected) > 2: # Allow small differences
631
+ verification_notes.append(f"Line {i+1}: Expected {total_expected} syllables, got {actual_count}")
632
+
633
+ # If we found issues, add them as notes at the end of the lyrics
634
+ if verification_notes:
635
+ lyrics += "\n\n[Note: Potential rhythm mismatches in these lines:]\n"
636
+ lyrics += "\n".join(verification_notes)
637
+ lyrics += "\n[You may want to adjust these lines to match the music's rhythm better]"
638
+
639
+ return lyrics
640
+
641
+ # Modified generate_lyrics function
642
+ def generate_lyrics(genre, duration, emotion_results, song_structure=None):
643
+ """Generate lyrics based on the genre, emotion, and structure analysis."""
644
  # Extract emotion and theme data from analysis results
645
  primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"]
646
  primary_theme = emotion_results["theme_analysis"]["primary_theme"]
 
654
  key = emotion_results["tonal_analysis"]["key"]
655
  mode = emotion_results["tonal_analysis"]["mode"]
656
 
657
+ # Format syllable templates for the prompt
658
+ syllable_guidance = ""
659
+ templates_for_verification = []
660
+
661
+ if song_structure:
662
+ # Try to use flexible structure if available
663
+ if "flexible_structure" in song_structure and song_structure["flexible_structure"]:
664
+ flexible = song_structure["flexible_structure"]
665
+ if "segments" in flexible and flexible["segments"]:
666
+ syllable_guidance = "Follow these exact syllable patterns for each line:\n"
667
+
668
+ for i, segment in enumerate(flexible["segments"]):
669
+ if i < 15: # Limit to 15 lines to keep prompt manageable
670
+ syllable_guidance += f"Line {i+1}: {segment['syllable_template']} syllables\n"
671
+ templates_for_verification.append(segment["syllable_template"])
672
+
673
+ # Fallback to traditional sections if needed
674
+ elif "syllables" in song_structure and song_structure["syllables"]:
675
+ syllable_guidance = "Follow these syllable patterns for each section:\n"
676
+
677
+ for section in song_structure["syllables"]:
678
+ if "syllable_template" in section:
679
+ syllable_guidance += f"[{section['type'].capitalize()}]: {section['syllable_template']} syllables per line\n"
680
+ elif "syllable_count" in section:
681
+ syllable_guidance += f"[{section['type'].capitalize()}]: ~{section['syllable_count']} syllables total\n"
682
+
683
+ if "syllable_template" in section:
684
+ templates_for_verification.append(section)
685
+
686
+ # If we couldn't get specific templates, use general guidance
687
+ if not syllable_guidance:
688
+ syllable_guidance = "Make sure each line has an appropriate syllable count for singing:\n"
689
+ syllable_guidance += "- For faster sections (tempo > 120 BPM): 4-6 syllables per line\n"
690
+ syllable_guidance += "- For medium tempo sections: 6-8 syllables per line\n"
691
+ syllable_guidance += "- For slower sections (tempo < 90 BPM): 8-10 syllables per line\n"
692
+
693
+ # Add examples of syllable counting
694
+ syllable_guidance += "\nExamples of syllable counting:\n"
695
+ syllable_guidance += "- 'I can see the light' = 4 syllables\n"
696
+ syllable_guidance += "- 'When it fades a-way' = 4 syllables\n"
697
+ syllable_guidance += "- 'The sun is shin-ing bright to-day' = 8 syllables\n"
698
+ syllable_guidance += "- 'I'll be wait-ing for you' = 6 syllables\n"
699
+
700
+ # Determine if we should use traditional sections or not
701
+ use_sections = True
702
+ if song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]:
703
+ # If we have more than 4 segments, it's likely not a traditional song structure
704
+ if "segments" in song_structure["flexible_structure"]:
705
+ segments = song_structure["flexible_structure"]["segments"]
706
+ if len(segments) > 4:
707
+ use_sections = False
708
+
709
+ # Create enhanced prompt for the LLM
710
+ if use_sections:
711
+ # Traditional approach with sections
712
+ # Calculate appropriate lyrics length and section distribution
713
+ try:
714
+ if song_structure and "beats" in song_structure:
715
+ beats_info = song_structure["beats"]
716
+ tempo = beats_info.get("tempo", 120)
717
+ time_signature = beats_info.get("time_signature", 4)
718
+ lines_structure = calculate_lyrics_length(duration, tempo, time_signature)
719
+
720
+ # Handle both possible return types
721
+ if isinstance(lines_structure, dict):
722
+ total_lines = lines_structure["lines_count"]
723
+
724
+ # Extract section line counts if available
725
+ verse_lines = 0
726
+ chorus_lines = 0
727
+ bridge_lines = 0
728
+
729
+ for section in lines_structure["sections"]:
730
+ if section["type"] == "verse":
731
+ verse_lines = section["lines"]
732
+ elif section["type"] == "chorus":
733
+ chorus_lines = section["lines"]
734
+ elif section["type"] == "bridge":
735
+ bridge_lines = section["lines"]
736
+ else:
737
+ # The function returned just an integer (old behavior)
738
+ total_lines = lines_structure
739
+
740
+ # Default section distribution based on total lines
741
+ if total_lines <= 6:
742
+ verse_lines = 2
743
+ chorus_lines = 2
744
+ bridge_lines = 0
745
+ elif total_lines <= 10:
746
+ verse_lines = 3
747
+ chorus_lines = 2
748
+ bridge_lines = 0
749
+ else:
750
+ verse_lines = 3
751
+ chorus_lines = 2
752
+ bridge_lines = 2
753
+ else:
754
+ # Fallback to simple calculation
755
+ total_lines = max(4, int(duration / 10))
756
+
757
+ # Default section distribution
758
+ if total_lines <= 6:
759
+ verse_lines = 2
760
+ chorus_lines = 2
761
+ bridge_lines = 0
762
+ elif total_lines <= 10:
763
+ verse_lines = 3
764
+ chorus_lines = 2
765
+ bridge_lines = 0
766
+ else:
767
+ verse_lines = 3
768
+ chorus_lines = 2
769
+ bridge_lines = 2
770
+ except Exception as e:
771
+ print(f"Error calculating lyrics length: {str(e)}")
772
+ total_lines = max(4, int(duration / 10))
773
+
774
+ # Default section distribution
775
+ verse_lines = 3
776
+ chorus_lines = 2
777
+ bridge_lines = 0
778
+
779
+ prompt = f"""
780
  You are a talented songwriter who specializes in {genre} music.
781
  Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
782
 
 
786
  - Primary emotion: {primary_emotion}
787
  - Primary theme: {primary_theme}
788
 
789
+ IMPORTANT: The lyrics must match the rhythm of the music exactly!
790
+ {syllable_guidance}
791
+
792
+ When writing the lyrics:
793
+ 1. Count syllables carefully for each line to match the specified pattern
794
+ 2. Ensure words fall naturally on the beat
795
+ 3. Place stressed syllables on strong beats
796
+ 4. Create a coherent theme throughout the lyrics
797
+
798
  The lyrics should:
799
  - Perfectly capture the essence and style of {genre} music
800
  - Express the {primary_emotion} emotion and {primary_theme} theme
801
+ - Be approximately {total_lines} lines long
 
802
  - Follow this structure:
803
  * Verse: {verse_lines} lines
804
  * Chorus: {chorus_lines} lines
805
+ * {f'Bridge: {bridge_lines} lines' if bridge_lines > 0 else ''}
806
  - Be completely original
807
  - Match the song duration of {duration:.1f} seconds
808
+
809
+ Your lyrics:
810
+ """
811
+ else:
812
+ # Flexible approach without traditional sections
813
+ prompt = f"""
814
+ You are a talented songwriter who specializes in {genre} music.
815
+ Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long.
816
+
817
+ Music analysis has detected the following qualities:
818
+ - Tempo: {tempo:.1f} BPM
819
+ - Key: {key} {mode}
820
+ - Primary emotion: {primary_emotion}
821
+ - Primary theme: {primary_theme}
822
+
823
+ IMPORTANT: The lyrics must match the rhythm of the music exactly!
824
+ {syllable_guidance}
825
+
826
+ When writing the lyrics:
827
+ 1. Count syllables carefully for each line to match the specified pattern
828
+ 2. Ensure words fall naturally on the beat
829
+ 3. Place stressed syllables on strong beats
830
+ 4. Create coherent lyrics that would work for this music segment
831
+
832
+ The lyrics should:
833
+ - Perfectly capture the essence and style of {genre} music
834
+ - Express the {primary_emotion} emotion and {primary_theme} theme
835
+ - Be completely original
836
+ - Maintain a consistent theme throughout
837
+ - Match the audio segment duration of {duration:.1f} seconds
838
+
839
+ DON'T include any section labels like [Verse] or [Chorus] unless specifically instructed.
840
+ Instead, write lyrics that flow naturally and match the music's rhythm.
841
 
842
  Your lyrics:
843
  """
 
855
  # Extract and clean generated lyrics
856
  lyrics = response[0]["generated_text"].strip()
857
 
858
+ # Verify syllable counts if we have templates
859
+ if templates_for_verification:
860
+ lyrics = verify_flexible_syllable_counts(lyrics, templates_for_verification)
861
+
862
+ # Add section labels if they're not present and we're using the traditional approach
863
+ if use_sections and "Verse" not in lyrics and "Chorus" not in lyrics:
864
  lines = lyrics.split('\n')
865
  formatted_lyrics = []
866
+
867
+ line_count = 0
868
  for i, line in enumerate(lines):
869
+ if not line.strip():
870
+ formatted_lyrics.append(line)
871
+ continue
872
+
873
+ if line_count == 0:
874
  formatted_lyrics.append("[Verse]")
875
+ elif line_count == verse_lines:
876
  formatted_lyrics.append("\n[Chorus]")
877
+ elif line_count == verse_lines + chorus_lines and bridge_lines > 0:
878
  formatted_lyrics.append("\n[Bridge]")
879
+
880
  formatted_lyrics.append(line)
881
+ line_count += 1
882
+
883
  lyrics = '\n'.join(formatted_lyrics)
884
 
885
  return lyrics
 
934
  # Continue with a simpler approach if this fails
935
  song_structure = None
936
 
937
+ # Generate lyrics based on top genre, emotion analysis, and song structure
938
  try:
939
  primary_genre, _ = top_genres[0]
940
+ lyrics = generate_lyrics(primary_genre, audio_data["duration"], emotion_results, song_structure)
941
  except Exception as e:
942
  print(f"Error generating lyrics: {str(e)}")
943
  lyrics = f"Error generating lyrics: {str(e)}"
 
993
  emotion_text += "\n\nSong Structure:\n"
994
  for section in song_structure["syllables"]:
995
  emotion_text += f"- {section['type'].capitalize()}: {section['start']:.1f}s to {section['end']:.1f}s "
996
+ emotion_text += f"({section['duration']:.1f}s, {section['beat_count']} beats, "
997
+
998
+ if "syllable_template" in section:
999
+ emotion_text += f"template: {section['syllable_template']})\n"
1000
+ else:
1001
+ emotion_text += f"~{section['syllable_count']} syllables)\n"
1002
+
1003
+ # Add flexible structure info if available
1004
+ if "flexible_structure" in song_structure and song_structure["flexible_structure"]:
1005
+ flexible = song_structure["flexible_structure"]
1006
+ if "segments" in flexible and flexible["segments"]:
1007
+ emotion_text += "\nDetailed Rhythm Analysis:\n"
1008
+ for i, segment in enumerate(flexible["segments"][:5]): # Show first 5 segments
1009
+ emotion_text += f"- Segment {i+1}: {segment['start']:.1f}s to {segment['end']:.1f}s, "
1010
+ emotion_text += f"pattern: {segment['syllable_template']}\n"
1011
+
1012
+ if len(flexible["segments"]) > 5:
1013
+ emotion_text += f" (+ {len(flexible['segments']) - 5} more segments)\n"
1014
+
1015
  except Exception as e:
1016
  print(f"Error displaying song structure: {str(e)}")
1017
  # Continue without showing structure details
 
1046
  2. The system will classify the genre using the dima806/music_genres_classification model
1047
  3. The system will analyze the musical emotion and theme using advanced audio processing
1048
  4. The system will identify the song structure, beats, and timing patterns
1049
+ 5. The system will create syllable templates that precisely match the rhythm of the music
1050
+ 6. Based on the detected genre, emotion, and syllable templates, it will generate lyrics that align perfectly with the beats
1051
+ 7. The system verifies syllable counts to ensure the generated lyrics can be sung naturally with the music
1052
  """)
1053
 
1054
  # Launch the app
requirements.txt CHANGED
@@ -11,4 +11,5 @@ sentencepiece>=0.1.99
11
  safetensors>=0.4.1
12
  scipy>=1.12.0
13
  soundfile>=0.12.1
14
- matplotlib>=3.7.0
 
 
11
  safetensors>=0.4.1
12
  scipy>=1.12.0
13
  soundfile>=0.12.1
14
+ matplotlib>=3.7.0
15
+ pronouncing>=0.2.0
utils.py CHANGED
@@ -37,39 +37,53 @@ def extract_mfcc_features(y, sr, n_mfcc=20):
37
  # Return a fallback feature vector if extraction fails
38
  return np.zeros(n_mfcc)
39
 
40
- def calculate_lyrics_length(duration):
41
- """
42
- Calculate appropriate lyrics length based on audio duration.
43
- Uses a more conservative calculation that generates shorter lyrics:
44
- - Average words per line (8-10 words)
45
- - Reduced words per minute (45 words instead of 135)
46
- - Simplified song structure
47
- """
48
- # Convert duration to minutes
49
- duration_minutes = duration / 60
50
 
51
- # Calculate total words based on duration
52
- # Using 45 words per minute (reduced from 135)
53
- total_words = int(duration_minutes * 90)
54
 
55
- # Calculate number of lines
56
- # Assuming 8-10 words per line
57
- words_per_line = 9 # average
58
- total_lines = total_words // words_per_line
 
59
 
60
- # Adjust for song structure with shorter lengths
61
- if total_lines < 6:
62
- # Very short song - keep it simple
63
- return max(2, total_lines)
64
- elif total_lines < 10:
65
- # Short song - one verse and chorus
66
- return min(6, total_lines)
67
- elif total_lines < 15:
68
- # Medium song - two verses and chorus
69
- return min(10, total_lines)
 
70
  else:
71
- # Longer song - two verses, chorus, and bridge
72
- return min(15, total_lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  def format_genre_results(top_genres):
75
  """Format genre classification results for display."""
 
37
  # Return a fallback feature vector if extraction fails
38
  return np.zeros(n_mfcc)
39
 
40
+ def calculate_lyrics_length(duration, tempo=100, time_signature=4):
41
+ """Calculate appropriate lyrics structure based on musical principles."""
42
+ # Legacy behavior - simple calculation based on duration
43
+ lines_count = max(4, int(duration / 10))
 
 
 
 
 
 
44
 
45
+ # If only duration was provided (original usage), return just the integer
46
+ if not isinstance(tempo, (int, float)) or not isinstance(time_signature, (int, float)):
47
+ return lines_count
48
 
49
+ # Enhanced calculation
50
+ beats_per_minute = tempo
51
+ beats_per_second = beats_per_minute / 60
52
+ total_beats = duration * beats_per_second
53
+ total_measures = total_beats / time_signature
54
 
55
+ # Determine section distributions
56
+ verse_lines = 0
57
+ chorus_lines = 0
58
+ bridge_lines = 0
59
+
60
+ if lines_count <= 6:
61
+ verse_lines = 2
62
+ chorus_lines = 2
63
+ elif lines_count <= 10:
64
+ verse_lines = 3
65
+ chorus_lines = 2
66
  else:
67
+ verse_lines = 3
68
+ chorus_lines = 2
69
+ bridge_lines = 2
70
+
71
+ # Create structured output
72
+ song_structure = {
73
+ "total_measures": int(total_measures),
74
+ "lines_count": lines_count, # Include the original line count
75
+ "sections": [
76
+ {"type": "verse", "lines": verse_lines, "measures": int(total_measures * 0.4)},
77
+ {"type": "chorus", "lines": chorus_lines, "measures": int(total_measures * 0.3)}
78
+ ]
79
+ }
80
+
81
+ if bridge_lines > 0:
82
+ song_structure["sections"].append(
83
+ {"type": "bridge", "lines": bridge_lines, "measures": int(total_measures * 0.2)}
84
+ )
85
+
86
+ return song_structure
87
 
88
  def format_genre_results(top_genres):
89
  """Format genre classification results for display."""