Spaces:

jacob-c
/

syllables_matching_experiment

Sleeping

App Files Files Community

root commited on 9 days ago

Commit

38b696f

1 Parent(s): ab30d87

music confidence

Browse files

Files changed (1) hide show

app.py +329 -86

app.py CHANGED Viewed

@@ -22,6 +22,7 @@ from utils import (
     preprocess_audio_for_model
 )
 from emotionanalysis import MusicAnalyzer
 # Login to Hugging Face Hub if token is provided
 if "HF_TOKEN" in os.environ:
@@ -173,34 +174,219 @@ def classify_genre(audio_data):
         # Fallback: return a default genre if everything fails
         return [("rock", 1.0)]
-def generate_lyrics(genre, duration, emotion_results):
-    """Generate lyrics based on the genre and with appropriate length."""
-    # Calculate appropriate lyrics length based on audio duration
-    lines_count = calculate_lyrics_length(duration)
-    # Calculate approximate number of verses and chorus
-    if lines_count <= 6:
-        # Very short song - one verse and chorus
-        verse_lines = 2
-        chorus_lines = 2
-    elif lines_count <= 10:
-        # Medium song - two verses and chorus
-        verse_lines = 3
-        chorus_lines = 2
-    else:
-        # Longer song - two verses, chorus, and bridge
-        verse_lines = 3
-        chorus_lines = 2
-    # Extract emotion and theme data from analysis results
-    primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"]
-    primary_theme = emotion_results["theme_analysis"]["primary_theme"]
-    tempo = emotion_results["rhythm_analysis"]["tempo"]
-    key = emotion_results["tonal_analysis"]["key"]
-    mode = emotion_results["tonal_analysis"]["mode"]
-    # Create prompt for the LLM
-    prompt = f"""
 You are a talented songwriter who specializes in {genre} music.
 Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
@@ -223,6 +409,60 @@ The lyrics should:
 - Match the song duration of {duration:.1f} seconds
 - Keep each line concise and impactful
 Your lyrics:
 """
@@ -239,76 +479,51 @@ Your lyrics:
     # Extract and clean generated lyrics
     lyrics = response[0]["generated_text"].strip()
-    # Add section labels if they're not present
-    if "Verse" not in lyrics and "Chorus" not in lyrics:
         lines = lyrics.split('\n')
         formatted_lyrics = []
         current_section = "Verse"
         for i, line in enumerate(lines):
             if i == 0:
                 formatted_lyrics.append("[Verse]")
             elif i == verse_lines:
                 formatted_lyrics.append("\n[Chorus]")
             elif i == verse_lines + chorus_lines and lines_count > 10:
                 formatted_lyrics.append("\n[Bridge]")
             formatted_lyrics.append(line)
         lyrics = '\n'.join(formatted_lyrics)
-    return lyrics
-def detect_music(audio_data):
-    """Detect if the audio is music using the MIT AST model."""
-    try:
-        # First attempt: Try using the pipeline if available
-        if 'music_detector' in globals():
-            results = music_detector(audio_data["path"])
-            # Look for music-related classes in the results
-            music_confidence = 0.0
-            for result in results:
-                label = result["label"].lower()
-                if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]):
-                    music_confidence = max(music_confidence, result["score"])
-            return music_confidence >= 0.2, results
-        # Second attempt: Use manually loaded model components
-        elif 'music_processor' in globals() and 'music_model' in globals():
-            # Process audio input with feature extractor
-            inputs = music_processor(
-                audio_data["waveform"],
-                sampling_rate=audio_data["sample_rate"],
-                return_tensors="pt"
-            )
-            with torch.no_grad():
-                outputs = music_model(**inputs)
-                predictions = outputs.logits.softmax(dim=-1)
-            # Get the top predictions
-            values, indices = torch.topk(predictions, 5)
-            # Map indices to labels
-            labels = music_model.config.id2label
-            # Check for music-related classes
-            music_confidence = 0.0
-            results = []
-            for i, (value, index) in enumerate(zip(values[0], indices[0])):
-                label = labels[index.item()].lower()
-                score = value.item()
-                results.append({"label": label, "score": score})
-                if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]):
-                    music_confidence = max(music_confidence, score)
-            return music_confidence >= 0.2, results
-        else:
-            raise ValueError("No music detection model available")
-    except Exception as e:
-        print(f"Error in music detection: {str(e)}")
-        return False, []
 def process_audio(audio_file):
     """Main function to process audio file, classify genre, and generate lyrics."""
@@ -344,12 +559,25 @@ def process_audio(audio_file):
         except Exception as e:
             print(f"Error in emotion analysis: {str(e)}")
             # Continue even if emotion analysis fails
-            emotion_results = {"summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}}
-        # Generate lyrics based on top genre and emotion analysis
         try:
             primary_genre, _ = top_genres[0]
-            lyrics = generate_lyrics(primary_genre, audio_data["duration"], emotion_results)
         except Exception as e:
             print(f"Error generating lyrics: {str(e)}")
             lyrics = f"Error generating lyrics: {str(e)}"
@@ -396,6 +624,20 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
                 emotion_text += f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n"
                 emotion_text += f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n"
                 emotion_text += f"Primary Theme: {emotion_results['summary']['primary_theme']}"
             except Exception as e:
                 print(f"Error in emotion analysis: {str(e)}")
                 emotion_text = f"Error in emotion analysis: {str(e)}"
@@ -425,9 +667,10 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
     1. Upload an audio file of your choice
     2. The system will classify the genre using the dima806/music_genres_classification model
     3. The system will analyze the musical emotion and theme using advanced audio processing
-    4. Based on the detected genre and emotion, it will generate appropriate lyrics using Llama-3.1-8B-Instruct
-    5. The lyrics length is automatically adjusted based on your audio duration
     """)
 # Launch the app
-demo.launch()

     preprocess_audio_for_model
 )
 from emotionanalysis import MusicAnalyzer
+import librosa
 # Login to Hugging Face Hub if token is provided
 if "HF_TOKEN" in os.environ:
         # Fallback: return a default genre if everything fails
         return [("rock", 1.0)]
+def detect_music(audio_data):
+    """Detect if the audio is music using the MIT AST model."""
+    try:
+        # First attempt: Try using the pipeline if available
+        if 'music_detector' in globals():
+            results = music_detector(audio_data["path"])
+            # Look for music-related classes in the results
+            music_confidence = 0.0
+            for result in results:
+                label = result["label"].lower()
+                if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]):
+                    music_confidence = max(music_confidence, result["score"])
+            return music_confidence >= 0.2, results
+        # Second attempt: Use manually loaded model components
+        elif 'music_processor' in globals() and 'music_model' in globals():
+            # Process audio input with feature extractor
+            inputs = music_processor(
+                audio_data["waveform"],
+                sampling_rate=audio_data["sample_rate"],
+                return_tensors="pt"
+            )
+            with torch.no_grad():
+                outputs = music_model(**inputs)
+                predictions = outputs.logits.softmax(dim=-1)
+            # Get the top predictions
+            values, indices = torch.topk(predictions, 5)
+            # Map indices to labels
+            labels = music_model.config.id2label
+            # Check for music-related classes
+            music_confidence = 0.0
+            results = []
+            for i, (value, index) in enumerate(zip(values[0], indices[0])):
+                label = labels[index.item()].lower()
+                score = value.item()
+                results.append({"label": label, "score": score})
+                if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]):
+                    music_confidence = max(music_confidence, score)
+            return music_confidence >= 0.2, results
+        else:
+            raise ValueError("No music detection model available")
+    except Exception as e:
+        print(f"Error in music detection: {str(e)}")
+        return False, []
+def detect_beats(y, sr):
+    """Detect beats in the audio using librosa."""
+    # Get tempo and beat frames
+    tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
+    # Convert beat frames to time in seconds
+    beat_times = librosa.frames_to_time(beat_frames, sr=sr)
+    return {
+        "tempo": tempo,
+        "beat_frames": beat_frames,
+        "beat_times": beat_times,
+        "beat_count": len(beat_times)
+    }
+def detect_sections(y, sr):
+    """Detect sections (verse, chorus, etc.) in the audio."""
+    # Compute the spectral contrast
+    S = np.abs(librosa.stft(y))
+    contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
+    # Compute the chroma features
+    chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
+    # Use a combination of contrast and chroma to find segment boundaries
+    # Average over frequency axis to get time series
+    contrast_avg = np.mean(contrast, axis=0)
+    chroma_avg = np.mean(chroma, axis=0)
+    # Normalize
+    contrast_avg = (contrast_avg - np.mean(contrast_avg)) / np.std(contrast_avg)
+    chroma_avg = (chroma_avg - np.mean(chroma_avg)) / np.std(chroma_avg)
+    # Combine features
+    combined = contrast_avg + chroma_avg
+    # Detect structural boundaries
+    bounds = librosa.segment.agglomerative(combined, 3)  # Adjust for typical song structures
+    # Convert to time in seconds
+    bound_times = librosa.frames_to_time(bounds, sr=sr)
+    # Estimate section types based on position and length
+    sections = []
+    for i in range(len(bound_times) - 1):
+        start = bound_times[i]
+        end = bound_times[i+1]
+        duration = end - start
+        # Simple heuristic to label sections
+        if i == 0:
+            section_type = "intro"
+        elif i == len(bound_times) - 2:
+            section_type = "outro"
+        elif i % 2 == 1:  # Alternating verse/chorus pattern
+            section_type = "chorus"
+        else:
+            section_type = "verse"
+        # If we have a short section in the middle, it might be a bridge
+        if 0 < i < len(bound_times) - 2 and duration < 20:
+            section_type = "bridge"
+        sections.append({
+            "type": section_type,
+            "start": start,
+            "end": end,
+            "duration": duration
+        })
+    return sections
+def estimate_syllables_per_section(beats_info, sections):
+    """Estimate the number of syllables needed for each section based on beats."""
+    syllables_per_section = []
+    for section in sections:
+        # Find beats that fall within this section
+        section_beats = [
+            beat for beat in beats_info["beat_times"]
+            if section["start"] <= beat < section["end"]
+        ]
+        # Calculate syllables based on section type and beat count
+        beat_count = len(section_beats)
+        # Adjust syllable count based on section type and genre conventions
+        if section["type"] == "verse":
+            # Verses typically have more syllables per beat (more words)
+            syllable_count = beat_count * 1.2
+        elif section["type"] == "chorus":
+            # Choruses often have fewer syllables per beat (more sustained notes)
+            syllable_count = beat_count * 0.9
+        elif section["type"] == "bridge":
+            syllable_count = beat_count * 1.0
+        else:  # intro, outro
+            syllable_count = beat_count * 0.5  # Often instrumental or sparse lyrics
+        syllables_per_section.append({
+            "type": section["type"],
+            "start": section["start"],
+            "end": section["end"],
+            "duration": section["duration"],
+            "beat_count": beat_count,
+            "syllable_count": int(syllable_count)
+        })
+    return syllables_per_section
+def calculate_detailed_song_structure(audio_data):
+    """Calculate detailed song structure for better lyrics generation."""
+    y = audio_data["waveform"]
+    sr = audio_data["sample_rate"]
+    # Detect beats
+    beats_info = detect_beats(y, sr)
+    # Detect sections
+    sections = detect_sections(y, sr)
+    # Estimate syllables per section
+    syllables_info = estimate_syllables_per_section(beats_info, sections)
+    return {
+        "beats": beats_info,
+        "sections": sections,
+        "syllables": syllables_info
+    }
+def generate_lyrics(genre, duration, emotion_results, song_structure=None):
+    """Generate lyrics based on genre, duration, emotion, and detailed song structure."""
+    # If no song structure is provided, fall back to the original approach
+    if song_structure is None:
+        # Calculate appropriate lyrics length based on audio duration
+        lines_count = calculate_lyrics_length(duration)
+        # Calculate approximate number of verses and chorus
+        if lines_count <= 6:
+            # Very short song - one verse and chorus
+            verse_lines = 2
+            chorus_lines = 2
+        elif lines_count <= 10:
+            # Medium song - two verses and chorus
+            verse_lines = 3
+            chorus_lines = 2
+        else:
+            # Longer song - two verses, chorus, and bridge
+            verse_lines = 3
+            chorus_lines = 2
+        # Extract emotion and theme data from analysis results
+        primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"]
+        primary_theme = emotion_results["theme_analysis"]["primary_theme"]
+        tempo = emotion_results["rhythm_analysis"]["tempo"]
+        key = emotion_results["tonal_analysis"]["key"]
+        mode = emotion_results["tonal_analysis"]["mode"]
+        # Create prompt for the LLM
+        prompt = f"""
 You are a talented songwriter who specializes in {genre} music.
 Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
 - Match the song duration of {duration:.1f} seconds
 - Keep each line concise and impactful
+Your lyrics:
+"""
+    else:
+        # Extract emotion and theme data from analysis results
+        primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"]
+        primary_theme = emotion_results["theme_analysis"]["primary_theme"]
+        tempo = emotion_results["rhythm_analysis"]["tempo"]
+        key = emotion_results["tonal_analysis"]["key"]
+        mode = emotion_results["tonal_analysis"]["mode"]
+        # Create detailed structure instructions for the LLM
+        structure_instructions = "Follow this exact song structure with specified syllable counts:\n"
+        for section in song_structure["syllables"]:
+            section_type = section["type"].capitalize()
+            start_time = f"{section['start']:.1f}"
+            end_time = f"{section['end']:.1f}"
+            duration = f"{section['duration']:.1f}"
+            beat_count = section["beat_count"]
+            syllable_count = section["syllable_count"]
+            structure_instructions += f"* {section_type} ({start_time}s - {end_time}s, {duration}s duration):\n"
+            structure_instructions += f"  - {beat_count} beats\n"
+            structure_instructions += f"  - Approximately {syllable_count} syllables\n"
+        # Calculate approximate total number of lines based on syllables
+        total_syllables = sum(section["syllable_count"] for section in song_structure["syllables"])
+        estimated_lines = max(4, int(total_syllables / 8))  # Rough estimate: average 8 syllables per line
+        # Create prompt for the LLM
+        prompt = f"""
+You are a talented songwriter who specializes in {genre} music.
+Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
+Music analysis has detected the following qualities in the music:
+- Tempo: {tempo:.1f} BPM
+- Key: {key} {mode}
+- Primary emotion: {primary_emotion}
+- Primary theme: {primary_theme}
+{structure_instructions}
+The lyrics should:
+- Perfectly capture the essence and style of {genre} music
+- Express the {primary_emotion} emotion and {primary_theme} theme
+- Have approximately {estimated_lines} lines total, distributed across sections
+- For each line, include a syllable count that matches the beats in that section
+- Include timestamps [MM:SS] at the beginning of each section
+- Be completely original
+- Match the exact song structure provided above
+Important: Each section should have lyrics with syllable counts matching the beats!
 Your lyrics:
 """
     # Extract and clean generated lyrics
     lyrics = response[0]["generated_text"].strip()
+    # Add section labels if they're not present (in fallback mode)
+    if song_structure is None and "Verse" not in lyrics and "Chorus" not in lyrics:
         lines = lyrics.split('\n')
         formatted_lyrics = []
         current_section = "Verse"
+        verse_count = 0
         for i, line in enumerate(lines):
             if i == 0:
                 formatted_lyrics.append("[Verse]")
+                verse_count = 1
             elif i == verse_lines:
                 formatted_lyrics.append("\n[Chorus]")
             elif i == verse_lines + chorus_lines and lines_count > 10:
                 formatted_lyrics.append("\n[Bridge]")
+            elif i == verse_lines + chorus_lines + (2 if lines_count > 10 else 0):
+                formatted_lyrics.append("\n[Verse]")
+                verse_count = 2
             formatted_lyrics.append(line)
         lyrics = '\n'.join(formatted_lyrics)
+    # Add timestamps in detailed mode if needed
+    elif song_structure is not None:
+        # Ensure the lyrics have proper section headings with timestamps
+        for section in song_structure["syllables"]:
+            section_type = section["type"].capitalize()
+            start_time_str = f"{int(section['start']) // 60:02d}:{int(section['start']) % 60:02d}"
+            section_header = f"[{start_time_str}] {section_type}"
+            # Check if this section header is missing and add it if needed
+            if section_header not in lyrics and section["type"] not in ["intro", "outro"]:
+                # Find where this section might be based on timestamp
+                time_matches = [
+                    idx for idx, line in enumerate(lyrics.split('\n'))
+                    if f"{int(section['start']) // 60:02d}:{int(section['start']) % 60:02d}" in line
+                ]
+                if time_matches:
+                    lines = lyrics.split('\n')
+                    line_idx = time_matches[0]
+                    lines[line_idx] = section_header
+                    lyrics = '\n'.join(lines)
+    return lyrics
 def process_audio(audio_file):
     """Main function to process audio file, classify genre, and generate lyrics."""
         except Exception as e:
             print(f"Error in emotion analysis: {str(e)}")
             # Continue even if emotion analysis fails
+            emotion_results = {
+                "emotion_analysis": {"primary_emotion": "Unknown"},
+                "theme_analysis": {"primary_theme": "Unknown"},
+                "rhythm_analysis": {"tempo": 0},
+                "tonal_analysis": {"key": "Unknown", "mode": ""}
+            }
+        # Calculate detailed song structure for better lyrics alignment
+        try:
+            song_structure = calculate_detailed_song_structure(audio_data)
+        except Exception as e:
+            print(f"Error analyzing song structure: {str(e)}")
+            # Continue with a simpler approach if this fails
+            song_structure = None
+        # Generate lyrics based on top genre, emotion analysis, and song structure
         try:
             primary_genre, _ = top_genres[0]
+            lyrics = generate_lyrics(primary_genre, audio_data["duration"], emotion_results, song_structure)
         except Exception as e:
             print(f"Error generating lyrics: {str(e)}")
             lyrics = f"Error generating lyrics: {str(e)}"
                 emotion_text += f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n"
                 emotion_text += f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n"
                 emotion_text += f"Primary Theme: {emotion_results['summary']['primary_theme']}"
+                # Add detailed song structure information if available
+                try:
+                    audio_data = extract_audio_features(audio_file)
+                    song_structure = calculate_detailed_song_structure(audio_data)
+                    emotion_text += "\n\nSong Structure:\n"
+                    for section in song_structure["syllables"]:
+                        emotion_text += f"- {section['type'].capitalize()}: {section['start']:.1f}s to {section['end']:.1f}s "
+                        emotion_text += f"({section['duration']:.1f}s, {section['beat_count']} beats, ~{section['syllable_count']} syllables)\n"
+                except Exception as e:
+                    print(f"Error displaying song structure: {str(e)}")
+                    # Continue without showing structure details
             except Exception as e:
                 print(f"Error in emotion analysis: {str(e)}")
                 emotion_text = f"Error in emotion analysis: {str(e)}"
     1. Upload an audio file of your choice
     2. The system will classify the genre using the dima806/music_genres_classification model
     3. The system will analyze the musical emotion and theme using advanced audio processing
+    4. The system will identify the song structure, beats, and timing patterns
+    5. Based on the detected genre, emotion, and structure, it will generate lyrics that match the beats, sections, and flow of the music
+    6. The lyrics will include appropriate section markings and syllable counts to align with the music
     """)
 # Launch the app
+demo.launch()