diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -33,7 +33,7 @@ if "HF_TOKEN" in os.environ: # Constants GENRE_MODEL_NAME = "dima806/music_genres_classification" MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593" -LLM_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" +LLM_MODEL_NAME = "Qwen/Qwen3-14B" SAMPLE_RATE = 22050 # Standard sample rate for audio processing # Check CUDA availability (for informational purposes) @@ -265,74 +265,216 @@ def detect_music(audio_data): print(f"Error in music detection: {str(e)}") return False, [] -# Enhanced detect_beats function for better rhythm analysis def detect_beats(y, sr): - """Detect beats and create a detailed rhythmic map of the audio.""" - # Get tempo and beat frames - tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) + """Enhanced beat detection with adaptive threshold analysis and improved time signature detection.""" + # STEP 1: Improved pre-processing with robustness for quiet sections + # Apply a small floor to avoid division-by-zero issues + y = np.clip(y, 1e-10, None) # Prevent extreme quiet sections from causing NaN - # Convert beat frames to time in seconds - beat_times = librosa.frames_to_time(beat_frames, sr=sr) + # Separate harmonic and percussive components + y_harmonic, y_percussive = librosa.effects.hpss(y) + + # Generate multiple onset envelopes with smoothing for stability + onset_env_full = librosa.onset.onset_strength(y=y, sr=sr) + onset_env_perc = librosa.onset.onset_strength(y=y_percussive, sr=sr) + + # Apply small smoothing to handle quiet sections + onset_env_full = np.maximum(onset_env_full, 1e-6) # Minimum threshold to avoid NaN + onset_env_perc = np.maximum(onset_env_perc, 1e-6) - # Calculate beat strength to identify strong and weak beats - onset_env = librosa.onset.onset_strength(y=y, sr=sr) - beat_strengths = [onset_env[frame] for frame in beat_frames if frame < len(onset_env)] + # Create weighted combination + combined_onset = onset_env_full * 0.3 + onset_env_perc * 0.7 + + # STEP 2: Multi-strategy tempo and beat detection + tempo_candidates = [] + beat_candidates = [] + + # Strategy 1: Standard detection + tempo1, beats1 = librosa.beat.beat_track( + onset_envelope=combined_onset, + sr=sr, + tightness=100 # More sensitive tracking + ) + tempo_candidates.append(tempo1) + beat_candidates.append(beats1) - # If we couldn't get strengths for all beats, use average for missing ones - if beat_strengths: - avg_strength = sum(beat_strengths) / len(beat_strengths) - while len(beat_strengths) < len(beat_times): - beat_strengths.append(avg_strength) + # Strategy 2: Try with different tempo range for complex signatures + tempo2, beats2 = librosa.beat.beat_track( + onset_envelope=combined_onset, + sr=sr, + tightness=100, + start_bpm=60, # Lower starting BPM helps find different time signatures + std_bpm=20 # Allow wider variations + ) + tempo_candidates.append(tempo2) + beat_candidates.append(beats2) + + # Select the best strategy based on consistency + beat_consistency = [] + for beats in beat_candidates: + if len(beats) <= 1: + beat_consistency.append(0) + continue + + times = librosa.frames_to_time(beats, sr=sr) + intervals = np.diff(times) + + # More consistent beats have lower variance in intervals + if np.mean(intervals) > 0: + consistency = 1.0 / (1.0 + np.std(intervals)/np.mean(intervals)) + beat_consistency.append(consistency) + else: + beat_consistency.append(0) + + best_idx = np.argmax(beat_consistency) if beat_consistency else 0 + tempo = tempo_candidates[best_idx] + beat_frames = beat_candidates[best_idx] + + # STEP 3: Performance optimization with vectorized operations + beat_times = librosa.frames_to_time(beat_frames, sr=sr) + + # Vectorized extraction of beat strengths instead of loop + beat_strengths = [] + if len(beat_frames) > 0: + # Filter out beat frames that exceed the onset envelope length + valid_frames = [frame for frame in beat_frames if frame < len(combined_onset)] + if valid_frames: + # Vectorized extraction of valid beat strengths + beat_strengths = combined_onset[valid_frames].tolist() + + # Handle any remaining beats + avg_strength = np.mean(beat_strengths) if beat_strengths else 1.0 + beat_strengths.extend([avg_strength] * (len(beat_times) - len(beat_strengths))) + else: + beat_strengths = [1.0] * len(beat_times) else: beat_strengths = [1.0] * len(beat_times) - # Calculate time intervals between beats (for rhythm pattern detection) - intervals = [] - for i in range(1, len(beat_times)): - intervals.append(beat_times[i] - beat_times[i-1]) + # STEP 4: Calculate intervals between beats + intervals = np.diff(beat_times).tolist() if len(beat_times) > 1 else [] + + # STEP 5: Improved time signature detection for various patterns + # Start with default assumption + time_signature = 4 - # Try to detect time signature based on beat pattern - time_signature = 4 # Default assumption of 4/4 time if len(beat_strengths) > 8: - strength_pattern = [] - for i in range(0, len(beat_strengths), 2): - if i+1 < len(beat_strengths): - ratio = beat_strengths[i] / (beat_strengths[i+1] + 0.0001) - strength_pattern.append(ratio) - - # Check if we have a clear 3/4 pattern (strong-weak-weak) - if strength_pattern: - three_pattern = sum(1 for r in strength_pattern if r > 1.2) / len(strength_pattern) - if three_pattern > 0.6: - time_signature = 3 - - # Group beats into phrases + # Use autocorrelation to find periodicity in beat strengths + if len(beat_strengths) > 4: + # Normalize beat strengths for better pattern detection + norm_strengths = np.array(beat_strengths) + if np.max(norm_strengths) > 0: + norm_strengths = norm_strengths / np.max(norm_strengths) + + # Compute autocorrelation to find periodic patterns (N) + ac = librosa.autocorrelate(norm_strengths, max_size=len(norm_strengths)//2) + + # Find peaks in autocorrelation (indicates periodicity) + if len(ac) > 3: # Need enough data for peak picking + # Find peaks after lag 0 + peaks = librosa.util.peak_pick(ac[1:], pre_max=1, post_max=1, pre_avg=1, post_avg=1, delta=0.1, wait=1) + peaks = peaks + 1 # Adjust for the removed lag 0 + + if len(peaks) > 0: + # Get the first significant peak position (cycle length N) + N = peaks[0] + + # Map common cycle lengths to time signatures + if 2 <= N <= 3: + time_signature = N # Direct mapping for simple cases + elif N == 6: + time_signature = 3 # Could be 6/8 or 3/4 with subdivisions + elif N == 8: + time_signature = 4 # Could be 4/4 with subdivisions + elif N == 5 or N == 7: + time_signature = N # Odd time signatures like 5/4 or 7/8 + # Otherwise, keep default 4 + + # Use adaptive thresholds for pattern detection instead of fixed values + if len(beat_strengths) > 3: + # Calculate z-scores to identify statistically significant strong beats + strengths_array = np.array(beat_strengths) + mean_strength = np.mean(strengths_array) + std_strength = np.std(strengths_array) + + if std_strength > 0: + z_scores = (strengths_array - mean_strength) / std_strength + + # Count beats with z-score > 1 in groups of 3 (for 3/4 time) + strong_beat_pattern = [] + for i in range(0, len(z_scores) - 2, 3): + # First beat should be significantly stronger (z > 1) + # Second and third beats should be weaker + if z_scores[i] > 1 and z_scores[i+1] < 0.5 and z_scores[i+2] < 0.5: + strong_beat_pattern.append(1) + else: + strong_beat_pattern.append(0) + + # Check if we have a clear 3/4 pattern + if strong_beat_pattern and len(strong_beat_pattern) >= 3: + three_pattern_probability = sum(strong_beat_pattern) / len(strong_beat_pattern) + if three_pattern_probability > 0.6: + time_signature = 3 + + # STEP 6: Enhanced phrase detection with adaptive thresholds phrases = [] current_phrase = [] - for i in range(len(beat_times)): - current_phrase.append(i) - - # Look for natural phrase boundaries - if i < len(beat_times) - 1: - is_stronger_next = False - if i < len(beat_strengths) - 1: - is_stronger_next = beat_strengths[i+1] > beat_strengths[i] * 1.2 - - is_longer_gap = False - if i < len(beat_times) - 1 and intervals: - current_gap = beat_times[i+1] - beat_times[i] - avg_gap = sum(intervals) / len(intervals) - is_longer_gap = current_gap > avg_gap * 1.3 + if len(beat_times) > 0: + # Calculate adaptive thresholds using percentiles instead of fixed ratios + if len(beat_strengths) > 4: + # Define thresholds based on distribution rather than fixed values + strong_threshold = np.percentile(beat_strengths, 75) # Top 25% are "strong" beats + # For gaps, calculate significant deviation using z-scores if we have intervals + if intervals: + mean_interval = np.mean(intervals) + std_interval = np.std(intervals) + # A significant gap is > 1.5 standard deviations above mean + significant_gap = mean_interval + (1.5 * std_interval) if std_interval > 0 else mean_interval * 1.3 + else: + significant_gap = 0 + else: + # Fallback for limited data + strong_threshold = np.max(beat_strengths) * 0.8 if beat_strengths else 1.0 + significant_gap = 0 + + # Identify phrase boundaries + for i in range(len(beat_times)): + current_phrase.append(i) - if (is_stronger_next or is_longer_gap) and len(current_phrase) >= 2: - phrases.append(current_phrase) - current_phrase = [] + # Check for phrase boundary conditions + if i < len(beat_times) - 1: + # Strong beat coming up (using adaptive threshold) + is_stronger_next = False + if i < len(beat_strengths) - 1: + is_stronger_next = beat_strengths[i+1] > strong_threshold and beat_strengths[i+1] > beat_strengths[i] * 1.1 + + # Significant gap (using adaptive threshold) + is_longer_gap = False + if i < len(beat_times) - 1 and intervals and i < len(intervals): + is_longer_gap = intervals[i] > significant_gap + + # Measure boundary based on time signature + is_measure_boundary = (i + 1) % time_signature == 0 and i > 0 + + # Combined decision for phrase boundary + if ((is_stronger_next or is_longer_gap) and len(current_phrase) >= 2) or \ + (is_measure_boundary and len(current_phrase) >= time_signature): + phrases.append(current_phrase) + current_phrase = [] # Add the last phrase if not empty - if current_phrase: + if current_phrase and len(current_phrase) >= 2: phrases.append(current_phrase) + # Ensure we have at least one phrase + if not phrases and len(beat_times) >= 2: + # Default to grouping by measures based on detected time signature + for i in range(0, len(beat_times), time_signature): + end = min(i + time_signature, len(beat_times)) + if end - i >= 2: # Ensure at least 2 beats per phrase + phrases.append(list(range(i, end))) + + # Return in the original format for compatibility return { "tempo": tempo, "beat_frames": beat_frames, @@ -345,53 +487,247 @@ def detect_beats(y, sr): } def detect_sections(y, sr): - """Detect sections (verse, chorus, etc.) in the audio.""" - # Compute the spectral contrast - S = np.abs(librosa.stft(y)) + """ + Advanced detection of musical sections with adaptive segmentation and improved classification. + + Parameters: + y: Audio time series + sr: Sample rate + + Returns: + A list of section dictionaries with type, start time, end time, and duration + """ + # Step 1: Extract rich feature set for comprehensive analysis + # ---------------------------------------------------------------------- + hop_length = 512 # Common hop length for feature extraction + + # Spectral features + S = np.abs(librosa.stft(y, hop_length=hop_length)) contrast = librosa.feature.spectral_contrast(S=S, sr=sr) - # Compute the chroma features - chroma = librosa.feature.chroma_cqt(y=y, sr=sr) + # Harmonic features with CQT-based chroma (better for harmonic analysis) + chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length) + + # Timbral features + mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length) - # Use a combination of contrast and chroma to find segment boundaries - # Average over frequency axis to get time series - contrast_avg = np.mean(contrast, axis=0) - chroma_avg = np.mean(chroma, axis=0) + # Energy features + rms = librosa.feature.rms(y=y, hop_length=hop_length) - # Normalize - contrast_avg = (contrast_avg - np.mean(contrast_avg)) / np.std(contrast_avg) - chroma_avg = (chroma_avg - np.mean(chroma_avg)) / np.std(chroma_avg) + # Harmonic-percussive source separation for better rhythm analysis + y_harmonic, y_percussive = librosa.effects.hpss(y) + percussive_rms = librosa.feature.rms(y=y_percussive, hop_length=hop_length) - # Combine features - combined = contrast_avg + chroma_avg + # Step 2: Adaptive determination of segment count based on song complexity + # ---------------------------------------------------------------------- + duration = librosa.get_duration(y=y, sr=sr) - # Detect structural boundaries - bounds = librosa.segment.agglomerative(combined, 3) # Adjust for typical song structures + # Feature preparation for adaptive segmentation + # Stack features with proper normalization (addressing the scale issue) + feature_stack = np.vstack([ + librosa.util.normalize(contrast), + librosa.util.normalize(chroma), + librosa.util.normalize(mfcc), + librosa.util.normalize(rms) + ]) - # Convert to time in seconds - bound_times = librosa.frames_to_time(bounds, sr=sr) + # Transpose to get time as first dimension + feature_matrix = feature_stack.T - # Estimate section types based on position and length + # Step 3: Feature fusion using dimensionality reduction (addressing simple summation issue) + # ---------------------------------------------------------------------- + + # Apply PCA to reduce dimensionality while preserving relationships + from sklearn.decomposition import PCA + + # Handle very short audio files + n_components = min(8, feature_matrix.shape[0], feature_matrix.shape[1]) + + if feature_matrix.shape[0] > n_components and feature_matrix.shape[1] > 0: + try: + pca = PCA(n_components=n_components) + reduced_features = pca.fit_transform(feature_matrix) + except Exception as e: + print(f"PCA failed, falling back to original features: {e}") + # Fallback to simpler approach if PCA fails + reduced_features = feature_matrix + else: + # Not enough data for PCA + reduced_features = feature_matrix + + # Step 4: Adaptive determination of optimal segment count + # ---------------------------------------------------------------------- + + # Initialize range of segment counts to try + min_segments = max(2, int(duration / 60)) # At least 2 segments, roughly 1 per minute + max_segments = min(10, int(duration / 20)) # At most 10 segments, roughly 1 per 20 seconds + + # Ensure reasonable bounds + min_segments = max(2, min(min_segments, 4)) + max_segments = max(min_segments + 1, min(max_segments, 8)) + + # Try different segment counts and evaluate with silhouette score + best_segments = min_segments + best_score = -1 + + from sklearn.metrics import silhouette_score + from sklearn.cluster import AgglomerativeClustering + + # Only do this analysis if we have enough data + if reduced_features.shape[0] > max_segments: + for n_segments in range(min_segments, max_segments + 1): + try: + # Perform agglomerative clustering + clustering = AgglomerativeClustering(n_clusters=n_segments) + labels = clustering.fit_predict(reduced_features) + + # Calculate silhouette score if we have enough samples + if len(np.unique(labels)) > 1 and len(labels) > n_segments + 1: + score = silhouette_score(reduced_features, labels) + + if score > best_score: + best_score = score + best_segments = n_segments + except Exception as e: + print(f"Clustering with {n_segments} segments failed: {e}") + continue + + # Use the optimal segment count for final segmentation + n_segments = best_segments + + # Step 5: Final segmentation using the optimal segment count + # ---------------------------------------------------------------------- + + # Method 1: Use agglomerative clustering on the reduced features + try: + clustering = AgglomerativeClustering(n_clusters=n_segments) + labels = clustering.fit_predict(reduced_features) + + # Convert cluster labels to boundaries by finding where labels change + boundaries = [0] # Start with the beginning + + for i in range(1, len(labels)): + if labels[i] != labels[i-1]: + boundaries.append(i) + + boundaries.append(len(labels)) # Add the end + + # Convert to frames + bounds_frames = np.array(boundaries) + + except Exception as e: + print(f"Final clustering failed: {e}") + # Fallback to librosa's agglomerative clustering on original features + bounds_frames = librosa.segment.agglomerative(feature_stack, n_segments) + + # Step 6: Detect harmonic changes for better bridge identification + # ---------------------------------------------------------------------- + + # Calculate tonal centroids to identify key changes + tonnetz = librosa.feature.tonnetz(y=y_harmonic, sr=sr) + + # Look for significant changes in harmonic content + harmonic_changes = [] + + if tonnetz.shape[1] > 1: + tonnetz_diff = np.sum(np.abs(np.diff(tonnetz, axis=1)), axis=0) + # Normalize + if np.max(tonnetz_diff) > 0: + tonnetz_diff = tonnetz_diff / np.max(tonnetz_diff) + + # Identify significant harmonic changes (potential bridges or section changes) + threshold = np.percentile(tonnetz_diff, 90) # Top 10% most significant changes + for i in range(len(tonnetz_diff)): + if tonnetz_diff[i] > threshold: + harmonic_changes.append(i) + + # Step 7: Convert boundaries to time and create sections + # ---------------------------------------------------------------------- + bounds_times = librosa.frames_to_time(bounds_frames, sr=sr, hop_length=hop_length) + + # Create sections from the boundaries sections = [] - for i in range(len(bound_times) - 1): - start = bound_times[i] - end = bound_times[i+1] + + for i in range(len(bounds_times) - 1): + start = bounds_times[i] + end = bounds_times[i+1] duration = end - start - # Simple heuristic to label sections + # Skip extremely short sections + if duration < 4 and i > 0 and i < len(bounds_times) - 2: + continue + + # Step 8: Section type classification with improved musical features + # ---------------------------------------------------------------------- + + # Get indices for this section + start_idx = bounds_frames[i] + end_idx = bounds_frames[i+1] + + # Basic section type based on position if i == 0: section_type = "intro" - elif i == len(bound_times) - 2: + elif i == len(bounds_times) - 2: section_type = "outro" - elif i % 2 == 1: # Alternating verse/chorus pattern - section_type = "chorus" else: - section_type = "verse" + # Default to alternating verse/chorus + section_type = "chorus" if i % 2 == 1 else "verse" + + # Only analyze characteristics if we have enough frames + if end_idx > start_idx: + # Calculate musical characteristics for this section + + # 1. Energy profile + energy = np.mean(rms[0, start_idx:end_idx]) - # If we have a short section in the middle, it might be a bridge - if 0 < i < len(bound_times) - 2 and duration < 20: - section_type = "bridge" + # 2. Rhythm intensity (percussive content) + rhythm_intensity = np.mean(percussive_rms[0, start_idx:end_idx]) + # 3. Harmonic complexity + if chroma.shape[1] > 0: + chroma_var = np.var(chroma[:, start_idx:end_idx]) + else: + chroma_var = 0 + + # 4. Timbral characteristics + if mfcc.shape[1] > 0: + mfcc_mean = np.mean(mfcc[:, start_idx:end_idx], axis=1) + mfcc_var = np.var(mfcc[:, start_idx:end_idx], axis=1) + else: + mfcc_mean = np.zeros(mfcc.shape[0]) + mfcc_var = np.zeros(mfcc.shape[0]) + + # 5. Check for harmonic changes within this section (for bridge detection) + has_harmonic_change = False + for change_idx in harmonic_changes: + if start_idx <= change_idx < end_idx: + has_harmonic_change = True + break + + # Calculate relative metrics by comparing to the entire song + relative_energy = energy / np.mean(rms) + relative_rhythm = rhythm_intensity / np.mean(percussive_rms) + + # Improved section type classification: + + # Chorus: High energy, strong rhythm, less harmonic variation + if (relative_energy > 1.1 and relative_rhythm > 1.1 and + section_type != "intro" and section_type != "outro"): + section_type = "chorus" + + # Verse: Moderate energy, moderate rhythm, more harmonic variation + elif (0.8 <= relative_energy <= 1.1 and chroma_var > np.mean(np.var(chroma, axis=1)) and + section_type != "intro" and section_type != "outro"): + section_type = "verse" + + # Bridge: Often has harmonic changes, energy drop, or unique timbral characteristics + if (section_type not in ["intro", "outro"] and + (has_harmonic_change or + (0.5 <= relative_energy <= 0.9 and duration < 30) or + np.any(mfcc_var > np.percentile(np.var(mfcc, axis=1), 75)))): + section_type = "bridge" + + # Add section to the list sections.append({ "type": section_type, "start": start, @@ -399,157 +735,535 @@ def detect_sections(y, sr): "duration": duration }) + # Post-processing: Ensure reasonable section sequence and durations + for i in range(1, len(sections) - 1): + # Check for unreasonably short sections and merge them + if sections[i]["duration"] < 8 and sections[i]["type"] not in ["intro", "outro", "bridge"]: + # Either merge with previous or next section based on similarity + prev_type = sections[i-1]["type"] + next_type = sections[i+1]["type"] if i+1 < len(sections) else "outro" + + # Default to merging with the previous section + sections[i]["type"] = prev_type + + # Filter out any remaining extremely short sections + sections = [s for s in sections if s["duration"] >= 5 or + s["type"] == "intro" or s["type"] == "outro"] + return sections -# New function: Create flexible syllable templates -def create_flexible_syllable_templates(beats_info): - """Create detailed syllable templates based on beat patterns, capturing stress patterns.""" - # Get the beat times and strengths - beat_times = beats_info["beat_times"] +def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='default'): + """ + Create enhanced syllable templates based on beat patterns with improved musical intelligence. + + Parameters: + beats_info: Dictionary containing beat analysis data + genre: Optional genre to influence template creation + phrase_mode: 'default' uses provided phrases, 'auto' forces recalculation + + Returns: + String of syllable templates with embedded strength values and flexible timing + """ + import numpy as np + from sklearn.cluster import KMeans + + # Extract basic beat information + beat_times = beats_info.get("beat_times", []) beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times)) - phrases = beats_info.get("phrases", []) tempo = beats_info.get("tempo", 120) + time_signature = beats_info.get("time_signature", 4) + + # Early return for insufficient data + if len(beat_times) < 2: + return "S(1.0):1-w(0.5):1|S(1.0):1-w(0.5):1" # Default fallback pattern - # If no phrases were detected, create a simple division - if not phrases: - # Default to 4-beat phrases + # Step 1: Adaptive thresholding using k-means clustering + # ---------------------------------------------------------------------- + if len(beat_strengths) >= 6: # Need enough data points for clustering + # Reshape for k-means + X = np.array(beat_strengths).reshape(-1, 1) + + # Use k-means with 3 clusters for Strong, Medium, Weak classification + kmeans = KMeans(n_clusters=3, random_state=0, n_init=10).fit(X) + + # Find the centroid values and sort them + centroids = sorted([float(c[0]) for c in kmeans.cluster_centers_]) + + # Map to thresholds (using the midpoints between centroids) + if len(centroids) >= 3: + medium_threshold = (centroids[0] + centroids[1]) / 2 + strong_threshold = (centroids[1] + centroids[2]) / 2 + else: + # Fallback if clustering doesn't work well + medium_threshold = np.percentile(beat_strengths, 33) + strong_threshold = np.percentile(beat_strengths, 66) + else: + # For limited data, use percentile-based approach + medium_threshold = np.percentile(beat_strengths, 33) + strong_threshold = np.percentile(beat_strengths, 66) + + # Step 2: Create or refine phrases based on mode + # ---------------------------------------------------------------------- + phrases = beats_info.get("phrases", []) + + if phrase_mode == 'auto' or not phrases: + # Create phrases based on time signature and beat strengths phrases = [] - for i in range(0, len(beat_times), 4): - end_idx = min(i + 4, len(beat_times)) - if end_idx - i >= 2: # Ensure at least 2 beats per phrase - phrases.append(list(range(i, end_idx))) + current_phrase = [] + + for i in range(len(beat_times)): + current_phrase.append(i) + + # Check for natural phrase endings + if (i + 1) % time_signature == 0 or i == len(beat_times) - 1: + if len(current_phrase) >= 2: # Ensure minimum phrase length + phrases.append(current_phrase) + current_phrase = [] + + # Add any remaining beats + if current_phrase and len(current_phrase) >= 2: + phrases.append(current_phrase) + + # Step 3: Calculate continuous tempo-to-syllable mapping function + # ---------------------------------------------------------------------- + def tempo_to_syllable_base(tempo): + """Continuous function mapping tempo to syllable base count""" + # Sigmoid-like function that smoothly transitions between syllable counts + if tempo > 180: + return 1.0 + elif tempo > 140: + return 1.0 + (180 - tempo) * 0.02 # Gradual increase 1.0 → 1.8 + elif tempo > 100: + return 1.8 + (140 - tempo) * 0.01 # Gradual increase 1.8 → 2.2 + elif tempo > 70: + return 2.2 + (100 - tempo) * 0.02 # Gradual increase 2.2 → 2.8 + else: + return 2.8 + max(0, (70 - tempo) * 0.04) # Continue increasing for very slow tempos - # Create enhanced syllable templates for each phrase + # Step 4: Generate enhanced templates with flexible timing + # ---------------------------------------------------------------------- syllable_templates = [] for phrase in phrases: + # Skip empty phrases + if not phrase: + continue + # Extract beat strengths for this phrase phrase_strengths = [beat_strengths[i] for i in phrase if i < len(beat_strengths)] if not phrase_strengths: phrase_strengths = [1.0] * len(phrase) - # Normalize strengths for easier pattern recognition - if phrase_strengths: - max_strength = max(phrase_strengths) - if max_strength > 0: - norm_strengths = [s/max_strength for s in phrase_strengths] - else: - norm_strengths = [1.0] * len(phrase_strengths) - else: - norm_strengths = [] - - # Identify strong and weak beats (S = strong, w = weak) + # Apply adaptive thresholding for stress pattern detection stress_pattern = [] - for strength in norm_strengths: - if strength > 0.7: - stress_pattern.append("S") # Strong beat - elif strength > 0.4: - stress_pattern.append("m") # Medium beat - else: - stress_pattern.append("w") # Weak beat + for i, strength in enumerate(phrase_strengths): + # Consider both strength and metrical position + metrical_position = i % time_signature + + # Apply positional boost for strong metrical positions + position_boost = 0.15 if metrical_position == 0 else 0 + # Secondary stress on beat 3 in 4/4 time + if time_signature == 4 and metrical_position == 2: + position_boost = 0.08 - # Calculate appropriate syllable count based on tempo and beat pattern - if tempo > 160: - # Very fast tempo - typically one syllable per beat - syllables_per_beat = [1] * len(phrase) - elif tempo > 120: - # Fast tempo - syllables_per_beat = [1 if s == "S" or s == "m" else 1 for s in stress_pattern] - elif tempo > 90: - # Medium tempo - syllables_per_beat = [2 if s == "S" else 1 if s == "m" else 1 for s in stress_pattern] - else: - # Slow tempo - syllables_per_beat = [2 if s == "S" else 2 if s == "m" else 1 for s in stress_pattern] + effective_strength = strength + position_boost + + if effective_strength >= strong_threshold: + stress_pattern.append(("S", effective_strength)) # Strong beat with strength + elif effective_strength >= medium_threshold: + stress_pattern.append(("m", effective_strength)) # Medium beat with strength + else: + stress_pattern.append(("w", effective_strength)) # Weak beat with strength - # Create a detailed template with stress information + # Step 5: Calculate syllable counts using continuous function + # ---------------------------------------------------------------------- detailed_template = [] - for i, (stress, syllable_count) in enumerate(zip(stress_pattern, syllables_per_beat)): - if stress == "S": - # Mark strong beat with capital letter followed by syllable count - detailed_template.append(f"S{syllable_count}") - elif stress == "m": - # Mark medium beat with lowercase letter - detailed_template.append(f"m{syllable_count}") + + for i, (stress_type, strength) in enumerate(stress_pattern): + # Get base syllable count from tempo + base_syllables = tempo_to_syllable_base(tempo) + + # Adjust based on stress type + if stress_type == "S": + syllable_factor = 1.2 # More syllables for strong beats + elif stress_type == "m": + syllable_factor = 1.0 # Normal for medium beats else: - # Mark weak beat with lowercase letter - detailed_template.append(f"w{syllable_count}") + syllable_factor = 0.8 # Fewer for weak beats + + # Apply genre-specific adjustments + genre_factor = 1.0 + if genre: + genre = genre.lower() + if any(term in genre for term in ["rap", "hip hop", "hip-hop"]): + genre_factor = 1.4 # Much higher syllable density for rap + elif any(term in genre for term in ["folk", "country", "ballad"]): + genre_factor = 0.8 # Lower density for folk styles + + # Calculate adjusted syllable count + raw_count = base_syllables * syllable_factor * genre_factor + + # Allow for more flexible syllable counts with non-integer values + # Round to multiples of 0.5 for half-syllable precision + rounded_count = round(raw_count * 2) / 2 + + # Limit to reasonable range (0.5 to 4) + syllable_count = max(0.5, min(4, rounded_count)) + + # Format with embedded strength value for reversibility + # Convert strength to 2-decimal precision percentage + strength_pct = int(strength * 100) / 100 + detailed_template.append(f"{stress_type}({strength_pct}):{syllable_count}") - # Join all beat templates for this phrase + # Join beat templates for this phrase phrase_template = "-".join(detailed_template) syllable_templates.append(phrase_template) - # Join all phrase templates + # Step 6: Ensure valid output with reasonable defaults + # ---------------------------------------------------------------------- + if not syllable_templates: + # Create a sensible default based on time signature + if time_signature == 3: + syllable_templates = ["S(0.95):2-w(0.4):1-w(0.35):1"] # 3/4 default + else: + syllable_templates = ["S(0.95):2-w(0.4):1-m(0.7):1.5-w(0.35):1"] # 4/4 default + + # Join all phrase templates with the original separator for compatibility return "|".join(syllable_templates) -# Helper function to convert technical templates to human-readable instructions -def format_syllable_templates_for_prompt(syllable_templates): - """Convert technical syllable templates into clear, human-readable instructions.""" +def format_syllable_templates_for_prompt(syllable_templates, arrow="→", line_wrap=10, + structured_output=False, beat_types=None): + """ + Convert technical syllable templates into clear, human-readable instructions with + enhanced flexibility and customization options. + + Parameters: + syllable_templates: String or list of templates + arrow: Symbol to use between beats (default: "→") + line_wrap: Number of beats before automatic line wrapping (0 = no wrapping) + structured_output: If True, return structured data instead of text + beat_types: Custom mapping for beat types (default: None, uses standard mapping) + + Returns: + Human-readable instructions or structured data depending on parameters + """ if not syllable_templates: - return "" + return {} if structured_output else "" + + # Define standard beat type mapping (extensible) + default_beat_types = { + "S": {"name": "STRONG", "description": "stressed syllable"}, + "m": {"name": "medium", "description": "medium-stressed syllable"}, + "w": {"name": "weak", "description": "unstressed syllable"}, + "X": {"name": "EXTRA", "description": "extra strong syllable"}, + "L": {"name": "legato", "description": "connected/tied syllable"} + } + + # Use custom mapping if provided, otherwise use default + beat_types = beat_types or default_beat_types + + # Initialize structured output if requested + structured_data = {"lines": [], "explanations": []} if structured_output else None - # Check if we're dealing with the enhanced format or the old format - if isinstance(syllable_templates, str) and "|" in syllable_templates: - # Enhanced format with stress patterns - phrases = syllable_templates.split("|") + # Improved format detection - more robust than just checking for "|" + is_enhanced_format = False + + # Check if it's a string with enhanced format patterns + if isinstance(syllable_templates, str): + # Look for enhanced format patterns - check for beat type indicators + if any(bt + "(" in syllable_templates or bt + ":" in syllable_templates or bt + "[" in syllable_templates + for bt in beat_types.keys()): + is_enhanced_format = True + # Secondary check for the "|" delimiter between phrases + elif "|" in syllable_templates: + is_enhanced_format = True + + # Initialize the output with a brief explanatory header + output = [] + + if is_enhanced_format: + # Split into individual phrase templates + phrases = syllable_templates.split("|") if "|" in syllable_templates else [syllable_templates] - instructions = [] + # Process each phrase into human-readable instructions for i, phrase in enumerate(phrases): + # Check for special annotations + has_swing = "(swing)" in phrase + if has_swing: + phrase = phrase.replace("(swing)", "") # Remove annotation for processing + beats = phrase.split("-") beat_instructions = [] - for beat in beats: - if beat.startswith("S"): - # Strong beat - count = beat[1:] - beat_instructions.append(f"STRONG({count})") - elif beat.startswith("m"): - # Medium beat - count = beat[1:] - beat_instructions.append(f"medium({count})") - elif beat.startswith("w"): - # Weak beat + # Process each beat in the phrase + for j, beat in enumerate(beats): + # Extract beat type and information + beat_info = {"original": beat, "type": None, "count": None, "strength": None} + + # Handle enhanced format with embedded strength values: S(0.95):2 + if "(" in beat and ")" in beat and ":" in beat: + parts = beat.split(":") + beat_type = parts[0].split("(")[0] # Extract beat type + strength = parts[0].split("(")[1].rstrip(")") # Extract strength value + count = parts[1] # Extract syllable count + + beat_info["type"] = beat_type + beat_info["count"] = count + beat_info["strength"] = strength + + # Handle simpler format: S2, m1, w1 + elif any(beat.startswith(bt) for bt in beat_types.keys()) and len(beat) > 1: + beat_type = beat[0] count = beat[1:] - beat_instructions.append(f"weak({count})") + + beat_info["type"] = beat_type + beat_info["count"] = count + + # Fallback for any other format + else: + beat_instructions.append(beat) + continue + + # Format the beat instruction based on type + if beat_info["type"] in beat_types: + type_name = beat_types[beat_info["type"]]["name"] + if beat_info["strength"]: + beat_instructions.append(f"{type_name}({beat_info['count']}) [{beat_info['strength']}]") + else: + beat_instructions.append(f"{type_name}({beat_info['count']})") else: - # Fallback for old format + # Unknown beat type, use as-is beat_instructions.append(beat) - line_desc = " → ".join(beat_instructions) - instructions.append(f"Line {i+1}: {line_desc}") + # Handle line wrapping for readability + if line_wrap > 0 and len(beat_instructions) > line_wrap: + wrapped_instructions = [] + for k in range(0, len(beat_instructions), line_wrap): + section = beat_instructions[k:k+line_wrap] + wrapped_instructions.append(f"{arrow} ".join(section)) + line_desc = f"\n {arrow} ".join(wrapped_instructions) + else: + line_desc = f" {arrow} ".join(beat_instructions) + + # Add swing notation if present + if has_swing: + line_desc += " [with swing feel]" + + # Add to output + line_output = f"Line {i+1}: {line_desc}" + output.append(line_output) + + if structured_output: + structured_data["lines"].append({ + "line_number": i+1, + "beats": [{"original": beats[j], + "type": beat_info.get("type"), + "count": beat_info.get("count"), + "strength": beat_info.get("strength")} + for j, beat_info in enumerate([b for b in beats if isinstance(b, dict)])], + "has_swing": has_swing + }) + + # Add explanation of notation after the lines + explanation = [ + "\n📝 UNDERSTANDING THE NOTATION:" + ] + + # Add descriptions for each beat type that was actually used + used_beat_types = set() + for phrase in phrases: + for beat in phrase.split("-"): + for bt in beat_types.keys(): + if beat.startswith(bt): + used_beat_types.add(bt) + + for bt in used_beat_types: + if bt in beat_types: + name = beat_types[bt]["name"] + desc = beat_types[bt]["description"] + explanation.append(f"- {name}(n): Place a {desc} here, plus (n-1) unstressed syllables") + + explanation.extend([ + f"- {arrow}: Indicates flow from one beat to the next", + "- [0.xx]: Beat strength value (higher = more emphasis needed)" + ]) + + output.extend(explanation) + + if structured_output: + structured_data["explanations"] = explanation + + # Add examples for half-syllable values if they appear in the templates + has_half_syllables = any((".5" in beat) for phrase in phrases for beat in phrase.split("-")) + if has_half_syllables: + half_syllable_examples = [ + "\n🎵 HALF-SYLLABLE EXAMPLES:", + "- STRONG(1.5): One stressed syllable followed by an unstressed half-syllable", + " Example: \"LOVE you\" where \"LOVE\" is stressed and \"you\" is quick", + "- medium(2.5): One medium syllable plus one-and-a-half unstressed syllables", + " Example: \"Wait for the\" where \"Wait\" is medium-stressed and \"for the\" is quick" + ] + output.extend(half_syllable_examples) + + if structured_output: + structured_data["half_syllable_examples"] = half_syllable_examples - return "\n".join(instructions) + # Add swing explanation if needed + if any("swing" in phrase for phrase in phrases): + swing_guide = [ + "\n🎶 SWING RHYTHM GUIDE:", + "- In swing, syllables should be unevenly timed (long-short pattern)", + "- Example: \"SUM-mer TIME\" in swing feels like \"SUM...mer-TIME\" with delay" + ] + output.extend(swing_guide) + + if structured_output: + structured_data["swing_guide"] = swing_guide + + # Handle the original format or segment dictionaries else: - # Handle the original format or segment dictionaries formatted_lines = [] if isinstance(syllable_templates, list): for i, template in enumerate(syllable_templates): if isinstance(template, dict) and "syllable_template" in template: - formatted_lines.append(f"Line {i+1}: {template['syllable_template']} syllables") + line = f"Line {i+1}: {template['syllable_template']} syllables" + formatted_lines.append(line) + + if structured_output: + structured_data["lines"].append({ + "line_number": i+1, + "syllable_count": template["syllable_template"] + }) elif isinstance(template, str): - formatted_lines.append(f"Line {i+1}: {template} syllables") + line = f"Line {i+1}: {template} syllables" + formatted_lines.append(line) + + if structured_output: + structured_data["lines"].append({ + "line_number": i+1, + "syllable_count": template + }) - return "\n".join(formatted_lines) - - return str(syllable_templates) + output = formatted_lines + else: + output = [str(syllable_templates)] + + if structured_output: + structured_data["raw_content"] = str(syllable_templates) + + # Add general application advice + application_tips = [ + "\n💡 APPLICATION TIPS:", + "1. Strong beats need naturally stressed syllables (like the START of \"RE-mem-ber\")", + "2. Place important words on strong beats for natural emphasis", + "3. Vowel sounds work best for sustained or emphasized syllables", + "4. Keep consonant clusters (like \"str\" or \"thr\") on weak beats" + ] + output.extend(application_tips) + + if structured_output: + structured_data["application_tips"] = application_tips + return structured_data + + return "\n".join(output) -# Enhanced verification function to check syllable counts and stress patterns def verify_flexible_syllable_counts(lyrics, templates): - """Verify that the generated lyrics match the required syllable counts and stress patterns.""" + """ + Enhanced verification of syllable counts and stress patterns with precise alignment analysis + and detailed feedback for all phrases in a template. + """ + import re + import pronouncing + import numpy as np + import functools + from itertools import chain + + # Apply caching to improve performance for repeated word lookups + @functools.lru_cache(maxsize=512) + def cached_phones_for_word(word): + return pronouncing.phones_for_word(word) + + @functools.lru_cache(maxsize=512) + def count_syllables_for_word(word): + """Count syllables in a single word with caching for performance.""" + # Try using pronouncing library first + pronunciations = cached_phones_for_word(word.lower()) + if pronunciations: + return pronouncing.syllable_count(pronunciations[0]) + + # Fallback method for words not in the pronouncing dictionary + vowels = "aeiouy" + word = word.lower() + count = 0 + prev_is_vowel = False + + for char in word: + is_vowel = char in vowels + if is_vowel and not prev_is_vowel: + count += 1 + prev_is_vowel = is_vowel + + # Handle special cases + if word.endswith('e') and not word.endswith('le'): + count -= 1 + if word.endswith('le') and len(word) > 2 and word[-3] not in vowels: + count += 1 + if count == 0: + count = 1 + + return count + + @functools.lru_cache(maxsize=512) + def get_word_stress(word): + """Get the stress pattern for a word with improved fallback handling.""" + pronunciations = cached_phones_for_word(word.lower()) + if pronunciations: + return pronouncing.stresses(pronunciations[0]) + + # Enhanced fallback for words not in the dictionary + syllables = count_syllables_for_word(word) + + # Common English stress patterns by word length + if syllables == 1: + return "1" # Single syllable words are stressed + elif syllables == 2: + # Most 2-syllable nouns and adjectives stress first syllable + # Common endings that indicate second-syllable stress + second_syllable_stress = ["ing", "er", "or", "ize", "ise", "ate", "ect", "end", "ure"] + if any(word.endswith(ending) for ending in second_syllable_stress): + return "01" + else: + return "10" # Default for 2-syllable words + elif syllables == 3: + # Common endings for specific stress patterns in 3-syllable words + if any(word.endswith(ending) for ending in ["ity", "ety", "ify", "ogy", "graphy"]): + return "100" # First syllable stress + elif any(word.endswith(ending) for ending in ["ation", "ious", "itis"]): + return "010" # Middle syllable stress + else: + return "100" # Default for 3-syllable words + else: + # For longer words, use common English patterns + return "1" + "0" * (syllables - 1) + # Split lyrics into lines lines = [line.strip() for line in lyrics.split("\n") if line.strip()] - # Check syllable counts for each line + # Initialize tracking variables verification_notes = [] + detailed_analysis = [] + stress_misalignments = [] + total_mismatch_count = 0 + # Process each lyric line against its template for i, line in enumerate(lines): if i >= len(templates): break template = templates[i] - # Handle different template formats + # Extract the template string from different possible formats if isinstance(template, dict) and "syllable_template" in template: template_str = template["syllable_template"] elif isinstance(template, str): @@ -557,95 +1271,394 @@ def verify_flexible_syllable_counts(lyrics, templates): else: continue - # Parse the enhanced template format if present + # Handle multiple phrases in template - process ALL phrases, not just the first + template_phrases = [template_str] if "|" in template_str: - # This is a phrase, take just the first part for now - template_str = template_str.split("|")[0] - - # Count expected syllables - total_expected = 0 - - # Handle the enhanced format with stress patterns - if "-" in template_str and any(x in template_str for x in ["S", "m", "w"]): - beats = template_str.split("-") - expected_counts = [] - - for beat in beats: - if beat.startswith(("S", "m", "w")): - try: - count = int(beat[1:]) - expected_counts.append(count) - total_expected += count - except ValueError: - expected_counts.append(1) - total_expected += 1 - else: - try: - count = int(beat) - expected_counts.append(count) - total_expected += count - except ValueError: - expected_counts.append(1) - total_expected += 1 - else: - # Old format - simple numbers separated by hyphens - try: - expected_counts = [int(count) for count in template_str.split("-")] - total_expected = sum(expected_counts) - except ValueError: - # Fallback if we can't parse the template - expected_counts = [] - total_expected = 0 - - # Count actual syllables + template_phrases = template_str.split("|") + + # Check against all phrases and find the best match + best_match_diff = float('inf') + best_match_phrase = None + best_phrase_beats = None actual_count = count_syllables(line) - # Calculate difference - if total_expected > 0 and abs(actual_count - total_expected) > 2: # Allow small differences - verification_notes.append(f"Line {i+1}: Expected {total_expected} syllables, got {actual_count}") + for phrase_idx, phrase in enumerate(template_phrases): + # Extract beat patterns and expected syllable counts from template + beats_info = [] + total_expected = 0 - # Additionally check if stressed syllables align with strong beats - words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) - if words and expected_counts and "S" in template_str: - # Try to find strong beats in the template - strong_beat_positions = [] - current_pos = 0 + # Enhanced template parsing + if "-" in phrase: + beat_templates = phrase.split("-") - for j, beat in enumerate(template_str.split("-")): - if beat.startswith("S"): - beat_count = int(beat[1:]) if len(beat) > 1 else 1 - strong_beat_positions.append(current_pos) - current_pos += beat_count + # Parse each beat template + for beat in beat_templates: + beat_info = {"original": beat, "type": None, "count": 1, "strength": None} + + # Handle templates with embedded strength values: S(0.95):2 + if "(" in beat and ")" in beat and ":" in beat: + parts = beat.split(":") + beat_type = parts[0].split("(")[0] + try: + strength = float(parts[0].split("(")[1].rstrip(")")) + except ValueError: + strength = 1.0 + + # Handle potential float syllable counts + try: + count = float(parts[1]) + # Convert to int if it's a whole number + if count == int(count): + count = int(count) + except ValueError: + count = 1 + + beat_info.update({ + "type": beat_type, + "count": count, + "strength": strength + }) + + # Handle simple format: S2, m1, w1 + elif any(beat.startswith(x) for x in ["S", "m", "w", "X", "L"]): + beat_type = beat[0] + + # Extract count, supporting float values + try: + count_str = beat[1:] + count = float(count_str) + if count == int(count): + count = int(count) + except ValueError: + count = 1 + + beat_info.update({ + "type": beat_type, + "count": count + }) + + # Legacy format - just numbers else: - beat_count = int(beat[1:]) if len(beat) > 1 else 1 - current_pos += beat_count + try: + count = float(beat) + if count == int(count): + count = int(count) + beat_info["count"] = count + except ValueError: + pass + + beats_info.append(beat_info) + total_expected += beat_info["count"] + + # Compare this phrase to actual syllable count + phrase_diff = abs(actual_count - total_expected) + + # Adaptive threshold based on expected syllables + expected_ratio = 0.15 if total_expected > 10 else 0.25 + phrase_threshold = max(1, round(total_expected * expected_ratio)) + + # If this is the best match so far, store it + if phrase_diff < best_match_diff: + best_match_diff = phrase_diff + best_match_phrase = phrase + best_phrase_beats = beats_info + + # For very simple templates without "-" + else: + try: + total_expected = float(phrase) + phrase_diff = abs(actual_count - total_expected) + if phrase_diff < best_match_diff: + best_match_diff = phrase_diff + best_match_phrase = phrase + best_phrase_beats = [{"count": total_expected}] + except ValueError: + pass + + # If we found a reasonable match, proceed with analysis + if best_match_phrase and best_phrase_beats: + total_expected = sum(beat["count"] for beat in best_phrase_beats) + + # Calculate adaptive threshold based on expected syllables + expected_ratio = 0.15 if total_expected > 10 else 0.25 + threshold = max(1, round(total_expected * expected_ratio)) + + # Check if total syllable count is significantly off + if total_expected > 0 and best_match_diff > threshold: + verification_notes.append(f"Line {i+1}: Expected {total_expected} syllables, got {actual_count}") + total_mismatch_count += 1 + + # Extract words and perform detailed alignment analysis + words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) + + # Get syllable count and stress for each word + word_analysis = [] + cumulative_syllables = 0 - # Try to get pronunciations for words to check stress alignment - word_stresses = [] for word in words: - pronunciations = pronouncing.phones_for_word(word) - if pronunciations: - stress_pattern = pronouncing.stresses(pronunciations[0]) - word_stresses.append(stress_pattern) + syllable_count = count_syllables_for_word(word) + + # Get stress pattern + stress_pattern = get_word_stress(word) + + word_analysis.append({ + "word": word, + "syllables": syllable_count, + "stress_pattern": stress_pattern, + "position": cumulative_syllables + }) + + cumulative_syllables += syllable_count - # Add note about stress alignment if we have enough information - if word_stresses and strong_beat_positions and len(word_stresses) >= len(strong_beat_positions): - verification_notes.append(f" → Check stress alignment on words with strong beats") + # Analyze alignment with beats - only if there are beat types + if best_phrase_beats and any(b.get("type") == "S" for b in best_phrase_beats if "type" in b): + # Identify positions where strong syllables should fall + strong_positions = [] + current_pos = 0 + + for beat in best_phrase_beats: + if beat.get("type") == "S": + strong_positions.append(current_pos) + current_pos += beat.get("count", 1) + + # Check if strong syllables align with strong beats + alignment_issues = [] + + for pos in strong_positions: + # Find which word contains this position + misaligned_word = None + + for word_info in word_analysis: + word_start = word_info["position"] + word_end = word_start + word_info["syllables"] + + if word_start <= pos < word_end: + # Check if a stressed syllable falls on this position + syllable_in_word = pos - word_start + + # Get stress pattern for this word + stress = word_info["stress_pattern"] + + # If we have stress information and this syllable isn't stressed + if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1': + misaligned_word = word_info["word"] + alignment_issues.append(f"'{word_info['word']}' (unstressed syllable on strong beat)") + stress_misalignments.append({ + "line": i+1, + "word": word_info["word"], + "position": pos, + "suggestion": get_stress_aligned_alternatives(word_info["word"], syllable_in_word) + }) + break + + if alignment_issues: + verification_notes.append(f" → Stress misalignments: {', '.join(alignment_issues)}") + + # Generate a visual alignment map for better understanding + alignment_map = generate_alignment_visualization(line, best_phrase_beats, word_analysis) + if alignment_map: + detailed_analysis.append(f"Line {i+1} Alignment Analysis:\n{alignment_map}") + else: + # If no matching template was found + verification_notes.append(f"Line {i+1}: Unable to find matching template pattern") - # If we found issues, add them as notes at the end of the lyrics + # Only add detailed analysis if we have rhythm mismatches if verification_notes: - lyrics += "\n\n[Note: Potential rhythm mismatches in these lines:]\n" + lyrics += "\n\n[Note: Potential rhythm mismatches detected in these lines:]\n" lyrics += "\n".join(verification_notes) - lyrics += "\n\n[To fix mismatches:]\n" - lyrics += "1. Make sure stressed syllables fall on STRONG beats\n" - lyrics += "2. Adjust syllable counts to match the template\n" - lyrics += "3. Try using words with naturally aligned stress patterns" + + if detailed_analysis: + lyrics += "\n\n[Detailed Alignment Analysis:]\n" + lyrics += "\n\n".join(detailed_analysis) + + lyrics += "\n\n[How to fix rhythm mismatches:]\n" + lyrics += "1. Make sure stressed syllables (like 'LO' in 'LOV-er') fall on STRONG beats\n" + lyrics += "2. Adjust syllable counts to match the template (add/remove words or use different words)\n" + lyrics += "3. Try using words where natural stress aligns with musical rhythm\n" + + # Add specific word substitution suggestions if we found stress misalignments + if stress_misalignments: + lyrics += "\n[Specific word replacement suggestions:]\n" + for issue in stress_misalignments[:5]: # Limit to first 5 issues + if issue["suggestion"]: + lyrics += f"Line {issue['line']}: Consider replacing '{issue['word']}' with: {issue['suggestion']}\n" return lyrics -# Modified generate_lyrics function +def generate_alignment_visualization(line, beats_info, word_analysis): + """Generate a visual representation of syllable alignment with beats.""" + if not beats_info or not word_analysis: + return None + + # Create a syllable breakdown with stress information + syllable_breakdown = [] + syllable_stresses = [] + + for word_info in word_analysis: + word = word_info["word"] + syllables = word_info["syllables"] + stress = word_info["stress_pattern"] or "" + + # Extend stress pattern if needed + while len(stress) < syllables: + stress += "0" + + # Get syllable breakdown + parts = naive_syllable_split(word, syllables) + + for i, part in enumerate(parts): + syllable_breakdown.append(part) + if i < len(stress): + syllable_stresses.append(stress[i]) + else: + syllable_stresses.append("0") + + # Create beat pattern + beat_types = [] + current_pos = 0 + + for beat in beats_info: + beat_type = beat.get("type", "-") + count = beat.get("count", 1) + + # Handle whole numbers and half syllables + if isinstance(count, int): + beat_types.extend([beat_type] * count) + else: + # For half syllables, round up and use markers + whole_part = int(count) + frac_part = count - whole_part + + if whole_part > 0: + beat_types.extend([beat_type] * whole_part) + + if frac_part > 0: + beat_types.append(f"{beat_type}½") + + # Ensure we have enough beat types + while len(beat_types) < len(syllable_breakdown): + beat_types.append("-") + + # Trim beat types if too many + beat_types = beat_types[:len(syllable_breakdown)] + + # Generate the visualization with highlighted misalignments + result = [] + + # First line: syllable breakdown with stress indicators + syllable_display = [] + for i, syllable in enumerate(syllable_breakdown): + if i < len(syllable_stresses) and syllable_stresses[i] == "1": + syllable_display.append(syllable.upper()) # Uppercase for stressed syllables + else: + syllable_display.append(syllable.lower()) # Lowercase for unstressed + + result.append(" - ".join(syllable_display)) + + # Second line: beat indicators with highlighting for misalignments + beat_indicators = [] + for i, (syllable, beat_type) in enumerate(zip(syllable_stresses, beat_types)): + if beat_type == "S" or beat_type.startswith("S"): + if syllable == "1": + beat_indicators.append("↑") # Aligned strong beat + else: + beat_indicators.append("❌") # Misaligned strong beat + elif beat_type == "m" or beat_type.startswith("m"): + beat_indicators.append("•") # Medium beat + elif beat_type == "w" or beat_type.startswith("w"): + beat_indicators.append("·") # Weak beat + else: + beat_indicators.append(" ") + + result.append(" ".join(beat_indicators)) + + # Third line: beat types + result.append(" - ".join(beat_types)) + + return "\n".join(result) + +@functools.lru_cache(maxsize=256) +def naive_syllable_split(word, syllable_count): + """Naively split a word into the specified number of syllables, with caching for performance.""" + if syllable_count <= 1: + return [word] + + # Common syllable break patterns + vowels = "aeiouy" + consonants = "bcdfghjklmnpqrstvwxz" + + # Find potential split points + splits = [] + for i in range(1, len(word) - 1): + if word[i] in consonants and word[i-1] in vowels: + splits.append(i) + elif word[i] in vowels and word[i-1] in consonants and word[i+1] in consonants: + splits.append(i+1) + + # Ensure we have enough split points + while len(splits) < syllable_count - 1: + for i in range(1, len(word)): + if i not in splits: + splits.append(i) + break + + # Sort and limit + splits.sort() + splits = splits[:syllable_count - 1] + + # Split the word + result = [] + prev = 0 + for pos in splits: + result.append(word[prev:pos]) + prev = pos + + result.append(word[prev:]) + return result + +def get_stress_aligned_alternatives(word, position_to_stress): + """Suggest alternative words with proper stress at the required position.""" + # This would ideally use a more sophisticated dictionary lookup, + # but here's a simple implementation with common word patterns + syllable_count = count_syllables_for_word(word) + + # Common synonyms/replacements by syllable count with stress position + if syllable_count == 2: + if position_to_stress == 0: # Need stress on first syllable + first_stress = ["love-ly", "won-der", "beau-ty", "danc-ing", "dream-ing", + "heart-beat", "sun-light", "moon-light", "star-light"] + return ", ".join(first_stress[:3]) + else: # Need stress on second syllable + second_stress = ["be-LIEVE", "a-BOVE", "a-ROUND", "to-DAY", "a-LIVE", + "a-LONE", "be-HOLD", "re-TURN", "de-LIGHT"] + return ", ".join(second_stress[:3]) + elif syllable_count == 3: + if position_to_stress == 0: # First syllable stress + return "MEM-o-ry, WON-der-ful, BEAU-ti-ful" + elif position_to_stress == 1: # Second syllable stress + return "a-MAZE-ing, to-GE-ther, for-EV-er" + else: # Third syllable stress + return "un-der-STAND, o-ver-COME, ne-ver-MORE" + + # For other cases, just provide general guidance + return f"a word with stress on syllable {position_to_stress + 1}" + def generate_lyrics(genre, duration, emotion_results, song_structure=None): - """Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment.""" + """ + Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment. + + This improved version uses advanced template creation, better formatting, and verification with + potential refinement for lyrics that perfectly match the musical rhythm patterns. + + Parameters: + genre: Musical genre of the audio + duration: Duration of the audio in seconds + emotion_results: Dictionary containing emotional analysis results + song_structure: Optional dictionary containing song structure analysis + + Returns: + Generated lyrics aligned with the rhythm patterns of the music + """ # Extract emotion and theme data from analysis results primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"] primary_theme = emotion_results["theme_analysis"]["primary_theme"] @@ -705,22 +1718,25 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None): # Create a phrase structure for this segment segment_beats_info["phrases"] = [segment_beats] - # Generate enhanced template - enhanced_template = create_flexible_syllable_templates(segment_beats_info) + # Generate enhanced template with genre awareness and auto phrasing + enhanced_template = create_flexible_syllable_templates( + segment_beats_info, + genre=genre, + phrase_mode='auto' if i == 0 else 'default' + ) enhanced_templates.append(enhanced_template) templates_for_verification.append(enhanced_template) - # Format templates for the prompt + # Format templates with improved formatting syllable_guidance = "CRITICAL RHYTHM INSTRUCTIONS:\n" syllable_guidance += "Match each line exactly to this rhythm pattern (STRONG beats need stressed syllables):\n\n" - syllable_guidance += format_syllable_templates_for_prompt(enhanced_templates) + syllable_guidance += format_syllable_templates_for_prompt( + enhanced_templates, + arrow="→", + line_wrap=8 + ) - # Add explanation of notation - syllable_guidance += "\n\nWhere:\n" - syllable_guidance += "- STRONG(n): Place a STRESSED syllable here, followed by (n-1) unstressed syllables\n" - syllable_guidance += "- medium(n): Place a medium-stressed or unstressed syllable here, followed by (n-1) unstressed syllables\n" - syllable_guidance += "- weak(n): Place unstressed syllables here\n" - syllable_guidance += "- →: Indicates flow from one beat to the next within a line\n" + # Note: The enhanced formatter now automatically includes explanations # Fallback to traditional sections if needed elif "syllables" in song_structure and song_structure["syllables"]: @@ -746,11 +1762,19 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None): # Create a phrase structure for this section section_beats_info["phrases"] = [list(range(len(section_beats_info["beat_times"])))] - # Generate enhanced template - enhanced_template = create_flexible_syllable_templates(section_beats_info) + # Generate enhanced template with genre awareness + enhanced_template = create_flexible_syllable_templates( + section_beats_info, + genre=genre, + phrase_mode='auto' if section['type'] == 'verse' else 'default' + ) syllable_guidance += f"[{section['type'].capitalize()}]:\n" - syllable_guidance += format_syllable_templates_for_prompt(enhanced_template) + "\n\n" + syllable_guidance += format_syllable_templates_for_prompt( + enhanced_template, + arrow="→", + line_wrap=6 + ) + "\n\n" templates_for_verification.append(section) elif "syllable_count" in section: syllable_guidance += f"[{section['type'].capitalize()}]: ~{section['syllable_count']} syllables total\n" @@ -765,23 +1789,49 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None): syllable_guidance += " - Medium tempo (90-120 BPM): 6-8 syllables per line\n" syllable_guidance += " - Slow tempo (<90 BPM): 8-10 syllables per line\n" - # Add examples of syllable-beat alignment with stress patterns + # Add examples of syllable-beat alignment with enhanced format syllable_guidance += "\nEXAMPLES OF PERFECT RHYTHM ALIGNMENT:\n" - syllable_guidance += "Pattern: STRONG(1) → weak(1) → medium(1) → weak(1)\n" + syllable_guidance += "Pattern: S(0.95):1 → w(0.4):1 → m(0.7):1 → w(0.3):1\n" syllable_guidance += "Lyric: 'HEAR the MU-sic PLAY'\n" syllable_guidance += " ↑ ↑ ↑ ↑\n" syllable_guidance += " S w m w <- BEAT TYPE\n\n" - syllable_guidance += "Pattern: STRONG(2) → weak(1) → STRONG(1) → weak(2)\n" + syllable_guidance += "Pattern: S(0.9):2 → w(0.3):1 → S(0.85):1 → w(0.4):2\n" syllable_guidance += "Lyric: 'DANC-ing TO the RHYTHM of LOVE'\n" syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" syllable_guidance += " S S w S w w <- BEAT TYPE\n\n" - syllable_guidance += "Pattern: STRONG(1) → medium(2) → STRONG(1) → weak(1)\n" + syllable_guidance += "Pattern: S(0.92):1 → m(0.65):2 → S(0.88):1 → w(0.35):1\n" syllable_guidance += "Lyric: 'TIME keeps FLOW-ing ON and ON'\n" syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" syllable_guidance += " S m m S w w <- BEAT TYPE\n\n" + # Add genre-specific guidance based on the detected genre + genre_guidance = "" + if any(term in genre.lower() for term in ["rap", "hip-hop", "hip hop"]): + genre_guidance += "\nSPECIFIC GUIDANCE FOR RAP/HIP-HOP RHYTHMS:\n" + genre_guidance += "- Use more syllables per beat for rapid-fire sections\n" + genre_guidance += "- Create internal rhymes within lines, not just at line endings\n" + genre_guidance += "- Emphasize the first beat of each bar with strong consonants\n" + elif any(term in genre.lower() for term in ["electronic", "edm", "techno", "house", "dance"]): + genre_guidance += "\nSPECIFIC GUIDANCE FOR ELECTRONIC MUSIC RHYTHMS:\n" + genre_guidance += "- Use repetitive phrases that build and release tension\n" + genre_guidance += "- Match syllables precisely to the beat grid\n" + genre_guidance += "- Use short, percussive words on strong beats\n" + elif any(term in genre.lower() for term in ["rock", "metal", "punk", "alternative"]): + genre_guidance += "\nSPECIFIC GUIDANCE FOR ROCK RHYTHMS:\n" + genre_guidance += "- Use powerful, emotive words on downbeats\n" + genre_guidance += "- Create contrast between verse and chorus energy levels\n" + genre_guidance += "- Emphasize hooks with simple, memorable phrases\n" + elif any(term in genre.lower() for term in ["folk", "country", "acoustic", "ballad"]): + genre_guidance += "\nSPECIFIC GUIDANCE FOR FOLK/ACOUSTIC RHYTHMS:\n" + genre_guidance += "- Focus on storytelling with clear narrative flow\n" + genre_guidance += "- Use natural speech patterns that flow conversationally\n" + genre_guidance += "- Place important words at the start of phrases\n" + + # Add genre guidance to the main guidance + syllable_guidance += genre_guidance + # Determine if we should use traditional sections or not use_sections = True if song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]: @@ -861,7 +1911,7 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None): # Create enhanced prompt with better rhythm alignment instructions if use_sections: # Traditional approach with sections - prompt = f""" + content = f""" You are a talented songwriter who specializes in {genre} music. Write original {genre} song lyrics for a song that is {duration:.1f} seconds long. @@ -879,6 +1929,14 @@ CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: 3. Line breaks should occur at phrase endings for natural breathing 4. Consonant clusters should be avoided on fast notes and strong beats 5. Open vowels (a, e, o) work better for sustained notes and syllables +6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) +7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels + +Think step by step about how to match words to the rhythm pattern: +1. First, identify the strong beats in each line pattern +2. Choose words where stressed syllables naturally fall on strong beats +3. Count syllables carefully to ensure they match the pattern precisely +4. Test your line against the pattern by mapping each syllable The lyrics should: - Perfectly capture the essence and style of {genre} music @@ -895,7 +1953,7 @@ Your lyrics: """ else: # Flexible approach without traditional sections - prompt = f""" + content = f""" You are a talented songwriter who specializes in {genre} music. Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long. @@ -913,6 +1971,14 @@ CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: 3. Line breaks should occur at phrase endings for natural breathing 4. Consonant clusters should be avoided on fast notes and strong beats 5. Open vowels (a, e, o) work better for sustained notes and syllables +6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) +7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels + +Think step by step about how to match words to the rhythm pattern: +1. First, identify the strong beats in each line pattern +2. Choose words where stressed syllables naturally fall on strong beats +3. Count syllables carefully to ensure they match the pattern precisely +4. Test your line against the pattern by mapping each syllable For perfect alignment examples: - "FEEL the RHY-thm in your SOUL" – stressed syllables on strong beats @@ -932,22 +1998,162 @@ Instead, write lyrics that flow naturally and match the music's rhythm precisely Your lyrics: """ + # Format as a chat message for the LLM + messages = [ + {"role": "user", "content": content} + ] + + # Apply chat template with thinking enabled + try: + # Try using the model-specific template with thinking enabled + text = llm_tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + enable_thinking=True # Only works with models that support thinking mode + ) + except Exception as e: + # Fallback to standard template if thinking mode not supported + print(f"Thinking mode not supported, using standard template: {str(e)}") + text = llm_tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + # Generate lyrics using the LLM - response = llm_pipeline( - prompt, - do_sample=True, - temperature=0.7, - top_p=0.9, - repetition_penalty=1.1, - return_full_text=False + model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device) + + # Configure generation parameters based on model capability + generation_params = { + "do_sample": True, + "temperature": 0.6, # Lower for more consistent rhythm alignment + "top_p": 0.95, + "top_k": 20, + "repetition_penalty": 1.2, + "max_new_tokens": 1024 # Allow more tokens for comprehensive lyrics + } + + # Generate output + generated_ids = llm_model.generate( + **model_inputs, + **generation_params ) - # Extract and clean generated lyrics - lyrics = response[0]["generated_text"].strip() + # Extract output tokens + output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() - # Verify syllable counts if we have templates + # Try to find token to separate thinking from final answer if the model supports it + try: + # Look for thinking mode tokens - check model-specific token IDs + # For Qwen3, the token ID is 151668 + think_end_tokens = { + "qwen": 151668, # Qwen token + "claude": 42, # Example for Claude (placeholder) + "llama": 128001 # Example for Llama (placeholder) + } + + # Try to find a known token + found_token = None + token_position = 0 + + for model_name, token_id in think_end_tokens.items(): + if token_id in output_ids: + found_token = token_id + token_position = len(output_ids) - output_ids[::-1].index(token_id) + break + + # Use the position of the thinking token if found + if found_token: + lyrics = llm_tokenizer.decode(output_ids[token_position:], skip_special_tokens=True).strip() + else: + lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip() + except (ValueError, IndexError, AttributeError) as e: + print(f"Error processing thinking output: {str(e)}") + # Default behavior if thinking mode processing fails + lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip() + + # Verify syllable counts with enhanced verification if templates_for_verification: - lyrics = verify_flexible_syllable_counts(lyrics, templates_for_verification) + verified_lyrics = verify_flexible_syllable_counts(lyrics, templates_for_verification) + + # Check if significant issues were detected + if "[Note: Potential rhythm mismatches" in verified_lyrics and "Detailed Alignment Analysis" in verified_lyrics: + # Extract the original lyrics (before the notes section) + original_lyrics = lyrics.split("[Note:")[0].strip() + + # Extract the analysis + analysis = verified_lyrics.split("[Note:")[1] + + # If we have serious alignment issues, consider a refinement step + if "stress misalignments" in analysis and len(templates_for_verification) > 0: + # Add a refinement prompt with the specific analysis + refinement_prompt = f""" +You need to fix rhythm issues in these lyrics. Here's the analysis of the problems: + +{analysis} + +Revise the lyrics to perfectly match the rhythm pattern while maintaining the theme. +Focus on fixing the stress misalignments by placing stressed syllables on STRONG beats. + +Original lyrics: +{original_lyrics} + +Improved lyrics with fixed rhythm: +""" + # Format as a chat message for refinement + refinement_messages = [ + {"role": "user", "content": refinement_prompt} + ] + + # Use standard template for refinement (no thinking mode needed) + refinement_text = llm_tokenizer.apply_chat_template( + refinement_messages, + tokenize=False, + add_generation_prompt=True + ) + + try: + # Generate refined lyrics with more focus on rhythm alignment + refinement_inputs = llm_tokenizer([refinement_text], return_tensors="pt").to(llm_model.device) + + # Use stricter parameters for refinement + refinement_params = { + "do_sample": True, + "temperature": 0.4, # Lower temperature for more precise refinement + "top_p": 0.9, + "repetition_penalty": 1.3, + "max_new_tokens": 1024 + } + + refined_ids = llm_model.generate( + **refinement_inputs, + **refinement_params + ) + + # Extract refined lyrics + refined_output_ids = refined_ids[0][len(refinement_inputs.input_ids[0]):].tolist() + refined_lyrics = llm_tokenizer.decode(refined_output_ids, skip_special_tokens=True).strip() + + # Verify the refined lyrics + refined_verified_lyrics = verify_flexible_syllable_counts(refined_lyrics, templates_for_verification) + + # Only use refined lyrics if they're better (fewer notes) + if "[Note: Potential rhythm mismatches" not in refined_verified_lyrics: + lyrics = refined_lyrics + elif refined_verified_lyrics.count("misalignments") < verified_lyrics.count("misalignments"): + lyrics = refined_verified_lyrics + else: + lyrics = verified_lyrics + except Exception as e: + print(f"Error in lyrics refinement: {str(e)}") + lyrics = verified_lyrics + else: + # Minor issues, just use the verification notes + lyrics = verified_lyrics + else: + # No significant issues detected + lyrics = verified_lyrics # Add section labels if they're not present and we're using the traditional approach if use_sections and "Verse" not in lyrics and "Chorus" not in lyrics: @@ -972,17 +2178,29 @@ Your lyrics: lyrics = '\n'.join(formatted_lyrics) + # Clean up the output if there are analytical notes + if "[Note: Potential rhythm mismatches" in lyrics and "[How to fix rhythm mismatches" in lyrics: + # Optionally separate the analysis from the final lyrics for cleaner display + clean_lyrics = lyrics.split("[Note:")[0].strip() + analysis_notes = lyrics.split("[Note:")[1] + + # For now, keep the full output with notes for debugging + # In a production system, you might want to handle this differently + lyrics = lyrics + return lyrics def process_audio(audio_file): - """Main function to process audio file, classify genre, and generate lyrics.""" + """Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis.""" if audio_file is None: return "Please upload an audio file.", None, None try: + print("Step 1/5: Extracting audio features...") # Extract audio features audio_data = extract_audio_features(audio_file) + print("Step 2/5: Verifying audio contains music...") # First check if it's music try: is_music, ast_results = detect_music(audio_data) @@ -993,6 +2211,7 @@ def process_audio(audio_file): if not is_music: return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results + print("Step 3/5: Classifying music genre...") # Classify genre try: top_genres = classify_genre(audio_data) @@ -1002,6 +2221,7 @@ def process_audio(audio_file): print(f"Error in genre classification: {str(e)}") return f"Error in genre classification: {str(e)}", None, ast_results + print("Step 4/5: Analyzing music emotions, themes, and structure...") # Analyze music emotions and themes try: emotion_results = music_analyzer.analyze_music(audio_file) @@ -1024,6 +2244,7 @@ def process_audio(audio_file): # Continue with a simpler approach if this fails song_structure = None + print("Step 5/5: Generating rhythmically aligned lyrics...") # Generate lyrics based on top genre, emotion analysis, and song structure try: primary_genre, _ = top_genres[0] @@ -1032,40 +2253,104 @@ def process_audio(audio_file): print(f"Error generating lyrics: {str(e)}") lyrics = f"Error generating lyrics: {str(e)}" - return genre_results, lyrics, ast_results + # Prepare results dictionary with additional rhythm analysis + results = { + "genre_results": genre_results, + "lyrics": lyrics, + "ast_results": ast_results + } + + # Extract rhythm analysis if present in the lyrics + if isinstance(lyrics, str) and "[Note: Potential rhythm mismatches" in lyrics: + clean_lyrics = lyrics.split("[Note:")[0].strip() + rhythm_analysis = "[Note:" + lyrics.split("[Note:")[1] + results["clean_lyrics"] = clean_lyrics + results["rhythm_analysis"] = rhythm_analysis + + return results except Exception as e: error_msg = f"Error processing audio: {str(e)}" print(error_msg) return error_msg, None, [] -# Create Gradio interface +# Create enhanced Gradio interface with tabs for better organization with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo: gr.Markdown("# Music Genre Classifier & Lyrics Generator") - gr.Markdown("Upload a music file to classify its genre, analyze its emotions, and generate matching lyrics.") + gr.Markdown("Upload a music file to classify its genre, analyze its emotions, and generate perfectly aligned lyrics.") with gr.Row(): - with gr.Column(): + with gr.Column(scale=1): audio_input = gr.Audio(label="Upload Music", type="filepath") - submit_btn = gr.Button("Analyze & Generate") + submit_btn = gr.Button("Analyze & Generate", variant="primary") + + # Add genre info box + with gr.Accordion("About Music Genres", open=False): + gr.Markdown(""" + The system recognizes various music genres including: + - Pop, Rock, Hip-Hop, R&B + - Electronic, Dance, Techno, House + - Jazz, Blues, Classical + - Folk, Country, Acoustic + - Metal, Punk, Alternative + - And many others! + + For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music. + """) - with gr.Column(): - genre_output = gr.Textbox(label="Detected Genres", lines=5) - emotion_output = gr.Textbox(label="Emotion Analysis", lines=5) - ast_output = gr.Textbox(label="Audio Classification Results (AST)", lines=5) - lyrics_output = gr.Textbox(label="Generated Lyrics", lines=15) + with gr.Column(scale=2): + # Use tabs for better organization of outputs + with gr.Tabs(): + with gr.TabItem("Analysis Results"): + genre_output = gr.Textbox(label="Detected Genres", lines=4) + + # Create 2 columns for emotion and audio classification + with gr.Row(): + with gr.Column(): + emotion_output = gr.Textbox(label="Emotion & Structure Analysis", lines=8) + with gr.Column(): + ast_output = gr.Textbox(label="Audio Classification", lines=8) + + with gr.TabItem("Generated Lyrics"): + lyrics_output = gr.Textbox(label="Lyrics", lines=18) + + with gr.TabItem("Rhythm Analysis"): + rhythm_analysis_output = gr.Textbox(label="Syllable-Beat Alignment Analysis", lines=16) + # Processing function with better handling of results def display_results(audio_file): if audio_file is None: - return "Please upload an audio file.", "No emotion analysis available.", "No audio classification available.", None + return "Please upload an audio file.", "No emotion analysis available.", "No audio classification available.", "No lyrics generated.", "No rhythm analysis available." try: - # Process audio and get genre, lyrics, and AST results - genre_results, lyrics, ast_results = process_audio(audio_file) + # Process audio and get results + results = process_audio(audio_file) # Check if we got an error message instead of results - if isinstance(genre_results, str) and genre_results.startswith("Error"): - return genre_results, "Error in emotion analysis", "Error in audio classification", None + if isinstance(results, str) and "Error" in results: + return results, "Error in analysis", "Error in classification", "No lyrics generated", "No rhythm analysis available" + elif isinstance(results, tuple) and isinstance(results[0], str) and "Error" in results[0]: + return results[0], "Error in analysis", "Error in classification", "No lyrics generated", "No rhythm analysis available" + + # For backwards compatibility, handle both dictionary and tuple returns + if isinstance(results, dict): + genre_results = results.get("genre_results", "Genre classification failed") + lyrics = results.get("lyrics", "Lyrics generation failed") + ast_results = results.get("ast_results", []) + + # Use clean lyrics if available + clean_lyrics = results.get("clean_lyrics", lyrics) + rhythm_analysis = results.get("rhythm_analysis", "No detailed rhythm analysis available") + else: + # Handle the old tuple return format + genre_results, lyrics, ast_results = results + clean_lyrics = lyrics + + # Extract rhythm analysis if present + rhythm_analysis = "No detailed rhythm analysis available" + if isinstance(lyrics, str) and "[Note: Potential rhythm mismatches" in lyrics: + clean_lyrics = lyrics.split("[Note:")[0].strip() + rhythm_analysis = "[Note:" + lyrics.split("[Note:")[1] # Format emotion analysis results try: @@ -1097,7 +2382,7 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo: emotion_text += "\nDetailed Rhythm Analysis:\n" for i, segment in enumerate(flexible["segments"][:5]): # Show first 5 segments emotion_text += f"- Segment {i+1}: {segment['start']:.1f}s to {segment['end']:.1f}s, " - emotion_text += f"pattern: {segment['syllable_template']}\n" + emotion_text += f"pattern: {segment.get('syllable_template', 'N/A')}\n" if len(flexible["segments"]) > 5: emotion_text += f" (+ {len(flexible['segments']) - 5} more segments)\n" @@ -1112,34 +2397,63 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo: # Format AST classification results if ast_results and isinstance(ast_results, list): - ast_text = "Audio Classification Results (AST Model):\n" + ast_text = "Audio Classification Results:\n" for result in ast_results[:5]: # Show top 5 results ast_text += f"{result['label']}: {result['score']*100:.2f}%\n" else: ast_text = "No valid audio classification results available." - return genre_results, emotion_text, ast_text, lyrics + # Return all results for the tabbed interface + return genre_results, emotion_text, ast_text, clean_lyrics, rhythm_analysis + except Exception as e: error_msg = f"Error: {str(e)}" print(error_msg) - return error_msg, "Error in emotion analysis", "Error in audio classification", None + return error_msg, "Error in emotion analysis", "Error in audio classification", "No lyrics generated", "No rhythm analysis available" + # Connect the button to the display function submit_btn.click( fn=display_results, inputs=[audio_input], - outputs=[genre_output, emotion_output, ast_output, lyrics_output] + outputs=[genre_output, emotion_output, ast_output, lyrics_output, rhythm_analysis_output] ) - gr.Markdown("### How it works") - gr.Markdown(""" - 1. Upload an audio file of your choice - 2. The system will classify the genre using the dima806/music_genres_classification model - 3. The system will analyze the musical emotion and theme using advanced audio processing - 4. The system will identify the song structure, beats, and timing patterns - 5. The system will create syllable templates that precisely match the rhythm of the music - 6. Based on the detected genre, emotion, and syllable templates, it will generate lyrics that align perfectly with the beats - 7. The system verifies syllable counts to ensure the generated lyrics can be sung naturally with the music - """) + # Enhanced explanation of how the system works + with gr.Accordion("How it works", open=False): + gr.Markdown(""" + ## Advanced Lyrics Generation Process + + 1. **Audio Analysis**: The system analyzes your uploaded music file using multiple machine learning models. + + 2. **Genre Classification**: A specialized neural network identifies the musical genre, detecting subtle patterns in the audio. + + 3. **Emotional Analysis**: The system examines harmonic, rhythmic, and timbral features to determine the emotional qualities of the music. + + 4. **Rhythm Mapping**: Advanced beat detection algorithms create a detailed rhythmic map of the music, identifying: + - Strong and weak beats + - Natural phrase boundaries + - Time signature and tempo variations + + 5. **Syllable Template Creation**: For each musical phrase, the system generates precise syllable templates that reflect: + - Beat stress patterns (strong, medium, weak) + - Appropriate syllable counts based on tempo + - Genre-specific rhythmic qualities + + 6. **Lyrics Generation**: Using the detected genre, emotion, and rhythm patterns, a large language model generates lyrics that: + - Match the emotional quality of the music + - Follow the precise syllable templates + - Align stressed syllables with strong beats + - Maintain genre-appropriate style and themes + + 7. **Rhythm Verification**: The system verifies the generated lyrics, analyzing: + - Syllable count accuracy + - Stress alignment with strong beats + - Word stress patterns + + 8. **Refinement**: If significant rhythm mismatches are detected, the system can automatically refine the lyrics for better alignment. + + This multi-step process creates lyrics that feel naturally connected to the music, as if they were written specifically for it. + """) # Launch the app demo.launch() \ No newline at end of file