|
import os |
|
import io |
|
import gradio as gr |
|
import torch |
|
import numpy as np |
|
import re |
|
import pronouncing |
|
import functools |
|
from transformers import ( |
|
AutoModelForAudioClassification, |
|
AutoFeatureExtractor, |
|
AutoTokenizer, |
|
pipeline, |
|
AutoModelForCausalLM, |
|
BitsAndBytesConfig |
|
) |
|
from huggingface_hub import login |
|
from utils import ( |
|
load_audio, |
|
extract_audio_duration, |
|
extract_mfcc_features, |
|
calculate_lyrics_length, |
|
format_genre_results, |
|
ensure_cuda_availability, |
|
preprocess_audio_for_model |
|
) |
|
from emotionanalysis import MusicAnalyzer |
|
import librosa |
|
|
|
|
|
if "HF_TOKEN" in os.environ: |
|
login(token=os.environ["HF_TOKEN"]) |
|
|
|
|
|
GENRE_MODEL_NAME = "dima806/music_genres_classification" |
|
MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593" |
|
LLM_MODEL_NAME = "Qwen/Qwen3-14B" |
|
SAMPLE_RATE = 22050 |
|
|
|
|
|
CUDA_AVAILABLE = ensure_cuda_availability() |
|
|
|
|
|
print(f"Loading music detection model: {MUSIC_DETECTION_MODEL}") |
|
try: |
|
music_detector = pipeline( |
|
"audio-classification", |
|
model=MUSIC_DETECTION_MODEL, |
|
device=0 if CUDA_AVAILABLE else -1 |
|
) |
|
print("Successfully loaded music detection pipeline") |
|
except Exception as e: |
|
print(f"Error creating music detection pipeline: {str(e)}") |
|
|
|
try: |
|
music_processor = AutoFeatureExtractor.from_pretrained(MUSIC_DETECTION_MODEL) |
|
music_model = AutoModelForAudioClassification.from_pretrained(MUSIC_DETECTION_MODEL) |
|
print("Successfully loaded music detection model and feature extractor") |
|
except Exception as e2: |
|
print(f"Error loading music detection model components: {str(e2)}") |
|
raise RuntimeError(f"Could not load music detection model: {str(e2)}") |
|
|
|
|
|
print(f"Loading audio classification model: {GENRE_MODEL_NAME}") |
|
try: |
|
genre_classifier = pipeline( |
|
"audio-classification", |
|
model=GENRE_MODEL_NAME, |
|
device=0 if CUDA_AVAILABLE else -1 |
|
) |
|
print("Successfully loaded audio classification pipeline") |
|
except Exception as e: |
|
print(f"Error creating pipeline: {str(e)}") |
|
|
|
try: |
|
genre_processor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME) |
|
genre_model = AutoModelForAudioClassification.from_pretrained(GENRE_MODEL_NAME) |
|
print("Successfully loaded audio classification model and feature extractor") |
|
except Exception as e2: |
|
print(f"Error loading model components: {str(e2)}") |
|
raise RuntimeError(f"Could not load genre classification model: {str(e2)}") |
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.float16, |
|
) |
|
|
|
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME) |
|
llm_model = AutoModelForCausalLM.from_pretrained( |
|
LLM_MODEL_NAME, |
|
device_map="auto", |
|
quantization_config=bnb_config, |
|
torch_dtype=torch.float16, |
|
) |
|
|
|
|
|
llm_pipeline = pipeline( |
|
"text-generation", |
|
model=llm_model, |
|
tokenizer=llm_tokenizer, |
|
max_new_tokens=512, |
|
) |
|
|
|
|
|
music_analyzer = MusicAnalyzer() |
|
|
|
|
|
def count_syllables(text): |
|
"""Count syllables in a given text using the pronouncing library.""" |
|
words = re.findall(r'\b[a-zA-Z]+\b', text.lower()) |
|
syllable_count = 0 |
|
|
|
for word in words: |
|
|
|
pronunciations = pronouncing.phones_for_word(word) |
|
if pronunciations: |
|
|
|
syllable_count += pronouncing.syllable_count(pronunciations[0]) |
|
else: |
|
|
|
vowels = "aeiouy" |
|
count = 0 |
|
prev_is_vowel = False |
|
|
|
for char in word: |
|
is_vowel = char.lower() in vowels |
|
if is_vowel and not prev_is_vowel: |
|
count += 1 |
|
prev_is_vowel = is_vowel |
|
|
|
if word.endswith('e'): |
|
count -= 1 |
|
if word.endswith('le') and len(word) > 2 and word[-3] not in vowels: |
|
count += 1 |
|
if count == 0: |
|
count = 1 |
|
|
|
syllable_count += count |
|
|
|
return syllable_count |
|
|
|
def extract_audio_features(audio_file): |
|
"""Extract audio features from an audio file.""" |
|
try: |
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
|
|
if y is None or sr is None: |
|
raise ValueError("Failed to load audio data") |
|
|
|
|
|
duration = extract_audio_duration(y, sr) |
|
|
|
|
|
mfccs_mean = extract_mfcc_features(y, sr, n_mfcc=20) |
|
|
|
return { |
|
"features": mfccs_mean, |
|
"duration": duration, |
|
"waveform": y, |
|
"sample_rate": sr, |
|
"path": audio_file |
|
} |
|
except Exception as e: |
|
print(f"Error extracting audio features: {str(e)}") |
|
raise ValueError(f"Failed to extract audio features: {str(e)}") |
|
|
|
def classify_genre(audio_data): |
|
"""Classify the genre of the audio using the loaded model.""" |
|
try: |
|
|
|
if 'genre_classifier' in globals(): |
|
results = genre_classifier(audio_data["path"]) |
|
|
|
top_genres = [(result["label"], result["score"]) for result in results[:3]] |
|
return top_genres |
|
|
|
|
|
elif 'genre_processor' in globals() and 'genre_model' in globals(): |
|
|
|
inputs = genre_processor( |
|
audio_data["waveform"], |
|
sampling_rate=audio_data["sample_rate"], |
|
return_tensors="pt" |
|
) |
|
|
|
with torch.no_grad(): |
|
outputs = genre_model(**inputs) |
|
predictions = outputs.logits.softmax(dim=-1) |
|
|
|
|
|
values, indices = torch.topk(predictions, 3) |
|
|
|
|
|
genre_labels = genre_model.config.id2label |
|
|
|
top_genres = [] |
|
for i, (value, index) in enumerate(zip(values[0], indices[0])): |
|
genre = genre_labels[index.item()] |
|
confidence = value.item() |
|
top_genres.append((genre, confidence)) |
|
|
|
return top_genres |
|
|
|
else: |
|
raise ValueError("No genre classification model available") |
|
|
|
except Exception as e: |
|
print(f"Error in genre classification: {str(e)}") |
|
|
|
return [("rock", 1.0)] |
|
|
|
def detect_music(audio_data): |
|
"""Detect if the audio is music using the MIT AST model.""" |
|
try: |
|
|
|
if 'music_detector' in globals(): |
|
results = music_detector(audio_data["path"]) |
|
|
|
music_confidence = 0.0 |
|
for result in results: |
|
label = result["label"].lower() |
|
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): |
|
music_confidence = max(music_confidence, result["score"]) |
|
return music_confidence >= 0.2, results |
|
|
|
|
|
elif 'music_processor' in globals() and 'music_model' in globals(): |
|
|
|
inputs = music_processor( |
|
audio_data["waveform"], |
|
sampling_rate=audio_data["sample_rate"], |
|
return_tensors="pt" |
|
) |
|
|
|
with torch.no_grad(): |
|
outputs = music_model(**inputs) |
|
predictions = outputs.logits.softmax(dim=-1) |
|
|
|
|
|
values, indices = torch.topk(predictions, 5) |
|
|
|
|
|
labels = music_model.config.id2label |
|
|
|
|
|
music_confidence = 0.0 |
|
results = [] |
|
|
|
for i, (value, index) in enumerate(zip(values[0], indices[0])): |
|
label = labels[index.item()].lower() |
|
score = value.item() |
|
results.append({"label": label, "score": score}) |
|
|
|
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): |
|
music_confidence = max(music_confidence, score) |
|
|
|
return music_confidence >= 0.2, results |
|
|
|
else: |
|
raise ValueError("No music detection model available") |
|
|
|
except Exception as e: |
|
print(f"Error in music detection: {str(e)}") |
|
return False, [] |
|
|
|
def detect_beats(y, sr): |
|
"""Enhanced beat detection with adaptive threshold analysis and improved time signature detection.""" |
|
|
|
|
|
y = np.clip(y, 1e-10, None) |
|
|
|
|
|
y_harmonic, y_percussive = librosa.effects.hpss(y) |
|
|
|
|
|
onset_env_full = librosa.onset.onset_strength(y=y, sr=sr) |
|
onset_env_perc = librosa.onset.onset_strength(y=y_percussive, sr=sr) |
|
|
|
|
|
onset_env_full = np.maximum(onset_env_full, 1e-6) |
|
onset_env_perc = np.maximum(onset_env_perc, 1e-6) |
|
|
|
|
|
combined_onset = onset_env_full * 0.3 + onset_env_perc * 0.7 |
|
|
|
|
|
tempo_candidates = [] |
|
beat_candidates = [] |
|
|
|
|
|
tempo1, beats1 = librosa.beat.beat_track( |
|
onset_envelope=combined_onset, |
|
sr=sr, |
|
tightness=100 |
|
) |
|
tempo_candidates.append(tempo1) |
|
beat_candidates.append(beats1) |
|
|
|
|
|
tempo2, beats2 = librosa.beat.beat_track( |
|
onset_envelope=combined_onset, |
|
sr=sr, |
|
tightness=100, |
|
start_bpm=60, |
|
std_bpm=20 |
|
) |
|
tempo_candidates.append(tempo2) |
|
beat_candidates.append(beats2) |
|
|
|
|
|
beat_consistency = [] |
|
for beats in beat_candidates: |
|
if len(beats) <= 1: |
|
beat_consistency.append(0) |
|
continue |
|
|
|
times = librosa.frames_to_time(beats, sr=sr) |
|
intervals = np.diff(times) |
|
|
|
|
|
if np.mean(intervals) > 0: |
|
consistency = 1.0 / (1.0 + np.std(intervals)/np.mean(intervals)) |
|
beat_consistency.append(consistency) |
|
else: |
|
beat_consistency.append(0) |
|
|
|
best_idx = np.argmax(beat_consistency) if beat_consistency else 0 |
|
tempo = tempo_candidates[best_idx] |
|
beat_frames = beat_candidates[best_idx] |
|
|
|
|
|
beat_times = librosa.frames_to_time(beat_frames, sr=sr) |
|
|
|
|
|
beat_strengths = [] |
|
if len(beat_frames) > 0: |
|
|
|
valid_frames = [frame for frame in beat_frames if frame < len(combined_onset)] |
|
if valid_frames: |
|
|
|
beat_strengths = combined_onset[valid_frames].tolist() |
|
|
|
|
|
avg_strength = np.mean(beat_strengths) if beat_strengths else 1.0 |
|
beat_strengths.extend([avg_strength] * (len(beat_times) - len(beat_strengths))) |
|
else: |
|
beat_strengths = [1.0] * len(beat_times) |
|
else: |
|
beat_strengths = [1.0] * len(beat_times) |
|
|
|
|
|
intervals = np.diff(beat_times).tolist() if len(beat_times) > 1 else [] |
|
|
|
|
|
|
|
time_signature = 4 |
|
|
|
if len(beat_strengths) > 8: |
|
|
|
if len(beat_strengths) > 4: |
|
|
|
norm_strengths = np.array(beat_strengths) |
|
if np.max(norm_strengths) > 0: |
|
norm_strengths = norm_strengths / np.max(norm_strengths) |
|
|
|
|
|
ac = librosa.autocorrelate(norm_strengths, max_size=len(norm_strengths)//2) |
|
|
|
|
|
if len(ac) > 3: |
|
|
|
peaks = librosa.util.peak_pick(ac[1:], pre_max=1, post_max=1, pre_avg=1, post_avg=1, delta=0.1, wait=1) |
|
peaks = peaks + 1 |
|
|
|
if len(peaks) > 0: |
|
|
|
N = peaks[0] |
|
|
|
|
|
if 2 <= N <= 3: |
|
time_signature = N |
|
elif N == 6: |
|
time_signature = 3 |
|
elif N == 8: |
|
time_signature = 4 |
|
elif N == 5 or N == 7: |
|
time_signature = N |
|
|
|
|
|
|
|
if len(beat_strengths) > 3: |
|
|
|
strengths_array = np.array(beat_strengths) |
|
mean_strength = np.mean(strengths_array) |
|
std_strength = np.std(strengths_array) |
|
|
|
if std_strength > 0: |
|
z_scores = (strengths_array - mean_strength) / std_strength |
|
|
|
|
|
strong_beat_pattern = [] |
|
for i in range(0, len(z_scores) - 2, 3): |
|
|
|
|
|
if z_scores[i] > 1 and z_scores[i+1] < 0.5 and z_scores[i+2] < 0.5: |
|
strong_beat_pattern.append(1) |
|
else: |
|
strong_beat_pattern.append(0) |
|
|
|
|
|
if strong_beat_pattern and len(strong_beat_pattern) >= 3: |
|
three_pattern_probability = sum(strong_beat_pattern) / len(strong_beat_pattern) |
|
if three_pattern_probability > 0.6: |
|
time_signature = 3 |
|
|
|
|
|
phrases = [] |
|
current_phrase = [] |
|
|
|
if len(beat_times) > 0: |
|
|
|
if len(beat_strengths) > 4: |
|
|
|
strong_threshold = np.percentile(beat_strengths, 75) |
|
|
|
if intervals: |
|
mean_interval = np.mean(intervals) |
|
std_interval = np.std(intervals) |
|
|
|
significant_gap = mean_interval + (1.5 * std_interval) if std_interval > 0 else mean_interval * 1.3 |
|
else: |
|
significant_gap = 0 |
|
else: |
|
|
|
strong_threshold = np.max(beat_strengths) * 0.8 if beat_strengths else 1.0 |
|
significant_gap = 0 |
|
|
|
|
|
for i in range(len(beat_times)): |
|
current_phrase.append(i) |
|
|
|
|
|
if i < len(beat_times) - 1: |
|
|
|
is_stronger_next = False |
|
if i < len(beat_strengths) - 1: |
|
is_stronger_next = beat_strengths[i+1] > strong_threshold and beat_strengths[i+1] > beat_strengths[i] * 1.1 |
|
|
|
|
|
is_longer_gap = False |
|
if i < len(beat_times) - 1 and intervals and i < len(intervals): |
|
is_longer_gap = intervals[i] > significant_gap |
|
|
|
|
|
is_measure_boundary = (i + 1) % time_signature == 0 and i > 0 |
|
|
|
|
|
if ((is_stronger_next or is_longer_gap) and len(current_phrase) >= 2) or \ |
|
(is_measure_boundary and len(current_phrase) >= time_signature): |
|
phrases.append(current_phrase) |
|
current_phrase = [] |
|
|
|
|
|
if current_phrase and len(current_phrase) >= 2: |
|
phrases.append(current_phrase) |
|
|
|
|
|
if not phrases and len(beat_times) >= 2: |
|
|
|
for i in range(0, len(beat_times), time_signature): |
|
end = min(i + time_signature, len(beat_times)) |
|
if end - i >= 2: |
|
phrases.append(list(range(i, end))) |
|
|
|
|
|
return { |
|
"tempo": tempo, |
|
"beat_frames": beat_frames, |
|
"beat_times": beat_times, |
|
"beat_count": len(beat_times), |
|
"beat_strengths": beat_strengths, |
|
"intervals": intervals, |
|
"time_signature": time_signature, |
|
"phrases": phrases |
|
} |
|
|
|
def detect_sections(y, sr): |
|
""" |
|
Advanced detection of musical sections with adaptive segmentation and improved classification. |
|
|
|
Parameters: |
|
y: Audio time series |
|
sr: Sample rate |
|
|
|
Returns: |
|
A list of section dictionaries with type, start time, end time, and duration |
|
""" |
|
|
|
|
|
hop_length = 512 |
|
|
|
|
|
S = np.abs(librosa.stft(y, hop_length=hop_length)) |
|
contrast = librosa.feature.spectral_contrast(S=S, sr=sr) |
|
|
|
|
|
chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length) |
|
|
|
|
|
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length) |
|
|
|
|
|
rms = librosa.feature.rms(y=y, hop_length=hop_length) |
|
|
|
|
|
y_harmonic, y_percussive = librosa.effects.hpss(y) |
|
percussive_rms = librosa.feature.rms(y=y_percussive, hop_length=hop_length) |
|
|
|
|
|
|
|
duration = librosa.get_duration(y=y, sr=sr) |
|
|
|
|
|
|
|
feature_stack = np.vstack([ |
|
librosa.util.normalize(contrast), |
|
librosa.util.normalize(chroma), |
|
librosa.util.normalize(mfcc), |
|
librosa.util.normalize(rms) |
|
]) |
|
|
|
|
|
feature_matrix = feature_stack.T |
|
|
|
|
|
|
|
|
|
|
|
from sklearn.decomposition import PCA |
|
|
|
|
|
n_components = min(8, feature_matrix.shape[0], feature_matrix.shape[1]) |
|
|
|
if feature_matrix.shape[0] > n_components and feature_matrix.shape[1] > 0: |
|
try: |
|
pca = PCA(n_components=n_components) |
|
reduced_features = pca.fit_transform(feature_matrix) |
|
except Exception as e: |
|
print(f"PCA failed, falling back to original features: {e}") |
|
|
|
reduced_features = feature_matrix |
|
else: |
|
|
|
reduced_features = feature_matrix |
|
|
|
|
|
|
|
|
|
|
|
min_segments = max(2, int(duration / 60)) |
|
max_segments = min(10, int(duration / 20)) |
|
|
|
|
|
min_segments = max(2, min(min_segments, 4)) |
|
max_segments = max(min_segments + 1, min(max_segments, 8)) |
|
|
|
|
|
best_segments = min_segments |
|
best_score = -1 |
|
|
|
from sklearn.metrics import silhouette_score |
|
from sklearn.cluster import AgglomerativeClustering |
|
|
|
|
|
if reduced_features.shape[0] > max_segments: |
|
for n_segments in range(min_segments, max_segments + 1): |
|
try: |
|
|
|
clustering = AgglomerativeClustering(n_clusters=n_segments) |
|
labels = clustering.fit_predict(reduced_features) |
|
|
|
|
|
if len(np.unique(labels)) > 1 and len(labels) > n_segments + 1: |
|
score = silhouette_score(reduced_features, labels) |
|
|
|
if score > best_score: |
|
best_score = score |
|
best_segments = n_segments |
|
except Exception as e: |
|
print(f"Clustering with {n_segments} segments failed: {e}") |
|
continue |
|
|
|
|
|
n_segments = best_segments |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
clustering = AgglomerativeClustering(n_clusters=n_segments) |
|
labels = clustering.fit_predict(reduced_features) |
|
|
|
|
|
boundaries = [0] |
|
|
|
for i in range(1, len(labels)): |
|
if labels[i] != labels[i-1]: |
|
boundaries.append(i) |
|
|
|
boundaries.append(len(labels)) |
|
|
|
|
|
bounds_frames = np.array(boundaries) |
|
|
|
except Exception as e: |
|
print(f"Final clustering failed: {e}") |
|
|
|
bounds_frames = librosa.segment.agglomerative(feature_stack, n_segments) |
|
|
|
|
|
|
|
|
|
|
|
tonnetz = librosa.feature.tonnetz(y=y_harmonic, sr=sr) |
|
|
|
|
|
harmonic_changes = [] |
|
|
|
if tonnetz.shape[1] > 1: |
|
tonnetz_diff = np.sum(np.abs(np.diff(tonnetz, axis=1)), axis=0) |
|
|
|
if np.max(tonnetz_diff) > 0: |
|
tonnetz_diff = tonnetz_diff / np.max(tonnetz_diff) |
|
|
|
|
|
threshold = np.percentile(tonnetz_diff, 90) |
|
for i in range(len(tonnetz_diff)): |
|
if tonnetz_diff[i] > threshold: |
|
harmonic_changes.append(i) |
|
|
|
|
|
|
|
bounds_times = librosa.frames_to_time(bounds_frames, sr=sr, hop_length=hop_length) |
|
|
|
|
|
sections = [] |
|
|
|
for i in range(len(bounds_times) - 1): |
|
start = bounds_times[i] |
|
end = bounds_times[i+1] |
|
duration = end - start |
|
|
|
|
|
if duration < 4 and i > 0 and i < len(bounds_times) - 2: |
|
continue |
|
|
|
|
|
|
|
|
|
|
|
start_idx = bounds_frames[i] |
|
end_idx = bounds_frames[i+1] |
|
|
|
|
|
if i == 0: |
|
section_type = "intro" |
|
elif i == len(bounds_times) - 2: |
|
section_type = "outro" |
|
else: |
|
|
|
section_type = "chorus" if i % 2 == 1 else "verse" |
|
|
|
|
|
if end_idx > start_idx: |
|
|
|
|
|
|
|
energy = np.mean(rms[0, start_idx:end_idx]) |
|
|
|
|
|
rhythm_intensity = np.mean(percussive_rms[0, start_idx:end_idx]) |
|
|
|
|
|
if chroma.shape[1] > 0: |
|
chroma_var = np.var(chroma[:, start_idx:end_idx]) |
|
else: |
|
chroma_var = 0 |
|
|
|
|
|
if mfcc.shape[1] > 0: |
|
mfcc_mean = np.mean(mfcc[:, start_idx:end_idx], axis=1) |
|
mfcc_var = np.var(mfcc[:, start_idx:end_idx], axis=1) |
|
else: |
|
mfcc_mean = np.zeros(mfcc.shape[0]) |
|
mfcc_var = np.zeros(mfcc.shape[0]) |
|
|
|
|
|
has_harmonic_change = False |
|
for change_idx in harmonic_changes: |
|
if start_idx <= change_idx < end_idx: |
|
has_harmonic_change = True |
|
break |
|
|
|
|
|
relative_energy = energy / np.mean(rms) |
|
relative_rhythm = rhythm_intensity / np.mean(percussive_rms) |
|
|
|
|
|
|
|
|
|
if (relative_energy > 1.1 and relative_rhythm > 1.1 and |
|
section_type != "intro" and section_type != "outro"): |
|
section_type = "chorus" |
|
|
|
|
|
elif (0.8 <= relative_energy <= 1.1 and chroma_var > np.mean(np.var(chroma, axis=1)) and |
|
section_type != "intro" and section_type != "outro"): |
|
section_type = "verse" |
|
|
|
|
|
if (section_type not in ["intro", "outro"] and |
|
(has_harmonic_change or |
|
(0.5 <= relative_energy <= 0.9 and duration < 30) or |
|
np.any(mfcc_var > np.percentile(np.var(mfcc, axis=1), 75)))): |
|
section_type = "bridge" |
|
|
|
|
|
sections.append({ |
|
"type": section_type, |
|
"start": start, |
|
"end": end, |
|
"duration": duration |
|
}) |
|
|
|
|
|
for i in range(1, len(sections) - 1): |
|
|
|
if sections[i]["duration"] < 8 and sections[i]["type"] not in ["intro", "outro", "bridge"]: |
|
|
|
prev_type = sections[i-1]["type"] |
|
next_type = sections[i+1]["type"] if i+1 < len(sections) else "outro" |
|
|
|
|
|
sections[i]["type"] = prev_type |
|
|
|
|
|
sections = [s for s in sections if s["duration"] >= 5 or |
|
s["type"] == "intro" or s["type"] == "outro"] |
|
|
|
return sections |
|
|
|
def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='default'): |
|
""" |
|
Create enhanced syllable templates based on beat patterns with improved musical intelligence. |
|
|
|
Parameters: |
|
beats_info: Dictionary containing beat analysis data |
|
genre: Optional genre to influence template creation |
|
phrase_mode: 'default' uses provided phrases, 'auto' forces recalculation |
|
|
|
Returns: |
|
String of syllable templates with embedded strength values and flexible timing |
|
""" |
|
import numpy as np |
|
from sklearn.cluster import KMeans |
|
|
|
|
|
beat_times = beats_info.get("beat_times", []) |
|
beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times)) |
|
tempo = beats_info.get("tempo", 120) |
|
time_signature = beats_info.get("time_signature", 4) |
|
|
|
|
|
if len(beat_times) < 2: |
|
return "S(1.0):1-w(0.5):1|S(1.0):1-w(0.5):1" |
|
|
|
|
|
|
|
if len(beat_strengths) >= 6: |
|
|
|
X = np.array(beat_strengths).reshape(-1, 1) |
|
|
|
|
|
kmeans = KMeans(n_clusters=3, random_state=0, n_init=10).fit(X) |
|
|
|
|
|
centroids = sorted([float(c[0]) for c in kmeans.cluster_centers_]) |
|
|
|
|
|
if len(centroids) >= 3: |
|
medium_threshold = (centroids[0] + centroids[1]) / 2 |
|
strong_threshold = (centroids[1] + centroids[2]) / 2 |
|
else: |
|
|
|
medium_threshold = np.percentile(beat_strengths, 33) |
|
strong_threshold = np.percentile(beat_strengths, 66) |
|
else: |
|
|
|
medium_threshold = np.percentile(beat_strengths, 33) |
|
strong_threshold = np.percentile(beat_strengths, 66) |
|
|
|
|
|
|
|
phrases = beats_info.get("phrases", []) |
|
|
|
if phrase_mode == 'auto' or not phrases: |
|
|
|
phrases = [] |
|
current_phrase = [] |
|
|
|
for i in range(len(beat_times)): |
|
current_phrase.append(i) |
|
|
|
|
|
if (i + 1) % time_signature == 0 or i == len(beat_times) - 1: |
|
if len(current_phrase) >= 2: |
|
phrases.append(current_phrase) |
|
current_phrase = [] |
|
|
|
|
|
if current_phrase and len(current_phrase) >= 2: |
|
phrases.append(current_phrase) |
|
|
|
|
|
|
|
def tempo_to_syllable_base(tempo): |
|
"""Continuous function mapping tempo to syllable base count""" |
|
|
|
if tempo > 180: |
|
return 1.0 |
|
elif tempo > 140: |
|
return 1.0 + (180 - tempo) * 0.02 |
|
elif tempo > 100: |
|
return 1.8 + (140 - tempo) * 0.01 |
|
elif tempo > 70: |
|
return 2.2 + (100 - tempo) * 0.02 |
|
else: |
|
return 2.8 + max(0, (70 - tempo) * 0.04) |
|
|
|
|
|
|
|
syllable_templates = [] |
|
|
|
for phrase in phrases: |
|
|
|
if not phrase: |
|
continue |
|
|
|
|
|
phrase_strengths = [beat_strengths[i] for i in phrase if i < len(beat_strengths)] |
|
if not phrase_strengths: |
|
phrase_strengths = [1.0] * len(phrase) |
|
|
|
|
|
stress_pattern = [] |
|
for i, strength in enumerate(phrase_strengths): |
|
|
|
metrical_position = i % time_signature |
|
|
|
|
|
position_boost = 0.15 if metrical_position == 0 else 0 |
|
|
|
if time_signature == 4 and metrical_position == 2: |
|
position_boost = 0.08 |
|
|
|
effective_strength = strength + position_boost |
|
|
|
if effective_strength >= strong_threshold: |
|
stress_pattern.append(("S", effective_strength)) |
|
elif effective_strength >= medium_threshold: |
|
stress_pattern.append(("m", effective_strength)) |
|
else: |
|
stress_pattern.append(("w", effective_strength)) |
|
|
|
|
|
|
|
detailed_template = [] |
|
|
|
for i, (stress_type, strength) in enumerate(stress_pattern): |
|
|
|
base_syllables = tempo_to_syllable_base(tempo) |
|
|
|
|
|
if stress_type == "S": |
|
syllable_factor = 1.2 |
|
elif stress_type == "m": |
|
syllable_factor = 1.0 |
|
else: |
|
syllable_factor = 0.8 |
|
|
|
|
|
genre_factor = 1.0 |
|
if genre: |
|
genre = genre.lower() |
|
if any(term in genre for term in ["rap", "hip hop", "hip-hop"]): |
|
genre_factor = 1.4 |
|
elif any(term in genre for term in ["folk", "country", "ballad"]): |
|
genre_factor = 0.8 |
|
|
|
|
|
raw_count = base_syllables * syllable_factor * genre_factor |
|
|
|
|
|
|
|
rounded_count = round(raw_count * 2) / 2 |
|
|
|
|
|
syllable_count = max(0.5, min(4, rounded_count)) |
|
|
|
|
|
|
|
strength_pct = int(strength * 100) / 100 |
|
detailed_template.append(f"{stress_type}({strength_pct}):{syllable_count}") |
|
|
|
|
|
phrase_template = "-".join(detailed_template) |
|
syllable_templates.append(phrase_template) |
|
|
|
|
|
|
|
if not syllable_templates: |
|
|
|
if time_signature == 3: |
|
syllable_templates = ["S(0.95):2-w(0.4):1-w(0.35):1"] |
|
else: |
|
syllable_templates = ["S(0.95):2-w(0.4):1-m(0.7):1.5-w(0.35):1"] |
|
|
|
|
|
return "|".join(syllable_templates) |
|
|
|
def format_syllable_templates_for_prompt(syllable_templates, arrow="→", line_wrap=10, |
|
structured_output=False, beat_types=None): |
|
""" |
|
Convert technical syllable templates into clear, human-readable instructions with |
|
enhanced flexibility and customization options. |
|
|
|
Parameters: |
|
syllable_templates: String or list of templates |
|
arrow: Symbol to use between beats (default: "→") |
|
line_wrap: Number of beats before automatic line wrapping (0 = no wrapping) |
|
structured_output: If True, return structured data instead of text |
|
beat_types: Custom mapping for beat types (default: None, uses standard mapping) |
|
|
|
Returns: |
|
Human-readable instructions or structured data depending on parameters |
|
""" |
|
if not syllable_templates: |
|
return {} if structured_output else "" |
|
|
|
|
|
default_beat_types = { |
|
"S": {"name": "STRONG", "description": "stressed syllable"}, |
|
"m": {"name": "medium", "description": "medium-stressed syllable"}, |
|
"w": {"name": "weak", "description": "unstressed syllable"}, |
|
"X": {"name": "EXTRA", "description": "extra strong syllable"}, |
|
"L": {"name": "legato", "description": "connected/tied syllable"} |
|
} |
|
|
|
|
|
beat_types = beat_types or default_beat_types |
|
|
|
|
|
structured_data = {"lines": [], "explanations": []} if structured_output else None |
|
|
|
|
|
is_enhanced_format = False |
|
|
|
|
|
if isinstance(syllable_templates, str): |
|
|
|
if any(bt + "(" in syllable_templates or bt + ":" in syllable_templates or bt + "[" in syllable_templates |
|
for bt in beat_types.keys()): |
|
is_enhanced_format = True |
|
|
|
elif "|" in syllable_templates: |
|
is_enhanced_format = True |
|
|
|
|
|
output = [] |
|
|
|
if is_enhanced_format: |
|
|
|
phrases = syllable_templates.split("|") if "|" in syllable_templates else [syllable_templates] |
|
|
|
|
|
for i, phrase in enumerate(phrases): |
|
|
|
has_swing = "(swing)" in phrase |
|
if has_swing: |
|
phrase = phrase.replace("(swing)", "") |
|
|
|
beats = phrase.split("-") |
|
beat_instructions = [] |
|
|
|
|
|
for j, beat in enumerate(beats): |
|
|
|
beat_info = {"original": beat, "type": None, "count": None, "strength": None} |
|
|
|
|
|
if "(" in beat and ")" in beat and ":" in beat: |
|
parts = beat.split(":") |
|
beat_type = parts[0].split("(")[0] |
|
strength = parts[0].split("(")[1].rstrip(")") |
|
count = parts[1] |
|
|
|
beat_info["type"] = beat_type |
|
beat_info["count"] = count |
|
beat_info["strength"] = strength |
|
|
|
|
|
elif any(beat.startswith(bt) for bt in beat_types.keys()) and len(beat) > 1: |
|
beat_type = beat[0] |
|
count = beat[1:] |
|
|
|
beat_info["type"] = beat_type |
|
beat_info["count"] = count |
|
|
|
|
|
else: |
|
beat_instructions.append(beat) |
|
continue |
|
|
|
|
|
if beat_info["type"] in beat_types: |
|
type_name = beat_types[beat_info["type"]]["name"] |
|
if beat_info["strength"]: |
|
beat_instructions.append(f"{type_name}({beat_info['count']}) [{beat_info['strength']}]") |
|
else: |
|
beat_instructions.append(f"{type_name}({beat_info['count']})") |
|
else: |
|
|
|
beat_instructions.append(beat) |
|
|
|
|
|
if line_wrap > 0 and len(beat_instructions) > line_wrap: |
|
wrapped_instructions = [] |
|
for k in range(0, len(beat_instructions), line_wrap): |
|
section = beat_instructions[k:k+line_wrap] |
|
wrapped_instructions.append(f"{arrow} ".join(section)) |
|
line_desc = f"\n {arrow} ".join(wrapped_instructions) |
|
else: |
|
line_desc = f" {arrow} ".join(beat_instructions) |
|
|
|
|
|
if has_swing: |
|
line_desc += " [with swing feel]" |
|
|
|
|
|
line_output = f"Line {i+1}: {line_desc}" |
|
output.append(line_output) |
|
|
|
if structured_output: |
|
structured_data["lines"].append({ |
|
"line_number": i+1, |
|
"beats": [{"original": beats[j], |
|
"type": beat_info.get("type"), |
|
"count": beat_info.get("count"), |
|
"strength": beat_info.get("strength")} |
|
for j, beat_info in enumerate([b for b in beats if isinstance(b, dict)])], |
|
"has_swing": has_swing |
|
}) |
|
|
|
|
|
explanation = [ |
|
"\n📝 UNDERSTANDING THE NOTATION:" |
|
] |
|
|
|
|
|
used_beat_types = set() |
|
for phrase in phrases: |
|
for beat in phrase.split("-"): |
|
for bt in beat_types.keys(): |
|
if beat.startswith(bt): |
|
used_beat_types.add(bt) |
|
|
|
for bt in used_beat_types: |
|
if bt in beat_types: |
|
name = beat_types[bt]["name"] |
|
desc = beat_types[bt]["description"] |
|
explanation.append(f"- {name}(n): Place a {desc} here, plus (n-1) unstressed syllables") |
|
|
|
explanation.extend([ |
|
f"- {arrow}: Indicates flow from one beat to the next", |
|
"- [0.xx]: Beat strength value (higher = more emphasis needed)" |
|
]) |
|
|
|
output.extend(explanation) |
|
|
|
if structured_output: |
|
structured_data["explanations"] = explanation |
|
|
|
|
|
has_half_syllables = any((".5" in beat) for phrase in phrases for beat in phrase.split("-")) |
|
if has_half_syllables: |
|
half_syllable_examples = [ |
|
"\n🎵 HALF-SYLLABLE EXAMPLES:", |
|
"- STRONG(1.5): One stressed syllable followed by an unstressed half-syllable", |
|
" Example: \"LOVE you\" where \"LOVE\" is stressed and \"you\" is quick", |
|
"- medium(2.5): One medium syllable plus one-and-a-half unstressed syllables", |
|
" Example: \"Wait for the\" where \"Wait\" is medium-stressed and \"for the\" is quick" |
|
] |
|
output.extend(half_syllable_examples) |
|
|
|
if structured_output: |
|
structured_data["half_syllable_examples"] = half_syllable_examples |
|
|
|
|
|
if any("swing" in phrase for phrase in phrases): |
|
swing_guide = [ |
|
"\n🎶 SWING RHYTHM GUIDE:", |
|
"- In swing, syllables should be unevenly timed (long-short pattern)", |
|
"- Example: \"SUM-mer TIME\" in swing feels like \"SUM...mer-TIME\" with delay" |
|
] |
|
output.extend(swing_guide) |
|
|
|
if structured_output: |
|
structured_data["swing_guide"] = swing_guide |
|
|
|
|
|
else: |
|
formatted_lines = [] |
|
|
|
if isinstance(syllable_templates, list): |
|
for i, template in enumerate(syllable_templates): |
|
if isinstance(template, dict) and "syllable_template" in template: |
|
line = f"Line {i+1}: {template['syllable_template']} syllables" |
|
formatted_lines.append(line) |
|
|
|
if structured_output: |
|
structured_data["lines"].append({ |
|
"line_number": i+1, |
|
"syllable_count": template["syllable_template"] |
|
}) |
|
elif isinstance(template, str): |
|
line = f"Line {i+1}: {template} syllables" |
|
formatted_lines.append(line) |
|
|
|
if structured_output: |
|
structured_data["lines"].append({ |
|
"line_number": i+1, |
|
"syllable_count": template |
|
}) |
|
|
|
output = formatted_lines |
|
else: |
|
output = [str(syllable_templates)] |
|
|
|
if structured_output: |
|
structured_data["raw_content"] = str(syllable_templates) |
|
|
|
|
|
application_tips = [ |
|
"\n💡 APPLICATION TIPS:", |
|
"1. Strong beats need naturally stressed syllables (like the START of \"RE-mem-ber\")", |
|
"2. Place important words on strong beats for natural emphasis", |
|
"3. Vowel sounds work best for sustained or emphasized syllables", |
|
"4. Keep consonant clusters (like \"str\" or \"thr\") on weak beats" |
|
] |
|
output.extend(application_tips) |
|
|
|
if structured_output: |
|
structured_data["application_tips"] = application_tips |
|
return structured_data |
|
|
|
return "\n".join(output) |
|
|
|
def verify_flexible_syllable_counts(lyrics, templates): |
|
""" |
|
Enhanced verification of syllable counts and stress patterns with precise alignment analysis |
|
and detailed feedback for all phrases in a template. |
|
""" |
|
import re |
|
import pronouncing |
|
import numpy as np |
|
import functools |
|
from itertools import chain |
|
|
|
|
|
@functools.lru_cache(maxsize=512) |
|
def cached_phones_for_word(word): |
|
return pronouncing.phones_for_word(word) |
|
|
|
@functools.lru_cache(maxsize=512) |
|
def count_syllables_for_word(word): |
|
"""Count syllables in a single word with caching for performance.""" |
|
|
|
pronunciations = cached_phones_for_word(word.lower()) |
|
if pronunciations: |
|
return pronouncing.syllable_count(pronunciations[0]) |
|
|
|
|
|
vowels = "aeiouy" |
|
word = word.lower() |
|
count = 0 |
|
prev_is_vowel = False |
|
|
|
for char in word: |
|
is_vowel = char in vowels |
|
if is_vowel and not prev_is_vowel: |
|
count += 1 |
|
prev_is_vowel = is_vowel |
|
|
|
|
|
if word.endswith('e') and not word.endswith('le'): |
|
count -= 1 |
|
if word.endswith('le') and len(word) > 2 and word[-3] not in vowels: |
|
count += 1 |
|
if count == 0: |
|
count = 1 |
|
|
|
return count |
|
|
|
@functools.lru_cache(maxsize=512) |
|
def get_word_stress(word): |
|
"""Get the stress pattern for a word with improved fallback handling.""" |
|
pronunciations = cached_phones_for_word(word.lower()) |
|
if pronunciations: |
|
return pronouncing.stresses(pronunciations[0]) |
|
|
|
|
|
syllables = count_syllables_for_word(word) |
|
|
|
|
|
if syllables == 1: |
|
return "1" |
|
elif syllables == 2: |
|
|
|
|
|
second_syllable_stress = ["ing", "er", "or", "ize", "ise", "ate", "ect", "end", "ure"] |
|
if any(word.endswith(ending) for ending in second_syllable_stress): |
|
return "01" |
|
else: |
|
return "10" |
|
elif syllables == 3: |
|
|
|
if any(word.endswith(ending) for ending in ["ity", "ety", "ify", "ogy", "graphy"]): |
|
return "100" |
|
elif any(word.endswith(ending) for ending in ["ation", "ious", "itis"]): |
|
return "010" |
|
else: |
|
return "100" |
|
else: |
|
|
|
return "1" + "0" * (syllables - 1) |
|
|
|
|
|
lines = [line.strip() for line in lyrics.split("\n") if line.strip()] |
|
|
|
|
|
verification_notes = [] |
|
detailed_analysis = [] |
|
stress_misalignments = [] |
|
total_mismatch_count = 0 |
|
|
|
|
|
for i, line in enumerate(lines): |
|
if i >= len(templates): |
|
break |
|
|
|
template = templates[i] |
|
|
|
|
|
if isinstance(template, dict) and "syllable_template" in template: |
|
template_str = template["syllable_template"] |
|
elif isinstance(template, str): |
|
template_str = template |
|
else: |
|
continue |
|
|
|
|
|
template_phrases = [template_str] |
|
if "|" in template_str: |
|
template_phrases = template_str.split("|") |
|
|
|
|
|
best_match_diff = float('inf') |
|
best_match_phrase = None |
|
best_phrase_beats = None |
|
actual_count = count_syllables(line) |
|
|
|
for phrase_idx, phrase in enumerate(template_phrases): |
|
|
|
beats_info = [] |
|
total_expected = 0 |
|
|
|
|
|
if "-" in phrase: |
|
beat_templates = phrase.split("-") |
|
|
|
|
|
for beat in beat_templates: |
|
beat_info = {"original": beat, "type": None, "count": 1, "strength": None} |
|
|
|
|
|
if "(" in beat and ")" in beat and ":" in beat: |
|
parts = beat.split(":") |
|
beat_type = parts[0].split("(")[0] |
|
try: |
|
strength = float(parts[0].split("(")[1].rstrip(")")) |
|
except ValueError: |
|
strength = 1.0 |
|
|
|
|
|
try: |
|
count = float(parts[1]) |
|
|
|
if count == int(count): |
|
count = int(count) |
|
except ValueError: |
|
count = 1 |
|
|
|
beat_info.update({ |
|
"type": beat_type, |
|
"count": count, |
|
"strength": strength |
|
}) |
|
|
|
|
|
elif any(beat.startswith(x) for x in ["S", "m", "w", "X", "L"]): |
|
beat_type = beat[0] |
|
|
|
|
|
try: |
|
count_str = beat[1:] |
|
count = float(count_str) |
|
if count == int(count): |
|
count = int(count) |
|
except ValueError: |
|
count = 1 |
|
|
|
beat_info.update({ |
|
"type": beat_type, |
|
"count": count |
|
}) |
|
|
|
|
|
else: |
|
try: |
|
count = float(beat) |
|
if count == int(count): |
|
count = int(count) |
|
beat_info["count"] = count |
|
except ValueError: |
|
pass |
|
|
|
beats_info.append(beat_info) |
|
total_expected += beat_info["count"] |
|
|
|
|
|
phrase_diff = abs(actual_count - total_expected) |
|
|
|
|
|
expected_ratio = 0.15 if total_expected > 10 else 0.25 |
|
phrase_threshold = max(1, round(total_expected * expected_ratio)) |
|
|
|
|
|
if phrase_diff < best_match_diff: |
|
best_match_diff = phrase_diff |
|
best_match_phrase = phrase |
|
best_phrase_beats = beats_info |
|
|
|
|
|
else: |
|
try: |
|
total_expected = float(phrase) |
|
phrase_diff = abs(actual_count - total_expected) |
|
if phrase_diff < best_match_diff: |
|
best_match_diff = phrase_diff |
|
best_match_phrase = phrase |
|
best_phrase_beats = [{"count": total_expected}] |
|
except ValueError: |
|
pass |
|
|
|
|
|
if best_match_phrase and best_phrase_beats: |
|
total_expected = sum(beat["count"] for beat in best_phrase_beats) |
|
|
|
|
|
expected_ratio = 0.15 if total_expected > 10 else 0.25 |
|
threshold = max(1, round(total_expected * expected_ratio)) |
|
|
|
|
|
if total_expected > 0 and best_match_diff > threshold: |
|
verification_notes.append(f"Line {i+1}: Expected {total_expected} syllables, got {actual_count}") |
|
total_mismatch_count += 1 |
|
|
|
|
|
words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) |
|
|
|
|
|
word_analysis = [] |
|
cumulative_syllables = 0 |
|
|
|
for word in words: |
|
syllable_count = count_syllables_for_word(word) |
|
|
|
|
|
stress_pattern = get_word_stress(word) |
|
|
|
word_analysis.append({ |
|
"word": word, |
|
"syllables": syllable_count, |
|
"stress_pattern": stress_pattern, |
|
"position": cumulative_syllables |
|
}) |
|
|
|
cumulative_syllables += syllable_count |
|
|
|
|
|
if best_phrase_beats and any(b.get("type") == "S" for b in best_phrase_beats if "type" in b): |
|
|
|
strong_positions = [] |
|
current_pos = 0 |
|
|
|
for beat in best_phrase_beats: |
|
if beat.get("type") == "S": |
|
strong_positions.append(current_pos) |
|
current_pos += beat.get("count", 1) |
|
|
|
|
|
alignment_issues = [] |
|
|
|
for pos in strong_positions: |
|
|
|
misaligned_word = None |
|
|
|
for word_info in word_analysis: |
|
word_start = word_info["position"] |
|
word_end = word_start + word_info["syllables"] |
|
|
|
if word_start <= pos < word_end: |
|
|
|
syllable_in_word = pos - word_start |
|
|
|
|
|
stress = word_info["stress_pattern"] |
|
|
|
|
|
if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1': |
|
misaligned_word = word_info["word"] |
|
alignment_issues.append(f"'{word_info['word']}' (unstressed syllable on strong beat)") |
|
stress_misalignments.append({ |
|
"line": i+1, |
|
"word": word_info["word"], |
|
"position": pos, |
|
"suggestion": get_stress_aligned_alternatives(word_info["word"], syllable_in_word) |
|
}) |
|
break |
|
|
|
if alignment_issues: |
|
verification_notes.append(f" → Stress misalignments: {', '.join(alignment_issues)}") |
|
|
|
|
|
alignment_map = generate_alignment_visualization(line, best_phrase_beats, word_analysis) |
|
if alignment_map: |
|
detailed_analysis.append(f"Line {i+1} Alignment Analysis:\n{alignment_map}") |
|
else: |
|
|
|
verification_notes.append(f"Line {i+1}: Unable to find matching template pattern") |
|
|
|
|
|
if verification_notes: |
|
lyrics += "\n\n[Note: Potential rhythm mismatches detected in these lines:]\n" |
|
lyrics += "\n".join(verification_notes) |
|
|
|
if detailed_analysis: |
|
lyrics += "\n\n[Detailed Alignment Analysis:]\n" |
|
lyrics += "\n\n".join(detailed_analysis) |
|
|
|
lyrics += "\n\n[How to fix rhythm mismatches:]\n" |
|
lyrics += "1. Make sure stressed syllables (like 'LO' in 'LOV-er') fall on STRONG beats\n" |
|
lyrics += "2. Adjust syllable counts to match the template (add/remove words or use different words)\n" |
|
lyrics += "3. Try using words where natural stress aligns with musical rhythm\n" |
|
|
|
|
|
if stress_misalignments: |
|
lyrics += "\n[Specific word replacement suggestions:]\n" |
|
for issue in stress_misalignments[:5]: |
|
if issue["suggestion"]: |
|
lyrics += f"Line {issue['line']}: Consider replacing '{issue['word']}' with: {issue['suggestion']}\n" |
|
|
|
return lyrics |
|
|
|
def generate_alignment_visualization(line, beats_info, word_analysis): |
|
"""Generate a visual representation of syllable alignment with beats.""" |
|
if not beats_info or not word_analysis: |
|
return None |
|
|
|
|
|
syllable_breakdown = [] |
|
syllable_stresses = [] |
|
|
|
for word_info in word_analysis: |
|
word = word_info["word"] |
|
syllables = word_info["syllables"] |
|
stress = word_info["stress_pattern"] or "" |
|
|
|
|
|
while len(stress) < syllables: |
|
stress += "0" |
|
|
|
|
|
parts = naive_syllable_split(word, syllables) |
|
|
|
for i, part in enumerate(parts): |
|
syllable_breakdown.append(part) |
|
if i < len(stress): |
|
syllable_stresses.append(stress[i]) |
|
else: |
|
syllable_stresses.append("0") |
|
|
|
|
|
beat_types = [] |
|
current_pos = 0 |
|
|
|
for beat in beats_info: |
|
beat_type = beat.get("type", "-") |
|
count = beat.get("count", 1) |
|
|
|
|
|
if isinstance(count, int): |
|
beat_types.extend([beat_type] * count) |
|
else: |
|
|
|
whole_part = int(count) |
|
frac_part = count - whole_part |
|
|
|
if whole_part > 0: |
|
beat_types.extend([beat_type] * whole_part) |
|
|
|
if frac_part > 0: |
|
beat_types.append(f"{beat_type}½") |
|
|
|
|
|
while len(beat_types) < len(syllable_breakdown): |
|
beat_types.append("-") |
|
|
|
|
|
beat_types = beat_types[:len(syllable_breakdown)] |
|
|
|
|
|
result = [] |
|
|
|
|
|
syllable_display = [] |
|
for i, syllable in enumerate(syllable_breakdown): |
|
if i < len(syllable_stresses) and syllable_stresses[i] == "1": |
|
syllable_display.append(syllable.upper()) |
|
else: |
|
syllable_display.append(syllable.lower()) |
|
|
|
result.append(" - ".join(syllable_display)) |
|
|
|
|
|
beat_indicators = [] |
|
for i, (syllable, beat_type) in enumerate(zip(syllable_stresses, beat_types)): |
|
if beat_type == "S" or beat_type.startswith("S"): |
|
if syllable == "1": |
|
beat_indicators.append("↑") |
|
else: |
|
beat_indicators.append("❌") |
|
elif beat_type == "m" or beat_type.startswith("m"): |
|
beat_indicators.append("•") |
|
elif beat_type == "w" or beat_type.startswith("w"): |
|
beat_indicators.append("·") |
|
else: |
|
beat_indicators.append(" ") |
|
|
|
result.append(" ".join(beat_indicators)) |
|
|
|
|
|
result.append(" - ".join(beat_types)) |
|
|
|
return "\n".join(result) |
|
|
|
@functools.lru_cache(maxsize=256) |
|
def naive_syllable_split(word, syllable_count): |
|
"""Naively split a word into the specified number of syllables, with caching for performance.""" |
|
if syllable_count <= 1: |
|
return [word] |
|
|
|
|
|
vowels = "aeiouy" |
|
consonants = "bcdfghjklmnpqrstvwxz" |
|
|
|
|
|
splits = [] |
|
for i in range(1, len(word) - 1): |
|
if word[i] in consonants and word[i-1] in vowels: |
|
splits.append(i) |
|
elif word[i] in vowels and word[i-1] in consonants and word[i+1] in consonants: |
|
splits.append(i+1) |
|
|
|
|
|
while len(splits) < syllable_count - 1: |
|
for i in range(1, len(word)): |
|
if i not in splits: |
|
splits.append(i) |
|
break |
|
|
|
|
|
splits.sort() |
|
splits = splits[:syllable_count - 1] |
|
|
|
|
|
result = [] |
|
prev = 0 |
|
for pos in splits: |
|
result.append(word[prev:pos]) |
|
prev = pos |
|
|
|
result.append(word[prev:]) |
|
return result |
|
|
|
def get_stress_aligned_alternatives(word, position_to_stress): |
|
"""Suggest alternative words with proper stress at the required position.""" |
|
|
|
|
|
syllable_count = count_syllables_for_word(word) |
|
|
|
|
|
if syllable_count == 2: |
|
if position_to_stress == 0: |
|
first_stress = ["love-ly", "won-der", "beau-ty", "danc-ing", "dream-ing", |
|
"heart-beat", "sun-light", "moon-light", "star-light"] |
|
return ", ".join(first_stress[:3]) |
|
else: |
|
second_stress = ["be-LIEVE", "a-BOVE", "a-ROUND", "to-DAY", "a-LIVE", |
|
"a-LONE", "be-HOLD", "re-TURN", "de-LIGHT"] |
|
return ", ".join(second_stress[:3]) |
|
elif syllable_count == 3: |
|
if position_to_stress == 0: |
|
return "MEM-o-ry, WON-der-ful, BEAU-ti-ful" |
|
elif position_to_stress == 1: |
|
return "a-MAZE-ing, to-GE-ther, for-EV-er" |
|
else: |
|
return "un-der-STAND, o-ver-COME, ne-ver-MORE" |
|
|
|
|
|
return f"a word with stress on syllable {position_to_stress + 1}" |
|
|
|
def generate_lyrics(genre, duration, emotion_results, song_structure=None): |
|
""" |
|
Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment. |
|
|
|
This improved version uses advanced template creation, better formatting, and verification with |
|
potential refinement for lyrics that perfectly match the musical rhythm patterns. |
|
|
|
Parameters: |
|
genre: Musical genre of the audio |
|
duration: Duration of the audio in seconds |
|
emotion_results: Dictionary containing emotional analysis results |
|
song_structure: Optional dictionary containing song structure analysis |
|
|
|
Returns: |
|
Generated lyrics aligned with the rhythm patterns of the music |
|
""" |
|
|
|
primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"] |
|
primary_theme = emotion_results["theme_analysis"]["primary_theme"] |
|
|
|
|
|
try: |
|
tempo = float(emotion_results["rhythm_analysis"]["tempo"]) |
|
except (KeyError, ValueError, TypeError): |
|
tempo = 0.0 |
|
|
|
key = emotion_results["tonal_analysis"]["key"] |
|
mode = emotion_results["tonal_analysis"]["mode"] |
|
|
|
|
|
syllable_guidance = "" |
|
templates_for_verification = [] |
|
|
|
|
|
structure_visualization = "=== MUSIC-LYRICS STRUCTURE MATCHING ===\n\n" |
|
structure_visualization += f"Song Duration: {duration:.1f} seconds\n" |
|
structure_visualization += f"Tempo: {tempo:.1f} BPM\n\n" |
|
|
|
if song_structure: |
|
|
|
if "flexible_structure" in song_structure and song_structure["flexible_structure"]: |
|
flexible = song_structure["flexible_structure"] |
|
if "segments" in flexible and flexible["segments"]: |
|
|
|
segments = flexible["segments"] |
|
|
|
|
|
structure_visualization += f"Total segments: {len(segments)}\n" |
|
structure_visualization += "Each segment represents one musical phrase for which you should write ONE line of lyrics.\n\n" |
|
|
|
|
|
enhanced_templates = [] |
|
|
|
for i, segment in enumerate(segments): |
|
if i < 30: |
|
|
|
segment_start = segment["start"] |
|
segment_end = segment["end"] |
|
|
|
|
|
structure_visualization += f"Segment {i+1}: {segment_start:.1f}s - {segment_end:.1f}s (duration: {segment_end-segment_start:.1f}s)\n" |
|
|
|
|
|
segment_beats = [] |
|
beat_times = flexible["beats"]["beat_times"] |
|
beat_strengths = flexible["beats"].get("beat_strengths", []) |
|
|
|
for j, beat_time in enumerate(beat_times): |
|
if segment_start <= beat_time < segment_end: |
|
|
|
segment_beats.append(j) |
|
|
|
|
|
segment_beats_info = { |
|
"beat_times": [beat_times[j] for j in segment_beats], |
|
"tempo": flexible["beats"].get("tempo", 120) |
|
} |
|
|
|
if beat_strengths: |
|
segment_beats_info["beat_strengths"] = [ |
|
beat_strengths[j] for j in segment_beats |
|
if j < len(beat_strengths) |
|
] |
|
|
|
|
|
segment_beats_info["phrases"] = [segment_beats] |
|
|
|
|
|
enhanced_template = create_flexible_syllable_templates( |
|
segment_beats_info, |
|
genre=genre, |
|
phrase_mode='auto' if i == 0 else 'default' |
|
) |
|
enhanced_templates.append(enhanced_template) |
|
templates_for_verification.append(enhanced_template) |
|
|
|
|
|
structure_visualization += f" Template: {enhanced_template}\n" |
|
|
|
|
|
|
|
section_types = [] |
|
pattern_groups = {} |
|
|
|
for i, template in enumerate(enhanced_templates): |
|
|
|
simple_pattern = template.replace("(", "").replace(")", "").replace(":", "") |
|
|
|
|
|
found_match = False |
|
for group, patterns in pattern_groups.items(): |
|
if any(simple_pattern == p.replace("(", "").replace(")", "").replace(":", "") for p in patterns): |
|
pattern_groups[group].append(template) |
|
section_types.append(group) |
|
found_match = True |
|
break |
|
|
|
if not found_match: |
|
|
|
group_name = f"Group_{len(pattern_groups) + 1}" |
|
pattern_groups[group_name] = [template] |
|
section_types.append(group_name) |
|
|
|
|
|
section_mapping = {} |
|
if len(pattern_groups) >= 1: |
|
|
|
most_common = max(pattern_groups.items(), key=lambda x: len(x[1]))[0] |
|
section_mapping[most_common] = "verse" |
|
|
|
if len(pattern_groups) >= 2: |
|
|
|
sorted_groups = sorted(pattern_groups.items(), key=lambda x: len(x[1]), reverse=True) |
|
if len(sorted_groups) > 1: |
|
section_mapping[sorted_groups[1][0]] = "chorus" |
|
|
|
if len(pattern_groups) >= 3: |
|
|
|
sorted_groups = sorted(pattern_groups.items(), key=lambda x: len(x[1]), reverse=True) |
|
if len(sorted_groups) > 2: |
|
section_mapping[sorted_groups[2][0]] = "bridge" |
|
|
|
|
|
mapped_section_types = [] |
|
for section_type in section_types: |
|
if section_type in section_mapping: |
|
mapped_section_types.append(section_mapping[section_type]) |
|
else: |
|
mapped_section_types.append("verse") |
|
|
|
|
|
structure_visualization += "\nPredicted Song Structure:\n" |
|
for i, section_type in enumerate(mapped_section_types): |
|
if i < len(enhanced_templates): |
|
structure_visualization += f"Line {i+1}: [{section_type.upper()}] {enhanced_templates[i]}\n" |
|
|
|
|
|
total_lines = len(enhanced_templates) |
|
verse_lines = mapped_section_types.count("verse") |
|
chorus_lines = mapped_section_types.count("chorus") |
|
bridge_lines = mapped_section_types.count("bridge") |
|
|
|
|
|
structure_visualization += f"\nTotal Lines Required: {total_lines}\n" |
|
structure_visualization += f"Verse Lines: {verse_lines}\n" |
|
structure_visualization += f"Chorus Lines: {chorus_lines}\n" |
|
structure_visualization += f"Bridge Lines: {bridge_lines}\n" |
|
|
|
|
|
syllable_guidance = "CRITICAL RHYTHM INSTRUCTIONS:\n" |
|
syllable_guidance += "Each line of lyrics MUST match exactly with one musical phrase/segment.\n" |
|
syllable_guidance += "Follow these rhythm patterns for each line (STRONG beats need stressed syllables):\n\n" |
|
|
|
|
|
formatted_templates = [] |
|
for i, template in enumerate(enhanced_templates): |
|
if i < len(mapped_section_types): |
|
section_type = mapped_section_types[i].upper() |
|
if i > 0 and mapped_section_types[i] != mapped_section_types[i-1]: |
|
|
|
formatted_templates.append(f"\n[{section_type}]") |
|
elif i == 0: |
|
|
|
formatted_templates.append(f"[{section_type}]") |
|
formatted_templates.append(format_syllable_templates_for_prompt([template], arrow="→", line_wrap=8)) |
|
|
|
syllable_guidance += "\n".join(formatted_templates) |
|
|
|
|
|
use_sections = True |
|
|
|
|
|
if verse_lines > 0: |
|
verse_lines = min(verse_lines, total_lines // 2) |
|
else: |
|
verse_lines = total_lines // 2 |
|
|
|
if chorus_lines > 0: |
|
chorus_lines = min(chorus_lines, total_lines // 3) |
|
else: |
|
chorus_lines = total_lines // 3 |
|
|
|
if bridge_lines > 0: |
|
bridge_lines = min(bridge_lines, total_lines // 6) |
|
else: |
|
bridge_lines = 0 |
|
|
|
|
|
elif "syllables" in song_structure and song_structure["syllables"]: |
|
syllable_guidance = "RHYTHM PATTERN INSTRUCTIONS:\n" |
|
syllable_guidance += "Follow these syllable patterns for each section. Each line should match ONE phrase:\n\n" |
|
|
|
|
|
section_counts = {"verse": 0, "chorus": 0, "bridge": 0, "intro": 0, "outro": 0} |
|
|
|
for section in song_structure["syllables"]: |
|
section_counts[section["type"]] = section_counts.get(section["type"], 0) + 1 |
|
|
|
if "syllable_template" in section: |
|
|
|
section_beats_info = { |
|
"beat_times": [beat for beat in song_structure["beats"]["beat_times"] |
|
if section["start"] <= beat < section["end"]], |
|
"tempo": song_structure["beats"].get("tempo", 120) |
|
} |
|
|
|
if "beat_strengths" in song_structure["beats"]: |
|
section_beats_info["beat_strengths"] = [ |
|
strength for i, strength in enumerate(song_structure["beats"]["beat_strengths"]) |
|
if i < len(song_structure["beats"]["beat_times"]) and |
|
section["start"] <= song_structure["beats"]["beat_times"][i] < section["end"] |
|
] |
|
|
|
|
|
section_beats_info["phrases"] = [list(range(len(section_beats_info["beat_times"])))] |
|
|
|
|
|
enhanced_template = create_flexible_syllable_templates( |
|
section_beats_info, |
|
genre=genre, |
|
phrase_mode='auto' if section['type'] == 'verse' else 'default' |
|
) |
|
|
|
syllable_guidance += f"[{section['type'].capitalize()}]:\n" |
|
syllable_guidance += format_syllable_templates_for_prompt( |
|
enhanced_template, |
|
arrow="→", |
|
line_wrap=6 |
|
) + "\n\n" |
|
templates_for_verification.append(section) |
|
elif "syllable_count" in section: |
|
syllable_guidance += f"[{section['type'].capitalize()}]: ~{section['syllable_count']} syllables total\n" |
|
|
|
|
|
structure_visualization += "Using traditional section-based structure:\n" |
|
for section_type, count in section_counts.items(): |
|
if count > 0: |
|
structure_visualization += f"{section_type.capitalize()}: {count} sections\n" |
|
|
|
|
|
verse_lines = max(2, section_counts.get("verse", 0) * 4) |
|
chorus_lines = max(2, section_counts.get("chorus", 0) * 4) |
|
bridge_lines = max(0, section_counts.get("bridge", 0) * 2) |
|
|
|
|
|
use_sections = True |
|
|
|
|
|
if not syllable_guidance: |
|
syllable_guidance = "RHYTHM ALIGNMENT INSTRUCTIONS:\n\n" |
|
syllable_guidance += "1. Align stressed syllables with strong beats (usually beats 1 and 3 in 4/4 time)\n" |
|
syllable_guidance += "2. Use unstressed syllables on weak beats (usually beats 2 and 4 in 4/4 time)\n" |
|
syllable_guidance += "3. Use appropriate syllable counts based on tempo:\n" |
|
syllable_guidance += " - Fast tempo (>120 BPM): 4-6 syllables per line\n" |
|
syllable_guidance += " - Medium tempo (90-120 BPM): 6-8 syllables per line\n" |
|
syllable_guidance += " - Slow tempo (<90 BPM): 8-10 syllables per line\n" |
|
|
|
|
|
structure_visualization += "Using estimated structure (no detailed analysis available):\n" |
|
|
|
|
|
estimated_lines = max(8, int(duration / 10)) |
|
structure_visualization += f"Estimated total lines: {estimated_lines}\n" |
|
|
|
|
|
verse_lines = estimated_lines // 2 |
|
chorus_lines = estimated_lines // 3 |
|
bridge_lines = estimated_lines // 6 if estimated_lines > 12 else 0 |
|
|
|
|
|
use_sections = True |
|
|
|
|
|
syllable_guidance += "\nEXAMPLES OF PERFECT RHYTHM ALIGNMENT:\n" |
|
syllable_guidance += "Pattern: S(0.95):1 → w(0.4):1 → m(0.7):1 → w(0.3):1\n" |
|
syllable_guidance += "Lyric: 'HEAR the MU-sic PLAY'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S w m w <- BEAT TYPE\n\n" |
|
|
|
syllable_guidance += "Pattern: S(0.9):2 → w(0.3):1 → S(0.85):1 → w(0.4):2\n" |
|
syllable_guidance += "Lyric: 'DANC-ing TO the RHYTHM of LOVE'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S S w S w w <- BEAT TYPE\n\n" |
|
|
|
syllable_guidance += "Pattern: S(0.92):1 → m(0.65):2 → S(0.88):1 → w(0.35):1\n" |
|
syllable_guidance += "Lyric: 'TIME keeps FLOW-ing ON and ON'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S m m S w w <- BEAT TYPE\n\n" |
|
|
|
|
|
genre_guidance = "" |
|
if any(term in genre.lower() for term in ["rap", "hip-hop", "hip hop"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR RAP/HIP-HOP RHYTHMS:\n" |
|
genre_guidance += "- Use more syllables per beat for rapid-fire sections\n" |
|
genre_guidance += "- Create internal rhymes within lines, not just at line endings\n" |
|
genre_guidance += "- Emphasize the first beat of each bar with strong consonants\n" |
|
elif any(term in genre.lower() for term in ["electronic", "edm", "techno", "house", "dance"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR ELECTRONIC MUSIC RHYTHMS:\n" |
|
genre_guidance += "- Use repetitive phrases that build and release tension\n" |
|
genre_guidance += "- Match syllables precisely to the beat grid\n" |
|
genre_guidance += "- Use short, percussive words on strong beats\n" |
|
elif any(term in genre.lower() for term in ["rock", "metal", "punk", "alternative"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR ROCK RHYTHMS:\n" |
|
genre_guidance += "- Use powerful, emotive words on downbeats\n" |
|
genre_guidance += "- Create contrast between verse and chorus energy levels\n" |
|
genre_guidance += "- Emphasize hooks with simple, memorable phrases\n" |
|
elif any(term in genre.lower() for term in ["folk", "country", "acoustic", "ballad"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR FOLK/ACOUSTIC RHYTHMS:\n" |
|
genre_guidance += "- Focus on storytelling with clear narrative flow\n" |
|
genre_guidance += "- Use natural speech patterns that flow conversationally\n" |
|
genre_guidance += "- Place important words at the start of phrases\n" |
|
|
|
|
|
syllable_guidance += genre_guidance |
|
|
|
|
|
syllable_guidance_text = syllable_guidance |
|
|
|
|
|
if song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]: |
|
|
|
if "segments" in song_structure["flexible_structure"]: |
|
segments = song_structure["flexible_structure"]["segments"] |
|
if len(segments) > 4: |
|
use_sections = False |
|
|
|
|
|
if use_sections: |
|
|
|
content = f""" |
|
You are a talented songwriter who specializes in {genre} music. |
|
Write original {genre} song lyrics for a song that is {duration:.1f} seconds long. |
|
|
|
Music analysis has detected the following qualities in the music: |
|
- Tempo: {tempo:.1f} BPM |
|
- Key: {key} {mode} |
|
- Primary emotion: {primary_emotion} |
|
- Primary theme: {primary_theme} |
|
|
|
{syllable_guidance} |
|
|
|
CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: |
|
1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) |
|
2. Natural word stress patterns must match the beat strength (strong words on strong beats) |
|
3. Line breaks should occur at phrase endings for natural breathing |
|
4. Consonant clusters should be avoided on fast notes and strong beats |
|
5. Open vowels (a, e, o) work better for sustained notes and syllables |
|
6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) |
|
7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels |
|
|
|
Think step by step about how to match words to the rhythm pattern: |
|
1. First, identify the strong beats in each line pattern |
|
2. Choose words where stressed syllables naturally fall on strong beats |
|
3. Count syllables carefully to ensure they match the pattern precisely |
|
4. Test your line against the pattern by mapping each syllable |
|
|
|
IMPORTANT: Each line of lyrics must match exactly to ONE musical phrase/segment. |
|
|
|
The lyrics should: |
|
- Perfectly capture the essence and style of {genre} music |
|
- Express the {primary_emotion} emotion and {primary_theme} theme |
|
- Follow the structure patterns provided above |
|
- Be completely original |
|
- Match the song duration of {duration:.1f} seconds |
|
|
|
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" |
|
where you analyze how well the lyrics align with the musical rhythm. This section MUST appear |
|
even if there are no rhythm issues. Include the following in your analysis: |
|
1. Syllable counts for each line and how they match the rhythm pattern |
|
2. Where stressed syllables align with strong beats |
|
3. Any potential misalignments or improvements |
|
|
|
Your lyrics: |
|
""" |
|
else: |
|
|
|
content = f""" |
|
You are a talented songwriter who specializes in {genre} music. |
|
Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long. |
|
|
|
Music analysis has detected the following qualities: |
|
- Tempo: {tempo:.1f} BPM |
|
- Key: {key} {mode} |
|
- Primary emotion: {primary_emotion} |
|
- Primary theme: {primary_theme} |
|
|
|
{syllable_guidance} |
|
|
|
CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: |
|
1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) |
|
2. Natural word stress patterns must match the beat strength (strong words on strong beats) |
|
3. Line breaks should occur at phrase endings for natural breathing |
|
4. Consonant clusters should be avoided on fast notes and strong beats |
|
5. Open vowels (a, e, o) work better for sustained notes and syllables |
|
6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) |
|
7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels |
|
|
|
Think step by step about how to match words to the rhythm pattern: |
|
1. First, identify the strong beats in each line pattern |
|
2. Choose words where stressed syllables naturally fall on strong beats |
|
3. Count syllables carefully to ensure they match the pattern precisely |
|
4. Test your line against the pattern by mapping each syllable |
|
|
|
CRITICAL: Each line of lyrics must match exactly to ONE musical phrase/segment. |
|
|
|
For perfect alignment examples: |
|
- "FEEL the RHY-thm in your SOUL" – stressed syllables on strong beats |
|
- "to-DAY we DANCE a-LONG" – natural speech stress matches musical stress |
|
- "WAIT-ing FOR the SUN to RISE" – syllable emphasis aligns with beat emphasis |
|
|
|
The lyrics should: |
|
- Perfectly capture the essence and style of {genre} music |
|
- Express the {primary_emotion} emotion and {primary_theme} theme |
|
- Be completely original |
|
- Maintain a consistent theme throughout |
|
- Match the audio segment duration of {duration:.1f} seconds |
|
|
|
Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above. |
|
Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY. |
|
|
|
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" |
|
where you analyze how well the lyrics align with the musical rhythm. This section MUST appear |
|
even if there are no rhythm issues. Include the following in your analysis: |
|
1. Syllable counts for each line and how they match the rhythm pattern |
|
2. Where stressed syllables align with strong beats |
|
3. Any potential misalignments or improvements |
|
|
|
Your lyrics: |
|
""" |
|
|
|
|
|
messages = [ |
|
{"role": "user", "content": content} |
|
] |
|
|
|
|
|
text = llm_tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
|
|
|
|
model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device) |
|
|
|
|
|
generation_params = { |
|
"do_sample": True, |
|
"temperature": 0.6, |
|
"top_p": 0.95, |
|
"top_k": 50, |
|
"repetition_penalty": 1.2, |
|
"max_new_tokens": 2048 |
|
} |
|
|
|
|
|
generated_ids = llm_model.generate( |
|
**model_inputs, |
|
**generation_params |
|
) |
|
|
|
|
|
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() |
|
|
|
|
|
lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip() |
|
|
|
|
|
if "<thinking>" in lyrics and "</thinking>" in lyrics: |
|
lyrics = lyrics.split("</thinking>")[1].strip() |
|
|
|
|
|
thinking_markers = ["<think>", "</think>", "[thinking]", "[/thinking]", "I'll think step by step:"] |
|
for marker in thinking_markers: |
|
if marker in lyrics: |
|
parts = lyrics.split(marker) |
|
if len(parts) > 1: |
|
lyrics = parts[-1].strip() |
|
|
|
|
|
if templates_for_verification: |
|
verified_lyrics = verify_flexible_syllable_counts(lyrics, templates_for_verification) |
|
|
|
|
|
if "[Note: Potential rhythm mismatches" in verified_lyrics and "Detailed Alignment Analysis" in verified_lyrics: |
|
|
|
original_lyrics = lyrics.split("[Note:")[0].strip() |
|
|
|
|
|
analysis = verified_lyrics.split("[Note:")[1] |
|
|
|
|
|
if "stress misalignments" in analysis and len(templates_for_verification) > 0: |
|
|
|
refinement_prompt = f""" |
|
You need to fix rhythm issues in these lyrics. Here's the analysis of the problems: |
|
|
|
{analysis} |
|
|
|
Revise the lyrics to perfectly match the rhythm pattern while maintaining the theme. |
|
Focus on fixing the stress misalignments by placing stressed syllables on STRONG beats. |
|
|
|
Original lyrics: |
|
{original_lyrics} |
|
|
|
Improved lyrics with fixed rhythm: |
|
""" |
|
|
|
refinement_messages = [ |
|
{"role": "user", "content": refinement_prompt} |
|
] |
|
|
|
|
|
refinement_text = llm_tokenizer.apply_chat_template( |
|
refinement_messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
|
|
try: |
|
|
|
refinement_inputs = llm_tokenizer([refinement_text], return_tensors="pt").to(llm_model.device) |
|
|
|
|
|
refinement_params = { |
|
"do_sample": True, |
|
"temperature": 0.4, |
|
"top_p": 0.9, |
|
"repetition_penalty": 1.3, |
|
"max_new_tokens": 1024 |
|
} |
|
|
|
refined_ids = llm_model.generate( |
|
**refinement_inputs, |
|
**refinement_params |
|
) |
|
|
|
|
|
refined_output_ids = refined_ids[0][len(refinement_inputs.input_ids[0]):].tolist() |
|
refined_lyrics = llm_tokenizer.decode(refined_output_ids, skip_special_tokens=True).strip() |
|
|
|
|
|
refined_verified_lyrics = verify_flexible_syllable_counts(refined_lyrics, templates_for_verification) |
|
|
|
|
|
if "[Note: Potential rhythm mismatches" not in refined_verified_lyrics: |
|
lyrics = refined_lyrics |
|
elif refined_verified_lyrics.count("misalignments") < verified_lyrics.count("misalignments"): |
|
lyrics = refined_verified_lyrics |
|
else: |
|
lyrics = verified_lyrics |
|
except Exception as e: |
|
print(f"Error in lyrics refinement: {str(e)}") |
|
lyrics = verified_lyrics |
|
else: |
|
|
|
lyrics = verified_lyrics |
|
else: |
|
|
|
lyrics = verified_lyrics |
|
|
|
|
|
if "[RHYTHM_ANALYSIS_SECTION]" in lyrics: |
|
|
|
parts = lyrics.split("[RHYTHM_ANALYSIS_SECTION]") |
|
clean_lyrics = parts[0].strip() |
|
rhythm_analysis = parts[1].strip() |
|
|
|
|
|
lyrics = clean_lyrics + "\n\n[Note: Rhythm Analysis]\n" + rhythm_analysis |
|
|
|
|
|
elif "[Note: Potential rhythm mismatches" in lyrics: |
|
|
|
pass |
|
else: |
|
|
|
lyrics = lyrics + "\n\n[Note: Rhythm Analysis]\nNo rhythm issues detected. All syllables align well with the beat pattern." |
|
|
|
|
|
if isinstance(lyrics, str): |
|
|
|
if "[Note: Rhythm Analysis]" in lyrics: |
|
clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() |
|
rhythm_analysis = lyrics.split("[Note: Rhythm Analysis]")[1] |
|
elif "[Note: Potential rhythm mismatches" in lyrics: |
|
clean_lyrics = lyrics.split("[Note:")[0].strip() |
|
rhythm_analysis = "[Note:" + lyrics.split("[Note:")[1] |
|
else: |
|
clean_lyrics = lyrics |
|
rhythm_analysis = "No rhythm analysis available" |
|
|
|
|
|
syllable_analysis = "=== SYLLABLE ANALYSIS ===\n\n" |
|
if templates_for_verification: |
|
syllable_analysis += "Template Analysis:\n" |
|
for i, template in enumerate(templates_for_verification): |
|
if i < min(len(templates_for_verification), 30): |
|
syllable_analysis += f"Line {i+1}:\n" |
|
if isinstance(template, dict): |
|
if "syllable_template" in template: |
|
syllable_analysis += f" Template: {template['syllable_template']}\n" |
|
if "syllable_count" in template: |
|
syllable_analysis += f" Expected syllables: {template['syllable_count']}\n" |
|
elif isinstance(template, str): |
|
syllable_analysis += f" Template: {template}\n" |
|
syllable_analysis += "\n" |
|
|
|
if len(templates_for_verification) > 30: |
|
syllable_analysis += f"... and {len(templates_for_verification) - 30} more lines\n\n" |
|
|
|
|
|
syllable_analysis += "\n" + structure_visualization |
|
|
|
|
|
prompt_template = "=== PROMPT TEMPLATE ===\n\n" |
|
prompt_template += "Genre: " + genre + "\n" |
|
prompt_template += f"Duration: {duration:.1f} seconds\n" |
|
prompt_template += f"Tempo: {tempo:.1f} BPM\n" |
|
prompt_template += f"Key: {key} {mode}\n" |
|
prompt_template += f"Primary Emotion: {primary_emotion}\n" |
|
prompt_template += f"Primary Theme: {primary_theme}\n\n" |
|
prompt_template += "Syllable Guidance:\n" + syllable_guidance_text |
|
|
|
|
|
return { |
|
"lyrics": clean_lyrics, |
|
"rhythm_analysis": rhythm_analysis, |
|
"syllable_analysis": syllable_analysis, |
|
"prompt_template": prompt_template |
|
} |
|
|
|
return lyrics |
|
|
|
def process_audio(audio_file): |
|
"""Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis.""" |
|
if audio_file is None: |
|
return "Please upload an audio file.", None, None |
|
|
|
try: |
|
print("Step 1/5: Extracting audio features...") |
|
|
|
audio_data = extract_audio_features(audio_file) |
|
|
|
print("Step 2/5: Verifying audio contains music...") |
|
|
|
try: |
|
is_music, ast_results = detect_music(audio_data) |
|
except Exception as e: |
|
print(f"Error in music detection: {str(e)}") |
|
return f"Error in music detection: {str(e)}", None, ast_results |
|
|
|
if not is_music: |
|
return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results |
|
|
|
print("Step 3/5: Classifying music genre...") |
|
|
|
try: |
|
top_genres = classify_genre(audio_data) |
|
|
|
genre_results = format_genre_results(top_genres) |
|
except Exception as e: |
|
print(f"Error in genre classification: {str(e)}") |
|
return f"Error in genre classification: {str(e)}", None, ast_results |
|
|
|
print("Step 4/5: Analyzing music emotions, themes, and structure...") |
|
|
|
try: |
|
emotion_results = music_analyzer.analyze_music(audio_file) |
|
except Exception as e: |
|
print(f"Error in emotion analysis: {str(e)}") |
|
|
|
emotion_results = { |
|
"emotion_analysis": {"primary_emotion": "Unknown"}, |
|
"theme_analysis": {"primary_theme": "Unknown"}, |
|
"rhythm_analysis": {"tempo": 0}, |
|
"tonal_analysis": {"key": "Unknown", "mode": ""}, |
|
"summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"} |
|
} |
|
|
|
|
|
try: |
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
|
|
|
|
beats_info = detect_beats(y, sr) |
|
sections_info = detect_sections(y, sr) |
|
|
|
|
|
segments = [] |
|
|
|
|
|
|
|
if sections_info and len(sections_info) > 1: |
|
min_segment_duration = 1.5 |
|
|
|
for section in sections_info: |
|
section_start = section["start"] |
|
section_end = section["end"] |
|
section_duration = section["duration"] |
|
|
|
|
|
if section_duration < min_segment_duration * 1.5: |
|
segments.append({ |
|
"start": section_start, |
|
"end": section_end |
|
}) |
|
else: |
|
|
|
|
|
ideal_segment_duration = 3.0 |
|
segment_count = max(1, int(section_duration / ideal_segment_duration)) |
|
|
|
|
|
segment_duration = section_duration / segment_count |
|
for i in range(segment_count): |
|
segment_start = section_start + i * segment_duration |
|
segment_end = segment_start + segment_duration |
|
segments.append({ |
|
"start": segment_start, |
|
"end": segment_end |
|
}) |
|
|
|
elif beats_info and len(beats_info["beat_times"]) > 4: |
|
beats = beats_info["beat_times"] |
|
time_signature = beats_info.get("time_signature", 4) |
|
|
|
|
|
measure_size = time_signature |
|
for i in range(0, len(beats), measure_size): |
|
if i + 1 < len(beats): |
|
measure_start = beats[i] |
|
|
|
if i + measure_size < len(beats): |
|
measure_end = beats[i + measure_size] |
|
else: |
|
|
|
if i > 0: |
|
beat_interval = beats[i] - beats[i-1] |
|
measure_end = beats[-1] + (beat_interval * (measure_size - (len(beats) - i))) |
|
else: |
|
measure_end = audio_data["duration"] |
|
|
|
segments.append({ |
|
"start": measure_start, |
|
"end": measure_end |
|
}) |
|
|
|
else: |
|
|
|
segment_duration = 3.0 |
|
total_segments = max(4, int(audio_data["duration"] / segment_duration)) |
|
segment_duration = audio_data["duration"] / total_segments |
|
|
|
for i in range(total_segments): |
|
segment_start = i * segment_duration |
|
segment_end = segment_start + segment_duration |
|
segments.append({ |
|
"start": segment_start, |
|
"end": segment_end |
|
}) |
|
|
|
|
|
flexible_structure = { |
|
"beats": beats_info, |
|
"segments": segments |
|
} |
|
|
|
|
|
song_structure = { |
|
"beats": beats_info, |
|
"sections": sections_info, |
|
"flexible_structure": flexible_structure |
|
} |
|
|
|
|
|
song_structure["syllables"] = [] |
|
for section in sections_info: |
|
|
|
section_beats_info = { |
|
"beat_times": [beat for beat in beats_info["beat_times"] |
|
if section["start"] <= beat < section["end"]], |
|
"tempo": beats_info.get("tempo", 120) |
|
} |
|
if "beat_strengths" in beats_info: |
|
section_beats_info["beat_strengths"] = [ |
|
strength for i, strength in enumerate(beats_info["beat_strengths"]) |
|
if i < len(beats_info["beat_times"]) and |
|
section["start"] <= beats_info["beat_times"][i] < section["end"] |
|
] |
|
|
|
|
|
syllable_count = int(section["duration"] * (beats_info.get("tempo", 120) / 60) * 1.5) |
|
|
|
section_info = { |
|
"type": section["type"], |
|
"start": section["start"], |
|
"end": section["end"], |
|
"duration": section["duration"], |
|
"syllable_count": syllable_count, |
|
"beat_count": len(section_beats_info["beat_times"]) |
|
} |
|
|
|
|
|
if len(section_beats_info["beat_times"]) >= 2: |
|
section_info["syllable_template"] = create_flexible_syllable_templates( |
|
section_beats_info, |
|
genre=top_genres[0][0] |
|
) |
|
|
|
song_structure["syllables"].append(section_info) |
|
|
|
print(f"Successfully analyzed song structure with {len(segments)} segments") |
|
|
|
except Exception as e: |
|
print(f"Error analyzing song structure: {str(e)}") |
|
|
|
song_structure = None |
|
|
|
print("Step 5/5: Generating rhythmically aligned lyrics...") |
|
|
|
try: |
|
primary_genre, _ = top_genres[0] |
|
lyrics_result = generate_lyrics(primary_genre, audio_data["duration"], emotion_results, song_structure) |
|
|
|
|
|
if isinstance(lyrics_result, dict): |
|
lyrics = lyrics_result["lyrics"] |
|
rhythm_analysis = lyrics_result["rhythm_analysis"] |
|
syllable_analysis = lyrics_result["syllable_analysis"] |
|
prompt_template = lyrics_result["prompt_template"] |
|
else: |
|
lyrics = lyrics_result |
|
rhythm_analysis = "No detailed rhythm analysis available" |
|
syllable_analysis = "No syllable analysis available" |
|
prompt_template = "No prompt template available" |
|
|
|
except Exception as e: |
|
print(f"Error generating lyrics: {str(e)}") |
|
lyrics = f"Error generating lyrics: {str(e)}" |
|
rhythm_analysis = "No rhythm analysis available" |
|
syllable_analysis = "No syllable analysis available" |
|
prompt_template = "No prompt template available" |
|
|
|
|
|
results = { |
|
"genre_results": genre_results, |
|
"lyrics": lyrics, |
|
"rhythm_analysis": rhythm_analysis, |
|
"syllable_analysis": syllable_analysis, |
|
"prompt_template": prompt_template, |
|
"ast_results": ast_results |
|
} |
|
|
|
return results |
|
|
|
except Exception as e: |
|
error_msg = f"Error processing audio: {str(e)}" |
|
print(error_msg) |
|
return error_msg, None, [] |
|
|
|
|
|
with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo: |
|
gr.Markdown("# Music Genre Classifier & Lyrics Generator") |
|
gr.Markdown("Upload a music file to classify its genre, analyze its emotions, and generate perfectly aligned lyrics.") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
audio_input = gr.Audio(label="Upload Music", type="filepath") |
|
submit_btn = gr.Button("Analyze & Generate", variant="primary") |
|
|
|
|
|
with gr.Accordion("About Music Genres", open=False): |
|
gr.Markdown(""" |
|
The system recognizes various music genres including: |
|
- Pop, Rock, Hip-Hop, R&B |
|
- Electronic, Dance, Techno, House |
|
- Jazz, Blues, Classical |
|
- Folk, Country, Acoustic |
|
- Metal, Punk, Alternative |
|
- And many others! |
|
|
|
For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music. |
|
""") |
|
|
|
with gr.Column(scale=2): |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Analysis Results"): |
|
genre_output = gr.Textbox(label="Detected Genres", lines=4) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
emotion_output = gr.Textbox(label="Emotion & Structure Analysis", lines=8) |
|
with gr.Column(): |
|
ast_output = gr.Textbox(label="Audio Classification", lines=8) |
|
|
|
with gr.TabItem("Generated Lyrics"): |
|
lyrics_output = gr.Textbox(label="Lyrics", lines=18) |
|
|
|
with gr.TabItem("Rhythm Analysis"): |
|
rhythm_analysis_output = gr.Textbox(label="Syllable-Beat Alignment Analysis", lines=16) |
|
|
|
with gr.TabItem("Syllable Analysis"): |
|
syllable_analysis_output = gr.Textbox(label="Detailed Syllable Analysis", lines=16) |
|
prompt_template_output = gr.Textbox(label="Prompt Template", lines=16) |
|
|
|
|
|
def display_results(audio_file): |
|
if audio_file is None: |
|
return "Please upload an audio file.", "No emotion analysis available.", "No audio classification available.", "No lyrics generated.", "No rhythm analysis available.", "No syllable analysis available.", "No prompt template available." |
|
|
|
try: |
|
|
|
results = process_audio(audio_file) |
|
|
|
|
|
if isinstance(results, str) and "Error" in results: |
|
return results, "Error in analysis", "Error in classification", "No lyrics generated", "No rhythm analysis available", "No syllable analysis available", "No prompt template available" |
|
elif isinstance(results, tuple) and isinstance(results[0], str) and "Error" in results[0]: |
|
return results[0], "Error in analysis", "Error in classification", "No lyrics generated", "No rhythm analysis available", "No syllable analysis available", "No prompt template available" |
|
|
|
|
|
if isinstance(results, dict): |
|
genre_results = results.get("genre_results", "Genre classification failed") |
|
lyrics = results.get("lyrics", "Lyrics generation failed") |
|
ast_results = results.get("ast_results", []) |
|
|
|
|
|
clean_lyrics = results.get("clean_lyrics", lyrics) |
|
rhythm_analysis = results.get("rhythm_analysis", "No detailed rhythm analysis available") |
|
|
|
|
|
syllable_analysis = results.get("syllable_analysis", "No syllable analysis available") |
|
prompt_template = results.get("prompt_template", "No prompt template available") |
|
else: |
|
|
|
genre_results, lyrics, ast_results = results |
|
clean_lyrics = lyrics |
|
|
|
|
|
rhythm_analysis = "No detailed rhythm analysis available" |
|
if isinstance(lyrics, str): |
|
|
|
if "[Note: Rhythm Analysis]" in lyrics: |
|
clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() |
|
rhythm_analysis = lyrics.split("[Note: Rhythm Analysis]")[1] |
|
|
|
elif "[Note: Potential rhythm mismatches" in lyrics: |
|
clean_lyrics = lyrics.split("[Note:")[0].strip() |
|
rhythm_analysis = "[Note:" + lyrics.split("[Note:")[1] |
|
|
|
|
|
syllable_analysis = "No syllable analysis available" |
|
prompt_template = "No prompt template available" |
|
|
|
|
|
try: |
|
emotion_results = music_analyzer.analyze_music(audio_file) |
|
emotion_text = f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n" |
|
emotion_text += f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n" |
|
emotion_text += f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n" |
|
emotion_text += f"Primary Theme: {emotion_results['summary']['primary_theme']}" |
|
|
|
|
|
try: |
|
audio_data = extract_audio_features(audio_file) |
|
song_structure = calculate_detailed_song_structure(audio_data) |
|
|
|
emotion_text += "\n\nSong Structure:\n" |
|
for section in song_structure["syllables"]: |
|
emotion_text += f"- {section['type'].capitalize()}: {section['start']:.1f}s to {section['end']:.1f}s " |
|
emotion_text += f"({section['duration']:.1f}s, {section['beat_count']} beats, " |
|
|
|
if "syllable_template" in section: |
|
emotion_text += f"template: {section['syllable_template']})\n" |
|
else: |
|
emotion_text += f"~{section['syllable_count']} syllables)\n" |
|
|
|
|
|
if "flexible_structure" in song_structure and song_structure["flexible_structure"]: |
|
flexible = song_structure["flexible_structure"] |
|
if "segments" in flexible and flexible["segments"]: |
|
emotion_text += "\nDetailed Rhythm Analysis:\n" |
|
for i, segment in enumerate(flexible["segments"][:5]): |
|
emotion_text += f"- Segment {i+1}: {segment['start']:.1f}s to {segment['end']:.1f}s, " |
|
emotion_text += f"pattern: {segment.get('syllable_template', 'N/A')}\n" |
|
|
|
if len(flexible["segments"]) > 5: |
|
emotion_text += f" (+ {len(flexible['segments']) - 5} more segments)\n" |
|
|
|
except Exception as e: |
|
print(f"Error displaying song structure: {str(e)}") |
|
|
|
|
|
except Exception as e: |
|
print(f"Error in emotion analysis: {str(e)}") |
|
emotion_text = f"Error in emotion analysis: {str(e)}" |
|
|
|
|
|
if ast_results and isinstance(ast_results, list): |
|
ast_text = "Audio Classification Results:\n" |
|
for result in ast_results[:5]: |
|
ast_text += f"{result['label']}: {result['score']*100:.2f}%\n" |
|
else: |
|
ast_text = "No valid audio classification results available." |
|
|
|
|
|
return genre_results, emotion_text, ast_text, clean_lyrics, rhythm_analysis, syllable_analysis, prompt_template |
|
|
|
except Exception as e: |
|
error_msg = f"Error: {str(e)}" |
|
print(error_msg) |
|
return error_msg, "Error in emotion analysis", "Error in audio classification", "No lyrics generated", "No rhythm analysis available", "No syllable analysis available", "No prompt template available" |
|
|
|
|
|
submit_btn.click( |
|
fn=display_results, |
|
inputs=[audio_input], |
|
outputs=[genre_output, emotion_output, ast_output, lyrics_output, rhythm_analysis_output, syllable_analysis_output, prompt_template_output] |
|
) |
|
|
|
|
|
with gr.Accordion("How it works", open=False): |
|
gr.Markdown(""" |
|
## Advanced Lyrics Generation Process |
|
|
|
1. **Audio Analysis**: The system analyzes your uploaded music file using multiple machine learning models. |
|
|
|
2. **Genre Classification**: A specialized neural network identifies the musical genre, detecting subtle patterns in the audio. |
|
|
|
3. **Emotional Analysis**: The system examines harmonic, rhythmic, and timbral features to determine the emotional qualities of the music. |
|
|
|
4. **Rhythm Mapping**: Advanced beat detection algorithms create a detailed rhythmic map of the music, identifying: |
|
- Strong and weak beats |
|
- Natural phrase boundaries |
|
- Time signature and tempo variations |
|
|
|
5. **Syllable Template Creation**: For each musical phrase, the system generates precise syllable templates that reflect: |
|
- Beat stress patterns (strong, medium, weak) |
|
- Appropriate syllable counts based on tempo |
|
- Genre-specific rhythmic qualities |
|
|
|
6. **Lyrics Generation**: Using the detected genre, emotion, and rhythm patterns, a large language model generates lyrics that: |
|
- Match the emotional quality of the music |
|
- Follow the precise syllable templates |
|
- Align stressed syllables with strong beats |
|
- Maintain genre-appropriate style and themes |
|
|
|
7. **Rhythm Verification**: The system verifies the generated lyrics, analyzing: |
|
- Syllable count accuracy |
|
- Stress alignment with strong beats |
|
- Word stress patterns |
|
|
|
8. **Refinement**: If significant rhythm mismatches are detected, the system can automatically refine the lyrics for better alignment. |
|
|
|
This multi-step process creates lyrics that feel naturally connected to the music, as if they were written specifically for it. |
|
""") |
|
|
|
|
|
demo.launch() |