root
commited on
Commit
·
38b696f
1
Parent(s):
ab30d87
music confidence
Browse files
app.py
CHANGED
@@ -22,6 +22,7 @@ from utils import (
|
|
22 |
preprocess_audio_for_model
|
23 |
)
|
24 |
from emotionanalysis import MusicAnalyzer
|
|
|
25 |
|
26 |
# Login to Hugging Face Hub if token is provided
|
27 |
if "HF_TOKEN" in os.environ:
|
@@ -173,34 +174,219 @@ def classify_genre(audio_data):
|
|
173 |
# Fallback: return a default genre if everything fails
|
174 |
return [("rock", 1.0)]
|
175 |
|
176 |
-
def
|
177 |
-
"""
|
178 |
-
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
-
#
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
-
#
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
You are a talented songwriter who specializes in {genre} music.
|
205 |
Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
|
206 |
|
@@ -223,6 +409,60 @@ The lyrics should:
|
|
223 |
- Match the song duration of {duration:.1f} seconds
|
224 |
- Keep each line concise and impactful
|
225 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
Your lyrics:
|
227 |
"""
|
228 |
|
@@ -239,76 +479,51 @@ Your lyrics:
|
|
239 |
# Extract and clean generated lyrics
|
240 |
lyrics = response[0]["generated_text"].strip()
|
241 |
|
242 |
-
# Add section labels if they're not present
|
243 |
-
if "Verse" not in lyrics and "Chorus" not in lyrics:
|
244 |
lines = lyrics.split('\n')
|
245 |
formatted_lyrics = []
|
246 |
current_section = "Verse"
|
|
|
|
|
247 |
for i, line in enumerate(lines):
|
248 |
if i == 0:
|
249 |
formatted_lyrics.append("[Verse]")
|
|
|
250 |
elif i == verse_lines:
|
251 |
formatted_lyrics.append("\n[Chorus]")
|
252 |
elif i == verse_lines + chorus_lines and lines_count > 10:
|
253 |
formatted_lyrics.append("\n[Bridge]")
|
|
|
|
|
|
|
254 |
formatted_lyrics.append(line)
|
|
|
255 |
lyrics = '\n'.join(formatted_lyrics)
|
256 |
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
results = music_detector(audio_data["path"])
|
265 |
-
# Look for music-related classes in the results
|
266 |
-
music_confidence = 0.0
|
267 |
-
for result in results:
|
268 |
-
label = result["label"].lower()
|
269 |
-
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]):
|
270 |
-
music_confidence = max(music_confidence, result["score"])
|
271 |
-
return music_confidence >= 0.2, results
|
272 |
-
|
273 |
-
# Second attempt: Use manually loaded model components
|
274 |
-
elif 'music_processor' in globals() and 'music_model' in globals():
|
275 |
-
# Process audio input with feature extractor
|
276 |
-
inputs = music_processor(
|
277 |
-
audio_data["waveform"],
|
278 |
-
sampling_rate=audio_data["sample_rate"],
|
279 |
-
return_tensors="pt"
|
280 |
-
)
|
281 |
-
|
282 |
-
with torch.no_grad():
|
283 |
-
outputs = music_model(**inputs)
|
284 |
-
predictions = outputs.logits.softmax(dim=-1)
|
285 |
-
|
286 |
-
# Get the top predictions
|
287 |
-
values, indices = torch.topk(predictions, 5)
|
288 |
-
|
289 |
-
# Map indices to labels
|
290 |
-
labels = music_model.config.id2label
|
291 |
-
|
292 |
-
# Check for music-related classes
|
293 |
-
music_confidence = 0.0
|
294 |
-
results = []
|
295 |
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
|
|
|
|
|
|
300 |
|
301 |
-
if
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
except Exception as e:
|
310 |
-
print(f"Error in music detection: {str(e)}")
|
311 |
-
return False, []
|
312 |
|
313 |
def process_audio(audio_file):
|
314 |
"""Main function to process audio file, classify genre, and generate lyrics."""
|
@@ -344,12 +559,25 @@ def process_audio(audio_file):
|
|
344 |
except Exception as e:
|
345 |
print(f"Error in emotion analysis: {str(e)}")
|
346 |
# Continue even if emotion analysis fails
|
347 |
-
emotion_results = {
|
|
|
|
|
|
|
|
|
|
|
348 |
|
349 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
350 |
try:
|
351 |
primary_genre, _ = top_genres[0]
|
352 |
-
lyrics = generate_lyrics(primary_genre, audio_data["duration"], emotion_results)
|
353 |
except Exception as e:
|
354 |
print(f"Error generating lyrics: {str(e)}")
|
355 |
lyrics = f"Error generating lyrics: {str(e)}"
|
@@ -396,6 +624,20 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
|
|
396 |
emotion_text += f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n"
|
397 |
emotion_text += f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n"
|
398 |
emotion_text += f"Primary Theme: {emotion_results['summary']['primary_theme']}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
399 |
except Exception as e:
|
400 |
print(f"Error in emotion analysis: {str(e)}")
|
401 |
emotion_text = f"Error in emotion analysis: {str(e)}"
|
@@ -425,9 +667,10 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
|
|
425 |
1. Upload an audio file of your choice
|
426 |
2. The system will classify the genre using the dima806/music_genres_classification model
|
427 |
3. The system will analyze the musical emotion and theme using advanced audio processing
|
428 |
-
4.
|
429 |
-
5.
|
|
|
430 |
""")
|
431 |
|
432 |
# Launch the app
|
433 |
-
demo.launch()
|
|
|
22 |
preprocess_audio_for_model
|
23 |
)
|
24 |
from emotionanalysis import MusicAnalyzer
|
25 |
+
import librosa
|
26 |
|
27 |
# Login to Hugging Face Hub if token is provided
|
28 |
if "HF_TOKEN" in os.environ:
|
|
|
174 |
# Fallback: return a default genre if everything fails
|
175 |
return [("rock", 1.0)]
|
176 |
|
177 |
+
def detect_music(audio_data):
|
178 |
+
"""Detect if the audio is music using the MIT AST model."""
|
179 |
+
try:
|
180 |
+
# First attempt: Try using the pipeline if available
|
181 |
+
if 'music_detector' in globals():
|
182 |
+
results = music_detector(audio_data["path"])
|
183 |
+
# Look for music-related classes in the results
|
184 |
+
music_confidence = 0.0
|
185 |
+
for result in results:
|
186 |
+
label = result["label"].lower()
|
187 |
+
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]):
|
188 |
+
music_confidence = max(music_confidence, result["score"])
|
189 |
+
return music_confidence >= 0.2, results
|
190 |
+
|
191 |
+
# Second attempt: Use manually loaded model components
|
192 |
+
elif 'music_processor' in globals() and 'music_model' in globals():
|
193 |
+
# Process audio input with feature extractor
|
194 |
+
inputs = music_processor(
|
195 |
+
audio_data["waveform"],
|
196 |
+
sampling_rate=audio_data["sample_rate"],
|
197 |
+
return_tensors="pt"
|
198 |
+
)
|
199 |
+
|
200 |
+
with torch.no_grad():
|
201 |
+
outputs = music_model(**inputs)
|
202 |
+
predictions = outputs.logits.softmax(dim=-1)
|
203 |
+
|
204 |
+
# Get the top predictions
|
205 |
+
values, indices = torch.topk(predictions, 5)
|
206 |
+
|
207 |
+
# Map indices to labels
|
208 |
+
labels = music_model.config.id2label
|
209 |
+
|
210 |
+
# Check for music-related classes
|
211 |
+
music_confidence = 0.0
|
212 |
+
results = []
|
213 |
+
|
214 |
+
for i, (value, index) in enumerate(zip(values[0], indices[0])):
|
215 |
+
label = labels[index.item()].lower()
|
216 |
+
score = value.item()
|
217 |
+
results.append({"label": label, "score": score})
|
218 |
+
|
219 |
+
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]):
|
220 |
+
music_confidence = max(music_confidence, score)
|
221 |
+
|
222 |
+
return music_confidence >= 0.2, results
|
223 |
+
|
224 |
+
else:
|
225 |
+
raise ValueError("No music detection model available")
|
226 |
+
|
227 |
+
except Exception as e:
|
228 |
+
print(f"Error in music detection: {str(e)}")
|
229 |
+
return False, []
|
230 |
+
|
231 |
+
def detect_beats(y, sr):
|
232 |
+
"""Detect beats in the audio using librosa."""
|
233 |
+
# Get tempo and beat frames
|
234 |
+
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
|
235 |
|
236 |
+
# Convert beat frames to time in seconds
|
237 |
+
beat_times = librosa.frames_to_time(beat_frames, sr=sr)
|
238 |
+
|
239 |
+
return {
|
240 |
+
"tempo": tempo,
|
241 |
+
"beat_frames": beat_frames,
|
242 |
+
"beat_times": beat_times,
|
243 |
+
"beat_count": len(beat_times)
|
244 |
+
}
|
245 |
+
|
246 |
+
def detect_sections(y, sr):
|
247 |
+
"""Detect sections (verse, chorus, etc.) in the audio."""
|
248 |
+
# Compute the spectral contrast
|
249 |
+
S = np.abs(librosa.stft(y))
|
250 |
+
contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
|
251 |
+
|
252 |
+
# Compute the chroma features
|
253 |
+
chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
|
254 |
+
|
255 |
+
# Use a combination of contrast and chroma to find segment boundaries
|
256 |
+
# Average over frequency axis to get time series
|
257 |
+
contrast_avg = np.mean(contrast, axis=0)
|
258 |
+
chroma_avg = np.mean(chroma, axis=0)
|
259 |
+
|
260 |
+
# Normalize
|
261 |
+
contrast_avg = (contrast_avg - np.mean(contrast_avg)) / np.std(contrast_avg)
|
262 |
+
chroma_avg = (chroma_avg - np.mean(chroma_avg)) / np.std(chroma_avg)
|
263 |
+
|
264 |
+
# Combine features
|
265 |
+
combined = contrast_avg + chroma_avg
|
266 |
+
|
267 |
+
# Detect structural boundaries
|
268 |
+
bounds = librosa.segment.agglomerative(combined, 3) # Adjust for typical song structures
|
269 |
+
|
270 |
+
# Convert to time in seconds
|
271 |
+
bound_times = librosa.frames_to_time(bounds, sr=sr)
|
272 |
+
|
273 |
+
# Estimate section types based on position and length
|
274 |
+
sections = []
|
275 |
+
for i in range(len(bound_times) - 1):
|
276 |
+
start = bound_times[i]
|
277 |
+
end = bound_times[i+1]
|
278 |
+
duration = end - start
|
279 |
+
|
280 |
+
# Simple heuristic to label sections
|
281 |
+
if i == 0:
|
282 |
+
section_type = "intro"
|
283 |
+
elif i == len(bound_times) - 2:
|
284 |
+
section_type = "outro"
|
285 |
+
elif i % 2 == 1: # Alternating verse/chorus pattern
|
286 |
+
section_type = "chorus"
|
287 |
+
else:
|
288 |
+
section_type = "verse"
|
289 |
+
|
290 |
+
# If we have a short section in the middle, it might be a bridge
|
291 |
+
if 0 < i < len(bound_times) - 2 and duration < 20:
|
292 |
+
section_type = "bridge"
|
293 |
+
|
294 |
+
sections.append({
|
295 |
+
"type": section_type,
|
296 |
+
"start": start,
|
297 |
+
"end": end,
|
298 |
+
"duration": duration
|
299 |
+
})
|
300 |
+
|
301 |
+
return sections
|
302 |
+
|
303 |
+
def estimate_syllables_per_section(beats_info, sections):
|
304 |
+
"""Estimate the number of syllables needed for each section based on beats."""
|
305 |
+
syllables_per_section = []
|
306 |
|
307 |
+
for section in sections:
|
308 |
+
# Find beats that fall within this section
|
309 |
+
section_beats = [
|
310 |
+
beat for beat in beats_info["beat_times"]
|
311 |
+
if section["start"] <= beat < section["end"]
|
312 |
+
]
|
313 |
+
|
314 |
+
# Calculate syllables based on section type and beat count
|
315 |
+
beat_count = len(section_beats)
|
316 |
+
|
317 |
+
# Adjust syllable count based on section type and genre conventions
|
318 |
+
if section["type"] == "verse":
|
319 |
+
# Verses typically have more syllables per beat (more words)
|
320 |
+
syllable_count = beat_count * 1.2
|
321 |
+
elif section["type"] == "chorus":
|
322 |
+
# Choruses often have fewer syllables per beat (more sustained notes)
|
323 |
+
syllable_count = beat_count * 0.9
|
324 |
+
elif section["type"] == "bridge":
|
325 |
+
syllable_count = beat_count * 1.0
|
326 |
+
else: # intro, outro
|
327 |
+
syllable_count = beat_count * 0.5 # Often instrumental or sparse lyrics
|
328 |
+
|
329 |
+
syllables_per_section.append({
|
330 |
+
"type": section["type"],
|
331 |
+
"start": section["start"],
|
332 |
+
"end": section["end"],
|
333 |
+
"duration": section["duration"],
|
334 |
+
"beat_count": beat_count,
|
335 |
+
"syllable_count": int(syllable_count)
|
336 |
+
})
|
337 |
+
|
338 |
+
return syllables_per_section
|
339 |
+
|
340 |
+
def calculate_detailed_song_structure(audio_data):
|
341 |
+
"""Calculate detailed song structure for better lyrics generation."""
|
342 |
+
y = audio_data["waveform"]
|
343 |
+
sr = audio_data["sample_rate"]
|
344 |
|
345 |
+
# Detect beats
|
346 |
+
beats_info = detect_beats(y, sr)
|
347 |
+
|
348 |
+
# Detect sections
|
349 |
+
sections = detect_sections(y, sr)
|
350 |
+
|
351 |
+
# Estimate syllables per section
|
352 |
+
syllables_info = estimate_syllables_per_section(beats_info, sections)
|
353 |
+
|
354 |
+
return {
|
355 |
+
"beats": beats_info,
|
356 |
+
"sections": sections,
|
357 |
+
"syllables": syllables_info
|
358 |
+
}
|
359 |
+
|
360 |
+
def generate_lyrics(genre, duration, emotion_results, song_structure=None):
|
361 |
+
"""Generate lyrics based on genre, duration, emotion, and detailed song structure."""
|
362 |
+
# If no song structure is provided, fall back to the original approach
|
363 |
+
if song_structure is None:
|
364 |
+
# Calculate appropriate lyrics length based on audio duration
|
365 |
+
lines_count = calculate_lyrics_length(duration)
|
366 |
+
|
367 |
+
# Calculate approximate number of verses and chorus
|
368 |
+
if lines_count <= 6:
|
369 |
+
# Very short song - one verse and chorus
|
370 |
+
verse_lines = 2
|
371 |
+
chorus_lines = 2
|
372 |
+
elif lines_count <= 10:
|
373 |
+
# Medium song - two verses and chorus
|
374 |
+
verse_lines = 3
|
375 |
+
chorus_lines = 2
|
376 |
+
else:
|
377 |
+
# Longer song - two verses, chorus, and bridge
|
378 |
+
verse_lines = 3
|
379 |
+
chorus_lines = 2
|
380 |
+
|
381 |
+
# Extract emotion and theme data from analysis results
|
382 |
+
primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"]
|
383 |
+
primary_theme = emotion_results["theme_analysis"]["primary_theme"]
|
384 |
+
tempo = emotion_results["rhythm_analysis"]["tempo"]
|
385 |
+
key = emotion_results["tonal_analysis"]["key"]
|
386 |
+
mode = emotion_results["tonal_analysis"]["mode"]
|
387 |
+
|
388 |
+
# Create prompt for the LLM
|
389 |
+
prompt = f"""
|
390 |
You are a talented songwriter who specializes in {genre} music.
|
391 |
Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
|
392 |
|
|
|
409 |
- Match the song duration of {duration:.1f} seconds
|
410 |
- Keep each line concise and impactful
|
411 |
|
412 |
+
Your lyrics:
|
413 |
+
"""
|
414 |
+
|
415 |
+
else:
|
416 |
+
# Extract emotion and theme data from analysis results
|
417 |
+
primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"]
|
418 |
+
primary_theme = emotion_results["theme_analysis"]["primary_theme"]
|
419 |
+
tempo = emotion_results["rhythm_analysis"]["tempo"]
|
420 |
+
key = emotion_results["tonal_analysis"]["key"]
|
421 |
+
mode = emotion_results["tonal_analysis"]["mode"]
|
422 |
+
|
423 |
+
# Create detailed structure instructions for the LLM
|
424 |
+
structure_instructions = "Follow this exact song structure with specified syllable counts:\n"
|
425 |
+
|
426 |
+
for section in song_structure["syllables"]:
|
427 |
+
section_type = section["type"].capitalize()
|
428 |
+
start_time = f"{section['start']:.1f}"
|
429 |
+
end_time = f"{section['end']:.1f}"
|
430 |
+
duration = f"{section['duration']:.1f}"
|
431 |
+
beat_count = section["beat_count"]
|
432 |
+
syllable_count = section["syllable_count"]
|
433 |
+
|
434 |
+
structure_instructions += f"* {section_type} ({start_time}s - {end_time}s, {duration}s duration):\n"
|
435 |
+
structure_instructions += f" - {beat_count} beats\n"
|
436 |
+
structure_instructions += f" - Approximately {syllable_count} syllables\n"
|
437 |
+
|
438 |
+
# Calculate approximate total number of lines based on syllables
|
439 |
+
total_syllables = sum(section["syllable_count"] for section in song_structure["syllables"])
|
440 |
+
estimated_lines = max(4, int(total_syllables / 8)) # Rough estimate: average 8 syllables per line
|
441 |
+
|
442 |
+
# Create prompt for the LLM
|
443 |
+
prompt = f"""
|
444 |
+
You are a talented songwriter who specializes in {genre} music.
|
445 |
+
Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
|
446 |
+
|
447 |
+
Music analysis has detected the following qualities in the music:
|
448 |
+
- Tempo: {tempo:.1f} BPM
|
449 |
+
- Key: {key} {mode}
|
450 |
+
- Primary emotion: {primary_emotion}
|
451 |
+
- Primary theme: {primary_theme}
|
452 |
+
|
453 |
+
{structure_instructions}
|
454 |
+
|
455 |
+
The lyrics should:
|
456 |
+
- Perfectly capture the essence and style of {genre} music
|
457 |
+
- Express the {primary_emotion} emotion and {primary_theme} theme
|
458 |
+
- Have approximately {estimated_lines} lines total, distributed across sections
|
459 |
+
- For each line, include a syllable count that matches the beats in that section
|
460 |
+
- Include timestamps [MM:SS] at the beginning of each section
|
461 |
+
- Be completely original
|
462 |
+
- Match the exact song structure provided above
|
463 |
+
|
464 |
+
Important: Each section should have lyrics with syllable counts matching the beats!
|
465 |
+
|
466 |
Your lyrics:
|
467 |
"""
|
468 |
|
|
|
479 |
# Extract and clean generated lyrics
|
480 |
lyrics = response[0]["generated_text"].strip()
|
481 |
|
482 |
+
# Add section labels if they're not present (in fallback mode)
|
483 |
+
if song_structure is None and "Verse" not in lyrics and "Chorus" not in lyrics:
|
484 |
lines = lyrics.split('\n')
|
485 |
formatted_lyrics = []
|
486 |
current_section = "Verse"
|
487 |
+
verse_count = 0
|
488 |
+
|
489 |
for i, line in enumerate(lines):
|
490 |
if i == 0:
|
491 |
formatted_lyrics.append("[Verse]")
|
492 |
+
verse_count = 1
|
493 |
elif i == verse_lines:
|
494 |
formatted_lyrics.append("\n[Chorus]")
|
495 |
elif i == verse_lines + chorus_lines and lines_count > 10:
|
496 |
formatted_lyrics.append("\n[Bridge]")
|
497 |
+
elif i == verse_lines + chorus_lines + (2 if lines_count > 10 else 0):
|
498 |
+
formatted_lyrics.append("\n[Verse]")
|
499 |
+
verse_count = 2
|
500 |
formatted_lyrics.append(line)
|
501 |
+
|
502 |
lyrics = '\n'.join(formatted_lyrics)
|
503 |
|
504 |
+
# Add timestamps in detailed mode if needed
|
505 |
+
elif song_structure is not None:
|
506 |
+
# Ensure the lyrics have proper section headings with timestamps
|
507 |
+
for section in song_structure["syllables"]:
|
508 |
+
section_type = section["type"].capitalize()
|
509 |
+
start_time_str = f"{int(section['start']) // 60:02d}:{int(section['start']) % 60:02d}"
|
510 |
+
section_header = f"[{start_time_str}] {section_type}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
511 |
|
512 |
+
# Check if this section header is missing and add it if needed
|
513 |
+
if section_header not in lyrics and section["type"] not in ["intro", "outro"]:
|
514 |
+
# Find where this section might be based on timestamp
|
515 |
+
time_matches = [
|
516 |
+
idx for idx, line in enumerate(lyrics.split('\n'))
|
517 |
+
if f"{int(section['start']) // 60:02d}:{int(section['start']) % 60:02d}" in line
|
518 |
+
]
|
519 |
|
520 |
+
if time_matches:
|
521 |
+
lines = lyrics.split('\n')
|
522 |
+
line_idx = time_matches[0]
|
523 |
+
lines[line_idx] = section_header
|
524 |
+
lyrics = '\n'.join(lines)
|
525 |
+
|
526 |
+
return lyrics
|
|
|
|
|
|
|
|
|
527 |
|
528 |
def process_audio(audio_file):
|
529 |
"""Main function to process audio file, classify genre, and generate lyrics."""
|
|
|
559 |
except Exception as e:
|
560 |
print(f"Error in emotion analysis: {str(e)}")
|
561 |
# Continue even if emotion analysis fails
|
562 |
+
emotion_results = {
|
563 |
+
"emotion_analysis": {"primary_emotion": "Unknown"},
|
564 |
+
"theme_analysis": {"primary_theme": "Unknown"},
|
565 |
+
"rhythm_analysis": {"tempo": 0},
|
566 |
+
"tonal_analysis": {"key": "Unknown", "mode": ""}
|
567 |
+
}
|
568 |
|
569 |
+
# Calculate detailed song structure for better lyrics alignment
|
570 |
+
try:
|
571 |
+
song_structure = calculate_detailed_song_structure(audio_data)
|
572 |
+
except Exception as e:
|
573 |
+
print(f"Error analyzing song structure: {str(e)}")
|
574 |
+
# Continue with a simpler approach if this fails
|
575 |
+
song_structure = None
|
576 |
+
|
577 |
+
# Generate lyrics based on top genre, emotion analysis, and song structure
|
578 |
try:
|
579 |
primary_genre, _ = top_genres[0]
|
580 |
+
lyrics = generate_lyrics(primary_genre, audio_data["duration"], emotion_results, song_structure)
|
581 |
except Exception as e:
|
582 |
print(f"Error generating lyrics: {str(e)}")
|
583 |
lyrics = f"Error generating lyrics: {str(e)}"
|
|
|
624 |
emotion_text += f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n"
|
625 |
emotion_text += f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n"
|
626 |
emotion_text += f"Primary Theme: {emotion_results['summary']['primary_theme']}"
|
627 |
+
|
628 |
+
# Add detailed song structure information if available
|
629 |
+
try:
|
630 |
+
audio_data = extract_audio_features(audio_file)
|
631 |
+
song_structure = calculate_detailed_song_structure(audio_data)
|
632 |
+
|
633 |
+
emotion_text += "\n\nSong Structure:\n"
|
634 |
+
for section in song_structure["syllables"]:
|
635 |
+
emotion_text += f"- {section['type'].capitalize()}: {section['start']:.1f}s to {section['end']:.1f}s "
|
636 |
+
emotion_text += f"({section['duration']:.1f}s, {section['beat_count']} beats, ~{section['syllable_count']} syllables)\n"
|
637 |
+
except Exception as e:
|
638 |
+
print(f"Error displaying song structure: {str(e)}")
|
639 |
+
# Continue without showing structure details
|
640 |
+
|
641 |
except Exception as e:
|
642 |
print(f"Error in emotion analysis: {str(e)}")
|
643 |
emotion_text = f"Error in emotion analysis: {str(e)}"
|
|
|
667 |
1. Upload an audio file of your choice
|
668 |
2. The system will classify the genre using the dima806/music_genres_classification model
|
669 |
3. The system will analyze the musical emotion and theme using advanced audio processing
|
670 |
+
4. The system will identify the song structure, beats, and timing patterns
|
671 |
+
5. Based on the detected genre, emotion, and structure, it will generate lyrics that match the beats, sections, and flow of the music
|
672 |
+
6. The lyrics will include appropriate section markings and syllable counts to align with the music
|
673 |
""")
|
674 |
|
675 |
# Launch the app
|
676 |
+
demo.launch()
|