|
import os |
|
import io |
|
import gradio as gr |
|
import torch |
|
import numpy as np |
|
from transformers import ( |
|
AutoModelForAudioClassification, |
|
AutoFeatureExtractor, |
|
AutoTokenizer, |
|
pipeline, |
|
AutoModelForCausalLM, |
|
BitsAndBytesConfig |
|
) |
|
from huggingface_hub import login |
|
from utils import ( |
|
load_audio, |
|
extract_audio_duration, |
|
extract_mfcc_features, |
|
calculate_lyrics_length, |
|
format_genre_results, |
|
ensure_cuda_availability, |
|
preprocess_audio_for_model |
|
) |
|
from emotionanalysis import MusicAnalyzer |
|
|
|
|
|
if "HF_TOKEN" in os.environ: |
|
login(token=os.environ["HF_TOKEN"]) |
|
|
|
|
|
GENRE_MODEL_NAME = "dima806/music_genres_classification" |
|
MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593" |
|
LLM_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" |
|
SAMPLE_RATE = 22050 |
|
|
|
|
|
CUDA_AVAILABLE = ensure_cuda_availability() |
|
|
|
|
|
print(f"Loading music detection model: {MUSIC_DETECTION_MODEL}") |
|
try: |
|
music_detector = pipeline( |
|
"audio-classification", |
|
model=MUSIC_DETECTION_MODEL, |
|
device=0 if CUDA_AVAILABLE else -1 |
|
) |
|
print("Successfully loaded music detection pipeline") |
|
except Exception as e: |
|
print(f"Error creating music detection pipeline: {str(e)}") |
|
|
|
try: |
|
music_processor = AutoFeatureExtractor.from_pretrained(MUSIC_DETECTION_MODEL) |
|
music_model = AutoModelForAudioClassification.from_pretrained(MUSIC_DETECTION_MODEL) |
|
print("Successfully loaded music detection model and feature extractor") |
|
except Exception as e2: |
|
print(f"Error loading music detection model components: {str(e2)}") |
|
raise RuntimeError(f"Could not load music detection model: {str(e2)}") |
|
|
|
|
|
print(f"Loading audio classification model: {GENRE_MODEL_NAME}") |
|
try: |
|
genre_classifier = pipeline( |
|
"audio-classification", |
|
model=GENRE_MODEL_NAME, |
|
device=0 if CUDA_AVAILABLE else -1 |
|
) |
|
print("Successfully loaded audio classification pipeline") |
|
except Exception as e: |
|
print(f"Error creating pipeline: {str(e)}") |
|
|
|
try: |
|
genre_processor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME) |
|
genre_model = AutoModelForAudioClassification.from_pretrained(GENRE_MODEL_NAME) |
|
print("Successfully loaded audio classification model and feature extractor") |
|
except Exception as e2: |
|
print(f"Error loading model components: {str(e2)}") |
|
raise RuntimeError(f"Could not load genre classification model: {str(e2)}") |
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.float16, |
|
) |
|
|
|
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME) |
|
llm_model = AutoModelForCausalLM.from_pretrained( |
|
LLM_MODEL_NAME, |
|
device_map="auto", |
|
quantization_config=bnb_config, |
|
torch_dtype=torch.float16, |
|
) |
|
|
|
|
|
llm_pipeline = pipeline( |
|
"text-generation", |
|
model=llm_model, |
|
tokenizer=llm_tokenizer, |
|
max_new_tokens=512, |
|
) |
|
|
|
|
|
music_analyzer = MusicAnalyzer() |
|
|
|
def extract_audio_features(audio_file): |
|
"""Extract audio features from an audio file.""" |
|
try: |
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
|
|
if y is None or sr is None: |
|
raise ValueError("Failed to load audio data") |
|
|
|
|
|
duration = extract_audio_duration(y, sr) |
|
|
|
|
|
mfccs_mean = extract_mfcc_features(y, sr, n_mfcc=20) |
|
|
|
return { |
|
"features": mfccs_mean, |
|
"duration": duration, |
|
"waveform": y, |
|
"sample_rate": sr, |
|
"path": audio_file |
|
} |
|
except Exception as e: |
|
print(f"Error extracting audio features: {str(e)}") |
|
raise ValueError(f"Failed to extract audio features: {str(e)}") |
|
|
|
def classify_genre(audio_data): |
|
"""Classify the genre of the audio using the loaded model.""" |
|
try: |
|
|
|
if 'genre_classifier' in globals(): |
|
results = genre_classifier(audio_data["path"]) |
|
|
|
top_genres = [(result["label"], result["score"]) for result in results[:3]] |
|
return top_genres |
|
|
|
|
|
elif 'genre_processor' in globals() and 'genre_model' in globals(): |
|
|
|
inputs = genre_processor( |
|
audio_data["waveform"], |
|
sampling_rate=audio_data["sample_rate"], |
|
return_tensors="pt" |
|
) |
|
|
|
with torch.no_grad(): |
|
outputs = genre_model(**inputs) |
|
predictions = outputs.logits.softmax(dim=-1) |
|
|
|
|
|
values, indices = torch.topk(predictions, 3) |
|
|
|
|
|
genre_labels = genre_model.config.id2label |
|
|
|
top_genres = [] |
|
for i, (value, index) in enumerate(zip(values[0], indices[0])): |
|
genre = genre_labels[index.item()] |
|
confidence = value.item() |
|
top_genres.append((genre, confidence)) |
|
|
|
return top_genres |
|
|
|
else: |
|
raise ValueError("No genre classification model available") |
|
|
|
except Exception as e: |
|
print(f"Error in genre classification: {str(e)}") |
|
|
|
return [("rock", 1.0)] |
|
|
|
def generate_lyrics(genre, duration, emotion_results): |
|
"""Generate lyrics based on the genre and with appropriate length.""" |
|
|
|
lines_count = calculate_lyrics_length(duration) |
|
|
|
|
|
if lines_count <= 6: |
|
|
|
verse_lines = 2 |
|
chorus_lines = 2 |
|
elif lines_count <= 10: |
|
|
|
verse_lines = 3 |
|
chorus_lines = 2 |
|
else: |
|
|
|
verse_lines = 3 |
|
chorus_lines = 2 |
|
|
|
|
|
primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"] |
|
primary_theme = emotion_results["theme_analysis"]["primary_theme"] |
|
tempo = emotion_results["rhythm_analysis"]["tempo"] |
|
key = emotion_results["tonal_analysis"]["key"] |
|
mode = emotion_results["tonal_analysis"]["mode"] |
|
|
|
|
|
prompt = f""" |
|
You are a talented songwriter who specializes in {genre} music. |
|
Write original {genre} song lyrics for a song that is {duration:.1f} seconds long. |
|
|
|
Music analysis has detected the following qualities in the music: |
|
- Tempo: {tempo:.1f} BPM |
|
- Key: {key} {mode} |
|
- Primary emotion: {primary_emotion} |
|
- Primary theme: {primary_theme} |
|
|
|
The lyrics should: |
|
- Perfectly capture the essence and style of {genre} music |
|
- Express the {primary_emotion} emotion and {primary_theme} theme |
|
- Be approximately {lines_count} lines long |
|
- Have a coherent theme and flow |
|
- Follow this structure: |
|
* Verse: {verse_lines} lines |
|
* Chorus: {chorus_lines} lines |
|
* {f'Bridge: 2 lines' if lines_count > 10 else ''} |
|
- Be completely original |
|
- Match the song duration of {duration:.1f} seconds |
|
- Keep each line concise and impactful |
|
|
|
Your lyrics: |
|
""" |
|
|
|
|
|
response = llm_pipeline( |
|
prompt, |
|
do_sample=True, |
|
temperature=0.7, |
|
top_p=0.9, |
|
repetition_penalty=1.1, |
|
return_full_text=False |
|
) |
|
|
|
|
|
lyrics = response[0]["generated_text"].strip() |
|
|
|
|
|
if "Verse" not in lyrics and "Chorus" not in lyrics: |
|
lines = lyrics.split('\n') |
|
formatted_lyrics = [] |
|
current_section = "Verse" |
|
for i, line in enumerate(lines): |
|
if i == 0: |
|
formatted_lyrics.append("[Verse]") |
|
elif i == verse_lines: |
|
formatted_lyrics.append("\n[Chorus]") |
|
elif i == verse_lines + chorus_lines and lines_count > 10: |
|
formatted_lyrics.append("\n[Bridge]") |
|
formatted_lyrics.append(line) |
|
lyrics = '\n'.join(formatted_lyrics) |
|
|
|
return lyrics |
|
|
|
def detect_music(audio_data): |
|
"""Detect if the audio is music using the MIT AST model.""" |
|
try: |
|
|
|
if 'music_detector' in globals(): |
|
results = music_detector(audio_data["path"]) |
|
|
|
music_confidence = 0.0 |
|
for result in results: |
|
label = result["label"].lower() |
|
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): |
|
music_confidence = max(music_confidence, result["score"]) |
|
return music_confidence >= 0.5, results |
|
|
|
|
|
elif 'music_processor' in globals() and 'music_model' in globals(): |
|
|
|
inputs = music_processor( |
|
audio_data["waveform"], |
|
sampling_rate=audio_data["sample_rate"], |
|
return_tensors="pt" |
|
) |
|
|
|
with torch.no_grad(): |
|
outputs = music_model(**inputs) |
|
predictions = outputs.logits.softmax(dim=-1) |
|
|
|
|
|
values, indices = torch.topk(predictions, 5) |
|
|
|
|
|
labels = music_model.config.id2label |
|
|
|
|
|
music_confidence = 0.0 |
|
results = [] |
|
|
|
for i, (value, index) in enumerate(zip(values[0], indices[0])): |
|
label = labels[index.item()].lower() |
|
score = value.item() |
|
results.append({"label": label, "score": score}) |
|
|
|
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): |
|
music_confidence = max(music_confidence, score) |
|
|
|
return music_confidence >= 0.5, results |
|
|
|
else: |
|
raise ValueError("No music detection model available") |
|
|
|
except Exception as e: |
|
print(f"Error in music detection: {str(e)}") |
|
return False, [] |
|
|
|
def process_audio(audio_file): |
|
"""Main function to process audio file, classify genre, and generate lyrics.""" |
|
if audio_file is None: |
|
return "Please upload an audio file.", None, None |
|
|
|
try: |
|
|
|
audio_data = extract_audio_features(audio_file) |
|
|
|
|
|
try: |
|
is_music, ast_results = detect_music(audio_data) |
|
except Exception as e: |
|
print(f"Error in music detection: {str(e)}") |
|
return f"Error in music detection: {str(e)}", None, [] |
|
|
|
if not is_music: |
|
return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results |
|
|
|
|
|
try: |
|
top_genres = classify_genre(audio_data) |
|
|
|
genre_results = format_genre_results(top_genres) |
|
except Exception as e: |
|
print(f"Error in genre classification: {str(e)}") |
|
return f"Error in genre classification: {str(e)}", None, ast_results |
|
|
|
|
|
try: |
|
emotion_results = music_analyzer.analyze_music(audio_file) |
|
except Exception as e: |
|
print(f"Error in emotion analysis: {str(e)}") |
|
|
|
emotion_results = {"summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}} |
|
|
|
|
|
try: |
|
primary_genre, _ = top_genres[0] |
|
lyrics = generate_lyrics(primary_genre, audio_data["duration"], emotion_results) |
|
except Exception as e: |
|
print(f"Error generating lyrics: {str(e)}") |
|
lyrics = f"Error generating lyrics: {str(e)}" |
|
|
|
return genre_results, lyrics, ast_results |
|
|
|
except Exception as e: |
|
error_msg = f"Error processing audio: {str(e)}" |
|
print(error_msg) |
|
return error_msg, None, [] |
|
|
|
|
|
with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo: |
|
gr.Markdown("# Music Genre Classifier & Lyrics Generator") |
|
gr.Markdown("Upload a music file to classify its genre, analyze its emotions, and generate matching lyrics.") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
audio_input = gr.Audio(label="Upload Music", type="filepath") |
|
submit_btn = gr.Button("Analyze & Generate") |
|
|
|
with gr.Column(): |
|
genre_output = gr.Textbox(label="Detected Genres", lines=5) |
|
emotion_output = gr.Textbox(label="Emotion Analysis", lines=5) |
|
ast_output = gr.Textbox(label="Audio Classification Results (AST)", lines=5) |
|
lyrics_output = gr.Textbox(label="Generated Lyrics", lines=15) |
|
|
|
def display_results(audio_file): |
|
if audio_file is None: |
|
return "Please upload an audio file.", "No emotion analysis available.", "No audio classification available.", None |
|
|
|
try: |
|
|
|
genre_results, lyrics, ast_results = process_audio(audio_file) |
|
|
|
|
|
if isinstance(genre_results, str) and genre_results.startswith("Error"): |
|
return genre_results, "Error in emotion analysis", "Error in audio classification", None |
|
|
|
|
|
try: |
|
emotion_results = music_analyzer.analyze_music(audio_file) |
|
emotion_text = f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n" |
|
emotion_text += f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n" |
|
emotion_text += f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n" |
|
emotion_text += f"Primary Theme: {emotion_results['summary']['primary_theme']}" |
|
except Exception as e: |
|
print(f"Error in emotion analysis: {str(e)}") |
|
emotion_text = f"Error in emotion analysis: {str(e)}" |
|
|
|
|
|
if ast_results and isinstance(ast_results, list): |
|
ast_text = "Audio Classification Results (AST Model):\n" |
|
for result in ast_results[:5]: |
|
ast_text += f"{result['label']}: {result['score']*100:.2f}%\n" |
|
else: |
|
ast_text = "No valid audio classification results available." |
|
|
|
return genre_results, emotion_text, ast_text, lyrics |
|
except Exception as e: |
|
error_msg = f"Error: {str(e)}" |
|
print(error_msg) |
|
return error_msg, "Error in emotion analysis", "Error in audio classification", None |
|
|
|
submit_btn.click( |
|
fn=display_results, |
|
inputs=[audio_input], |
|
outputs=[genre_output, emotion_output, ast_output, lyrics_output] |
|
) |
|
|
|
gr.Markdown("### How it works") |
|
gr.Markdown(""" |
|
1. Upload an audio file of your choice |
|
2. The system will classify the genre using the dima806/music_genres_classification model |
|
3. The system will analyze the musical emotion and theme using advanced audio processing |
|
4. Based on the detected genre and emotion, it will generate appropriate lyrics using Llama-3.1-8B-Instruct |
|
5. The lyrics length is automatically adjusted based on your audio duration |
|
""") |
|
|
|
|
|
demo.launch() |
|
|