|
import torch |
|
import numpy as np |
|
import librosa |
|
|
|
def load_audio(audio_file, sr=22050): |
|
"""Load an audio file and convert to mono if needed.""" |
|
try: |
|
|
|
y, sr = librosa.load(audio_file, sr=sr, mono=True) |
|
return y, sr |
|
except Exception as e: |
|
print(f"Error loading audio with librosa: {str(e)}") |
|
|
|
import soundfile as sf |
|
try: |
|
y, sr = sf.read(audio_file) |
|
|
|
if len(y.shape) > 1: |
|
y = y.mean(axis=1) |
|
return y, sr |
|
except Exception as e2: |
|
print(f"Error loading audio with soundfile: {str(e2)}") |
|
raise ValueError(f"Could not load audio file: {audio_file}") |
|
|
|
def extract_audio_duration(y, sr): |
|
"""Get the duration of audio in seconds.""" |
|
return len(y) / sr |
|
|
|
def extract_mfcc_features(y, sr, n_mfcc=20): |
|
"""Extract MFCC features from audio.""" |
|
try: |
|
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc) |
|
mfccs_mean = np.mean(mfccs.T, axis=0) |
|
return mfccs_mean |
|
except Exception as e: |
|
print(f"Error extracting MFCCs: {str(e)}") |
|
|
|
return np.zeros(n_mfcc) |
|
|
|
def calculate_lyrics_length(duration, tempo=100, time_signature=4): |
|
"""Calculate appropriate lyrics structure based on musical principles.""" |
|
|
|
lines_count = max(4, int(duration / 10)) |
|
|
|
|
|
if not isinstance(tempo, (int, float)) or not isinstance(time_signature, (int, float)): |
|
return lines_count |
|
|
|
|
|
beats_per_minute = tempo |
|
beats_per_second = beats_per_minute / 60 |
|
total_beats = duration * beats_per_second |
|
total_measures = total_beats / time_signature |
|
|
|
|
|
verse_lines = 0 |
|
chorus_lines = 0 |
|
bridge_lines = 0 |
|
|
|
if lines_count <= 6: |
|
verse_lines = 2 |
|
chorus_lines = 2 |
|
elif lines_count <= 10: |
|
verse_lines = 3 |
|
chorus_lines = 2 |
|
else: |
|
verse_lines = 3 |
|
chorus_lines = 2 |
|
bridge_lines = 2 |
|
|
|
|
|
song_structure = { |
|
"total_measures": int(total_measures), |
|
"lines_count": lines_count, |
|
"sections": [ |
|
{"type": "verse", "lines": verse_lines, "measures": int(total_measures * 0.4)}, |
|
{"type": "chorus", "lines": chorus_lines, "measures": int(total_measures * 0.3)} |
|
] |
|
} |
|
|
|
if bridge_lines > 0: |
|
song_structure["sections"].append( |
|
{"type": "bridge", "lines": bridge_lines, "measures": int(total_measures * 0.2)} |
|
) |
|
|
|
return song_structure |
|
|
|
def format_genre_results(top_genres): |
|
"""Format genre classification results for display.""" |
|
result = "Top Detected Genres:\n" |
|
for genre, confidence in top_genres: |
|
result += f"- {genre}: {confidence*100:.2f}%\n" |
|
return result |
|
|
|
def ensure_cuda_availability(): |
|
"""Check and report CUDA availability for informational purposes.""" |
|
cuda_available = torch.cuda.is_available() |
|
if cuda_available: |
|
device_count = torch.cuda.device_count() |
|
device_name = torch.cuda.get_device_name(0) if device_count > 0 else "Unknown" |
|
print(f"CUDA is available with {device_count} device(s). Using: {device_name}") |
|
else: |
|
print("CUDA is not available. Using CPU for inference.") |
|
return cuda_available |
|
|
|
def preprocess_audio_for_model(waveform, sample_rate, target_sample_rate=16000, max_length=16000): |
|
"""Preprocess audio for model input (resample, pad/trim).""" |
|
|
|
if sample_rate != target_sample_rate: |
|
waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=target_sample_rate) |
|
|
|
|
|
if len(waveform) > max_length: |
|
waveform = waveform[:max_length] |
|
elif len(waveform) < max_length: |
|
padding = max_length - len(waveform) |
|
waveform = np.pad(waveform, (0, padding), 'constant') |
|
|
|
return waveform |