import torch import numpy as np import librosa def load_audio(audio_file, sr=22050): """Load an audio file and convert to mono if needed.""" try: # Try to load audio with librosa y, sr = librosa.load(audio_file, sr=sr, mono=True) return y, sr except Exception as e: print(f"Error loading audio with librosa: {str(e)}") # Fallback to basic loading if necessary import soundfile as sf try: y, sr = sf.read(audio_file) # Convert to mono if stereo if len(y.shape) > 1: y = y.mean(axis=1) return y, sr except Exception as e2: print(f"Error loading audio with soundfile: {str(e2)}") raise ValueError(f"Could not load audio file: {audio_file}") def extract_audio_duration(y, sr): """Get the duration of audio in seconds.""" return len(y) / sr def extract_mfcc_features(y, sr, n_mfcc=20): """Extract MFCC features from audio.""" try: mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc) mfccs_mean = np.mean(mfccs.T, axis=0) return mfccs_mean except Exception as e: print(f"Error extracting MFCCs: {str(e)}") # Return a fallback feature vector if extraction fails return np.zeros(n_mfcc) def calculate_lyrics_length(duration, tempo=100, time_signature=4): """Calculate appropriate lyrics structure based on musical principles.""" # Legacy behavior - simple calculation based on duration lines_count = max(4, int(duration / 10)) # If only duration was provided (original usage), return just the integer if not isinstance(tempo, (int, float)) or not isinstance(time_signature, (int, float)): return lines_count # Enhanced calculation beats_per_minute = tempo beats_per_second = beats_per_minute / 60 total_beats = duration * beats_per_second total_measures = total_beats / time_signature # Determine section distributions verse_lines = 0 chorus_lines = 0 bridge_lines = 0 if lines_count <= 6: verse_lines = 2 chorus_lines = 2 elif lines_count <= 10: verse_lines = 3 chorus_lines = 2 else: verse_lines = 3 chorus_lines = 2 bridge_lines = 2 # Create structured output song_structure = { "total_measures": int(total_measures), "lines_count": lines_count, # Include the original line count "sections": [ {"type": "verse", "lines": verse_lines, "measures": int(total_measures * 0.4)}, {"type": "chorus", "lines": chorus_lines, "measures": int(total_measures * 0.3)} ] } if bridge_lines > 0: song_structure["sections"].append( {"type": "bridge", "lines": bridge_lines, "measures": int(total_measures * 0.2)} ) return song_structure def format_genre_results(top_genres): """Format genre classification results for display.""" result = "Top Detected Genres:\n" for genre, confidence in top_genres: result += f"- {genre}: {confidence*100:.2f}%\n" return result def ensure_cuda_availability(): """Check and report CUDA availability for informational purposes.""" cuda_available = torch.cuda.is_available() if cuda_available: device_count = torch.cuda.device_count() device_name = torch.cuda.get_device_name(0) if device_count > 0 else "Unknown" print(f"CUDA is available with {device_count} device(s). Using: {device_name}") else: print("CUDA is not available. Using CPU for inference.") return cuda_available def preprocess_audio_for_model(waveform, sample_rate, target_sample_rate=16000, max_length=16000): """Preprocess audio for model input (resample, pad/trim).""" # Resample if needed if sample_rate != target_sample_rate: waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=target_sample_rate) # Trim or pad to expected length if len(waveform) > max_length: waveform = waveform[:max_length] elif len(waveform) < max_length: padding = max_length - len(waveform) waveform = np.pad(waveform, (0, padding), 'constant') return waveform