Spaces:

jacob-c
/

syllables_matching_experiment

Sleeping

syllables_matching_experiment / utils.py

root

syllables trying first

4ddd8f4 7 days ago

4.3 kB

	import torch
	import numpy as np
	import librosa

	def load_audio(audio_file, sr=22050):
	"""Load an audio file and convert to mono if needed."""
	try:
	# Try to load audio with librosa
	y, sr = librosa.load(audio_file, sr=sr, mono=True)
	return y, sr
	except Exception as e:
	print(f"Error loading audio with librosa: {str(e)}")
	# Fallback to basic loading if necessary
	import soundfile as sf
	try:
	y, sr = sf.read(audio_file)
	# Convert to mono if stereo
	if len(y.shape) > 1:
	y = y.mean(axis=1)
	return y, sr
	except Exception as e2:
	print(f"Error loading audio with soundfile: {str(e2)}")
	raise ValueError(f"Could not load audio file: {audio_file}")

	def extract_audio_duration(y, sr):
	"""Get the duration of audio in seconds."""
	return len(y) / sr

	def extract_mfcc_features(y, sr, n_mfcc=20):
	"""Extract MFCC features from audio."""
	try:
	mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
	mfccs_mean = np.mean(mfccs.T, axis=0)
	return mfccs_mean
	except Exception as e:
	print(f"Error extracting MFCCs: {str(e)}")
	# Return a fallback feature vector if extraction fails
	return np.zeros(n_mfcc)

	def calculate_lyrics_length(duration, tempo=100, time_signature=4):
	"""Calculate appropriate lyrics structure based on musical principles."""
	# Legacy behavior - simple calculation based on duration
	lines_count = max(4, int(duration / 10))

	# If only duration was provided (original usage), return just the integer
	if not isinstance(tempo, (int, float)) or not isinstance(time_signature, (int, float)):
	return lines_count

	# Enhanced calculation
	beats_per_minute = tempo
	beats_per_second = beats_per_minute / 60
	total_beats = duration * beats_per_second
	total_measures = total_beats / time_signature

	# Determine section distributions
	verse_lines = 0
	chorus_lines = 0
	bridge_lines = 0

	if lines_count <= 6:
	verse_lines = 2
	chorus_lines = 2
	elif lines_count <= 10:
	verse_lines = 3
	chorus_lines = 2
	else:
	verse_lines = 3
	chorus_lines = 2
	bridge_lines = 2

	# Create structured output
	song_structure = {
	"total_measures": int(total_measures),
	"lines_count": lines_count, # Include the original line count
	"sections": [
	{"type": "verse", "lines": verse_lines, "measures": int(total_measures * 0.4)},
	{"type": "chorus", "lines": chorus_lines, "measures": int(total_measures * 0.3)}
	]
	}

	if bridge_lines > 0:
	song_structure["sections"].append(
	{"type": "bridge", "lines": bridge_lines, "measures": int(total_measures * 0.2)}
	)

	return song_structure

	def format_genre_results(top_genres):
	"""Format genre classification results for display."""
	result = "Top Detected Genres:\n"
	for genre, confidence in top_genres:
	result += f"- {genre}: {confidence*100:.2f}%\n"
	return result

	def ensure_cuda_availability():
	"""Check and report CUDA availability for informational purposes."""
	cuda_available = torch.cuda.is_available()
	if cuda_available:
	device_count = torch.cuda.device_count()
	device_name = torch.cuda.get_device_name(0) if device_count > 0 else "Unknown"
	print(f"CUDA is available with {device_count} device(s). Using: {device_name}")
	else:
	print("CUDA is not available. Using CPU for inference.")
	return cuda_available

	def preprocess_audio_for_model(waveform, sample_rate, target_sample_rate=16000, max_length=16000):
	"""Preprocess audio for model input (resample, pad/trim)."""
	# Resample if needed
	if sample_rate != target_sample_rate:
	waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=target_sample_rate)

	# Trim or pad to expected length
	if len(waveform) > max_length:
	waveform = waveform[:max_length]
	elif len(waveform) < max_length:
	padding = max_length - len(waveform)
	waveform = np.pad(waveform, (0, padding), 'constant')

	return waveform