import torch
import torchaudio
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

MODEL_PATH = "D:/SER MiniProj/wav2vec2_model/"
TARGET_SAMPLE_RATE = 16000  # Model requires 16kHz audio

# Load feature extractor and model
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_PATH)
model = AutoModelForAudioClassification.from_pretrained(MODEL_PATH)

print("Feature extractor and model loaded successfully!")

# Load an audio file
audio_file = "D:/SER MiniProj/temp_audio.wav"
waveform, sample_rate = torchaudio.load(audio_file)

# Convert to mono if needed
if waveform.shape[0] > 1:
    waveform = torch.mean(waveform, dim=0, keepdim=True)

# Resample if the sample rate is not 16kHz
if sample_rate != TARGET_SAMPLE_RATE:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE)
    waveform = resampler(waveform)
    sample_rate = TARGET_SAMPLE_RATE  # Update sample rate

# Process the audio for the model
inputs = feature_extractor(waveform.squeeze(0), sampling_rate=sample_rate, return_tensors="pt")

# Perform inference
with torch.no_grad():
    logits = model(**inputs).logits

# Get the predicted emotion
predicted_label = torch.argmax(logits, dim=-1).item()

# Print the output
print(f"Predicted Emotion Class: {predicted_label}")