import gradio as gr
import numpy as np
import librosa
import joblib
import json
import tensorflow as tf
from huggingface_hub import hf_hub_download

# Download artifacts from Hugging Face Hub
REPO_ID = "hriteshMaikap/languageClassifier"
MODEL_FILENAME = "indic_language_classifier_mtm.keras"
SCALER_FILENAME = "audio_feature_scaler_mtm.pkl"
CONFIG_FILENAME = "config_mtm.json"

model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
scaler_path = hf_hub_download(repo_id=REPO_ID, filename=SCALER_FILENAME)
config_path = hf_hub_download(repo_id=REPO_ID, filename=CONFIG_FILENAME)

with open(config_path, "r") as f:
    config = json.load(f)
n_mfcc = config["n_mfcc"]
max_pad_len = config["max_pad_len"]
feature_type = config["feature_type"]
class_labels = config["class_labels"]
sr = 22050  # Use the same sample rate as in your training

scaler = joblib.load(scaler_path)
model = tf.keras.models.load_model(model_path)

def extract_features(file_path, n_mfcc, max_pad_len, feature_type, sr):
    audio, _ = librosa.load(file_path, sr=sr, mono=True, res_type='kaiser_fast')
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    if feature_type == 'mfcc_delta':
        delta_mfccs = librosa.feature.delta(mfccs)
        delta2_mfccs = librosa.feature.delta(mfccs, order=2)
        features = np.concatenate((mfccs, delta_mfccs, delta2_mfccs), axis=0)
    elif feature_type == 'mfcc':
        features = mfccs
    else:
        features = mfccs
    current_len = features.shape[1]
    if current_len > max_pad_len:
        features = features[:, :max_pad_len]
    elif current_len < max_pad_len:
        pad_width = max_pad_len - current_len
        features = np.pad(features, pad_width=((0, 0), (0, pad_width)), mode='constant')
    return features.T

def predict_language(audio_filepath):
    try:
        features = extract_features(audio_filepath, n_mfcc, max_pad_len, feature_type, sr)
        features_scaled = scaler.transform(features)
        features_scaled = features_scaled[np.newaxis, :, :]
        pred_probs = model.predict(features_scaled)
        pred_idx = np.argmax(pred_probs, axis=1)[0]
        pred_lang = class_labels[pred_idx]
        confidence = float(pred_probs[0, pred_idx])
        all_probs = {l: float(p) for l, p in zip(class_labels, pred_probs[0])}
        prob_str = "\n".join([f"{l}: {p:.3f}" for l, p in all_probs.items()])
        return f"**Prediction:** {pred_lang}\n**Confidence:** {confidence:.2%}\n\n**Class Probabilities:**\n{prob_str}"
    except Exception as e:
        return f"Error processing audio: {str(e)}"

demo = gr.Interface(
    fn=predict_language,
    inputs=gr.Audio(type="filepath", label="Upload or record audio (.wav or .mp3)"),
    outputs=gr.Markdown(),
    title="Indic Language Classifier (Marathi, Telugu, Malayalam)",
    description="Record or upload an audio sample. The model predicts the language (Marathi, Telugu, or Malayalam)."
)

if __name__ == "__main__":
    demo.launch(show_error=True)