import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import zipfile
import json
import pandas as pd
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
import requests
from .config import TAG_NAMES, DEVICE, SPACE_URL

def load_data(test_data_path):
    # zip file handler
    zip_file = zipfile.ZipFile(test_data_path)

    # list available files in the container
    names = zip_file.namelist()
    data = []
    features = ["prob_desc_description","prob_desc_input_spec","prob_desc_output_spec"]
    cols = features + ["tags"]
    # extract a specific file from the zip container
    for name in names[1:]:
        f = zip_file.open(name)

    # save the extraced file
    content = f.read()
    d = json.loads(content)
    # json_fmt = json.dumps(d, indent=2)
    # print(json_fmt)
    row = []
    for c in cols:
        row.append(d[c])
    data.append(row)
    df = pd.DataFrame(data, columns=cols)
    return df

def preprocessing(df):
    mlb = MultiLabelBinarizer()
    tags_to_encode = ['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities']

    # Filter tags and one-hot encode
    df['tags_filtered'] = [[tag for tag in tags if tag in tags_to_encode] for tags in df["tags"]]
    df.loc[df['tags_filtered'].apply(len) == 0, 'tags_filtered'] = df.loc[df['tags_filtered'].apply(len) == 0, 'tags_filtered'].apply(lambda x: ['other'])
    encoded_tags = mlb.fit_transform(df['tags_filtered'])

    # Create a new DataFrame with one-hot encoded columns
    encoded_df = pd.DataFrame(encoded_tags, columns=mlb.classes_)

    # Concatenate the encoded tags with the original DataFrame
    df = pd.concat([df, encoded_df], axis=1)

    texts = df["prob_desc_description"].values.tolist()
    labels = df[TAG_NAMES].values.tolist()

    # data:
    # texts = ["text1", "text2", ...]  # list of texts
    # labels = [[0,1,0,0,1,0,1,1,0], [0,1,1,0,0,0,0,0,0],, ...] # list of labels
    
    df = pd.DataFrame({'text':texts, 'labels': labels})
    return df


def evaluate_batch(file_path, hf_repo, backend="local", hf_token=None):
    if backend == "local":
        return _evaluate_local(file_path, hf_repo)
    elif backend == "hf":
        return _evaluate_hf_api(file_path, hf_token)
    else:
        raise ValueError(f"Unknown backend: {backend}")

def _evaluate_local(test_data_path, hf_repo):
    global local_model, local_tokenizer
    
    # Lazy-loading to avoid slow startup
    if local_model is None:
        from .model import QwenClassifier
        from transformers import AutoTokenizer
        
        local_model = QwenClassifier.from_pretrained(hf_repo).eval()
        local_tokenizer = AutoTokenizer.from_pretrained(hf_repo)
    df = load_data(test_data_path)
    df = preprocessing(df)

    hf_dataset = Dataset.from_pandas(df)

    # Then apply tokenization
    def tokenize_function(examples):
        return local_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

    dataset = hf_dataset.map(tokenize_function, batched=True)

    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


    local_model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            labels = batch["labels"].type(torch.float32)

            logits = local_model(batch["input_ids"], batch["attention_mask"])

            preds = torch.sigmoid(logits).cpu().numpy() > 0.5
            labels = labels.cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    val_acc = accuracy_score(all_labels, all_preds)
    val_prec = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    val_rec = recall_score(all_labels, all_preds, average='macro')
    val_f1 = f1_score(all_labels, all_preds, average='macro')
    val_prec_per_class = precision_score(all_labels, all_preds, average=None, zero_division=0)
    val_rec_per_class = recall_score(all_labels, all_preds, average=None)
    val_f1_per_class = f1_score(all_labels, all_preds, average=None)

    metrics = {
        val_acc,
        val_prec,
        val_rec, 
        val_f1, 
        val_prec_per_class, 
        val_rec_per_class, 
        val_f1_per_class
    }
    report = classification_report(all_labels, all_preds, target_names=TAG_NAMES, zero_division=0)

    return metrics, report


def _evaluate_hf_api(file_path, hf_token=None):
    try:
        response = requests.post(
            f"{SPACE_URL}/evaluate",
            json={"file_path": file_path},  # This matches the Pydantic model
            headers={
                "Authorization": f"Bearer {hf_token}",
                "Content-Type": "application/json"
            } if hf_token else {"Content-Type": "application/json"},
            timeout=10
        )
        response.raise_for_status()  # Raise HTTP errors
        return response.json()
    except requests.exceptions.RequestException as e:
        raise ValueError(f"API Error: {str(e)}\nResponse: {e.response.text if hasattr(e, 'response') else ''}")