import numpy as np from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.preprocessing import MultiLabelBinarizer import zipfile import json import pandas as pd import torch from datasets import Dataset from torch.utils.data import DataLoader import requests from .config import TAG_NAMES, DEVICE, SPACE_URL def load_data(test_data_path): # zip file handler zip_file = zipfile.ZipFile(test_data_path) # list available files in the container names = zip_file.namelist() data = [] features = ["prob_desc_description","prob_desc_input_spec","prob_desc_output_spec"] cols = features + ["tags"] # extract a specific file from the zip container for name in names[1:]: f = zip_file.open(name) # save the extraced file content = f.read() d = json.loads(content) # json_fmt = json.dumps(d, indent=2) # print(json_fmt) row = [] for c in cols: row.append(d[c]) data.append(row) df = pd.DataFrame(data, columns=cols) return df def preprocessing(df): mlb = MultiLabelBinarizer() tags_to_encode = ['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities'] # Filter tags and one-hot encode df['tags_filtered'] = [[tag for tag in tags if tag in tags_to_encode] for tags in df["tags"]] df.loc[df['tags_filtered'].apply(len) == 0, 'tags_filtered'] = df.loc[df['tags_filtered'].apply(len) == 0, 'tags_filtered'].apply(lambda x: ['other']) encoded_tags = mlb.fit_transform(df['tags_filtered']) # Create a new DataFrame with one-hot encoded columns encoded_df = pd.DataFrame(encoded_tags, columns=mlb.classes_) # Concatenate the encoded tags with the original DataFrame df = pd.concat([df, encoded_df], axis=1) texts = df["prob_desc_description"].values.tolist() labels = df[TAG_NAMES].values.tolist() # data: # texts = ["text1", "text2", ...] # list of texts # labels = [[0,1,0,0,1,0,1,1,0], [0,1,1,0,0,0,0,0,0],, ...] # list of labels df = pd.DataFrame({'text':texts, 'labels': labels}) return df def evaluate_batch(file_path, hf_repo, backend="local", hf_token=None): if backend == "local": return _evaluate_local(file_path, hf_repo) elif backend == "hf": return _evaluate_hf_api(file_path, hf_token) else: raise ValueError(f"Unknown backend: {backend}") def _evaluate_local(test_data_path, hf_repo): global local_model, local_tokenizer # Lazy-loading to avoid slow startup if local_model is None: from .model import QwenClassifier from transformers import AutoTokenizer local_model = QwenClassifier.from_pretrained(hf_repo).eval() local_tokenizer = AutoTokenizer.from_pretrained(hf_repo) df = load_data(test_data_path) df = preprocessing(df) hf_dataset = Dataset.from_pandas(df) # Then apply tokenization def tokenize_function(examples): return local_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) dataset = hf_dataset.map(tokenize_function, batched=True) dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) dataloader = DataLoader(dataset, batch_size=8, shuffle=True) local_model.eval() all_preds = [] all_labels = [] with torch.no_grad(): for batch in dataloader: batch = {k: v.to(DEVICE) for k, v in batch.items()} labels = batch["labels"].type(torch.float32) logits = local_model(batch["input_ids"], batch["attention_mask"]) preds = torch.sigmoid(logits).cpu().numpy() > 0.5 labels = labels.cpu().numpy() all_preds.extend(preds) all_labels.extend(labels) val_acc = accuracy_score(all_labels, all_preds) val_prec = precision_score(all_labels, all_preds, average='macro', zero_division=0) val_rec = recall_score(all_labels, all_preds, average='macro') val_f1 = f1_score(all_labels, all_preds, average='macro') val_prec_per_class = precision_score(all_labels, all_preds, average=None, zero_division=0) val_rec_per_class = recall_score(all_labels, all_preds, average=None) val_f1_per_class = f1_score(all_labels, all_preds, average=None) metrics = { val_acc, val_prec, val_rec, val_f1, val_prec_per_class, val_rec_per_class, val_f1_per_class } report = classification_report(all_labels, all_preds, target_names=TAG_NAMES, zero_division=0) return metrics, report def _evaluate_hf_api(file_path, hf_token=None): try: response = requests.post( f"{SPACE_URL}/evaluate", json={"file_path": file_path}, # This matches the Pydantic model headers={ "Authorization": f"Bearer {hf_token}", "Content-Type": "application/json" } if hf_token else {"Content-Type": "application/json"}, timeout=10 ) response.raise_for_status() # Raise HTTP errors return response.json() except requests.exceptions.RequestException as e: raise ValueError(f"API Error: {str(e)}\nResponse: {e.response.text if hasattr(e, 'response') else ''}")