Spaces:
Sleeping
Sleeping
File size: 2,892 Bytes
cc95177 2d33816 256b96b 8d7f108 498ba1c 256b96b cc95177 88242b4 cc95177 88242b4 cc95177 8fc44c2 e6a0876 90d2350 e6a0876 90d2350 3f764f8 2403602 3f764f8 2403602 0010524 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
from pathlib import Path
import io
import json
import math
import statistics
import sys
import time
from datasets import concatenate_datasets, Dataset
from datasets import load_dataset
from huggingface_hub import hf_hub_url
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from evaluate import load
import uvicorn
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
metric = load("glue", "mrpc")
app = FastAPI()
def preprocess_function(examples):
tokenizer = AutoTokenizer.from_pretrained("sgugger/glue-mrpc")
return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)
def compute_model_card_evaluation_results(model_checkpoint, raw_datasets, metric):
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
batch_size = 16
args = TrainingArguments(
"test-glue",
evaluation_strategy = "epoch",
learning_rate=5e-5,
seed=42,
lr_scheduler_type="linear",
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=3,
weight_decay=0.01,
load_best_model_at_end=False,
metric_for_best_model="accuracy",
report_to="none"
)
trainer = Trainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
result = trainer.evaluate()
return result
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/", tags = ["Home"])
def api_home():
return {'detail': 'Welcome to Bastions Model evaluation!'}
@app.post("/api/evaluate/{model_checkpoint, dataset_name}", summary = "Input dataset and model identifiers", tags = ["Test API"])
def return_output(model_checkpoint, dataset_name):
"""
Please enter model and dataset identifiers
"""
#model_checkpoint = "sgugger/glue-mrpc"
#dataset_name = "nyu-mll/glue"
print(model_checkpoint, dataset_name, metric)
model_checkpoint = model_checkpoint
raw_datasets = load_dataset(dataset_name, "mrpc")
#tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
output = compute_model_card_evaluation_results(model_checkpoint, raw_datasets, metric)
return output |