File size: 3,273 Bytes
78fc145
5be1834
1218270
 
 
 
e711649
 
9cc9a99
 
e688c12
59c09a6
9cc9a99
843d6e6
9cc9a99
e711649
9cc9a99
 
 
 
 
 
 
 
 
 
 
 
 
 
117043a
0526a15
9cc9a99
 
1e55a04
9cc9a99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1218270
5b8f50d
 
 
e711649
6a96cd0
e085d1b
61c0e67
 
e085d1b
 
 
 
dc2bd70
e085d1b
e711649
 
6a96cd0
e63e630
e085d1b
3fd074b
e63e630
e085d1b
 
 
843d6e6
dea31f5
843d6e6
 
 
3e91df5
dea31f5
03e6204
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import streamlit as st
import torch
import logging
from typing import List, Dict
import gc
import os
import pandas as pd
import numpy as np
import json

# Huggingface stuff
from datasets import load_dataset, Dataset

from huggingface_hub import hf_hub_url, ModelCard

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from evaluate import load

def preprocess_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

def compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric):
    tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
    
    batch_size = 16
    args = TrainingArguments(
        "test-glue",
        eval_strategy = "epoch",
        learning_rate=5e-5,
        seed=42,
        lr_scheduler_type="linear",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=False,
        metric_for_best_model="accuracy",
        report_to="none"
        )
    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    result = trainer.evaluate()
    return result

if __name__ == "__main__":

    st.title("Hugging Face Model Evaluation Demo")

    with st.form("my_st_form"):

        # Create an input text box
        dataset_name = st.text_input("Enter dataset identifier", "")
        model_checkpoint = st.text_input("Enter model identifier", "")
        # Every form must have a submit button.
        submitted = st.form_submit_button("Submit")

        if submitted:
            print(dataset_name, model_checkpoint) 
            # hardcode input data
            #model_checkpoint = "sgugger/glue-mrpc"
            #dataset_name = "nyu-mll/glue"
    
            metric = load("glue", "mrpc")
            tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
            raw_datasets = load_dataset(dataset_name, "mrpc")
            
            tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
            output = compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric)
            print(json.dumps(output))
            #st.text_area(label="Output Data:", value=st.json(output, expanded=True), height=300)
            st.header("Self-generated Evaluation Results:")
            st.json(output, expanded=True)
            card = ModelCard.load(model_checkpoint)

            #st.text_area(label="Model Card Data:", height=500, value=json.dumps(card.data.eval_results))
            st.header("Model Card Evaluation Results:")            
            st.json(card.data.eval_results, expanded=True)