Spaces:
Sleeping
Sleeping
File size: 5,686 Bytes
1f09890 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import streamlit as st
from datasets import load_dataset_builder
from datasets import get_dataset_config_names
from os import listdir
from datasets import load_dataset, Dataset
from datasets_sql import query
import plotly.express as px
import numpy as np
import statistics
st.set_page_config(
page_title="Evaluation Buddy",
page_icon="./robot.png",
layout="wide",
)
st.title("Hugging Face Evaluation Buddy")
top_datasets= ['glue', 'super_glue', 'wikitext', 'imdb', 'squad', 'squad_es', \
'paws', 'librispeech_asr', 'wmt16', 'xnli', 'snli', 'ag_news', \
'anli', 'amazon_polarity', 'squad_v2', 'conll2003', 'red_caps', \
'common_voice', 'stsb_multi_mt', 'trec', 'tweet_eval', 'cosmos_qa',\
'sick', 'xsum', 'wikiann', 'yelp_polarity', 'hellaswag', 'piqa', \
'race', 'winogrande']
tasks= ['text-classification', 'question-answering-extractive', 'automatic-speech-recognition']
with st.sidebar.expander("Datasets", expanded=True):
dataset_name = st.selectbox(
f"Choose a dataset to evaluate on:",
sorted(top_datasets))
configs = get_dataset_config_names(dataset_name)
dataset_config = st.selectbox(
f"Choose a configuration of your dataset:",
configs)
dataset_builder = load_dataset_builder(dataset_name, dataset_config)
splits = [s for s in dataset_builder.info.splits]
dataset_split = st.selectbox(
f"Choose a dataset split:",
splits)
balanced_stdev = st.slider("Choose a standard deviation threshold for determining whether a dataset is balanced or not:", 0.00, 1.00, 0.20)
st.markdown("## Here is some information about your dataset:")
st.markdown("### Description")
st.markdown(dataset_builder.info.description)
st.markdown("For more information about this dataset, check out [its website](https://huggingface.co/datasets/"+dataset_name+")")
st.markdown("### Dataset-Specific Metrics")
if dataset_name in listdir('../datasets/metrics/'):
st.markdown("Great news! Your dataset has a dedicated metric for it! You can use it like this:")
code = ''' from datasets import load_metric
metric = load_metric('''+dataset+''', '''+config+''')'''
st.code(code, language='python')
dedicated_metric = True
else:
st.markdown("Your dataset doesn't have a dedicated metric, but that's ok!")
dedicated_metric = False
st.markdown("### Task-Specific Metrics")
try:
task = dataset_builder.info.task_templates[0].task
st.markdown("The task associated to it is: " + task)
if task == 'automatic-speech-recognition':
st.markdown('Automatic Speech Recognition has some dedicated metrics such as:')
st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)')
st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)')
else:
st.markdown("The task for your dataset doesn't have any dedicated metrics, but you can still use general ones!")
except:
st.markdown("The task for your dataset doesn't have any dedicated metrics, but you can still use general ones!")
#print(dataset_builder.info.task_templates)
#print(dataset_builder.info.features)
#st.markdown("### General Metrics")
#dataset = load_dataset(dataset_name, dataset_config, dataset_split)
#print(dataset_name, dataset_config, dataset_split)
#print(labels.head())
try:
num_classes = dataset_builder.info.features['label'].num_classes
dataset = load_dataset(dataset_name, split=dataset_split)
labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas()
labels = labels.rename(columns={"count_star()": "count"})
labels.index = dataset_builder.info.features['label'].names
st.markdown("### Labelled Metrics")
st.markdown("Your dataset has "+ str(dataset_builder.info.features['label'].num_classes) + " labels : " + ', '.join(dataset_builder.info.features['label'].names))
#TODO : figure out how to make a label plot
st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400))
total = sum(c for c in labels['count'])
proportion = [c/total for c in labels['count']]
#proportion = [0.85, 0.15]
stdev_dataset= statistics.stdev(proportion)
if stdev_dataset <= balanced_stdev:
st.markdown("Since your dataset is well-balanced, you can look at using:")
st.markdown('[Accuracy](https://huggingface.co/metrics/accuracy)')
accuracy_code = '''from datasets import load_metric
metric = load_metric("accuracy")'''
st.code(accuracy_code, language='python')
else:
st.markdown("Since your dataset is not well-balanced, you can look at using:")
st.markdown('[F1 Score](https://huggingface.co/metrics/f1)')
accuracy_code = '''from datasets import load_metric
metric = load_metric("accuracy")'''
st.code(accuracy_code, language='python')
st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.')
except:
st.markdown("### Unsupervised Metrics")
st.markdown("Since dataset doesn't have any labels, so the metrics that you can use for evaluation are:")
st.markdown('[Perplexity](https://huggingface.co/metrics/perplexity)')
perplexity_code = '''from datasets import load_metric
metric = load_metric("perplexity")'''
st.code(perplexity_code, language='python')
st.markdown('If you choose a model that was trained on **' + dataset_name + '** and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.')
|