Spaces:
Sleeping
Sleeping
from ast import literal_eval | |
from functools import lru_cache | |
from itertools import combinations | |
from pathlib import Path | |
from typing import List, Optional, Union | |
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
from cytoolz import concat, frequencies, topk, unique | |
from datasets import load_dataset | |
pd.options.plotting.backend = "plotly" | |
def download_dataset(): | |
return load_dataset( | |
"open-source-metrics/model-repos-stats", | |
split="train", | |
ignore_verifications=True, | |
) | |
def _clean_tags(tags: Optional[Union[str, List[str]]]): | |
try: | |
tags = literal_eval(tags) | |
if isinstance(tags, str): | |
return [tags] | |
if isinstance(tags, list): | |
return [tag for tag in tags if isinstance(tag, str)] | |
else: | |
return [] | |
except (ValueError, SyntaxError): | |
return [] | |
def _is_generated_from_tag(tags): | |
return any("generated" in tag for tag in tags) | |
def _parse_tags_for_generated(tags): | |
for tag in tags: | |
if "generated" in tag: | |
return tag | |
def prep_dataset(): | |
ds = download_dataset() | |
df = ds.to_pandas() | |
df["languages"] = df["languages"].apply(_clean_tags) | |
df["datasets"] = df["datasets"].apply(_clean_tags) | |
df["tags"] = df["tags"].apply(_clean_tags) | |
df["has_languages"] = df.languages.apply(len) > 0 | |
df["has_tags"] = df.tags.apply(len) > 0 | |
df["has_dataset"] = df.datasets.apply(len) > 0 | |
df["has_co2"] = df.co2.notnull() | |
df["has_co2"] = df.co2.apply(lambda x: x is not None) | |
df["has_license"] = df.license.notnull() | |
df["is_generated"] = df.tags.apply(_is_generated_from_tag) | |
df = df.drop(columns=["Unnamed: 0"]) | |
df.to_parquet("data.parquet") | |
return df | |
def load_data(): | |
return ( | |
pd.read_parquet("data.parquet") | |
if Path("data.parquet").exists() | |
else prep_dataset() | |
) | |
def filter_df_by_library(filter="transformers"): | |
df = load_data() | |
return df[df["library"] == filter] if filter else df | |
def get_library_choices(min_freq: int = 50): | |
df = load_data() | |
library_counts = df.library.value_counts() | |
return library_counts[library_counts > min_freq].index.to_list() | |
def get_all_tags(): | |
df = load_data() | |
tags = df["tags"].to_list() | |
return list(concat(tags)) | |
def get_case_sensitive_duplicate_tags(): | |
tags = get_all_tags() | |
unique_tags = unique(tags) | |
return [ | |
tag_combo | |
for tag_combo in combinations(unique_tags, 2) | |
if tag_combo[0].lower() == tag_combo[1].lower() | |
] | |
def display_case_sensitive_duplicate_tags(): | |
return pd.DataFrame(get_case_sensitive_duplicate_tags()) | |
def get_number_of_tags(case_sensitive=True): | |
tags = set(get_all_tags()) | |
if case_sensitive: | |
return f"Total number of case sensitive tags: {len(tags)}" | |
tags = {tag.lower() for tag in tags} | |
return f"Total number of case insensitive tags: {len(tags)}" | |
def tag_frequency(case_sensitive=True): | |
tags = get_all_tags() | |
if not case_sensitive: | |
tags = (tag.lower() for tag in tags) | |
tags_frequencies = dict(frequencies(tags)) | |
df = pd.DataFrame.from_dict( | |
tags_frequencies, orient="index", columns=["Count"] | |
).sort_values(by="Count", ascending=False) | |
return df.reset_index() | |
def tag_frequency_by_library(library_filter): | |
df = filter_df_by_library(library_filter) | |
tags = concat(df["tags"]) | |
tags = dict(frequencies(tags)) | |
df = pd.DataFrame.from_dict(tags, orient="index", columns=["Count"]).sort_values( | |
by="Count", ascending=False | |
) | |
return df.reset_index() | |
def has_model_card_by_library(top_n): | |
df = load_data() | |
if top_n: | |
top_libs = df.library.value_counts().head(int(top_n)).index.to_list() | |
# min_thresh = df.library.value_counts()[:min_number].index.to_list() | |
df = df[df.library.isin(top_libs)] | |
return ( | |
df.groupby("library")["has_text"] | |
.apply(lambda x: np.sum(x) / len(x)) | |
.sort_values() | |
.plot.barh() | |
) | |
def model_card_length_by_library(top_n): | |
df = load_data() | |
if top_n: | |
top_libs = df.library.value_counts().head(int(top_n)).index.to_list() | |
# min_thresh = df.library.value_counts()[:min_number].index.to_list() | |
df = df[df.library.isin(top_libs)] | |
return df.groupby("library")["text_length"].describe().round().reset_index() | |
# df = df.groupby('library')['text_length'].describe().round().reset_index() | |
# df['library'] = df.library.apply(lambda library: f"[{library}](https://huggingface.co/models?library={library})") | |
# return df.to_markdown() | |
def metadata_coverage_by_library(metadata_field): | |
df = load_data() | |
return df.groupby("library")[metadata_field].mean().sort_values().plot.barh() | |
def metatadata_coverage_autogenerated_vs_test(): | |
df = load_data() | |
subset_df = df[df["is_generated"]].copy(deep=True) | |
subset_df.reset_index() | |
return ( | |
df.groupby("is_generated")[[c for c in df.columns if c.startswith("has")]] | |
.mean() | |
.transpose() | |
.round(6) | |
.reset_index() | |
.rename( | |
columns={ | |
True: "From autogenerated", | |
False: "Not autogenerated", | |
"index": "Metadata/tag field", | |
} | |
) | |
) | |
def metadata_coverage_by_autogenerated(metadata_field): | |
df = load_data() | |
subset_df = df[df["is_generated"]].copy(deep=True) | |
subset_df.reset_index() | |
subset_df["autogenerated-from"] = subset_df.tags.apply(_parse_tags_for_generated) | |
return ( | |
subset_df.groupby("autogenerated-from")[metadata_field] | |
.mean() | |
.sort_values() | |
.plot.barh() | |
) | |
def model_card_length_by_autogenerated(): | |
df = load_data() | |
subset_df = df[df["is_generated"]].copy(deep=True) | |
subset_df.reset_index() | |
subset_df["autogenerated-from"] = subset_df.tags.apply(_parse_tags_for_generated) | |
return ( | |
subset_df.groupby("autogenerated-from")["text_length"] | |
.describe() | |
.round() | |
.reset_index() | |
) | |
_ABSTRACT = """ | |
tl;dr this dashboard aims to provide an overview of metadata associated with models hosted on the Hugging Face hub. | |
\n | |
Each tab of this dashboard focuses on a different aspect of model metadata on the hub. | |
Many of the tabs in the dashboard have a particular focus on the metadata coverage for different libraries in the hub. | |
""" | |
df = load_data() | |
top_n = df.library.value_counts().shape[0] | |
libraries = [library for library in df.library.unique() if library] | |
metadata_coverage_columns = [c for c in df.columns if c.startswith("has")] | |
with gr.Blocks() as demo: | |
gr.Markdown("# π€ Hub Metadata Explorer") | |
gr.Markdown(_ABSTRACT) | |
with gr.Tab("Tag frequencies"): | |
gr.Markdown( | |
"Tags are one of the key ways in which users may identify models which are of interest. This tab provides " | |
"some visualizations of tags across *all* models (regardless of library)" | |
) | |
with gr.Row(): | |
gr.Markdown( | |
"The accordian below allows you to see the top tags for models on the hub (optionally making " | |
"tags case insensitive" | |
) | |
with gr.Row(): | |
case_sensitive = gr.Checkbox( | |
True, | |
label="case sensitive", | |
) | |
mk = gr.Markdown() | |
case_sensitive.change(get_number_of_tags, [case_sensitive], mk, queue=False) | |
with gr.Accordion("Tag Frequencies", open=False): | |
df = gr.Dataframe() | |
case_sensitive.change(tag_frequency, [case_sensitive], df, queue=False) | |
with gr.Row(): | |
gr.Markdown( | |
"Some tags are currently used with in cased or uncased forms i.e. 'translation' vs 'Translation'" | |
) | |
with gr.Row(): | |
gr.Markdown( | |
f"Number of tags which are currently case sensitive {len(get_case_sensitive_duplicate_tags())}" | |
) | |
with gr.Row(): | |
with gr.Accordion("View case sensitive tag pairs", open=False): | |
gr.Dataframe(display_case_sensitive_duplicate_tags()) | |
with gr.Tab("Tags frequencies by library"): | |
gr.Markdown( | |
"The π€ hub hosts models from a wide range of machine learning libraries. These libraries use tags in " | |
"slightly different ways. The table below gives a breakdown of the most frequent tags for each library." | |
) | |
library_choice = gr.Dropdown(choices=libraries, label="select library") | |
df = gr.Dataframe() | |
library_choice.change( | |
tag_frequency_by_library, [library_choice], df, queue=False | |
) | |
with gr.Tab("Metadata coverage by library"): | |
gr.Markdown( | |
"Libraries hosting models on the Hugging Face hub take different approaches to " | |
"metadata i.e. some libraries automatically generate metadata for a model at the end of a " | |
"training run. These libraries may also have different types of users who take differing " | |
"approaches to creating metadata for models they share on the hub. The below chart allows you to " | |
"see which libraries have better coverage for key areas of model metadata. " | |
) | |
metadata_field = gr.Dropdown(choices=metadata_coverage_columns) | |
plot = gr.Plot() | |
metadata_field.change( | |
metadata_coverage_by_library, [metadata_field], plot, queue=False | |
) | |
with gr.Tab("Auto generated model cards"): | |
gr.Markdown( | |
"Some libraries/training frameworks automatically generate a model card when pushing models to " | |
"the hub. The below dataframe compares the metadata coverage across several tags for models " | |
"which are pushed with autogenerated model cards compared to those without. " | |
"" | |
"**Note** this " | |
"breakdown relies on tags with `autogenerated` in them." | |
"As a result some model cards might be in the wrong category. " | |
) | |
gr.Dataframe(metatadata_coverage_autogenerated_vs_test()) | |
with gr.Row(): | |
metadata_field = gr.Dropdown(choices=metadata_coverage_columns) | |
plot = gr.Plot() | |
metadata_field.change( | |
metadata_coverage_by_autogenerated, [metadata_field], plot, queue=False | |
) | |
# ) | |
# with gr.Row(): | |
# | |
# # with gr.Column(): | |
# # plot = gr.Plot() | |
# # min_lib_frequency.change( | |
# # model_card_length_by_autogenerated, [min_lib_frequency], plot, queue=False | |
# # ) | |
# with gr.Column(): | |
# gr.Markdown("Mean length of model card for autogenerated_from * model cards") | |
# df = gr.Dataframe(model_card_length_by_autogenerated) | |
with gr.Tab("Model Cards"): | |
gr.Markdown( | |
"""Model cards are a key component of metadata for a model. Model cards can include both | |
information created by a human i.e. outlining the goals behind the creation of the model and information | |
created by a training framework. This automatically generated information can contain information about | |
number of epochs, learning rate, weight decay etc. """ | |
) | |
min_lib_frequency = gr.Slider( | |
minimum=1, maximum=top_n, value=10, label="filter by top n libraries" | |
) | |
with gr.Column(): | |
plot = gr.Plot() | |
min_lib_frequency.change( | |
has_model_card_by_library, [min_lib_frequency], plot, queue=False | |
) | |
with gr.Column(): | |
gr.Markdown("Mean length of model card by library") | |
df = gr.Dataframe() | |
min_lib_frequency.change( | |
model_card_length_by_library, [min_lib_frequency], df, queue=False | |
) | |
demo.launch() | |