import datetime import os from dataclasses import asdict, dataclass from functools import lru_cache from json import JSONDecodeError from typing import List, Optional, Union import gradio as gr import requests from huggingface_hub import ( HfApi, ModelCard, hf_hub_url, list_models, list_repo_commits, logging, model_info, ) from huggingface_hub.utils import EntryNotFoundError, disable_progress_bars from tqdm.contrib.concurrent import thread_map disable_progress_bars() logging.set_verbosity_error() token = os.getenv("HF_TOKEN") def get_model_labels(model): try: url = hf_hub_url(repo_id=model, filename="config.json") return list(requests.get(url).json()["label2id"].keys()) except (KeyError, JSONDecodeError, AttributeError): return None @dataclass class EngagementStats: likes: int downloads: int created_at: datetime.datetime def _get_engagement_stats(hub_id): api = HfApi(token=token) repo = api.repo_info(hub_id) return EngagementStats( likes=repo.likes, downloads=repo.downloads, created_at=list_repo_commits(hub_id, repo_type="model")[-1].created_at, ) def _try_load_model_card(hub_id): try: card_text = ModelCard.load(hub_id, token=token).text length = len(card_text) except EntryNotFoundError: card_text = None length = None return card_text, length def _try_parse_card_data(hub_id): data = {} keys = ["license", "language", "datasets"] for key in keys: try: value = model_info(hub_id, token=token).cardData[key] data[key] = value except (KeyError, AttributeError): data[key] = None return data @dataclass class ModelMetadata: hub_id: str tags: Optional[List[str]] license: Optional[str] library_name: Optional[str] datasets: Optional[List[str]] pipeline_tag: Optional[str] labels: Optional[List[str]] languages: Optional[Union[str, List[str]]] engagement_stats: Optional[EngagementStats] = None model_card_text: Optional[str] = None model_card_length: Optional[int] = None @classmethod @lru_cache() def from_hub(cls, hub_id): model = model_info(hub_id) card_text, length = _try_load_model_card(hub_id) data = _try_parse_card_data(hub_id) try: library_name = model.library_name except AttributeError: library_name = None try: tags = model.tags except AttributeError: tags = None try: pipeline_tag = model.pipeline_tag except AttributeError: pipeline_tag = None return ModelMetadata( hub_id=hub_id, languages=data["language"], tags=tags, license=data["license"], library_name=library_name, datasets=data["datasets"], pipeline_tag=pipeline_tag, labels=get_model_labels(hub_id), engagement_stats=_get_engagement_stats(hub_id), model_card_text=card_text, model_card_length=length, ) COMMON_SCORES = { "license": { "required": True, "score": 2, "missing_recommendation": ( "You have not added a license to your models metadata" ), }, "datasets": { "required": False, "score": 1, "missing_recommendation": ( "You have not added any datasets to your models metadata" ), }, "model_card_text": { "required": True, "score": 3, "missing_recommendation": """You haven't created a model card for your model. It is strongly recommended to have a model card for your model. \nYou can create for your model by clicking [here](https://huggingface.co/HUB_ID/edit/main/README.md)""", }, } TASK_TYPES_WITH_LANGUAGES = { "text-classification", "token-classification", "table-question-answering", "question-answering", "zero-shot-classification", "translation", "summarization", "text-generation", "text2text-generation", "fill-mask", "sentence-similarity", "text-to-speech", "automatic-speech-recognition", "text-to-image", "image-to-text", "visual-question-answering", "document-question-answering", } LABELS_REQUIRED_TASKS = { "text-classification", "token-classification", "object-detection", "audio-classification", "image-classification", "tabular-classification", } ALL_PIPELINES = { "audio-classification", "audio-to-audio", "automatic-speech-recognition", "conversational", "depth-estimation", "document-question-answering", "feature-extraction", "fill-mask", "graph-ml", "image-classification", "image-segmentation", "image-to-image", "image-to-text", "object-detection", "question-answering", "reinforcement-learning", "robotics", "sentence-similarity", "summarization", "table-question-answering", "tabular-classification", "tabular-regression", "text-classification", "text-generation", "text-to-image", "text-to-speech", "text-to-video", "text2text-generation", "token-classification", "translation", "unconditional-image-generation", "video-classification", "visual-question-answering", "voice-activity-detection", "zero-shot-classification", "zero-shot-image-classification", } @lru_cache(maxsize=None) def generate_task_scores_dict(): task_scores = {} for task in ALL_PIPELINES: task_dict = COMMON_SCORES.copy() if task in TASK_TYPES_WITH_LANGUAGES: task_dict = { **task_dict, **{ "languages": { "required": True, "score": 2, "missing_recommendation": ( "You haven't defined any languages in your metadata. This" f" is usually recommned for {task} task" ), } }, } if task in LABELS_REQUIRED_TASKS: task_dict = { **task_dict, **{ "labels": { "required": True, "score": 2, "missing_recommendation": ( "You haven't defined any labels in the config.json file" f" these are usually recommended for {task}" ), } }, } max_score = sum(value["score"] for value in task_dict.values()) task_dict["_max_score"] = max_score task_scores[task] = task_dict return task_scores SCORES = generate_task_scores_dict() @lru_cache(maxsize=None) def _basic_check(hub_id): try: data = ModelMetadata.from_hub(hub_id) task = data.pipeline_tag data_dict = asdict(data) score = 0 if task: task_scores = SCORES[task] to_fix = {} for k, v in task_scores.items(): if k.startswith("_"): continue if data_dict[k] is None: to_fix[k] = task_scores[k]["missing_recommendation"] if data_dict[k] is not None: score += v["score"] max_score = task_scores["_max_score"] score = score / max_score score_summary = ( f"Your model's metadata score is {round(score*100)}% based on suggested metadata for {task}" ) recommendations = None if to_fix: recommendations = ( "Here are some suggestions to improve your model's metadata for" f" {task}." ) for v in to_fix.values(): recommendations += f"\n- {v}" return score_summary + recommendations if recommendations else score_summary except Exception as e: print(e) return None def basic_check(hub_id): return _basic_check(hub_id) # print("caching models...") # print("getting top 5,000 models") # models = list_models(sort="downloads", direction=-1, limit=5_000) # model_ids = [model.modelId for model in models] # print("calculating metadata scores...") # thread_map(basic_check, model_ids) gr.Interface(fn=basic_check, inputs="text", outputs="text").launch()