davanstrien HF Staff commited on
Commit
104a4ce
·
1 Parent(s): 016ee09
Files changed (1) hide show
  1. app.py +299 -0
app.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import os
3
+ from dataclasses import asdict, dataclass
4
+ from functools import lru_cache
5
+ from json import JSONDecodeError
6
+ from typing import List, Optional, Union
7
+
8
+ import gradio as gr
9
+ import requests
10
+ from huggingface_hub import (
11
+ HfApi,
12
+ ModelCard,
13
+ hf_hub_url,
14
+ list_models,
15
+ list_repo_commits,
16
+ logging,
17
+ model_info,
18
+ )
19
+ from huggingface_hub.utils import EntryNotFoundError, disable_progress_bars
20
+ from tqdm.contrib.concurrent import thread_map
21
+
22
+ disable_progress_bars()
23
+
24
+ logging.set_verbosity_error()
25
+
26
+ token = os.getenv("HF_TOKEN")
27
+
28
+
29
+ def get_model_labels(model):
30
+ try:
31
+ url = hf_hub_url(repo_id=model, filename="config.json")
32
+ return list(requests.get(url).json()["label2id"].keys())
33
+ except (KeyError, JSONDecodeError, AttributeError):
34
+ return None
35
+
36
+
37
+ @dataclass
38
+ class EngagementStats:
39
+ likes: int
40
+ downloads: int
41
+ created_at: datetime.datetime
42
+
43
+
44
+ def _get_engagement_stats(hub_id):
45
+ api = HfApi(token=token)
46
+ repo = api.repo_info(hub_id)
47
+ return EngagementStats(
48
+ likes=repo.likes,
49
+ downloads=repo.downloads,
50
+ created_at=list_repo_commits(hub_id, repo_type="model")[-1].created_at,
51
+ )
52
+
53
+
54
+ def _try_load_model_card(hub_id):
55
+ try:
56
+ card_text = ModelCard.load(hub_id, token=token).text
57
+ length = len(card_text)
58
+ except EntryNotFoundError:
59
+ card_text = None
60
+ length = None
61
+ return card_text, length
62
+
63
+
64
+ def _try_parse_card_data(hub_id):
65
+ data = {}
66
+ keys = ["license", "language", "datasets"]
67
+ for key in keys:
68
+ try:
69
+ value = model_info(hub_id, token=token).cardData[key]
70
+ data[key] = value
71
+ except (KeyError, AttributeError):
72
+ data[key] = None
73
+ return data
74
+
75
+
76
+ @dataclass
77
+ class ModelMetadata:
78
+ hub_id: str
79
+ tags: Optional[List[str]]
80
+ license: Optional[str]
81
+ library_name: Optional[str]
82
+ datasets: Optional[List[str]]
83
+ pipeline_tag: Optional[str]
84
+ labels: Optional[List[str]]
85
+ languages: Optional[Union[str, List[str]]]
86
+ engagement_stats: Optional[EngagementStats] = None
87
+ model_card_text: Optional[str] = None
88
+ model_card_length: Optional[int] = None
89
+
90
+ @classmethod
91
+ @lru_cache()
92
+ def from_hub(cls, hub_id):
93
+ model = model_info(hub_id)
94
+ card_text, length = _try_load_model_card(hub_id)
95
+ data = _try_parse_card_data(hub_id)
96
+ try:
97
+ library_name = model.library_name
98
+ except AttributeError:
99
+ library_name = None
100
+ try:
101
+ tags = model.tags
102
+ except AttributeError:
103
+ tags = None
104
+ try:
105
+ pipeline_tag = model.pipeline_tag
106
+ except AttributeError:
107
+ pipeline_tag = None
108
+ return ModelMetadata(
109
+ hub_id=hub_id,
110
+ languages=data["language"],
111
+ tags=tags,
112
+ license=data["license"],
113
+ library_name=library_name,
114
+ datasets=data["datasets"],
115
+ pipeline_tag=pipeline_tag,
116
+ labels=get_model_labels(hub_id),
117
+ engagement_stats=_get_engagement_stats(hub_id),
118
+ model_card_text=card_text,
119
+ model_card_length=length,
120
+ )
121
+
122
+
123
+ COMMON_SCORES = {
124
+ "license": {
125
+ "required": True,
126
+ "score": 2,
127
+ "missing_recommendation": (
128
+ "You have not added a license to your models metadata"
129
+ ),
130
+ },
131
+ "datasets": {
132
+ "required": False,
133
+ "score": 1,
134
+ "missing_recommendation": (
135
+ "You have not added any datasets to your models metadata"
136
+ ),
137
+ },
138
+ "model_card_text": {
139
+ "required": True,
140
+ "score": 3,
141
+ "missing_recommendation": """You haven't created a model card for your model. It is strongly recommended to have a model card for your model. \nYou can create for your model by clicking [here](https://huggingface.co/HUB_ID/edit/main/README.md)""",
142
+ },
143
+ }
144
+
145
+
146
+ TASK_TYPES_WITH_LANGUAGES = {
147
+ "text-classification",
148
+ "token-classification",
149
+ "table-question-answering",
150
+ "question-answering",
151
+ "zero-shot-classification",
152
+ "translation",
153
+ "summarization",
154
+ "text-generation",
155
+ "text2text-generation",
156
+ "fill-mask",
157
+ "sentence-similarity",
158
+ "text-to-speech",
159
+ "automatic-speech-recognition",
160
+ "text-to-image",
161
+ "image-to-text",
162
+ "visual-question-answering",
163
+ "document-question-answering",
164
+ }
165
+
166
+ LABELS_REQUIRED_TASKS = {
167
+ "text-classification",
168
+ "token-classification",
169
+ "object-detection",
170
+ "audio-classification",
171
+ "image-classification",
172
+ "tabular-classification",
173
+ }
174
+ ALL_PIPELINES = {
175
+ "audio-classification",
176
+ "audio-to-audio",
177
+ "automatic-speech-recognition",
178
+ "conversational",
179
+ "depth-estimation",
180
+ "document-question-answering",
181
+ "feature-extraction",
182
+ "fill-mask",
183
+ "graph-ml",
184
+ "image-classification",
185
+ "image-segmentation",
186
+ "image-to-image",
187
+ "image-to-text",
188
+ "object-detection",
189
+ "question-answering",
190
+ "reinforcement-learning",
191
+ "robotics",
192
+ "sentence-similarity",
193
+ "summarization",
194
+ "table-question-answering",
195
+ "tabular-classification",
196
+ "tabular-regression",
197
+ "text-classification",
198
+ "text-generation",
199
+ "text-to-image",
200
+ "text-to-speech",
201
+ "text-to-video",
202
+ "text2text-generation",
203
+ "token-classification",
204
+ "translation",
205
+ "unconditional-image-generation",
206
+ "video-classification",
207
+ "visual-question-answering",
208
+ "voice-activity-detection",
209
+ "zero-shot-classification",
210
+ "zero-shot-image-classification",
211
+ }
212
+
213
+
214
+ @lru_cache(maxsize=None)
215
+ def generate_task_scores_dict():
216
+ task_scores = {}
217
+ for task in ALL_PIPELINES:
218
+ task_dict = COMMON_SCORES.copy()
219
+ if task in TASK_TYPES_WITH_LANGUAGES:
220
+ task_dict = {
221
+ **task_dict,
222
+ **{
223
+ "languages": {
224
+ "required": True,
225
+ "score": 2,
226
+ "missing_recommendation": (
227
+ "You haven't defined any languages in your metadata. This"
228
+ f" is usually recommned for {task} task"
229
+ ),
230
+ }
231
+ },
232
+ }
233
+ if task in LABELS_REQUIRED_TASKS:
234
+ task_dict = {
235
+ **task_dict,
236
+ **{
237
+ "labels": {
238
+ "required": True,
239
+ "score": 2,
240
+ "missing_recommendation": (
241
+ "You haven't defined any labels in the config.json file"
242
+ f" these are usually recommended for {task}"
243
+ ),
244
+ }
245
+ },
246
+ }
247
+ max_score = sum(value["score"] for value in task_dict.values())
248
+ task_dict["_max_score"] = max_score
249
+ task_scores[task] = task_dict
250
+ return task_scores
251
+
252
+
253
+ SCORES = generate_task_scores_dict()
254
+
255
+
256
+ @lru_cache(maxsize=None)
257
+ def basic_check(hub_id):
258
+ try:
259
+ data = ModelMetadata.from_hub(hub_id)
260
+ task = data.pipeline_tag
261
+ data_dict = asdict(data)
262
+ score = 0
263
+ if task:
264
+ task_scores = SCORES[task]
265
+ to_fix = {}
266
+ for k, v in task_scores.items():
267
+ if k.startswith("_"):
268
+ continue
269
+ if data_dict[k] is None:
270
+ to_fix[k] = task_scores[k]["missing_recommendation"]
271
+ if data_dict[k] is not None:
272
+ score += v["score"]
273
+ max_score = task_scores["_max_score"]
274
+ score = score / max_score
275
+ score_summary = (
276
+ f"Your model's metadata score is {round(score*100)}% based on suggested"
277
+ f" metadata for {task}"
278
+ )
279
+ recommendations = (
280
+ "Here are some suggestions to improve your model's metadata for"
281
+ f" {task}."
282
+ )
283
+ for v in to_fix.values():
284
+ recommendations += f"\n- {v}"
285
+ return score_summary + recommendations
286
+ except Exception as e:
287
+ print(e)
288
+ return None
289
+
290
+
291
+ print("caching models...")
292
+ print("getting top 5,000 models")
293
+ models = list_models(sort="downloads", direction=-1, limit=5_000)
294
+ model_ids = [model.modelId for model in models]
295
+ print("calculating metadata scores...")
296
+ thread_map(basic_check, model_ids)
297
+
298
+
299
+ gr.Interface(fn=basic_check, inputs="text", outputs="text").launch()