File size: 10,721 Bytes
0db8b33
 
 
 
 
 
 
 
 
 
 
 
8181a7b
 
0db8b33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8181a7b
0db8b33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8181a7b
0db8b33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8181a7b
0db8b33
8181a7b
0db8b33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import os
import requests
from tqdm import tqdm
import time
import re
import json
from huggingface_hub import HfApi, hf_hub_download
from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError, HFValidationError
from requests.exceptions import RequestException
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle # Add pickle for caching

# Create a directory to store JSON data
OUTPUT_DIR = "model_data_json"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Number of worker threads for parallel processing - REDUCED
NUM_WORKERS = 4

# Add a delay between download attempts across threads
DOWNLOAD_DELAY_SECONDS = 0.2 # Adjust as needed

# --- README Cleaning ---
def clean_readme_content(text):
    """Basic cleaning of README markdown: remove code blocks, links."""
    if not text:
        return ""
    
    # Remove fenced code blocks (``` ... ```)
    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
    # Remove inline code (`...`)
    text = re.sub(r'`[^`]+`', '', text)
    # Remove markdown links ([text](url))
    text = re.sub(r'\[([^]]+)\]\([^)]+\)', r'\1', text) # Keep link text
    # Remove standalone URLs (simple version)
    text = re.sub(r'https?://\S+', '', text)
    # Remove markdown images (![alt](url))
    text = re.sub(r'!\[[^]]*\]\([^)]+\)', '', text)
    # Replace multiple newlines/spaces with single ones
    text = ' '.join(text.split())
    return text
# ---

MODELS_CACHE_FILE = "models_list_cache.pkl" # File to cache the raw model list

def get_all_models_with_downloads(min_downloads=10000):
    """Fetch all models from Hugging Face with at least min_downloads, using a local cache for the list."""
    models_list = None
    
    # 1. Check for cache
    if os.path.exists(MODELS_CACHE_FILE):
        try:
            print(f"Loading cached model list from {MODELS_CACHE_FILE}...")
            with open(MODELS_CACHE_FILE, 'rb') as f:
                models_list = pickle.load(f)
            print(f"Loaded {len(models_list)} models from cache.")
        except Exception as e:
            print(f"Error loading cache file {MODELS_CACHE_FILE}: {e}. Fetching from API.")
            models_list = None # Ensure fetching if cache loading fails
    
    # 2. Fetch from API if cache doesn't exist or failed to load
    if models_list is None:
        print(f"Fetching all models with more than {min_downloads} downloads from API...")
        try:
            print("Initializing HfApi...")
            api = HfApi()
            print("HfApi initialized. Calling list_models...")
            # Fetch the iterator
            models_iterator = api.list_models(sort="downloads", direction=-1, fetch_config=False, cardData=True)
            print("list_models call returned. Converting iterator to list...")
            # Convert the iterator to a list TO ALLOW CACHING
            models_list = list(models_iterator)
            print(f"Converted to list with {len(models_list)} models.")
            
            # Save to cache
            try:
                print(f"Saving model list to cache file: {MODELS_CACHE_FILE}...")
                with open(MODELS_CACHE_FILE, 'wb') as f:
                    pickle.dump(models_list, f)
                print("Model list saved to cache.")
            except Exception as e:
                print(f"Error saving cache file {MODELS_CACHE_FILE}: {e}")
            
        except Exception as e:
            print(f"Error during HfApi initialization or list_models call: {e}")
            return [] # Return empty list on error
    
    # 3. Filter the loaded/fetched list
    if not models_list:
        print("Model list is empty after fetching/loading.")
        return []
    
    qualifying_models = []
    print(f"Filtering {len(models_list)} models by download count...")
    for model in models_list: # Iterate through the list (from cache or API)
        # No need for prints inside this loop now, as it should be fast
        if not hasattr(model, 'downloads') or model.downloads is None:
            continue
        
        if model.downloads < min_downloads:
            # Since the list is sorted by downloads, we can stop
            break
        
        qualifying_models.append(model)
    
    print(f"Found {len(qualifying_models)} models with more than {min_downloads} downloads")
    return qualifying_models

def get_model_readme(model_id):
    """Get README.md content for a specific model using hf_hub_download. Returns None if not found or inaccessible."""
    filenames_to_try = ["README.md", "readme.md"]
    branches_to_try = ["main", "master"]
    
    for branch in branches_to_try:
        for filename in filenames_to_try:
            try:
                # print(f"Attempting download: repo={model_id}, branch={branch}, file={filename}") # Debug
                # Use hf_hub_download which uses stored token
                readme_path = hf_hub_download(
                    repo_id=model_id,
                    filename=filename,
                    revision=branch,
                    repo_type="model",
                    local_files_only=False, # Ensure it tries to download
                    # token=True # Often not needed if logged in via CLI, but can be explicit
                )
                
                # If download succeeded, read the content
                # print(f"Successfully downloaded {filename} from {branch} to {readme_path}") # Debug
                with open(readme_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                return content

            except RepositoryNotFoundError:
                print(f"Repository {model_id} not found.")
                return None # If repo doesn't exist, no point trying other files/branches
            except EntryNotFoundError: 
                # print(f"{filename} not found in branch {branch} for {model_id}. Trying next...") # Debug
                continue # File not found in this specific branch/filename combination, try next
            except HFValidationError as e: # Catch invalid repo ID or filename errors
                 print(f"Validation error for {model_id} (branch: {branch}, file: {filename}): {e}")
                 continue # Try next filename/branch
            except Exception as e: # Catch other potential errors (like 401 HfHubHTTPError, network issues)
                print(f"Error downloading {filename} from branch {branch} for {model_id}: {e}")
                # Check if it's a likely authentication error (401/403)
                if "401" in str(e) or "403" in str(e):
                    print(f"Authentication error (401/403) for {model_id}. Ensure you are logged in and accepted terms.")
                    return None # Don't try other files/branches if auth failed
                # For other errors, we continue to the next filename/branch attempt
                continue 
                
    # If all attempts failed
    print(f"Could not fetch README for {model_id} from any standard location.")
    return None

def get_filename_for_model(model_id):
    """Generate JSON filename for a model"""
    safe_id = model_id.replace("/", "_")
    return os.path.join(OUTPUT_DIR, f"{safe_id}.json") # Change extension to .json

def save_model_data(model_id, data):
    """Save model data (description, tags, downloads) to a JSON file."""
    filename = get_filename_for_model(model_id)
    try:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        return filename
    except Exception as e:
        print(f"Error saving JSON for {model_id} to {filename}: {e}")
        return None

def file_exists_for_model(model_id):
    """Check if a JSON file already exists for this model"""
    filename = get_filename_for_model(model_id)
    return os.path.exists(filename)

def process_model(model):
    """Process a single model - fetch README, clean it, save as JSON."""
    model_id = model.modelId
    downloads = model.downloads
    tags = getattr(model, 'tags', []) # Get tags if available
    
    # Check if JSON file already exists
    if file_exists_for_model(model_id):
        return (model_id, downloads, None, "skipped")
    
    # --- Add Delay Before Download Attempt ---
    time.sleep(DOWNLOAD_DELAY_SECONDS) 
    # ---------------------------------------
    
    # Get model README content
    readme_content = get_model_readme(model_id)
    
    # If README is not available, skip saving this model
    if readme_content is None:
        return (model_id, downloads, None, "no_readme")
    
    # Clean the README
    cleaned_readme = clean_readme_content(readme_content)
    
    # Prepare data payload
    model_data = {
        "model_id": model_id,
        "downloads": downloads,
        "tags": tags,
        "description": cleaned_readme
    }
    
    # Save data as JSON
    filename = save_model_data(model_id, model_data)
    if filename:
        return (model_id, downloads, filename, "downloaded")
    else:
        return (model_id, downloads, None, "save_failed")

def main():
    qualifying_models = get_all_models_with_downloads(min_downloads=10000)
    if not qualifying_models:
        print("No qualifying models found")
        return
    
    print(f"Processing {len(qualifying_models)} models, saving to '{OUTPUT_DIR}'...")
    downloaded = 0
    skipped = 0
    no_readme = 0
    failed = 0
    
    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        future_to_model = {executor.submit(process_model, model): model for model in qualifying_models}
        
        for future in tqdm(as_completed(future_to_model), total=len(qualifying_models)):
            try:
                model_id, downloads, filename, status = future.result()
                if status == "downloaded":
                    # Don't print every success to avoid clutter
                    # print(f"Saved data for {model_id} ({downloads} downloads) to {filename}")
                    downloaded += 1
                elif status == "skipped":
                    skipped += 1
                elif status == "no_readme":
                    no_readme += 1
                else: # save_failed or other errors
                    failed += 1
            except Exception as e:
                # Extract model_id for better error reporting if possible
                processed_model = future_to_model[future]
                print(f"Error processing model {getattr(processed_model, 'modelId', 'unknown')}: {e}")
                failed += 1
    
    print(f"\nCompleted! Downloaded: {downloaded}, Skipped existing: {skipped}, No README found: {no_readme}, Failed: {failed}")

if __name__ == "__main__":
    main()