Spaces:

shayan5422
/

back_rag_huggingface

Running

File size: 10,721 Bytes

import os
import requests
from tqdm import tqdm
import time
import re
import json
from huggingface_hub import HfApi, hf_hub_download
from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError, HFValidationError
from requests.exceptions import RequestException
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle # Add pickle for caching

# Create a directory to store JSON data
OUTPUT_DIR = "model_data_json"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Number of worker threads for parallel processing - REDUCED
NUM_WORKERS = 4

# Add a delay between download attempts across threads
DOWNLOAD_DELAY_SECONDS = 0.2 # Adjust as needed

# --- README Cleaning ---
def clean_readme_content(text):
    """Basic cleaning of README markdown: remove code blocks, links."""
    if not text:
        return ""
    
    # Remove fenced code blocks (``` ... ```)
    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
    # Remove inline code (`...`)
    text = re.sub(r'`[^`]+`', '', text)
    # Remove markdown links ([text](url))
    text = re.sub(r'\[([^]]+)\]\([^)]+\)', r'\1', text) # Keep link text
    # Remove standalone URLs (simple version)
    text = re.sub(r'https?://\S+', '', text)
    # Remove markdown images (![alt](url))
    text = re.sub(r'!\[[^]]*\]\([^)]+\)', '', text)
    # Replace multiple newlines/spaces with single ones
    text = ' '.join(text.split())
    return text
# ---

MODELS_CACHE_FILE = "models_list_cache.pkl" # File to cache the raw model list

def get_all_models_with_downloads(min_downloads=10000):
    """Fetch all models from Hugging Face with at least min_downloads, using a local cache for the list."""
    models_list = None
    
    # 1. Check for cache
    if os.path.exists(MODELS_CACHE_FILE):
        try:
            print(f"Loading cached model list from {MODELS_CACHE_FILE}...")
            with open(MODELS_CACHE_FILE, 'rb') as f:
                models_list = pickle.load(f)
            print(f"Loaded {len(models_list)} models from cache.")
        except Exception as e:
            print(f"Error loading cache file {MODELS_CACHE_FILE}: {e}. Fetching from API.")
            models_list = None # Ensure fetching if cache loading fails
    
    # 2. Fetch from API if cache doesn't exist or failed to load
    if models_list is None:
        print(f"Fetching all models with more than {min_downloads} downloads from API...")
        try:
            print("Initializing HfApi...")
            api = HfApi()
            print("HfApi initialized. Calling list_models...")
            # Fetch the iterator
            models_iterator = api.list_models(sort="downloads", direction=-1, fetch_config=False, cardData=True)
            print("list_models call returned. Converting iterator to list...")
            # Convert the iterator to a list TO ALLOW CACHING
            models_list = list(models_iterator)
            print(f"Converted to list with {len(models_list)} models.")
            
            # Save to cache
            try:
                print(f"Saving model list to cache file: {MODELS_CACHE_FILE}...")
                with open(MODELS_CACHE_FILE, 'wb') as f:
                    pickle.dump(models_list, f)
                print("Model list saved to cache.")
            except Exception as e:
                print(f"Error saving cache file {MODELS_CACHE_FILE}: {e}")
            
        except Exception as e:
            print(f"Error during HfApi initialization or list_models call: {e}")
            return [] # Return empty list on error
    
    # 3. Filter the loaded/fetched list
    if not models_list:
        print("Model list is empty after fetching/loading.")
        return []
    
    qualifying_models = []
    print(f"Filtering {len(models_list)} models by download count...")
    for model in models_list: # Iterate through the list (from cache or API)
        # No need for prints inside this loop now, as it should be fast
        if not hasattr(model, 'downloads') or model.downloads is None:
            continue
        
        if model.downloads < min_downloads:
            # Since the list is sorted by downloads, we can stop
            break
        
        qualifying_models.append(model)
    
    print(f"Found {len(qualifying_models)} models with more than {min_downloads} downloads")
    return qualifying_models

def get_model_readme(model_id):
    """Get README.md content for a specific model using hf_hub_download. Returns None if not found or inaccessible."""
    filenames_to_try = ["README.md", "readme.md"]
    branches_to_try = ["main", "master"]
    
    for branch in branches_to_try:
        for filename in filenames_to_try:
            try:
                # print(f"Attempting download: repo={model_id}, branch={branch}, file={filename}") # Debug
                # Use hf_hub_download which uses stored token
                readme_path = hf_hub_download(
                    repo_id=model_id,
                    filename=filename,
                    revision=branch,
                    repo_type="model",
                    local_files_only=False, # Ensure it tries to download
                    # token=True # Often not needed if logged in via CLI, but can be explicit
                )
                
                # If download succeeded, read the content
                # print(f"Successfully downloaded {filename} from {branch} to {readme_path}") # Debug
                with open(readme_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                return content

            except RepositoryNotFoundError:
                print(f"Repository {model_id} not found.")
                return None # If repo doesn't exist, no point trying other files/branches
            except EntryNotFoundError: 
                # print(f"{filename} not found in branch {branch} for {model_id}. Trying next...") # Debug
                continue # File not found in this specific branch/filename combination, try next
            except HFValidationError as e: # Catch invalid repo ID or filename errors
                 print(f"Validation error for {model_id} (branch: {branch}, file: {filename}): {e}")
                 continue # Try next filename/branch
            except Exception as e: # Catch other potential errors (like 401 HfHubHTTPError, network issues)
                print(f"Error downloading {filename} from branch {branch} for {model_id}: {e}")
                # Check if it's a likely authentication error (401/403)
                if "401" in str(e) or "403" in str(e):
                    print(f"Authentication error (401/403) for {model_id}. Ensure you are logged in and accepted terms.")
                    return None # Don't try other files/branches if auth failed
                # For other errors, we continue to the next filename/branch attempt
                continue 
                
    # If all attempts failed
    print(f"Could not fetch README for {model_id} from any standard location.")
    return None

def get_filename_for_model(model_id):
    """Generate JSON filename for a model"""
    safe_id = model_id.replace("/", "_")
    return os.path.join(OUTPUT_DIR, f"{safe_id}.json") # Change extension to .json

def save_model_data(model_id, data):
    """Save model data (description, tags, downloads) to a JSON file."""
    filename = get_filename_for_model(model_id)
    try:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        return filename
    except Exception as e:
        print(f"Error saving JSON for {model_id} to {filename}: {e}")
        return None

def file_exists_for_model(model_id):
    """Check if a JSON file already exists for this model"""
    filename = get_filename_for_model(model_id)
    return os.path.exists(filename)

def process_model(model):
    """Process a single model - fetch README, clean it, save as JSON."""
    model_id = model.modelId
    downloads = model.downloads
    tags = getattr(model, 'tags', []) # Get tags if available
    
    # Check if JSON file already exists
    if file_exists_for_model(model_id):
        return (model_id, downloads, None, "skipped")
    
    # --- Add Delay Before Download Attempt ---
    time.sleep(DOWNLOAD_DELAY_SECONDS) 
    # ---------------------------------------
    
    # Get model README content
    readme_content = get_model_readme(model_id)
    
    # If README is not available, skip saving this model
    if readme_content is None:
        return (model_id, downloads, None, "no_readme")
    
    # Clean the README
    cleaned_readme = clean_readme_content(readme_content)
    
    # Prepare data payload
    model_data = {
        "model_id": model_id,
        "downloads": downloads,
        "tags": tags,
        "description": cleaned_readme
    }
    
    # Save data as JSON
    filename = save_model_data(model_id, model_data)
    if filename:
        return (model_id, downloads, filename, "downloaded")
    else:
        return (model_id, downloads, None, "save_failed")

def main():
    qualifying_models = get_all_models_with_downloads(min_downloads=10000)
    if not qualifying_models:
        print("No qualifying models found")
        return
    
    print(f"Processing {len(qualifying_models)} models, saving to '{OUTPUT_DIR}'...")
    downloaded = 0
    skipped = 0
    no_readme = 0
    failed = 0
    
    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        future_to_model = {executor.submit(process_model, model): model for model in qualifying_models}
        
        for future in tqdm(as_completed(future_to_model), total=len(qualifying_models)):
            try:
                model_id, downloads, filename, status = future.result()
                if status == "downloaded":
                    # Don't print every success to avoid clutter
                    # print(f"Saved data for {model_id} ({downloads} downloads) to {filename}")
                    downloaded += 1
                elif status == "skipped":
                    skipped += 1
                elif status == "no_readme":
                    no_readme += 1
                else: # save_failed or other errors
                    failed += 1
            except Exception as e:
                # Extract model_id for better error reporting if possible
                processed_model = future_to_model[future]
                print(f"Error processing model {getattr(processed_model, 'modelId', 'unknown')}: {e}")
                failed += 1
    
    print(f"\nCompleted! Downloaded: {downloaded}, Skipped existing: {skipped}, No README found: {no_readme}, Failed: {failed}")

if __name__ == "__main__":
    main()