Spaces:
Running
Running
Upload 10 files
Browse files- add_model_explanations.py +253 -0
- app.py +82 -0
- build_index.py +140 -0
- daily_update.py +107 -0
- huggingface_model_descriptions.py +253 -0
- requirements.txt +3 -1
add_model_explanations.py
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from typing import Dict, Any, Optional
|
4 |
+
import logging
|
5 |
+
import time
|
6 |
+
# import google.generativeai as genai # Remove Gemini import
|
7 |
+
from openai import OpenAI, APIError # Add back OpenAI imports
|
8 |
+
|
9 |
+
# Configure logging
|
10 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
11 |
+
|
12 |
+
MODEL_DATA_DIR = "model_data_json"
|
13 |
+
EXPLANATION_KEY = "model_explanation_gemini"
|
14 |
+
DESCRIPTION_KEY = "description"
|
15 |
+
MAX_RETRIES = 3 # Retries for API calls
|
16 |
+
RETRY_DELAY_SECONDS = 5 # Delay between retries
|
17 |
+
|
18 |
+
# --- DeepSeek API Configuration (Restored) ---
|
19 |
+
DEEPSEEK_API_KEY_ENV_VAR = "DEEPSEEK_API_KEY" # Environment variable for the key
|
20 |
+
DEEPSEEK_BASE_URL = "https://api.deepseek.com"
|
21 |
+
DEEPSEEK_MODEL_NAME = "deepseek-chat"
|
22 |
+
# ---
|
23 |
+
|
24 |
+
# Remove Gemini configuration
|
25 |
+
# GEMINI_API_KEY_ENV_VAR = "GEMINI_API_KEY"
|
26 |
+
# GEMINI_MODEL_NAME = "gemini-1.5-flash-latest"
|
27 |
+
|
28 |
+
# Global client variable for DeepSeek/OpenAI client
|
29 |
+
client: Optional[OpenAI] = None # Use OpenAI client type
|
30 |
+
# gemini_model: Optional[genai.GenerativeModel] = None # Remove Gemini model variable
|
31 |
+
|
32 |
+
def configure_llm_client():
|
33 |
+
"""Configures the OpenAI client for DeepSeek API using the API key from environment variables."""
|
34 |
+
global client
|
35 |
+
# global gemini_model # Remove
|
36 |
+
api_key = os.getenv(DEEPSEEK_API_KEY_ENV_VAR) # Use DeepSeek env var
|
37 |
+
if not api_key:
|
38 |
+
logging.error(f"Error: {DEEPSEEK_API_KEY_ENV_VAR} environment variable not set.")
|
39 |
+
logging.error("Please set the environment variable with your DeepSeek API key before running the script.")
|
40 |
+
return False
|
41 |
+
try:
|
42 |
+
# Configure OpenAI client for DeepSeek
|
43 |
+
client = OpenAI(api_key=api_key, base_url=DEEPSEEK_BASE_URL)
|
44 |
+
logging.info(f"DeepSeek API client configured successfully for model: {DEEPSEEK_MODEL_NAME}.")
|
45 |
+
return True
|
46 |
+
except Exception as e:
|
47 |
+
logging.error(f"Failed to configure DeepSeek API client: {e}")
|
48 |
+
client = None
|
49 |
+
return False
|
50 |
+
|
51 |
+
# --- End DeepSeek API Configuration ---
|
52 |
+
|
53 |
+
def generate_explanation(model_id: str, description: str) -> Optional[str]:
|
54 |
+
"""
|
55 |
+
Generates a short English explanation for the model based on its description
|
56 |
+
by calling the DeepSeek API via the OpenAI library.
|
57 |
+
|
58 |
+
Args:
|
59 |
+
model_id: The ID of the model (for context).
|
60 |
+
description: The model description text.
|
61 |
+
|
62 |
+
Returns:
|
63 |
+
A short English explanation string from DeepSeek, or None if generation fails.
|
64 |
+
"""
|
65 |
+
global client # Use OpenAI client
|
66 |
+
# global gemini_model # Remove
|
67 |
+
if not client:
|
68 |
+
logging.error(f"[{model_id}] DeepSeek client not configured. Cannot generate explanation.")
|
69 |
+
return None
|
70 |
+
|
71 |
+
if not description or not isinstance(description, str):
|
72 |
+
logging.warning(f"[{model_id}] Description is empty or not a string. Skipping explanation generation.")
|
73 |
+
return None
|
74 |
+
|
75 |
+
# Truncate very long descriptions (adjust limit back if needed for DeepSeek)
|
76 |
+
max_desc_length = 4000
|
77 |
+
if len(description) > max_desc_length:
|
78 |
+
logging.warning(f"[{model_id}] Description truncated to {max_desc_length} chars for API call.")
|
79 |
+
description = description[:max_desc_length] + "... [truncated]"
|
80 |
+
|
81 |
+
# Construct the messages for DeepSeek API (Restore original format)
|
82 |
+
messages = [
|
83 |
+
{"role": "system", "content": "You are an AI assistant tasked with summarizing Hugging Face model descriptions concisely."},
|
84 |
+
{"role": "user", "content": (
|
85 |
+
f"Analyze the following description for the Hugging Face model '{model_id}'. "
|
86 |
+
f"Based **only** on this description, provide a concise, one-sentence explanation in English "
|
87 |
+
f"summarizing what this model does and its primary purpose or task. "
|
88 |
+
f"Focus on the core functionality mentioned. Avoid adding introductory phrases like 'This model is...' or 'The model...'."
|
89 |
+
f"\n\n---\nModel Description:\n{description}\n---\n\nConcise Explanation:"
|
90 |
+
)}
|
91 |
+
]
|
92 |
+
|
93 |
+
# Remove Gemini prompt construction
|
94 |
+
# prompt = (...)
|
95 |
+
|
96 |
+
retries = 0
|
97 |
+
while retries < MAX_RETRIES:
|
98 |
+
try:
|
99 |
+
logging.info(f"[{model_id}] Calling DeepSeek API (Attempt {retries + 1}/{MAX_RETRIES})...")
|
100 |
+
# Use OpenAI client call format
|
101 |
+
response = client.chat.completions.create(
|
102 |
+
model=DEEPSEEK_MODEL_NAME,
|
103 |
+
messages=messages,
|
104 |
+
stream=False,
|
105 |
+
max_tokens=100, # Limit response length
|
106 |
+
temperature=0.2 # Lower temperature for more focused summary
|
107 |
+
)
|
108 |
+
|
109 |
+
# Remove Gemini response handling
|
110 |
+
# if not response.candidates: ...
|
111 |
+
|
112 |
+
explanation = response.choices[0].message.content.strip() # Get explanation from OpenAI response structure
|
113 |
+
logging.info(f"[{model_id}] Explanation received from DeepSeek: '{explanation}'")
|
114 |
+
|
115 |
+
# Basic post-processing: remove potential quotes
|
116 |
+
if explanation.startswith('"') and explanation.endswith('"'):
|
117 |
+
explanation = explanation[1:-1]
|
118 |
+
# Remove Gemini specific post-processing
|
119 |
+
# explanation = explanation.replace('**', '')
|
120 |
+
return explanation
|
121 |
+
|
122 |
+
# Restore specific APIError catch for OpenAI client
|
123 |
+
except APIError as e:
|
124 |
+
retries += 1
|
125 |
+
logging.error(f"[{model_id}] DeepSeek API Error (Attempt {retries}/{MAX_RETRIES}): {e}")
|
126 |
+
if retries < MAX_RETRIES:
|
127 |
+
logging.info(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
|
128 |
+
time.sleep(RETRY_DELAY_SECONDS)
|
129 |
+
else:
|
130 |
+
logging.error(f"[{model_id}] Max retries reached. Failed to generate explanation via DeepSeek.")
|
131 |
+
return None
|
132 |
+
# Keep general Exception catch
|
133 |
+
except Exception as e:
|
134 |
+
retries += 1 # Consider retrying general errors too or handle differently
|
135 |
+
logging.error(f"[{model_id}] Unexpected Error during API call (Attempt {retries}/{MAX_RETRIES}): {e}")
|
136 |
+
if retries < MAX_RETRIES:
|
137 |
+
logging.info(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
|
138 |
+
time.sleep(RETRY_DELAY_SECONDS)
|
139 |
+
else:
|
140 |
+
logging.error(f"[{model_id}] Max retries reached. Failed to generate explanation due to unexpected errors.")
|
141 |
+
return None
|
142 |
+
|
143 |
+
return None # Should not be reached if loop finishes without returning
|
144 |
+
|
145 |
+
def process_json_file(filepath: str):
|
146 |
+
"""Reads, updates (only if explanation missing), and writes a single JSON file."""
|
147 |
+
model_id = os.path.basename(filepath).replace('.json', '')
|
148 |
+
logging.info(f"Processing {filepath}...")
|
149 |
+
|
150 |
+
try:
|
151 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
152 |
+
data = json.load(f)
|
153 |
+
except json.JSONDecodeError:
|
154 |
+
logging.error(f"[{model_id}] Invalid JSON format in {filepath}. Skipping.")
|
155 |
+
return False # Indicate failure/skip
|
156 |
+
except FileNotFoundError:
|
157 |
+
logging.error(f"[{model_id}] File not found: {filepath}. Skipping.")
|
158 |
+
return False
|
159 |
+
except Exception as e:
|
160 |
+
logging.error(f"[{model_id}] Error reading {filepath}: {e}. Skipping.")
|
161 |
+
return False
|
162 |
+
|
163 |
+
if not isinstance(data, dict):
|
164 |
+
logging.error(f"[{model_id}] Expected JSON object (dict) but got {type(data)} in {filepath}. Skipping.")
|
165 |
+
return False
|
166 |
+
|
167 |
+
# --- Check if explanation already exists ---
|
168 |
+
if EXPLANATION_KEY in data and data[EXPLANATION_KEY]: # Check if key exists AND has non-empty content
|
169 |
+
logging.info(f"[{model_id}] Explanation already exists. Skipping generation.")
|
170 |
+
return False # Indicate no update was needed
|
171 |
+
|
172 |
+
# --- Deletion Logic REMOVED ---
|
173 |
+
# if EXPLANATION_KEY in data: ...
|
174 |
+
|
175 |
+
# --- Generation Logic ---
|
176 |
+
description = data.get(DESCRIPTION_KEY)
|
177 |
+
if not description:
|
178 |
+
logging.warning(f"[{model_id}] Description field is missing or empty. Cannot generate explanation.")
|
179 |
+
return False # Cannot generate, so no update possible
|
180 |
+
|
181 |
+
explanation = generate_explanation(model_id, description) # Try to generate a new one
|
182 |
+
|
183 |
+
# --- Update and Write Logic ---
|
184 |
+
if explanation: # Only update if generation was successful
|
185 |
+
data[EXPLANATION_KEY] = explanation
|
186 |
+
try:
|
187 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
188 |
+
json.dump(data, f, ensure_ascii=False, indent=4)
|
189 |
+
logging.info(f"[{model_id}] Successfully generated and updated {filepath} with new explanation.")
|
190 |
+
return True # Indicate success/update
|
191 |
+
except IOError as e:
|
192 |
+
logging.error(f"[{model_id}] Error writing updated data to {filepath}: {e}")
|
193 |
+
return False
|
194 |
+
except Exception as e:
|
195 |
+
logging.error(f"[{model_id}] Unexpected error writing {filepath}: {e}")
|
196 |
+
return False
|
197 |
+
else: # Explanation generation failed
|
198 |
+
logging.warning(f"[{model_id}] Failed to generate new explanation for {filepath} via API. File not updated.")
|
199 |
+
return False # Indicate failure/no update
|
200 |
+
|
201 |
+
|
202 |
+
def main():
|
203 |
+
"""Main function to iterate through the directory and process files."""
|
204 |
+
if not configure_llm_client():
|
205 |
+
return # Stop if API key is not configured
|
206 |
+
|
207 |
+
if not os.path.isdir(MODEL_DATA_DIR):
|
208 |
+
logging.error(f"Directory not found: {MODEL_DATA_DIR}")
|
209 |
+
return
|
210 |
+
|
211 |
+
logging.info(f"Starting processing directory: {MODEL_DATA_DIR}")
|
212 |
+
processed_files = 0
|
213 |
+
updated_files = 0 # Count files actually updated
|
214 |
+
skipped_existing = 0 # Count files skipped because explanation existed
|
215 |
+
skipped_error = 0 # Count files skipped due to read/write/API errors or no description
|
216 |
+
|
217 |
+
all_files = [f for f in os.listdir(MODEL_DATA_DIR) if f.lower().endswith(".json")]
|
218 |
+
total_files = len(all_files)
|
219 |
+
logging.info(f"Found {total_files} JSON files to process.")
|
220 |
+
|
221 |
+
for i, filename in enumerate(all_files):
|
222 |
+
filepath = os.path.join(MODEL_DATA_DIR, filename)
|
223 |
+
logging.info(f"--- Processing file {i+1}/{total_files}: {filename} ---")
|
224 |
+
try:
|
225 |
+
# process_json_file now returns True if updated, False otherwise
|
226 |
+
updated = process_json_file(filepath)
|
227 |
+
processed_files += 1
|
228 |
+
if updated:
|
229 |
+
updated_files += 1
|
230 |
+
else:
|
231 |
+
# Need to differentiate why it wasn't updated. Re-read is inefficient.
|
232 |
+
# Let's rely on logs from process_json_file for now.
|
233 |
+
# A better way would be for process_json_file to return status codes.
|
234 |
+
pass # Logging within the function indicates reason (skipped existing, API fail, etc.)
|
235 |
+
|
236 |
+
except Exception as e:
|
237 |
+
logging.error(f"Unexpected error processing file loop for {filename}: {e}")
|
238 |
+
skipped_error += 1 # Count generic loop errors
|
239 |
+
# Add a small delay between files to potentially avoid hitting rate limits
|
240 |
+
# Adjust delay based on Gemini quota/limits (might need less than 0.5s)
|
241 |
+
time.sleep(0.2)
|
242 |
+
|
243 |
+
|
244 |
+
logging.info(f"--- Processing complete ---")
|
245 |
+
logging.info(f"Total JSON files found: {total_files}")
|
246 |
+
logging.info(f"Files processed (attempted): {processed_files}")
|
247 |
+
logging.info(f"Files successfully updated with new explanation: {updated_files}")
|
248 |
+
# Cannot precisely count skipped_existing vs skipped_error without better return values
|
249 |
+
# logging.info(f"Files skipped (existing explanation, errors, or no description): {total_files - updated_files}")
|
250 |
+
|
251 |
+
|
252 |
+
if __name__ == "__main__":
|
253 |
+
main()
|
app.py
CHANGED
@@ -5,8 +5,26 @@ from flask_cors import CORS
|
|
5 |
import numpy as np
|
6 |
import json
|
7 |
import traceback
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
app = Flask(__name__) # Create app object FIRST
|
|
|
|
|
|
|
|
|
10 |
# Allow requests from the Vercel frontend and localhost for development
|
11 |
CORS(app, origins=["http://127.0.0.1:3000", "http://localhost:3000", "https://rag-huggingface.vercel.app"], supports_credentials=True)
|
12 |
|
@@ -94,6 +112,70 @@ def load_resources():
|
|
94 |
load_resources()
|
95 |
# ---
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
@app.route('/search', methods=['POST'])
|
98 |
def search():
|
99 |
"""Handles search requests, embedding the query and searching the FAISS index."""
|
|
|
5 |
import numpy as np
|
6 |
import json
|
7 |
import traceback
|
8 |
+
import logging # Added for background task logging
|
9 |
+
import threading # Added for background task
|
10 |
+
import time # Added for background task
|
11 |
+
import schedule # Added for background task
|
12 |
+
|
13 |
+
# --- Import the daily update function ---
|
14 |
+
try:
|
15 |
+
from daily_update import main as run_daily_update
|
16 |
+
# Set up logging for the daily_update module if it uses logging
|
17 |
+
# logging.getLogger('daily_update').setLevel(logging.INFO) # Example
|
18 |
+
except ImportError:
|
19 |
+
logging.error("Failed to import daily_update.py. The daily update task will not run.")
|
20 |
+
run_daily_update = None # Define as None if import fails
|
21 |
+
# ---
|
22 |
|
23 |
app = Flask(__name__) # Create app object FIRST
|
24 |
+
|
25 |
+
# Configure Flask app logging (optional but recommended)
|
26 |
+
# app.logger.setLevel(logging.INFO)
|
27 |
+
|
28 |
# Allow requests from the Vercel frontend and localhost for development
|
29 |
CORS(app, origins=["http://127.0.0.1:3000", "http://localhost:3000", "https://rag-huggingface.vercel.app"], supports_credentials=True)
|
30 |
|
|
|
112 |
load_resources()
|
113 |
# ---
|
114 |
|
115 |
+
# --- Background Update Task ---
|
116 |
+
|
117 |
+
UPDATE_INTERVAL_HOURS = 24 # Check every 24 hours
|
118 |
+
UPDATE_TIME = "02:00" # Time to run the update (24-hour format)
|
119 |
+
|
120 |
+
def run_update_task():
|
121 |
+
"""Wrapper function to run the daily update and handle errors."""
|
122 |
+
if run_daily_update is None:
|
123 |
+
logging.warning("run_daily_update function not available (import failed). Skipping task.")
|
124 |
+
return
|
125 |
+
|
126 |
+
logging.info(f"Background task: Starting daily update check (scheduled for {UPDATE_TIME})...")
|
127 |
+
try:
|
128 |
+
# Make sure the DEEPSEEK_API_KEY is set before running
|
129 |
+
if not os.getenv("DEEPSEEK_API_KEY"):
|
130 |
+
logging.error("Background task: DEEPSEEK_API_KEY not set. Daily update cannot run.")
|
131 |
+
return # Don't run if key is missing
|
132 |
+
|
133 |
+
run_daily_update() # Call the main function from daily_update.py
|
134 |
+
logging.info("Background task: Daily update process finished.")
|
135 |
+
except Exception as e:
|
136 |
+
logging.error(f"Background task: Error during daily update execution: {e}")
|
137 |
+
logging.error(traceback.format_exc())
|
138 |
+
|
139 |
+
def background_scheduler():
|
140 |
+
"""Runs the scheduler loop in a background thread."""
|
141 |
+
logging.info(f"Background scheduler started. Will run update task daily around {UPDATE_TIME}.")
|
142 |
+
|
143 |
+
if run_daily_update is None:
|
144 |
+
logging.error("Background scheduler: daily_update.py could not be imported. Scheduler will not run tasks.")
|
145 |
+
return # Stop the thread if the core function isn't available
|
146 |
+
|
147 |
+
# Schedule the job
|
148 |
+
# schedule.every(UPDATE_INTERVAL_HOURS).hours.do(run_update_task) # Alternative: run every X hours
|
149 |
+
schedule.every().day.at(UPDATE_TIME).do(run_update_task)
|
150 |
+
logging.info(f"Scheduled daily update task for {UPDATE_TIME}.")
|
151 |
+
|
152 |
+
# Run once immediately on startup? (Optional)
|
153 |
+
# logging.info("Running initial update task on startup...")
|
154 |
+
# run_update_task()
|
155 |
+
# logging.info("Initial update task finished.")
|
156 |
+
|
157 |
+
while True:
|
158 |
+
schedule.run_pending()
|
159 |
+
time.sleep(60) # Check every 60 seconds if a task is due
|
160 |
+
|
161 |
+
# Start the background scheduler thread only if this is the main process
|
162 |
+
# This check helps prevent duplicate schedulers when using workers (like Gunicorn)
|
163 |
+
# Note: This might not be perfectly reliable with all WSGI servers/configs.
|
164 |
+
# Consider using a more robust method for ensuring single execution if needed (e.g., file lock, external process manager)
|
165 |
+
if os.environ.get("WERKZEUG_RUN_MAIN") == "true" or os.environ.get("FLASK_ENV") != "development":
|
166 |
+
# Start only in main Werkzeug process OR if not in Flask development mode (like production with Gunicorn)
|
167 |
+
# Check if the function is available before starting thread
|
168 |
+
if run_daily_update is not None:
|
169 |
+
scheduler_thread = threading.Thread(target=background_scheduler, daemon=True)
|
170 |
+
scheduler_thread.start()
|
171 |
+
logging.info("Background scheduler thread started.")
|
172 |
+
else:
|
173 |
+
logging.warning("Background scheduler thread NOT started because daily_update.py failed to import.")
|
174 |
+
else:
|
175 |
+
logging.info("Skipping background scheduler start in Werkzeug reloader process.")
|
176 |
+
|
177 |
+
# --- End Background Update Task ---
|
178 |
+
|
179 |
@app.route('/search', methods=['POST'])
|
180 |
def search():
|
181 |
"""Handles search requests, embedding the query and searching the FAISS index."""
|
build_index.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.environ['OMP_NUM_THREADS'] = '1' # Limit OpenMP threads, might help prevent crashes
|
3 |
+
import faiss
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
import numpy as np
|
6 |
+
import pickle
|
7 |
+
import json # Import json module
|
8 |
+
from tqdm import tqdm
|
9 |
+
|
10 |
+
# --- Configuration ---
|
11 |
+
MODEL_DATA_DIR = "model_data_json" # Path to downloaded JSON data
|
12 |
+
INDEX_FILE = "index.faiss"
|
13 |
+
MAP_FILE = "index_to_metadata.pkl" # Changed filename to reflect content
|
14 |
+
EMBEDDING_MODEL = 'all-mpnet-base-v2' # Efficient and good quality model
|
15 |
+
ENCODE_BATCH_SIZE = 32 # Process descriptions in smaller batches
|
16 |
+
# Tags to exclude from indexing text
|
17 |
+
COMMON_EXCLUDED_TAGS = {'transformers'} # Add other common tags if needed
|
18 |
+
EXCLUDED_TAG_PREFIXES = ('arxiv:', 'base_model:', 'dataset:', 'diffusers:', 'license:') # Add other prefixes if needed
|
19 |
+
MODEL_EXPLANATION_KEY = "model_explanation_gemini" # Key for the new explanation field
|
20 |
+
# ---
|
21 |
+
|
22 |
+
def load_model_data(directory):
|
23 |
+
"""Loads model data, filters tags (by length, common words, prefixes), and combines relevant info for indexing."""
|
24 |
+
all_texts = [] # Store combined text (model_id + description + filtered_tags)
|
25 |
+
all_metadata = [] # Store dicts: {'model_id': ..., 'tags': ..., 'downloads': ...}
|
26 |
+
print(f"Loading model data from JSON files in: {directory}")
|
27 |
+
if not os.path.isdir(directory):
|
28 |
+
print(f"Error: Directory not found: {directory}")
|
29 |
+
return [], []
|
30 |
+
|
31 |
+
filenames = [f for f in os.listdir(directory) if f.endswith(".json")] # Look for .json files
|
32 |
+
for filename in tqdm(filenames, desc="Reading JSON files"):
|
33 |
+
filepath = os.path.join(directory, filename)
|
34 |
+
try:
|
35 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
36 |
+
data = json.load(f)
|
37 |
+
# Ensure required fields exist
|
38 |
+
if 'description' in data and 'model_id' in data:
|
39 |
+
description = data['description']
|
40 |
+
model_id = data['model_id'] # Get model_id
|
41 |
+
if description: # Only index if description is not empty
|
42 |
+
original_tags = data.get('tags', [])
|
43 |
+
# Filter tags: remove short tags, common tags, and tags with specific prefixes
|
44 |
+
filtered_tags = [
|
45 |
+
str_tag for tag in original_tags
|
46 |
+
if (
|
47 |
+
tag and isinstance(tag, str) and # Ensure tag exists and is a string
|
48 |
+
len(tag) > 3 and
|
49 |
+
(str_tag := str(tag)).lower() not in COMMON_EXCLUDED_TAGS and
|
50 |
+
not str_tag.lower().startswith(EXCLUDED_TAG_PREFIXES) # Check for prefixes
|
51 |
+
)
|
52 |
+
]
|
53 |
+
tag_string = " ".join(filtered_tags)
|
54 |
+
explanation = data.get(MODEL_EXPLANATION_KEY) # Get the new explanation
|
55 |
+
|
56 |
+
# --- Construct combined text with priority weighting ---
|
57 |
+
text_parts = []
|
58 |
+
# 1. Add explanation (repeated for emphasis) if available
|
59 |
+
if explanation and isinstance(explanation, str):
|
60 |
+
text_parts.append(f"Summary: {explanation}")
|
61 |
+
text_parts.append(f"Summary: {explanation}") # Repeat for higher weight
|
62 |
+
# 2. Add model name
|
63 |
+
text_parts.append(f"Model: {model_id}")
|
64 |
+
# 3. Add filtered tags if available
|
65 |
+
if tag_string:
|
66 |
+
text_parts.append(f"Tags: {tag_string}")
|
67 |
+
# 4. Add original description
|
68 |
+
text_parts.append(f"Description: {description}")
|
69 |
+
|
70 |
+
combined_text = " ".join(text_parts).strip() # Join all parts
|
71 |
+
# --- End construction ---
|
72 |
+
|
73 |
+
all_texts.append(combined_text)
|
74 |
+
# Add explanation to metadata as well for potential display
|
75 |
+
metadata_entry = {
|
76 |
+
"model_id": model_id,
|
77 |
+
"tags": original_tags, # Keep ORIGINAL tags in metadata
|
78 |
+
"downloads": data.get('downloads', 0)
|
79 |
+
}
|
80 |
+
if explanation and isinstance(explanation, str):
|
81 |
+
metadata_entry[MODEL_EXPLANATION_KEY] = explanation
|
82 |
+
all_metadata.append(metadata_entry)
|
83 |
+
else:
|
84 |
+
print(f"Warning: Skipping {filename}, missing 'description' or 'model_id' key.")
|
85 |
+
except json.JSONDecodeError:
|
86 |
+
print(f"Warning: Skipping {filename}, invalid JSON.")
|
87 |
+
except Exception as e:
|
88 |
+
print(f"Warning: Could not read or process {filename}: {e}")
|
89 |
+
|
90 |
+
print(f"Loaded data for {len(all_texts)} models with valid descriptions after tag filtering.")
|
91 |
+
return all_texts, all_metadata
|
92 |
+
|
93 |
+
def build_and_save_index(texts_to_index, metadata_list):
|
94 |
+
"""Builds and saves the FAISS index and metadata mapping based on combined text."""
|
95 |
+
if not texts_to_index:
|
96 |
+
print("No text data to index.")
|
97 |
+
return
|
98 |
+
|
99 |
+
print(f"Loading sentence transformer model: {EMBEDDING_MODEL}")
|
100 |
+
# Consider adding device='mps' if on Apple Silicon and PyTorch supports it well enough,
|
101 |
+
# but start with CPU for stability.
|
102 |
+
model = SentenceTransformer(EMBEDDING_MODEL)
|
103 |
+
|
104 |
+
print(f"Generating embeddings for combined text in batches of {ENCODE_BATCH_SIZE}...")
|
105 |
+
all_embeddings = []
|
106 |
+
for i in tqdm(range(0, len(texts_to_index), ENCODE_BATCH_SIZE), desc="Encoding batches"):
|
107 |
+
batch = texts_to_index[i:i+ENCODE_BATCH_SIZE]
|
108 |
+
batch_embeddings = model.encode(batch, convert_to_numpy=True)
|
109 |
+
all_embeddings.append(batch_embeddings)
|
110 |
+
|
111 |
+
if not all_embeddings:
|
112 |
+
print("No embeddings generated. Cannot build index.")
|
113 |
+
return
|
114 |
+
|
115 |
+
embeddings = np.vstack(all_embeddings) # Combine embeddings from all batches
|
116 |
+
|
117 |
+
# Ensure embeddings are float32 for FAISS
|
118 |
+
embeddings = embeddings.astype('float32')
|
119 |
+
|
120 |
+
# Build FAISS index
|
121 |
+
print("Building FAISS index...")
|
122 |
+
dimension = embeddings.shape[1]
|
123 |
+
index = faiss.IndexFlatL2(dimension) # Using simple L2 distance
|
124 |
+
index.add(embeddings)
|
125 |
+
print(f"FAISS index built with {index.ntotal} vectors.")
|
126 |
+
|
127 |
+
# Save the index
|
128 |
+
faiss.write_index(index, INDEX_FILE)
|
129 |
+
print(f"FAISS index saved to: {INDEX_FILE}")
|
130 |
+
|
131 |
+
# Create mapping from index position to metadata dictionary
|
132 |
+
index_to_metadata = {i: metadata for i, metadata in enumerate(metadata_list)}
|
133 |
+
with open(MAP_FILE, 'wb') as f:
|
134 |
+
pickle.dump(index_to_metadata, f)
|
135 |
+
print(f"Index-to-Metadata mapping saved to: {MAP_FILE}")
|
136 |
+
|
137 |
+
if __name__ == "__main__":
|
138 |
+
combined_texts, metadata_list = load_model_data(MODEL_DATA_DIR)
|
139 |
+
build_and_save_index(combined_texts, metadata_list)
|
140 |
+
print("\nIndex building complete.")
|
daily_update.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import sys
|
3 |
+
import traceback
|
4 |
+
|
5 |
+
# Configure basic logging for the orchestration script
|
6 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
7 |
+
|
8 |
+
def run_step(step_func, step_name):
|
9 |
+
"""Runs a step and logs its success or failure."""
|
10 |
+
logging.info(f"--- Starting step: {step_name} ---")
|
11 |
+
try:
|
12 |
+
step_func()
|
13 |
+
logging.info(f"--- Finished step: {step_name} successfully ---")
|
14 |
+
return True
|
15 |
+
except Exception as e:
|
16 |
+
logging.error(f"--- Step failed: {step_name} ---")
|
17 |
+
logging.error(f"Error: {e}")
|
18 |
+
# Log the full traceback for detailed debugging
|
19 |
+
logging.error(traceback.format_exc())
|
20 |
+
return False
|
21 |
+
|
22 |
+
def main():
|
23 |
+
"""Runs the daily update sequence."""
|
24 |
+
logging.info("=== Starting Daily Model Update Process ===")
|
25 |
+
|
26 |
+
all_steps_succeeded = True
|
27 |
+
|
28 |
+
# --- Step 1: Fetch new/updated model descriptions ---
|
29 |
+
try:
|
30 |
+
# Import the script's main function dynamically
|
31 |
+
from huggingface_model_descriptions import main as fetch_models_main
|
32 |
+
if not run_step(fetch_models_main, "Fetch Hugging Face Models"):
|
33 |
+
all_steps_succeeded = False
|
34 |
+
# Decide if we should continue if fetching fails (maybe index can still be built?)
|
35 |
+
# For now, let's stop if the first step fails.
|
36 |
+
logging.error("Stopping update process for this cycle due to failure in fetching models.")
|
37 |
+
return # Exit the main function for this cycle
|
38 |
+
except ImportError:
|
39 |
+
logging.error("Failed to import huggingface_model_descriptions.py. Ensure it's in the same directory or Python path.")
|
40 |
+
all_steps_succeeded = False
|
41 |
+
return # Exit the main function for this cycle
|
42 |
+
except Exception as e: # Catch any unexpected error during import/setup
|
43 |
+
logging.error(f"Unexpected error setting up model fetching step: {e}")
|
44 |
+
logging.error(traceback.format_exc())
|
45 |
+
all_steps_succeeded = False
|
46 |
+
return # Exit the main function for this cycle
|
47 |
+
|
48 |
+
|
49 |
+
# --- Step 2: Add explanations using Gemini ---
|
50 |
+
# Only proceed if the previous step was successful
|
51 |
+
if all_steps_succeeded:
|
52 |
+
try:
|
53 |
+
from add_model_explanations import main as add_explanations_main
|
54 |
+
# Check for API key *before* running the step
|
55 |
+
import os
|
56 |
+
if not os.getenv("GEMINI_API_KEY"):
|
57 |
+
logging.warning("GEMINI_API_KEY environment variable not set. Explanation step will fail or do nothing.")
|
58 |
+
# Optionally, you could skip this step entirely if the key is missing:
|
59 |
+
# logging.warning("Skipping explanation generation step.")
|
60 |
+
# pass # Move to the next step
|
61 |
+
|
62 |
+
if not run_step(add_explanations_main, "Generate Model Explanations (Gemini)"):
|
63 |
+
all_steps_succeeded = False
|
64 |
+
# Decide if index building should proceed if explanations fail
|
65 |
+
logging.warning("Explanation generation failed. Index will be built with potentially missing explanations.")
|
66 |
+
# We will continue to the next step in this case
|
67 |
+
|
68 |
+
except ImportError:
|
69 |
+
logging.error("Failed to import add_model_explanations.py. Ensure it's in the same directory or Python path.")
|
70 |
+
all_steps_succeeded = False
|
71 |
+
# Stop if explanation script is missing
|
72 |
+
return # Exit the main function for this cycle
|
73 |
+
except Exception as e: # Catch any unexpected error during import/setup
|
74 |
+
logging.error(f"Unexpected error setting up explanation generation step: {e}")
|
75 |
+
logging.error(traceback.format_exc())
|
76 |
+
all_steps_succeeded = False
|
77 |
+
return # Exit the main function for this cycle
|
78 |
+
|
79 |
+
# --- Step 3: Rebuild the search index ---
|
80 |
+
# Only proceed if fetching models (Step 1) succeeded. Allow proceeding if Step 2 failed.
|
81 |
+
if 'fetch_models_main' in locals() or 'fetch_models_main' in globals(): # Check if Step 1 setup occurred
|
82 |
+
try:
|
83 |
+
from build_index import main as build_index_main
|
84 |
+
if not run_step(build_index_main, "Build Search Index (FAISS)"):
|
85 |
+
all_steps_succeeded = False
|
86 |
+
logging.error("Index building failed. The search index may be outdated or corrupted.")
|
87 |
+
# Stop if index building fails
|
88 |
+
return # Exit the main function for this cycle
|
89 |
+
except ImportError:
|
90 |
+
logging.error("Failed to import build_index.py. Ensure it's in the same directory or Python path.")
|
91 |
+
all_steps_succeeded = False
|
92 |
+
return # Exit the main function for this cycle
|
93 |
+
except Exception as e: # Catch any unexpected error during import/setup
|
94 |
+
logging.error(f"Unexpected error setting up index building step: {e}")
|
95 |
+
logging.error(traceback.format_exc())
|
96 |
+
all_steps_succeeded = False
|
97 |
+
return # Exit the main function for this cycle
|
98 |
+
|
99 |
+
|
100 |
+
logging.info("===========================================")
|
101 |
+
if all_steps_succeeded:
|
102 |
+
logging.info("=== Daily Model Update Process Completed Successfully ===")
|
103 |
+
else:
|
104 |
+
logging.error("=== Daily Model Update Process Completed with Errors ===")
|
105 |
+
|
106 |
+
if __name__ == "__main__":
|
107 |
+
main()
|
huggingface_model_descriptions.py
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
from tqdm import tqdm
|
4 |
+
import time
|
5 |
+
import re
|
6 |
+
import json
|
7 |
+
from huggingface_hub import HfApi, hf_hub_download
|
8 |
+
from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError, HFValidationError
|
9 |
+
from requests.exceptions import RequestException
|
10 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
11 |
+
import pickle # Add pickle for caching
|
12 |
+
|
13 |
+
# Create a directory to store JSON data
|
14 |
+
OUTPUT_DIR = "model_data_json"
|
15 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
16 |
+
|
17 |
+
# Number of worker threads for parallel processing - REDUCED
|
18 |
+
NUM_WORKERS = 4
|
19 |
+
|
20 |
+
# Add a delay between download attempts across threads
|
21 |
+
DOWNLOAD_DELAY_SECONDS = 0.2 # Adjust as needed
|
22 |
+
|
23 |
+
# --- README Cleaning ---
|
24 |
+
def clean_readme_content(text):
|
25 |
+
"""Basic cleaning of README markdown: remove code blocks, links."""
|
26 |
+
if not text:
|
27 |
+
return ""
|
28 |
+
|
29 |
+
# Remove fenced code blocks (``` ... ```)
|
30 |
+
text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
|
31 |
+
# Remove inline code (`...`)
|
32 |
+
text = re.sub(r'`[^`]+`', '', text)
|
33 |
+
# Remove markdown links ([text](url))
|
34 |
+
text = re.sub(r'\[([^]]+)\]\([^)]+\)', r'\1', text) # Keep link text
|
35 |
+
# Remove standalone URLs (simple version)
|
36 |
+
text = re.sub(r'https?://\S+', '', text)
|
37 |
+
# Remove markdown images ()
|
38 |
+
text = re.sub(r'!\[[^]]*\]\([^)]+\)', '', text)
|
39 |
+
# Replace multiple newlines/spaces with single ones
|
40 |
+
text = ' '.join(text.split())
|
41 |
+
return text
|
42 |
+
# ---
|
43 |
+
|
44 |
+
MODELS_CACHE_FILE = "models_list_cache.pkl" # File to cache the raw model list
|
45 |
+
|
46 |
+
def get_all_models_with_downloads(min_downloads=10000):
|
47 |
+
"""Fetch all models from Hugging Face with at least min_downloads, using a local cache for the list."""
|
48 |
+
models_list = None
|
49 |
+
|
50 |
+
# 1. Check for cache
|
51 |
+
if os.path.exists(MODELS_CACHE_FILE):
|
52 |
+
try:
|
53 |
+
print(f"Loading cached model list from {MODELS_CACHE_FILE}...")
|
54 |
+
with open(MODELS_CACHE_FILE, 'rb') as f:
|
55 |
+
models_list = pickle.load(f)
|
56 |
+
print(f"Loaded {len(models_list)} models from cache.")
|
57 |
+
except Exception as e:
|
58 |
+
print(f"Error loading cache file {MODELS_CACHE_FILE}: {e}. Fetching from API.")
|
59 |
+
models_list = None # Ensure fetching if cache loading fails
|
60 |
+
|
61 |
+
# 2. Fetch from API if cache doesn't exist or failed to load
|
62 |
+
if models_list is None:
|
63 |
+
print(f"Fetching all models with more than {min_downloads} downloads from API...")
|
64 |
+
try:
|
65 |
+
print("Initializing HfApi...")
|
66 |
+
api = HfApi()
|
67 |
+
print("HfApi initialized. Calling list_models...")
|
68 |
+
# Fetch the iterator
|
69 |
+
models_iterator = api.list_models(sort="downloads", direction=-1, fetch_config=False, cardData=True)
|
70 |
+
print("list_models call returned. Converting iterator to list...")
|
71 |
+
# Convert the iterator to a list TO ALLOW CACHING
|
72 |
+
models_list = list(models_iterator)
|
73 |
+
print(f"Converted to list with {len(models_list)} models.")
|
74 |
+
|
75 |
+
# Save to cache
|
76 |
+
try:
|
77 |
+
print(f"Saving model list to cache file: {MODELS_CACHE_FILE}...")
|
78 |
+
with open(MODELS_CACHE_FILE, 'wb') as f:
|
79 |
+
pickle.dump(models_list, f)
|
80 |
+
print("Model list saved to cache.")
|
81 |
+
except Exception as e:
|
82 |
+
print(f"Error saving cache file {MODELS_CACHE_FILE}: {e}")
|
83 |
+
|
84 |
+
except Exception as e:
|
85 |
+
print(f"Error during HfApi initialization or list_models call: {e}")
|
86 |
+
return [] # Return empty list on error
|
87 |
+
|
88 |
+
# 3. Filter the loaded/fetched list
|
89 |
+
if not models_list:
|
90 |
+
print("Model list is empty after fetching/loading.")
|
91 |
+
return []
|
92 |
+
|
93 |
+
qualifying_models = []
|
94 |
+
print(f"Filtering {len(models_list)} models by download count...")
|
95 |
+
for model in models_list: # Iterate through the list (from cache or API)
|
96 |
+
# No need for prints inside this loop now, as it should be fast
|
97 |
+
if not hasattr(model, 'downloads') or model.downloads is None:
|
98 |
+
continue
|
99 |
+
|
100 |
+
if model.downloads < min_downloads:
|
101 |
+
# Since the list is sorted by downloads, we can stop
|
102 |
+
break
|
103 |
+
|
104 |
+
qualifying_models.append(model)
|
105 |
+
|
106 |
+
print(f"Found {len(qualifying_models)} models with more than {min_downloads} downloads")
|
107 |
+
return qualifying_models
|
108 |
+
|
109 |
+
def get_model_readme(model_id):
|
110 |
+
"""Get README.md content for a specific model using hf_hub_download. Returns None if not found or inaccessible."""
|
111 |
+
filenames_to_try = ["README.md", "readme.md"]
|
112 |
+
branches_to_try = ["main", "master"]
|
113 |
+
|
114 |
+
for branch in branches_to_try:
|
115 |
+
for filename in filenames_to_try:
|
116 |
+
try:
|
117 |
+
# print(f"Attempting download: repo={model_id}, branch={branch}, file={filename}") # Debug
|
118 |
+
# Use hf_hub_download which uses stored token
|
119 |
+
readme_path = hf_hub_download(
|
120 |
+
repo_id=model_id,
|
121 |
+
filename=filename,
|
122 |
+
revision=branch,
|
123 |
+
repo_type="model",
|
124 |
+
local_files_only=False, # Ensure it tries to download
|
125 |
+
# token=True # Often not needed if logged in via CLI, but can be explicit
|
126 |
+
)
|
127 |
+
|
128 |
+
# If download succeeded, read the content
|
129 |
+
# print(f"Successfully downloaded {filename} from {branch} to {readme_path}") # Debug
|
130 |
+
with open(readme_path, 'r', encoding='utf-8') as f:
|
131 |
+
content = f.read()
|
132 |
+
return content
|
133 |
+
|
134 |
+
except RepositoryNotFoundError:
|
135 |
+
print(f"Repository {model_id} not found.")
|
136 |
+
return None # If repo doesn't exist, no point trying other files/branches
|
137 |
+
except EntryNotFoundError:
|
138 |
+
# print(f"{filename} not found in branch {branch} for {model_id}. Trying next...") # Debug
|
139 |
+
continue # File not found in this specific branch/filename combination, try next
|
140 |
+
except HFValidationError as e: # Catch invalid repo ID or filename errors
|
141 |
+
print(f"Validation error for {model_id} (branch: {branch}, file: {filename}): {e}")
|
142 |
+
continue # Try next filename/branch
|
143 |
+
except Exception as e: # Catch other potential errors (like 401 HfHubHTTPError, network issues)
|
144 |
+
print(f"Error downloading {filename} from branch {branch} for {model_id}: {e}")
|
145 |
+
# Check if it's a likely authentication error (401/403)
|
146 |
+
if "401" in str(e) or "403" in str(e):
|
147 |
+
print(f"Authentication error (401/403) for {model_id}. Ensure you are logged in and accepted terms.")
|
148 |
+
return None # Don't try other files/branches if auth failed
|
149 |
+
# For other errors, we continue to the next filename/branch attempt
|
150 |
+
continue
|
151 |
+
|
152 |
+
# If all attempts failed
|
153 |
+
print(f"Could not fetch README for {model_id} from any standard location.")
|
154 |
+
return None
|
155 |
+
|
156 |
+
def get_filename_for_model(model_id):
|
157 |
+
"""Generate JSON filename for a model"""
|
158 |
+
safe_id = model_id.replace("/", "_")
|
159 |
+
return os.path.join(OUTPUT_DIR, f"{safe_id}.json") # Change extension to .json
|
160 |
+
|
161 |
+
def save_model_data(model_id, data):
|
162 |
+
"""Save model data (description, tags, downloads) to a JSON file."""
|
163 |
+
filename = get_filename_for_model(model_id)
|
164 |
+
try:
|
165 |
+
with open(filename, "w", encoding="utf-8") as f:
|
166 |
+
json.dump(data, f, ensure_ascii=False, indent=4)
|
167 |
+
return filename
|
168 |
+
except Exception as e:
|
169 |
+
print(f"Error saving JSON for {model_id} to {filename}: {e}")
|
170 |
+
return None
|
171 |
+
|
172 |
+
def file_exists_for_model(model_id):
|
173 |
+
"""Check if a JSON file already exists for this model"""
|
174 |
+
filename = get_filename_for_model(model_id)
|
175 |
+
return os.path.exists(filename)
|
176 |
+
|
177 |
+
def process_model(model):
|
178 |
+
"""Process a single model - fetch README, clean it, save as JSON."""
|
179 |
+
model_id = model.modelId
|
180 |
+
downloads = model.downloads
|
181 |
+
tags = getattr(model, 'tags', []) # Get tags if available
|
182 |
+
|
183 |
+
# Check if JSON file already exists
|
184 |
+
if file_exists_for_model(model_id):
|
185 |
+
return (model_id, downloads, None, "skipped")
|
186 |
+
|
187 |
+
# --- Add Delay Before Download Attempt ---
|
188 |
+
time.sleep(DOWNLOAD_DELAY_SECONDS)
|
189 |
+
# ---------------------------------------
|
190 |
+
|
191 |
+
# Get model README content
|
192 |
+
readme_content = get_model_readme(model_id)
|
193 |
+
|
194 |
+
# If README is not available, skip saving this model
|
195 |
+
if readme_content is None:
|
196 |
+
return (model_id, downloads, None, "no_readme")
|
197 |
+
|
198 |
+
# Clean the README
|
199 |
+
cleaned_readme = clean_readme_content(readme_content)
|
200 |
+
|
201 |
+
# Prepare data payload
|
202 |
+
model_data = {
|
203 |
+
"model_id": model_id,
|
204 |
+
"downloads": downloads,
|
205 |
+
"tags": tags,
|
206 |
+
"description": cleaned_readme
|
207 |
+
}
|
208 |
+
|
209 |
+
# Save data as JSON
|
210 |
+
filename = save_model_data(model_id, model_data)
|
211 |
+
if filename:
|
212 |
+
return (model_id, downloads, filename, "downloaded")
|
213 |
+
else:
|
214 |
+
return (model_id, downloads, None, "save_failed")
|
215 |
+
|
216 |
+
def main():
|
217 |
+
qualifying_models = get_all_models_with_downloads(min_downloads=10000)
|
218 |
+
if not qualifying_models:
|
219 |
+
print("No qualifying models found")
|
220 |
+
return
|
221 |
+
|
222 |
+
print(f"Processing {len(qualifying_models)} models, saving to '{OUTPUT_DIR}'...")
|
223 |
+
downloaded = 0
|
224 |
+
skipped = 0
|
225 |
+
no_readme = 0
|
226 |
+
failed = 0
|
227 |
+
|
228 |
+
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
|
229 |
+
future_to_model = {executor.submit(process_model, model): model for model in qualifying_models}
|
230 |
+
|
231 |
+
for future in tqdm(as_completed(future_to_model), total=len(qualifying_models)):
|
232 |
+
try:
|
233 |
+
model_id, downloads, filename, status = future.result()
|
234 |
+
if status == "downloaded":
|
235 |
+
# Don't print every success to avoid clutter
|
236 |
+
# print(f"Saved data for {model_id} ({downloads} downloads) to {filename}")
|
237 |
+
downloaded += 1
|
238 |
+
elif status == "skipped":
|
239 |
+
skipped += 1
|
240 |
+
elif status == "no_readme":
|
241 |
+
no_readme += 1
|
242 |
+
else: # save_failed or other errors
|
243 |
+
failed += 1
|
244 |
+
except Exception as e:
|
245 |
+
# Extract model_id for better error reporting if possible
|
246 |
+
processed_model = future_to_model[future]
|
247 |
+
print(f"Error processing model {getattr(processed_model, 'modelId', 'unknown')}: {e}")
|
248 |
+
failed += 1
|
249 |
+
|
250 |
+
print(f"\nCompleted! Downloaded: {downloaded}, Skipped existing: {skipped}, No README found: {no_readme}, Failed: {failed}")
|
251 |
+
|
252 |
+
if __name__ == "__main__":
|
253 |
+
main()
|
requirements.txt
CHANGED
@@ -4,4 +4,6 @@ sentence-transformers>=2.3.0
|
|
4 |
numpy>=1.20.0
|
5 |
faiss-cpu>=1.7.0 # Use faiss-gpu if you need GPU support on HF Spaces
|
6 |
huggingface-hub>=0.15.1 # Version compatible with sentence-transformers >= 2.3.0
|
7 |
-
gunicorn # Added for deployment on Hugging Face Spaces
|
|
|
|
|
|
4 |
numpy>=1.20.0
|
5 |
faiss-cpu>=1.7.0 # Use faiss-gpu if you need GPU support on HF Spaces
|
6 |
huggingface-hub>=0.15.1 # Version compatible with sentence-transformers >= 2.3.0
|
7 |
+
gunicorn # Added for deployment on Hugging Face Spaces
|
8 |
+
openai>=1.0.0 # Added back for DeepSeek API via OpenAI client
|
9 |
+
schedule>=1.0.0 # Added for in-app scheduling
|