shayan5422 commited on
Commit
0db8b33
·
verified ·
1 Parent(s): 951f99a

Upload 10 files

Browse files
add_model_explanations.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from typing import Dict, Any, Optional
4
+ import logging
5
+ import time
6
+ # import google.generativeai as genai # Remove Gemini import
7
+ from openai import OpenAI, APIError # Add back OpenAI imports
8
+
9
+ # Configure logging
10
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
+
12
+ MODEL_DATA_DIR = "model_data_json"
13
+ EXPLANATION_KEY = "model_explanation_gemini"
14
+ DESCRIPTION_KEY = "description"
15
+ MAX_RETRIES = 3 # Retries for API calls
16
+ RETRY_DELAY_SECONDS = 5 # Delay between retries
17
+
18
+ # --- DeepSeek API Configuration (Restored) ---
19
+ DEEPSEEK_API_KEY_ENV_VAR = "DEEPSEEK_API_KEY" # Environment variable for the key
20
+ DEEPSEEK_BASE_URL = "https://api.deepseek.com"
21
+ DEEPSEEK_MODEL_NAME = "deepseek-chat"
22
+ # ---
23
+
24
+ # Remove Gemini configuration
25
+ # GEMINI_API_KEY_ENV_VAR = "GEMINI_API_KEY"
26
+ # GEMINI_MODEL_NAME = "gemini-1.5-flash-latest"
27
+
28
+ # Global client variable for DeepSeek/OpenAI client
29
+ client: Optional[OpenAI] = None # Use OpenAI client type
30
+ # gemini_model: Optional[genai.GenerativeModel] = None # Remove Gemini model variable
31
+
32
+ def configure_llm_client():
33
+ """Configures the OpenAI client for DeepSeek API using the API key from environment variables."""
34
+ global client
35
+ # global gemini_model # Remove
36
+ api_key = os.getenv(DEEPSEEK_API_KEY_ENV_VAR) # Use DeepSeek env var
37
+ if not api_key:
38
+ logging.error(f"Error: {DEEPSEEK_API_KEY_ENV_VAR} environment variable not set.")
39
+ logging.error("Please set the environment variable with your DeepSeek API key before running the script.")
40
+ return False
41
+ try:
42
+ # Configure OpenAI client for DeepSeek
43
+ client = OpenAI(api_key=api_key, base_url=DEEPSEEK_BASE_URL)
44
+ logging.info(f"DeepSeek API client configured successfully for model: {DEEPSEEK_MODEL_NAME}.")
45
+ return True
46
+ except Exception as e:
47
+ logging.error(f"Failed to configure DeepSeek API client: {e}")
48
+ client = None
49
+ return False
50
+
51
+ # --- End DeepSeek API Configuration ---
52
+
53
+ def generate_explanation(model_id: str, description: str) -> Optional[str]:
54
+ """
55
+ Generates a short English explanation for the model based on its description
56
+ by calling the DeepSeek API via the OpenAI library.
57
+
58
+ Args:
59
+ model_id: The ID of the model (for context).
60
+ description: The model description text.
61
+
62
+ Returns:
63
+ A short English explanation string from DeepSeek, or None if generation fails.
64
+ """
65
+ global client # Use OpenAI client
66
+ # global gemini_model # Remove
67
+ if not client:
68
+ logging.error(f"[{model_id}] DeepSeek client not configured. Cannot generate explanation.")
69
+ return None
70
+
71
+ if not description or not isinstance(description, str):
72
+ logging.warning(f"[{model_id}] Description is empty or not a string. Skipping explanation generation.")
73
+ return None
74
+
75
+ # Truncate very long descriptions (adjust limit back if needed for DeepSeek)
76
+ max_desc_length = 4000
77
+ if len(description) > max_desc_length:
78
+ logging.warning(f"[{model_id}] Description truncated to {max_desc_length} chars for API call.")
79
+ description = description[:max_desc_length] + "... [truncated]"
80
+
81
+ # Construct the messages for DeepSeek API (Restore original format)
82
+ messages = [
83
+ {"role": "system", "content": "You are an AI assistant tasked with summarizing Hugging Face model descriptions concisely."},
84
+ {"role": "user", "content": (
85
+ f"Analyze the following description for the Hugging Face model '{model_id}'. "
86
+ f"Based **only** on this description, provide a concise, one-sentence explanation in English "
87
+ f"summarizing what this model does and its primary purpose or task. "
88
+ f"Focus on the core functionality mentioned. Avoid adding introductory phrases like 'This model is...' or 'The model...'."
89
+ f"\n\n---\nModel Description:\n{description}\n---\n\nConcise Explanation:"
90
+ )}
91
+ ]
92
+
93
+ # Remove Gemini prompt construction
94
+ # prompt = (...)
95
+
96
+ retries = 0
97
+ while retries < MAX_RETRIES:
98
+ try:
99
+ logging.info(f"[{model_id}] Calling DeepSeek API (Attempt {retries + 1}/{MAX_RETRIES})...")
100
+ # Use OpenAI client call format
101
+ response = client.chat.completions.create(
102
+ model=DEEPSEEK_MODEL_NAME,
103
+ messages=messages,
104
+ stream=False,
105
+ max_tokens=100, # Limit response length
106
+ temperature=0.2 # Lower temperature for more focused summary
107
+ )
108
+
109
+ # Remove Gemini response handling
110
+ # if not response.candidates: ...
111
+
112
+ explanation = response.choices[0].message.content.strip() # Get explanation from OpenAI response structure
113
+ logging.info(f"[{model_id}] Explanation received from DeepSeek: '{explanation}'")
114
+
115
+ # Basic post-processing: remove potential quotes
116
+ if explanation.startswith('"') and explanation.endswith('"'):
117
+ explanation = explanation[1:-1]
118
+ # Remove Gemini specific post-processing
119
+ # explanation = explanation.replace('**', '')
120
+ return explanation
121
+
122
+ # Restore specific APIError catch for OpenAI client
123
+ except APIError as e:
124
+ retries += 1
125
+ logging.error(f"[{model_id}] DeepSeek API Error (Attempt {retries}/{MAX_RETRIES}): {e}")
126
+ if retries < MAX_RETRIES:
127
+ logging.info(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
128
+ time.sleep(RETRY_DELAY_SECONDS)
129
+ else:
130
+ logging.error(f"[{model_id}] Max retries reached. Failed to generate explanation via DeepSeek.")
131
+ return None
132
+ # Keep general Exception catch
133
+ except Exception as e:
134
+ retries += 1 # Consider retrying general errors too or handle differently
135
+ logging.error(f"[{model_id}] Unexpected Error during API call (Attempt {retries}/{MAX_RETRIES}): {e}")
136
+ if retries < MAX_RETRIES:
137
+ logging.info(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
138
+ time.sleep(RETRY_DELAY_SECONDS)
139
+ else:
140
+ logging.error(f"[{model_id}] Max retries reached. Failed to generate explanation due to unexpected errors.")
141
+ return None
142
+
143
+ return None # Should not be reached if loop finishes without returning
144
+
145
+ def process_json_file(filepath: str):
146
+ """Reads, updates (only if explanation missing), and writes a single JSON file."""
147
+ model_id = os.path.basename(filepath).replace('.json', '')
148
+ logging.info(f"Processing {filepath}...")
149
+
150
+ try:
151
+ with open(filepath, 'r', encoding='utf-8') as f:
152
+ data = json.load(f)
153
+ except json.JSONDecodeError:
154
+ logging.error(f"[{model_id}] Invalid JSON format in {filepath}. Skipping.")
155
+ return False # Indicate failure/skip
156
+ except FileNotFoundError:
157
+ logging.error(f"[{model_id}] File not found: {filepath}. Skipping.")
158
+ return False
159
+ except Exception as e:
160
+ logging.error(f"[{model_id}] Error reading {filepath}: {e}. Skipping.")
161
+ return False
162
+
163
+ if not isinstance(data, dict):
164
+ logging.error(f"[{model_id}] Expected JSON object (dict) but got {type(data)} in {filepath}. Skipping.")
165
+ return False
166
+
167
+ # --- Check if explanation already exists ---
168
+ if EXPLANATION_KEY in data and data[EXPLANATION_KEY]: # Check if key exists AND has non-empty content
169
+ logging.info(f"[{model_id}] Explanation already exists. Skipping generation.")
170
+ return False # Indicate no update was needed
171
+
172
+ # --- Deletion Logic REMOVED ---
173
+ # if EXPLANATION_KEY in data: ...
174
+
175
+ # --- Generation Logic ---
176
+ description = data.get(DESCRIPTION_KEY)
177
+ if not description:
178
+ logging.warning(f"[{model_id}] Description field is missing or empty. Cannot generate explanation.")
179
+ return False # Cannot generate, so no update possible
180
+
181
+ explanation = generate_explanation(model_id, description) # Try to generate a new one
182
+
183
+ # --- Update and Write Logic ---
184
+ if explanation: # Only update if generation was successful
185
+ data[EXPLANATION_KEY] = explanation
186
+ try:
187
+ with open(filepath, 'w', encoding='utf-8') as f:
188
+ json.dump(data, f, ensure_ascii=False, indent=4)
189
+ logging.info(f"[{model_id}] Successfully generated and updated {filepath} with new explanation.")
190
+ return True # Indicate success/update
191
+ except IOError as e:
192
+ logging.error(f"[{model_id}] Error writing updated data to {filepath}: {e}")
193
+ return False
194
+ except Exception as e:
195
+ logging.error(f"[{model_id}] Unexpected error writing {filepath}: {e}")
196
+ return False
197
+ else: # Explanation generation failed
198
+ logging.warning(f"[{model_id}] Failed to generate new explanation for {filepath} via API. File not updated.")
199
+ return False # Indicate failure/no update
200
+
201
+
202
+ def main():
203
+ """Main function to iterate through the directory and process files."""
204
+ if not configure_llm_client():
205
+ return # Stop if API key is not configured
206
+
207
+ if not os.path.isdir(MODEL_DATA_DIR):
208
+ logging.error(f"Directory not found: {MODEL_DATA_DIR}")
209
+ return
210
+
211
+ logging.info(f"Starting processing directory: {MODEL_DATA_DIR}")
212
+ processed_files = 0
213
+ updated_files = 0 # Count files actually updated
214
+ skipped_existing = 0 # Count files skipped because explanation existed
215
+ skipped_error = 0 # Count files skipped due to read/write/API errors or no description
216
+
217
+ all_files = [f for f in os.listdir(MODEL_DATA_DIR) if f.lower().endswith(".json")]
218
+ total_files = len(all_files)
219
+ logging.info(f"Found {total_files} JSON files to process.")
220
+
221
+ for i, filename in enumerate(all_files):
222
+ filepath = os.path.join(MODEL_DATA_DIR, filename)
223
+ logging.info(f"--- Processing file {i+1}/{total_files}: {filename} ---")
224
+ try:
225
+ # process_json_file now returns True if updated, False otherwise
226
+ updated = process_json_file(filepath)
227
+ processed_files += 1
228
+ if updated:
229
+ updated_files += 1
230
+ else:
231
+ # Need to differentiate why it wasn't updated. Re-read is inefficient.
232
+ # Let's rely on logs from process_json_file for now.
233
+ # A better way would be for process_json_file to return status codes.
234
+ pass # Logging within the function indicates reason (skipped existing, API fail, etc.)
235
+
236
+ except Exception as e:
237
+ logging.error(f"Unexpected error processing file loop for {filename}: {e}")
238
+ skipped_error += 1 # Count generic loop errors
239
+ # Add a small delay between files to potentially avoid hitting rate limits
240
+ # Adjust delay based on Gemini quota/limits (might need less than 0.5s)
241
+ time.sleep(0.2)
242
+
243
+
244
+ logging.info(f"--- Processing complete ---")
245
+ logging.info(f"Total JSON files found: {total_files}")
246
+ logging.info(f"Files processed (attempted): {processed_files}")
247
+ logging.info(f"Files successfully updated with new explanation: {updated_files}")
248
+ # Cannot precisely count skipped_existing vs skipped_error without better return values
249
+ # logging.info(f"Files skipped (existing explanation, errors, or no description): {total_files - updated_files}")
250
+
251
+
252
+ if __name__ == "__main__":
253
+ main()
app.py CHANGED
@@ -5,8 +5,26 @@ from flask_cors import CORS
5
  import numpy as np
6
  import json
7
  import traceback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  app = Flask(__name__) # Create app object FIRST
 
 
 
 
10
  # Allow requests from the Vercel frontend and localhost for development
11
  CORS(app, origins=["http://127.0.0.1:3000", "http://localhost:3000", "https://rag-huggingface.vercel.app"], supports_credentials=True)
12
 
@@ -94,6 +112,70 @@ def load_resources():
94
  load_resources()
95
  # ---
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  @app.route('/search', methods=['POST'])
98
  def search():
99
  """Handles search requests, embedding the query and searching the FAISS index."""
 
5
  import numpy as np
6
  import json
7
  import traceback
8
+ import logging # Added for background task logging
9
+ import threading # Added for background task
10
+ import time # Added for background task
11
+ import schedule # Added for background task
12
+
13
+ # --- Import the daily update function ---
14
+ try:
15
+ from daily_update import main as run_daily_update
16
+ # Set up logging for the daily_update module if it uses logging
17
+ # logging.getLogger('daily_update').setLevel(logging.INFO) # Example
18
+ except ImportError:
19
+ logging.error("Failed to import daily_update.py. The daily update task will not run.")
20
+ run_daily_update = None # Define as None if import fails
21
+ # ---
22
 
23
  app = Flask(__name__) # Create app object FIRST
24
+
25
+ # Configure Flask app logging (optional but recommended)
26
+ # app.logger.setLevel(logging.INFO)
27
+
28
  # Allow requests from the Vercel frontend and localhost for development
29
  CORS(app, origins=["http://127.0.0.1:3000", "http://localhost:3000", "https://rag-huggingface.vercel.app"], supports_credentials=True)
30
 
 
112
  load_resources()
113
  # ---
114
 
115
+ # --- Background Update Task ---
116
+
117
+ UPDATE_INTERVAL_HOURS = 24 # Check every 24 hours
118
+ UPDATE_TIME = "02:00" # Time to run the update (24-hour format)
119
+
120
+ def run_update_task():
121
+ """Wrapper function to run the daily update and handle errors."""
122
+ if run_daily_update is None:
123
+ logging.warning("run_daily_update function not available (import failed). Skipping task.")
124
+ return
125
+
126
+ logging.info(f"Background task: Starting daily update check (scheduled for {UPDATE_TIME})...")
127
+ try:
128
+ # Make sure the DEEPSEEK_API_KEY is set before running
129
+ if not os.getenv("DEEPSEEK_API_KEY"):
130
+ logging.error("Background task: DEEPSEEK_API_KEY not set. Daily update cannot run.")
131
+ return # Don't run if key is missing
132
+
133
+ run_daily_update() # Call the main function from daily_update.py
134
+ logging.info("Background task: Daily update process finished.")
135
+ except Exception as e:
136
+ logging.error(f"Background task: Error during daily update execution: {e}")
137
+ logging.error(traceback.format_exc())
138
+
139
+ def background_scheduler():
140
+ """Runs the scheduler loop in a background thread."""
141
+ logging.info(f"Background scheduler started. Will run update task daily around {UPDATE_TIME}.")
142
+
143
+ if run_daily_update is None:
144
+ logging.error("Background scheduler: daily_update.py could not be imported. Scheduler will not run tasks.")
145
+ return # Stop the thread if the core function isn't available
146
+
147
+ # Schedule the job
148
+ # schedule.every(UPDATE_INTERVAL_HOURS).hours.do(run_update_task) # Alternative: run every X hours
149
+ schedule.every().day.at(UPDATE_TIME).do(run_update_task)
150
+ logging.info(f"Scheduled daily update task for {UPDATE_TIME}.")
151
+
152
+ # Run once immediately on startup? (Optional)
153
+ # logging.info("Running initial update task on startup...")
154
+ # run_update_task()
155
+ # logging.info("Initial update task finished.")
156
+
157
+ while True:
158
+ schedule.run_pending()
159
+ time.sleep(60) # Check every 60 seconds if a task is due
160
+
161
+ # Start the background scheduler thread only if this is the main process
162
+ # This check helps prevent duplicate schedulers when using workers (like Gunicorn)
163
+ # Note: This might not be perfectly reliable with all WSGI servers/configs.
164
+ # Consider using a more robust method for ensuring single execution if needed (e.g., file lock, external process manager)
165
+ if os.environ.get("WERKZEUG_RUN_MAIN") == "true" or os.environ.get("FLASK_ENV") != "development":
166
+ # Start only in main Werkzeug process OR if not in Flask development mode (like production with Gunicorn)
167
+ # Check if the function is available before starting thread
168
+ if run_daily_update is not None:
169
+ scheduler_thread = threading.Thread(target=background_scheduler, daemon=True)
170
+ scheduler_thread.start()
171
+ logging.info("Background scheduler thread started.")
172
+ else:
173
+ logging.warning("Background scheduler thread NOT started because daily_update.py failed to import.")
174
+ else:
175
+ logging.info("Skipping background scheduler start in Werkzeug reloader process.")
176
+
177
+ # --- End Background Update Task ---
178
+
179
  @app.route('/search', methods=['POST'])
180
  def search():
181
  """Handles search requests, embedding the query and searching the FAISS index."""
build_index.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ['OMP_NUM_THREADS'] = '1' # Limit OpenMP threads, might help prevent crashes
3
+ import faiss
4
+ from sentence_transformers import SentenceTransformer
5
+ import numpy as np
6
+ import pickle
7
+ import json # Import json module
8
+ from tqdm import tqdm
9
+
10
+ # --- Configuration ---
11
+ MODEL_DATA_DIR = "model_data_json" # Path to downloaded JSON data
12
+ INDEX_FILE = "index.faiss"
13
+ MAP_FILE = "index_to_metadata.pkl" # Changed filename to reflect content
14
+ EMBEDDING_MODEL = 'all-mpnet-base-v2' # Efficient and good quality model
15
+ ENCODE_BATCH_SIZE = 32 # Process descriptions in smaller batches
16
+ # Tags to exclude from indexing text
17
+ COMMON_EXCLUDED_TAGS = {'transformers'} # Add other common tags if needed
18
+ EXCLUDED_TAG_PREFIXES = ('arxiv:', 'base_model:', 'dataset:', 'diffusers:', 'license:') # Add other prefixes if needed
19
+ MODEL_EXPLANATION_KEY = "model_explanation_gemini" # Key for the new explanation field
20
+ # ---
21
+
22
+ def load_model_data(directory):
23
+ """Loads model data, filters tags (by length, common words, prefixes), and combines relevant info for indexing."""
24
+ all_texts = [] # Store combined text (model_id + description + filtered_tags)
25
+ all_metadata = [] # Store dicts: {'model_id': ..., 'tags': ..., 'downloads': ...}
26
+ print(f"Loading model data from JSON files in: {directory}")
27
+ if not os.path.isdir(directory):
28
+ print(f"Error: Directory not found: {directory}")
29
+ return [], []
30
+
31
+ filenames = [f for f in os.listdir(directory) if f.endswith(".json")] # Look for .json files
32
+ for filename in tqdm(filenames, desc="Reading JSON files"):
33
+ filepath = os.path.join(directory, filename)
34
+ try:
35
+ with open(filepath, 'r', encoding='utf-8') as f:
36
+ data = json.load(f)
37
+ # Ensure required fields exist
38
+ if 'description' in data and 'model_id' in data:
39
+ description = data['description']
40
+ model_id = data['model_id'] # Get model_id
41
+ if description: # Only index if description is not empty
42
+ original_tags = data.get('tags', [])
43
+ # Filter tags: remove short tags, common tags, and tags with specific prefixes
44
+ filtered_tags = [
45
+ str_tag for tag in original_tags
46
+ if (
47
+ tag and isinstance(tag, str) and # Ensure tag exists and is a string
48
+ len(tag) > 3 and
49
+ (str_tag := str(tag)).lower() not in COMMON_EXCLUDED_TAGS and
50
+ not str_tag.lower().startswith(EXCLUDED_TAG_PREFIXES) # Check for prefixes
51
+ )
52
+ ]
53
+ tag_string = " ".join(filtered_tags)
54
+ explanation = data.get(MODEL_EXPLANATION_KEY) # Get the new explanation
55
+
56
+ # --- Construct combined text with priority weighting ---
57
+ text_parts = []
58
+ # 1. Add explanation (repeated for emphasis) if available
59
+ if explanation and isinstance(explanation, str):
60
+ text_parts.append(f"Summary: {explanation}")
61
+ text_parts.append(f"Summary: {explanation}") # Repeat for higher weight
62
+ # 2. Add model name
63
+ text_parts.append(f"Model: {model_id}")
64
+ # 3. Add filtered tags if available
65
+ if tag_string:
66
+ text_parts.append(f"Tags: {tag_string}")
67
+ # 4. Add original description
68
+ text_parts.append(f"Description: {description}")
69
+
70
+ combined_text = " ".join(text_parts).strip() # Join all parts
71
+ # --- End construction ---
72
+
73
+ all_texts.append(combined_text)
74
+ # Add explanation to metadata as well for potential display
75
+ metadata_entry = {
76
+ "model_id": model_id,
77
+ "tags": original_tags, # Keep ORIGINAL tags in metadata
78
+ "downloads": data.get('downloads', 0)
79
+ }
80
+ if explanation and isinstance(explanation, str):
81
+ metadata_entry[MODEL_EXPLANATION_KEY] = explanation
82
+ all_metadata.append(metadata_entry)
83
+ else:
84
+ print(f"Warning: Skipping {filename}, missing 'description' or 'model_id' key.")
85
+ except json.JSONDecodeError:
86
+ print(f"Warning: Skipping {filename}, invalid JSON.")
87
+ except Exception as e:
88
+ print(f"Warning: Could not read or process {filename}: {e}")
89
+
90
+ print(f"Loaded data for {len(all_texts)} models with valid descriptions after tag filtering.")
91
+ return all_texts, all_metadata
92
+
93
+ def build_and_save_index(texts_to_index, metadata_list):
94
+ """Builds and saves the FAISS index and metadata mapping based on combined text."""
95
+ if not texts_to_index:
96
+ print("No text data to index.")
97
+ return
98
+
99
+ print(f"Loading sentence transformer model: {EMBEDDING_MODEL}")
100
+ # Consider adding device='mps' if on Apple Silicon and PyTorch supports it well enough,
101
+ # but start with CPU for stability.
102
+ model = SentenceTransformer(EMBEDDING_MODEL)
103
+
104
+ print(f"Generating embeddings for combined text in batches of {ENCODE_BATCH_SIZE}...")
105
+ all_embeddings = []
106
+ for i in tqdm(range(0, len(texts_to_index), ENCODE_BATCH_SIZE), desc="Encoding batches"):
107
+ batch = texts_to_index[i:i+ENCODE_BATCH_SIZE]
108
+ batch_embeddings = model.encode(batch, convert_to_numpy=True)
109
+ all_embeddings.append(batch_embeddings)
110
+
111
+ if not all_embeddings:
112
+ print("No embeddings generated. Cannot build index.")
113
+ return
114
+
115
+ embeddings = np.vstack(all_embeddings) # Combine embeddings from all batches
116
+
117
+ # Ensure embeddings are float32 for FAISS
118
+ embeddings = embeddings.astype('float32')
119
+
120
+ # Build FAISS index
121
+ print("Building FAISS index...")
122
+ dimension = embeddings.shape[1]
123
+ index = faiss.IndexFlatL2(dimension) # Using simple L2 distance
124
+ index.add(embeddings)
125
+ print(f"FAISS index built with {index.ntotal} vectors.")
126
+
127
+ # Save the index
128
+ faiss.write_index(index, INDEX_FILE)
129
+ print(f"FAISS index saved to: {INDEX_FILE}")
130
+
131
+ # Create mapping from index position to metadata dictionary
132
+ index_to_metadata = {i: metadata for i, metadata in enumerate(metadata_list)}
133
+ with open(MAP_FILE, 'wb') as f:
134
+ pickle.dump(index_to_metadata, f)
135
+ print(f"Index-to-Metadata mapping saved to: {MAP_FILE}")
136
+
137
+ if __name__ == "__main__":
138
+ combined_texts, metadata_list = load_model_data(MODEL_DATA_DIR)
139
+ build_and_save_index(combined_texts, metadata_list)
140
+ print("\nIndex building complete.")
daily_update.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sys
3
+ import traceback
4
+
5
+ # Configure basic logging for the orchestration script
6
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
7
+
8
+ def run_step(step_func, step_name):
9
+ """Runs a step and logs its success or failure."""
10
+ logging.info(f"--- Starting step: {step_name} ---")
11
+ try:
12
+ step_func()
13
+ logging.info(f"--- Finished step: {step_name} successfully ---")
14
+ return True
15
+ except Exception as e:
16
+ logging.error(f"--- Step failed: {step_name} ---")
17
+ logging.error(f"Error: {e}")
18
+ # Log the full traceback for detailed debugging
19
+ logging.error(traceback.format_exc())
20
+ return False
21
+
22
+ def main():
23
+ """Runs the daily update sequence."""
24
+ logging.info("=== Starting Daily Model Update Process ===")
25
+
26
+ all_steps_succeeded = True
27
+
28
+ # --- Step 1: Fetch new/updated model descriptions ---
29
+ try:
30
+ # Import the script's main function dynamically
31
+ from huggingface_model_descriptions import main as fetch_models_main
32
+ if not run_step(fetch_models_main, "Fetch Hugging Face Models"):
33
+ all_steps_succeeded = False
34
+ # Decide if we should continue if fetching fails (maybe index can still be built?)
35
+ # For now, let's stop if the first step fails.
36
+ logging.error("Stopping update process for this cycle due to failure in fetching models.")
37
+ return # Exit the main function for this cycle
38
+ except ImportError:
39
+ logging.error("Failed to import huggingface_model_descriptions.py. Ensure it's in the same directory or Python path.")
40
+ all_steps_succeeded = False
41
+ return # Exit the main function for this cycle
42
+ except Exception as e: # Catch any unexpected error during import/setup
43
+ logging.error(f"Unexpected error setting up model fetching step: {e}")
44
+ logging.error(traceback.format_exc())
45
+ all_steps_succeeded = False
46
+ return # Exit the main function for this cycle
47
+
48
+
49
+ # --- Step 2: Add explanations using Gemini ---
50
+ # Only proceed if the previous step was successful
51
+ if all_steps_succeeded:
52
+ try:
53
+ from add_model_explanations import main as add_explanations_main
54
+ # Check for API key *before* running the step
55
+ import os
56
+ if not os.getenv("GEMINI_API_KEY"):
57
+ logging.warning("GEMINI_API_KEY environment variable not set. Explanation step will fail or do nothing.")
58
+ # Optionally, you could skip this step entirely if the key is missing:
59
+ # logging.warning("Skipping explanation generation step.")
60
+ # pass # Move to the next step
61
+
62
+ if not run_step(add_explanations_main, "Generate Model Explanations (Gemini)"):
63
+ all_steps_succeeded = False
64
+ # Decide if index building should proceed if explanations fail
65
+ logging.warning("Explanation generation failed. Index will be built with potentially missing explanations.")
66
+ # We will continue to the next step in this case
67
+
68
+ except ImportError:
69
+ logging.error("Failed to import add_model_explanations.py. Ensure it's in the same directory or Python path.")
70
+ all_steps_succeeded = False
71
+ # Stop if explanation script is missing
72
+ return # Exit the main function for this cycle
73
+ except Exception as e: # Catch any unexpected error during import/setup
74
+ logging.error(f"Unexpected error setting up explanation generation step: {e}")
75
+ logging.error(traceback.format_exc())
76
+ all_steps_succeeded = False
77
+ return # Exit the main function for this cycle
78
+
79
+ # --- Step 3: Rebuild the search index ---
80
+ # Only proceed if fetching models (Step 1) succeeded. Allow proceeding if Step 2 failed.
81
+ if 'fetch_models_main' in locals() or 'fetch_models_main' in globals(): # Check if Step 1 setup occurred
82
+ try:
83
+ from build_index import main as build_index_main
84
+ if not run_step(build_index_main, "Build Search Index (FAISS)"):
85
+ all_steps_succeeded = False
86
+ logging.error("Index building failed. The search index may be outdated or corrupted.")
87
+ # Stop if index building fails
88
+ return # Exit the main function for this cycle
89
+ except ImportError:
90
+ logging.error("Failed to import build_index.py. Ensure it's in the same directory or Python path.")
91
+ all_steps_succeeded = False
92
+ return # Exit the main function for this cycle
93
+ except Exception as e: # Catch any unexpected error during import/setup
94
+ logging.error(f"Unexpected error setting up index building step: {e}")
95
+ logging.error(traceback.format_exc())
96
+ all_steps_succeeded = False
97
+ return # Exit the main function for this cycle
98
+
99
+
100
+ logging.info("===========================================")
101
+ if all_steps_succeeded:
102
+ logging.info("=== Daily Model Update Process Completed Successfully ===")
103
+ else:
104
+ logging.error("=== Daily Model Update Process Completed with Errors ===")
105
+
106
+ if __name__ == "__main__":
107
+ main()
huggingface_model_descriptions.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from tqdm import tqdm
4
+ import time
5
+ import re
6
+ import json
7
+ from huggingface_hub import HfApi, hf_hub_download
8
+ from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError, HFValidationError
9
+ from requests.exceptions import RequestException
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ import pickle # Add pickle for caching
12
+
13
+ # Create a directory to store JSON data
14
+ OUTPUT_DIR = "model_data_json"
15
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
16
+
17
+ # Number of worker threads for parallel processing - REDUCED
18
+ NUM_WORKERS = 4
19
+
20
+ # Add a delay between download attempts across threads
21
+ DOWNLOAD_DELAY_SECONDS = 0.2 # Adjust as needed
22
+
23
+ # --- README Cleaning ---
24
+ def clean_readme_content(text):
25
+ """Basic cleaning of README markdown: remove code blocks, links."""
26
+ if not text:
27
+ return ""
28
+
29
+ # Remove fenced code blocks (``` ... ```)
30
+ text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
31
+ # Remove inline code (`...`)
32
+ text = re.sub(r'`[^`]+`', '', text)
33
+ # Remove markdown links ([text](url))
34
+ text = re.sub(r'\[([^]]+)\]\([^)]+\)', r'\1', text) # Keep link text
35
+ # Remove standalone URLs (simple version)
36
+ text = re.sub(r'https?://\S+', '', text)
37
+ # Remove markdown images (![alt](url))
38
+ text = re.sub(r'!\[[^]]*\]\([^)]+\)', '', text)
39
+ # Replace multiple newlines/spaces with single ones
40
+ text = ' '.join(text.split())
41
+ return text
42
+ # ---
43
+
44
+ MODELS_CACHE_FILE = "models_list_cache.pkl" # File to cache the raw model list
45
+
46
+ def get_all_models_with_downloads(min_downloads=10000):
47
+ """Fetch all models from Hugging Face with at least min_downloads, using a local cache for the list."""
48
+ models_list = None
49
+
50
+ # 1. Check for cache
51
+ if os.path.exists(MODELS_CACHE_FILE):
52
+ try:
53
+ print(f"Loading cached model list from {MODELS_CACHE_FILE}...")
54
+ with open(MODELS_CACHE_FILE, 'rb') as f:
55
+ models_list = pickle.load(f)
56
+ print(f"Loaded {len(models_list)} models from cache.")
57
+ except Exception as e:
58
+ print(f"Error loading cache file {MODELS_CACHE_FILE}: {e}. Fetching from API.")
59
+ models_list = None # Ensure fetching if cache loading fails
60
+
61
+ # 2. Fetch from API if cache doesn't exist or failed to load
62
+ if models_list is None:
63
+ print(f"Fetching all models with more than {min_downloads} downloads from API...")
64
+ try:
65
+ print("Initializing HfApi...")
66
+ api = HfApi()
67
+ print("HfApi initialized. Calling list_models...")
68
+ # Fetch the iterator
69
+ models_iterator = api.list_models(sort="downloads", direction=-1, fetch_config=False, cardData=True)
70
+ print("list_models call returned. Converting iterator to list...")
71
+ # Convert the iterator to a list TO ALLOW CACHING
72
+ models_list = list(models_iterator)
73
+ print(f"Converted to list with {len(models_list)} models.")
74
+
75
+ # Save to cache
76
+ try:
77
+ print(f"Saving model list to cache file: {MODELS_CACHE_FILE}...")
78
+ with open(MODELS_CACHE_FILE, 'wb') as f:
79
+ pickle.dump(models_list, f)
80
+ print("Model list saved to cache.")
81
+ except Exception as e:
82
+ print(f"Error saving cache file {MODELS_CACHE_FILE}: {e}")
83
+
84
+ except Exception as e:
85
+ print(f"Error during HfApi initialization or list_models call: {e}")
86
+ return [] # Return empty list on error
87
+
88
+ # 3. Filter the loaded/fetched list
89
+ if not models_list:
90
+ print("Model list is empty after fetching/loading.")
91
+ return []
92
+
93
+ qualifying_models = []
94
+ print(f"Filtering {len(models_list)} models by download count...")
95
+ for model in models_list: # Iterate through the list (from cache or API)
96
+ # No need for prints inside this loop now, as it should be fast
97
+ if not hasattr(model, 'downloads') or model.downloads is None:
98
+ continue
99
+
100
+ if model.downloads < min_downloads:
101
+ # Since the list is sorted by downloads, we can stop
102
+ break
103
+
104
+ qualifying_models.append(model)
105
+
106
+ print(f"Found {len(qualifying_models)} models with more than {min_downloads} downloads")
107
+ return qualifying_models
108
+
109
+ def get_model_readme(model_id):
110
+ """Get README.md content for a specific model using hf_hub_download. Returns None if not found or inaccessible."""
111
+ filenames_to_try = ["README.md", "readme.md"]
112
+ branches_to_try = ["main", "master"]
113
+
114
+ for branch in branches_to_try:
115
+ for filename in filenames_to_try:
116
+ try:
117
+ # print(f"Attempting download: repo={model_id}, branch={branch}, file={filename}") # Debug
118
+ # Use hf_hub_download which uses stored token
119
+ readme_path = hf_hub_download(
120
+ repo_id=model_id,
121
+ filename=filename,
122
+ revision=branch,
123
+ repo_type="model",
124
+ local_files_only=False, # Ensure it tries to download
125
+ # token=True # Often not needed if logged in via CLI, but can be explicit
126
+ )
127
+
128
+ # If download succeeded, read the content
129
+ # print(f"Successfully downloaded {filename} from {branch} to {readme_path}") # Debug
130
+ with open(readme_path, 'r', encoding='utf-8') as f:
131
+ content = f.read()
132
+ return content
133
+
134
+ except RepositoryNotFoundError:
135
+ print(f"Repository {model_id} not found.")
136
+ return None # If repo doesn't exist, no point trying other files/branches
137
+ except EntryNotFoundError:
138
+ # print(f"{filename} not found in branch {branch} for {model_id}. Trying next...") # Debug
139
+ continue # File not found in this specific branch/filename combination, try next
140
+ except HFValidationError as e: # Catch invalid repo ID or filename errors
141
+ print(f"Validation error for {model_id} (branch: {branch}, file: {filename}): {e}")
142
+ continue # Try next filename/branch
143
+ except Exception as e: # Catch other potential errors (like 401 HfHubHTTPError, network issues)
144
+ print(f"Error downloading {filename} from branch {branch} for {model_id}: {e}")
145
+ # Check if it's a likely authentication error (401/403)
146
+ if "401" in str(e) or "403" in str(e):
147
+ print(f"Authentication error (401/403) for {model_id}. Ensure you are logged in and accepted terms.")
148
+ return None # Don't try other files/branches if auth failed
149
+ # For other errors, we continue to the next filename/branch attempt
150
+ continue
151
+
152
+ # If all attempts failed
153
+ print(f"Could not fetch README for {model_id} from any standard location.")
154
+ return None
155
+
156
+ def get_filename_for_model(model_id):
157
+ """Generate JSON filename for a model"""
158
+ safe_id = model_id.replace("/", "_")
159
+ return os.path.join(OUTPUT_DIR, f"{safe_id}.json") # Change extension to .json
160
+
161
+ def save_model_data(model_id, data):
162
+ """Save model data (description, tags, downloads) to a JSON file."""
163
+ filename = get_filename_for_model(model_id)
164
+ try:
165
+ with open(filename, "w", encoding="utf-8") as f:
166
+ json.dump(data, f, ensure_ascii=False, indent=4)
167
+ return filename
168
+ except Exception as e:
169
+ print(f"Error saving JSON for {model_id} to {filename}: {e}")
170
+ return None
171
+
172
+ def file_exists_for_model(model_id):
173
+ """Check if a JSON file already exists for this model"""
174
+ filename = get_filename_for_model(model_id)
175
+ return os.path.exists(filename)
176
+
177
+ def process_model(model):
178
+ """Process a single model - fetch README, clean it, save as JSON."""
179
+ model_id = model.modelId
180
+ downloads = model.downloads
181
+ tags = getattr(model, 'tags', []) # Get tags if available
182
+
183
+ # Check if JSON file already exists
184
+ if file_exists_for_model(model_id):
185
+ return (model_id, downloads, None, "skipped")
186
+
187
+ # --- Add Delay Before Download Attempt ---
188
+ time.sleep(DOWNLOAD_DELAY_SECONDS)
189
+ # ---------------------------------------
190
+
191
+ # Get model README content
192
+ readme_content = get_model_readme(model_id)
193
+
194
+ # If README is not available, skip saving this model
195
+ if readme_content is None:
196
+ return (model_id, downloads, None, "no_readme")
197
+
198
+ # Clean the README
199
+ cleaned_readme = clean_readme_content(readme_content)
200
+
201
+ # Prepare data payload
202
+ model_data = {
203
+ "model_id": model_id,
204
+ "downloads": downloads,
205
+ "tags": tags,
206
+ "description": cleaned_readme
207
+ }
208
+
209
+ # Save data as JSON
210
+ filename = save_model_data(model_id, model_data)
211
+ if filename:
212
+ return (model_id, downloads, filename, "downloaded")
213
+ else:
214
+ return (model_id, downloads, None, "save_failed")
215
+
216
+ def main():
217
+ qualifying_models = get_all_models_with_downloads(min_downloads=10000)
218
+ if not qualifying_models:
219
+ print("No qualifying models found")
220
+ return
221
+
222
+ print(f"Processing {len(qualifying_models)} models, saving to '{OUTPUT_DIR}'...")
223
+ downloaded = 0
224
+ skipped = 0
225
+ no_readme = 0
226
+ failed = 0
227
+
228
+ with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
229
+ future_to_model = {executor.submit(process_model, model): model for model in qualifying_models}
230
+
231
+ for future in tqdm(as_completed(future_to_model), total=len(qualifying_models)):
232
+ try:
233
+ model_id, downloads, filename, status = future.result()
234
+ if status == "downloaded":
235
+ # Don't print every success to avoid clutter
236
+ # print(f"Saved data for {model_id} ({downloads} downloads) to {filename}")
237
+ downloaded += 1
238
+ elif status == "skipped":
239
+ skipped += 1
240
+ elif status == "no_readme":
241
+ no_readme += 1
242
+ else: # save_failed or other errors
243
+ failed += 1
244
+ except Exception as e:
245
+ # Extract model_id for better error reporting if possible
246
+ processed_model = future_to_model[future]
247
+ print(f"Error processing model {getattr(processed_model, 'modelId', 'unknown')}: {e}")
248
+ failed += 1
249
+
250
+ print(f"\nCompleted! Downloaded: {downloaded}, Skipped existing: {skipped}, No README found: {no_readme}, Failed: {failed}")
251
+
252
+ if __name__ == "__main__":
253
+ main()
requirements.txt CHANGED
@@ -4,4 +4,6 @@ sentence-transformers>=2.3.0
4
  numpy>=1.20.0
5
  faiss-cpu>=1.7.0 # Use faiss-gpu if you need GPU support on HF Spaces
6
  huggingface-hub>=0.15.1 # Version compatible with sentence-transformers >= 2.3.0
7
- gunicorn # Added for deployment on Hugging Face Spaces
 
 
 
4
  numpy>=1.20.0
5
  faiss-cpu>=1.7.0 # Use faiss-gpu if you need GPU support on HF Spaces
6
  huggingface-hub>=0.15.1 # Version compatible with sentence-transformers >= 2.3.0
7
+ gunicorn # Added for deployment on Hugging Face Spaces
8
+ openai>=1.0.0 # Added back for DeepSeek API via OpenAI client
9
+ schedule>=1.0.0 # Added for in-app scheduling