Spaces:

shayan5422
/

back_rag_huggingface

Running

App Files Files Community

shayan5422 commited on 4 days ago

Commit

0db8b33

verified ·

1 Parent(s): 951f99a

Upload 10 files

Browse files

Files changed (6) hide show

add_model_explanations.py +253 -0
app.py +82 -0
build_index.py +140 -0
daily_update.py +107 -0
huggingface_model_descriptions.py +253 -0
requirements.txt +3 -1

add_model_explanations.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import os
+import json
+from typing import Dict, Any, Optional
+import logging
+import time
+# import google.generativeai as genai # Remove Gemini import
+from openai import OpenAI, APIError # Add back OpenAI imports
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+MODEL_DATA_DIR = "model_data_json"
+EXPLANATION_KEY = "model_explanation_gemini"
+DESCRIPTION_KEY = "description"
+MAX_RETRIES = 3 # Retries for API calls
+RETRY_DELAY_SECONDS = 5 # Delay between retries
+# --- DeepSeek API Configuration (Restored) ---
+DEEPSEEK_API_KEY_ENV_VAR = "DEEPSEEK_API_KEY" # Environment variable for the key
+DEEPSEEK_BASE_URL = "https://api.deepseek.com"
+DEEPSEEK_MODEL_NAME = "deepseek-chat"
+# ---
+# Remove Gemini configuration
+# GEMINI_API_KEY_ENV_VAR = "GEMINI_API_KEY"
+# GEMINI_MODEL_NAME = "gemini-1.5-flash-latest"
+# Global client variable for DeepSeek/OpenAI client
+client: Optional[OpenAI] = None # Use OpenAI client type
+# gemini_model: Optional[genai.GenerativeModel] = None # Remove Gemini model variable
+def configure_llm_client():
+    """Configures the OpenAI client for DeepSeek API using the API key from environment variables."""
+    global client
+    # global gemini_model # Remove
+    api_key = os.getenv(DEEPSEEK_API_KEY_ENV_VAR) # Use DeepSeek env var
+    if not api_key:
+        logging.error(f"Error: {DEEPSEEK_API_KEY_ENV_VAR} environment variable not set.")
+        logging.error("Please set the environment variable with your DeepSeek API key before running the script.")
+        return False
+    try:
+        # Configure OpenAI client for DeepSeek
+        client = OpenAI(api_key=api_key, base_url=DEEPSEEK_BASE_URL)
+        logging.info(f"DeepSeek API client configured successfully for model: {DEEPSEEK_MODEL_NAME}.")
+        return True
+    except Exception as e:
+        logging.error(f"Failed to configure DeepSeek API client: {e}")
+        client = None
+        return False
+# --- End DeepSeek API Configuration ---
+def generate_explanation(model_id: str, description: str) -> Optional[str]:
+    """
+    Generates a short English explanation for the model based on its description
+    by calling the DeepSeek API via the OpenAI library.
+    Args:
+        model_id: The ID of the model (for context).
+        description: The model description text.
+    Returns:
+        A short English explanation string from DeepSeek, or None if generation fails.
+    """
+    global client # Use OpenAI client
+    # global gemini_model # Remove
+    if not client:
+        logging.error(f"[{model_id}] DeepSeek client not configured. Cannot generate explanation.")
+        return None
+    if not description or not isinstance(description, str):
+        logging.warning(f"[{model_id}] Description is empty or not a string. Skipping explanation generation.")
+        return None
+    # Truncate very long descriptions (adjust limit back if needed for DeepSeek)
+    max_desc_length = 4000
+    if len(description) > max_desc_length:
+        logging.warning(f"[{model_id}] Description truncated to {max_desc_length} chars for API call.")
+        description = description[:max_desc_length] + "... [truncated]"
+    # Construct the messages for DeepSeek API (Restore original format)
+    messages = [
+        {"role": "system", "content": "You are an AI assistant tasked with summarizing Hugging Face model descriptions concisely."},
+        {"role": "user", "content": (
+            f"Analyze the following description for the Hugging Face model '{model_id}'. "
+            f"Based **only** on this description, provide a concise, one-sentence explanation in English "
+            f"summarizing what this model does and its primary purpose or task. "
+            f"Focus on the core functionality mentioned. Avoid adding introductory phrases like 'This model is...' or 'The model...'."
+            f"\n\n---\nModel Description:\n{description}\n---\n\nConcise Explanation:"
+        )}
+    ]
+    # Remove Gemini prompt construction
+    # prompt = (...)
+    retries = 0
+    while retries < MAX_RETRIES:
+        try:
+            logging.info(f"[{model_id}] Calling DeepSeek API (Attempt {retries + 1}/{MAX_RETRIES})...")
+            # Use OpenAI client call format
+            response = client.chat.completions.create(
+                model=DEEPSEEK_MODEL_NAME,
+                messages=messages,
+                stream=False,
+                max_tokens=100, # Limit response length
+                temperature=0.2 # Lower temperature for more focused summary
+            )
+            # Remove Gemini response handling
+            # if not response.candidates: ...
+            explanation = response.choices[0].message.content.strip() # Get explanation from OpenAI response structure
+            logging.info(f"[{model_id}] Explanation received from DeepSeek: '{explanation}'")
+            # Basic post-processing: remove potential quotes
+            if explanation.startswith('"') and explanation.endswith('"'):
+                explanation = explanation[1:-1]
+            # Remove Gemini specific post-processing
+            # explanation = explanation.replace('**', '')
+            return explanation
+        # Restore specific APIError catch for OpenAI client
+        except APIError as e:
+            retries += 1
+            logging.error(f"[{model_id}] DeepSeek API Error (Attempt {retries}/{MAX_RETRIES}): {e}")
+            if retries < MAX_RETRIES:
+                logging.info(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
+                time.sleep(RETRY_DELAY_SECONDS)
+            else:
+                logging.error(f"[{model_id}] Max retries reached. Failed to generate explanation via DeepSeek.")
+                return None
+        # Keep general Exception catch
+        except Exception as e:
+            retries += 1 # Consider retrying general errors too or handle differently
+            logging.error(f"[{model_id}] Unexpected Error during API call (Attempt {retries}/{MAX_RETRIES}): {e}")
+            if retries < MAX_RETRIES:
+                 logging.info(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
+                 time.sleep(RETRY_DELAY_SECONDS)
+            else:
+                logging.error(f"[{model_id}] Max retries reached. Failed to generate explanation due to unexpected errors.")
+                return None
+    return None # Should not be reached if loop finishes without returning
+def process_json_file(filepath: str):
+    """Reads, updates (only if explanation missing), and writes a single JSON file."""
+    model_id = os.path.basename(filepath).replace('.json', '')
+    logging.info(f"Processing {filepath}...")
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    except json.JSONDecodeError:
+        logging.error(f"[{model_id}] Invalid JSON format in {filepath}. Skipping.")
+        return False # Indicate failure/skip
+    except FileNotFoundError:
+        logging.error(f"[{model_id}] File not found: {filepath}. Skipping.")
+        return False
+    except Exception as e:
+        logging.error(f"[{model_id}] Error reading {filepath}: {e}. Skipping.")
+        return False
+    if not isinstance(data, dict):
+        logging.error(f"[{model_id}] Expected JSON object (dict) but got {type(data)} in {filepath}. Skipping.")
+        return False
+    # --- Check if explanation already exists ---
+    if EXPLANATION_KEY in data and data[EXPLANATION_KEY]: # Check if key exists AND has non-empty content
+        logging.info(f"[{model_id}] Explanation already exists. Skipping generation.")
+        return False # Indicate no update was needed
+    # --- Deletion Logic REMOVED ---
+    # if EXPLANATION_KEY in data: ...
+    # --- Generation Logic ---
+    description = data.get(DESCRIPTION_KEY)
+    if not description:
+         logging.warning(f"[{model_id}] Description field is missing or empty. Cannot generate explanation.")
+         return False # Cannot generate, so no update possible
+    explanation = generate_explanation(model_id, description) # Try to generate a new one
+    # --- Update and Write Logic ---
+    if explanation: # Only update if generation was successful
+        data[EXPLANATION_KEY] = explanation
+        try:
+            with open(filepath, 'w', encoding='utf-8') as f:
+                json.dump(data, f, ensure_ascii=False, indent=4)
+            logging.info(f"[{model_id}] Successfully generated and updated {filepath} with new explanation.")
+            return True # Indicate success/update
+        except IOError as e:
+            logging.error(f"[{model_id}] Error writing updated data to {filepath}: {e}")
+            return False
+        except Exception as e:
+            logging.error(f"[{model_id}] Unexpected error writing {filepath}: {e}")
+            return False
+    else: # Explanation generation failed
+         logging.warning(f"[{model_id}] Failed to generate new explanation for {filepath} via API. File not updated.")
+         return False # Indicate failure/no update
+def main():
+    """Main function to iterate through the directory and process files."""
+    if not configure_llm_client():
+        return # Stop if API key is not configured
+    if not os.path.isdir(MODEL_DATA_DIR):
+        logging.error(f"Directory not found: {MODEL_DATA_DIR}")
+        return
+    logging.info(f"Starting processing directory: {MODEL_DATA_DIR}")
+    processed_files = 0
+    updated_files = 0 # Count files actually updated
+    skipped_existing = 0 # Count files skipped because explanation existed
+    skipped_error = 0 # Count files skipped due to read/write/API errors or no description
+    all_files = [f for f in os.listdir(MODEL_DATA_DIR) if f.lower().endswith(".json")]
+    total_files = len(all_files)
+    logging.info(f"Found {total_files} JSON files to process.")
+    for i, filename in enumerate(all_files):
+        filepath = os.path.join(MODEL_DATA_DIR, filename)
+        logging.info(f"--- Processing file {i+1}/{total_files}: {filename} ---")
+        try:
+            # process_json_file now returns True if updated, False otherwise
+            updated = process_json_file(filepath)
+            processed_files += 1
+            if updated:
+                updated_files += 1
+            else:
+                # Need to differentiate why it wasn't updated. Re-read is inefficient.
+                # Let's rely on logs from process_json_file for now.
+                # A better way would be for process_json_file to return status codes.
+                pass # Logging within the function indicates reason (skipped existing, API fail, etc.)
+        except Exception as e:
+            logging.error(f"Unexpected error processing file loop for {filename}: {e}")
+            skipped_error += 1 # Count generic loop errors
+        # Add a small delay between files to potentially avoid hitting rate limits
+        # Adjust delay based on Gemini quota/limits (might need less than 0.5s)
+        time.sleep(0.2)
+    logging.info(f"--- Processing complete ---")
+    logging.info(f"Total JSON files found: {total_files}")
+    logging.info(f"Files processed (attempted): {processed_files}")
+    logging.info(f"Files successfully updated with new explanation: {updated_files}")
+    # Cannot precisely count skipped_existing vs skipped_error without better return values
+    # logging.info(f"Files skipped (existing explanation, errors, or no description): {total_files - updated_files}")
+if __name__ == "__main__":
+    main()

app.py CHANGED Viewed

@@ -5,8 +5,26 @@ from flask_cors import CORS
 import numpy as np
 import json
 import traceback
 app = Flask(__name__) # Create app object FIRST
 # Allow requests from the Vercel frontend and localhost for development
 CORS(app, origins=["http://127.0.0.1:3000", "http://localhost:3000", "https://rag-huggingface.vercel.app"], supports_credentials=True)
@@ -94,6 +112,70 @@ def load_resources():
 load_resources()
 # ---
 @app.route('/search', methods=['POST'])
 def search():
     """Handles search requests, embedding the query and searching the FAISS index."""

 import numpy as np
 import json
 import traceback
+import logging # Added for background task logging
+import threading # Added for background task
+import time # Added for background task
+import schedule # Added for background task
+# --- Import the daily update function ---
+try:
+    from daily_update import main as run_daily_update
+    # Set up logging for the daily_update module if it uses logging
+    # logging.getLogger('daily_update').setLevel(logging.INFO) # Example
+except ImportError:
+    logging.error("Failed to import daily_update.py. The daily update task will not run.")
+    run_daily_update = None # Define as None if import fails
+# ---
 app = Flask(__name__) # Create app object FIRST
+# Configure Flask app logging (optional but recommended)
+# app.logger.setLevel(logging.INFO)
 # Allow requests from the Vercel frontend and localhost for development
 CORS(app, origins=["http://127.0.0.1:3000", "http://localhost:3000", "https://rag-huggingface.vercel.app"], supports_credentials=True)
 load_resources()
 # ---
+# --- Background Update Task ---
+UPDATE_INTERVAL_HOURS = 24 # Check every 24 hours
+UPDATE_TIME = "02:00" # Time to run the update (24-hour format)
+def run_update_task():
+    """Wrapper function to run the daily update and handle errors."""
+    if run_daily_update is None:
+        logging.warning("run_daily_update function not available (import failed). Skipping task.")
+        return
+    logging.info(f"Background task: Starting daily update check (scheduled for {UPDATE_TIME})...")
+    try:
+        # Make sure the DEEPSEEK_API_KEY is set before running
+        if not os.getenv("DEEPSEEK_API_KEY"):
+            logging.error("Background task: DEEPSEEK_API_KEY not set. Daily update cannot run.")
+            return # Don't run if key is missing
+        run_daily_update() # Call the main function from daily_update.py
+        logging.info("Background task: Daily update process finished.")
+    except Exception as e:
+        logging.error(f"Background task: Error during daily update execution: {e}")
+        logging.error(traceback.format_exc())
+def background_scheduler():
+    """Runs the scheduler loop in a background thread."""
+    logging.info(f"Background scheduler started. Will run update task daily around {UPDATE_TIME}.")
+    if run_daily_update is None:
+        logging.error("Background scheduler: daily_update.py could not be imported. Scheduler will not run tasks.")
+        return # Stop the thread if the core function isn't available
+    # Schedule the job
+    # schedule.every(UPDATE_INTERVAL_HOURS).hours.do(run_update_task) # Alternative: run every X hours
+    schedule.every().day.at(UPDATE_TIME).do(run_update_task)
+    logging.info(f"Scheduled daily update task for {UPDATE_TIME}.")
+    # Run once immediately on startup? (Optional)
+    # logging.info("Running initial update task on startup...")
+    # run_update_task()
+    # logging.info("Initial update task finished.")
+    while True:
+        schedule.run_pending()
+        time.sleep(60) # Check every 60 seconds if a task is due
+# Start the background scheduler thread only if this is the main process
+# This check helps prevent duplicate schedulers when using workers (like Gunicorn)
+# Note: This might not be perfectly reliable with all WSGI servers/configs.
+# Consider using a more robust method for ensuring single execution if needed (e.g., file lock, external process manager)
+if os.environ.get("WERKZEUG_RUN_MAIN") == "true" or os.environ.get("FLASK_ENV") != "development":
+    # Start only in main Werkzeug process OR if not in Flask development mode (like production with Gunicorn)
+    # Check if the function is available before starting thread
+    if run_daily_update is not None:
+        scheduler_thread = threading.Thread(target=background_scheduler, daemon=True)
+        scheduler_thread.start()
+        logging.info("Background scheduler thread started.")
+    else:
+        logging.warning("Background scheduler thread NOT started because daily_update.py failed to import.")
+else:
+    logging.info("Skipping background scheduler start in Werkzeug reloader process.")
+# --- End Background Update Task ---
 @app.route('/search', methods=['POST'])
 def search():
     """Handles search requests, embedding the query and searching the FAISS index."""

build_index.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+os.environ['OMP_NUM_THREADS'] = '1' # Limit OpenMP threads, might help prevent crashes
+import faiss
+from sentence_transformers import SentenceTransformer
+import numpy as np
+import pickle
+import json # Import json module
+from tqdm import tqdm
+# --- Configuration ---
+MODEL_DATA_DIR = "model_data_json"  # Path to downloaded JSON data
+INDEX_FILE = "index.faiss"
+MAP_FILE = "index_to_metadata.pkl" # Changed filename to reflect content
+EMBEDDING_MODEL = 'all-mpnet-base-v2'  # Efficient and good quality model
+ENCODE_BATCH_SIZE = 32  # Process descriptions in smaller batches
+# Tags to exclude from indexing text
+COMMON_EXCLUDED_TAGS = {'transformers'} # Add other common tags if needed
+EXCLUDED_TAG_PREFIXES = ('arxiv:', 'base_model:', 'dataset:', 'diffusers:', 'license:') # Add other prefixes if needed
+MODEL_EXPLANATION_KEY = "model_explanation_gemini" # Key for the new explanation field
+# ---
+def load_model_data(directory):
+    """Loads model data, filters tags (by length, common words, prefixes), and combines relevant info for indexing."""
+    all_texts = [] # Store combined text (model_id + description + filtered_tags)
+    all_metadata = [] # Store dicts: {'model_id': ..., 'tags': ..., 'downloads': ...}
+    print(f"Loading model data from JSON files in: {directory}")
+    if not os.path.isdir(directory):
+        print(f"Error: Directory not found: {directory}")
+        return [], []
+    filenames = [f for f in os.listdir(directory) if f.endswith(".json")] # Look for .json files
+    for filename in tqdm(filenames, desc="Reading JSON files"):
+        filepath = os.path.join(directory, filename)
+        try:
+            with open(filepath, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                # Ensure required fields exist
+                if 'description' in data and 'model_id' in data:
+                    description = data['description']
+                    model_id = data['model_id'] # Get model_id
+                    if description: # Only index if description is not empty
+                        original_tags = data.get('tags', [])
+                        # Filter tags: remove short tags, common tags, and tags with specific prefixes
+                        filtered_tags = [
+                            str_tag for tag in original_tags
+                            if (
+                                tag and isinstance(tag, str) and # Ensure tag exists and is a string
+                                len(tag) > 3 and
+                                (str_tag := str(tag)).lower() not in COMMON_EXCLUDED_TAGS and
+                                not str_tag.lower().startswith(EXCLUDED_TAG_PREFIXES) # Check for prefixes
+                            )
+                        ]
+                        tag_string = " ".join(filtered_tags)
+                        explanation = data.get(MODEL_EXPLANATION_KEY) # Get the new explanation
+                        # --- Construct combined text with priority weighting ---
+                        text_parts = []
+                        # 1. Add explanation (repeated for emphasis) if available
+                        if explanation and isinstance(explanation, str):
+                            text_parts.append(f"Summary: {explanation}")
+                            text_parts.append(f"Summary: {explanation}") # Repeat for higher weight
+                        # 2. Add model name
+                        text_parts.append(f"Model: {model_id}")
+                        # 3. Add filtered tags if available
+                        if tag_string:
+                            text_parts.append(f"Tags: {tag_string}")
+                        # 4. Add original description
+                        text_parts.append(f"Description: {description}")
+                        combined_text = " ".join(text_parts).strip() # Join all parts
+                        # --- End construction ---
+                        all_texts.append(combined_text)
+                        # Add explanation to metadata as well for potential display
+                        metadata_entry = {
+                            "model_id": model_id,
+                            "tags": original_tags, # Keep ORIGINAL tags in metadata
+                            "downloads": data.get('downloads', 0)
+                        }
+                        if explanation and isinstance(explanation, str):
+                            metadata_entry[MODEL_EXPLANATION_KEY] = explanation
+                        all_metadata.append(metadata_entry)
+                else:
+                    print(f"Warning: Skipping {filename}, missing 'description' or 'model_id' key.")
+        except json.JSONDecodeError:
+            print(f"Warning: Skipping {filename}, invalid JSON.")
+        except Exception as e:
+            print(f"Warning: Could not read or process {filename}: {e}")
+    print(f"Loaded data for {len(all_texts)} models with valid descriptions after tag filtering.")
+    return all_texts, all_metadata
+def build_and_save_index(texts_to_index, metadata_list):
+    """Builds and saves the FAISS index and metadata mapping based on combined text."""
+    if not texts_to_index:
+        print("No text data to index.")
+        return
+    print(f"Loading sentence transformer model: {EMBEDDING_MODEL}")
+    # Consider adding device='mps' if on Apple Silicon and PyTorch supports it well enough,
+    # but start with CPU for stability.
+    model = SentenceTransformer(EMBEDDING_MODEL)
+    print(f"Generating embeddings for combined text in batches of {ENCODE_BATCH_SIZE}...")
+    all_embeddings = []
+    for i in tqdm(range(0, len(texts_to_index), ENCODE_BATCH_SIZE), desc="Encoding batches"):
+        batch = texts_to_index[i:i+ENCODE_BATCH_SIZE]
+        batch_embeddings = model.encode(batch, convert_to_numpy=True)
+        all_embeddings.append(batch_embeddings)
+    if not all_embeddings:
+        print("No embeddings generated. Cannot build index.")
+        return
+    embeddings = np.vstack(all_embeddings) # Combine embeddings from all batches
+    # Ensure embeddings are float32 for FAISS
+    embeddings = embeddings.astype('float32')
+    # Build FAISS index
+    print("Building FAISS index...")
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)  # Using simple L2 distance
+    index.add(embeddings)
+    print(f"FAISS index built with {index.ntotal} vectors.")
+    # Save the index
+    faiss.write_index(index, INDEX_FILE)
+    print(f"FAISS index saved to: {INDEX_FILE}")
+    # Create mapping from index position to metadata dictionary
+    index_to_metadata = {i: metadata for i, metadata in enumerate(metadata_list)}
+    with open(MAP_FILE, 'wb') as f:
+        pickle.dump(index_to_metadata, f)
+    print(f"Index-to-Metadata mapping saved to: {MAP_FILE}")
+if __name__ == "__main__":
+    combined_texts, metadata_list = load_model_data(MODEL_DATA_DIR)
+    build_and_save_index(combined_texts, metadata_list)
+    print("\nIndex building complete.")

daily_update.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import logging
+import sys
+import traceback
+# Configure basic logging for the orchestration script
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def run_step(step_func, step_name):
+    """Runs a step and logs its success or failure."""
+    logging.info(f"--- Starting step: {step_name} ---")
+    try:
+        step_func()
+        logging.info(f"--- Finished step: {step_name} successfully ---")
+        return True
+    except Exception as e:
+        logging.error(f"--- Step failed: {step_name} ---")
+        logging.error(f"Error: {e}")
+        # Log the full traceback for detailed debugging
+        logging.error(traceback.format_exc())
+        return False
+def main():
+    """Runs the daily update sequence."""
+    logging.info("=== Starting Daily Model Update Process ===")
+    all_steps_succeeded = True
+    # --- Step 1: Fetch new/updated model descriptions ---
+    try:
+        # Import the script's main function dynamically
+        from huggingface_model_descriptions import main as fetch_models_main
+        if not run_step(fetch_models_main, "Fetch Hugging Face Models"):
+            all_steps_succeeded = False
+            # Decide if we should continue if fetching fails (maybe index can still be built?)
+            # For now, let's stop if the first step fails.
+            logging.error("Stopping update process for this cycle due to failure in fetching models.")
+            return # Exit the main function for this cycle
+    except ImportError:
+        logging.error("Failed to import huggingface_model_descriptions.py. Ensure it's in the same directory or Python path.")
+        all_steps_succeeded = False
+        return # Exit the main function for this cycle
+    except Exception as e: # Catch any unexpected error during import/setup
+        logging.error(f"Unexpected error setting up model fetching step: {e}")
+        logging.error(traceback.format_exc())
+        all_steps_succeeded = False
+        return # Exit the main function for this cycle
+    # --- Step 2: Add explanations using Gemini ---
+    # Only proceed if the previous step was successful
+    if all_steps_succeeded:
+        try:
+            from add_model_explanations import main as add_explanations_main
+            # Check for API key *before* running the step
+            import os
+            if not os.getenv("GEMINI_API_KEY"):
+                 logging.warning("GEMINI_API_KEY environment variable not set. Explanation step will fail or do nothing.")
+                 # Optionally, you could skip this step entirely if the key is missing:
+                 # logging.warning("Skipping explanation generation step.")
+                 # pass # Move to the next step
+            if not run_step(add_explanations_main, "Generate Model Explanations (Gemini)"):
+                all_steps_succeeded = False
+                # Decide if index building should proceed if explanations fail
+                logging.warning("Explanation generation failed. Index will be built with potentially missing explanations.")
+                # We will continue to the next step in this case
+        except ImportError:
+            logging.error("Failed to import add_model_explanations.py. Ensure it's in the same directory or Python path.")
+            all_steps_succeeded = False
+            # Stop if explanation script is missing
+            return # Exit the main function for this cycle
+        except Exception as e: # Catch any unexpected error during import/setup
+            logging.error(f"Unexpected error setting up explanation generation step: {e}")
+            logging.error(traceback.format_exc())
+            all_steps_succeeded = False
+            return # Exit the main function for this cycle
+    # --- Step 3: Rebuild the search index ---
+    # Only proceed if fetching models (Step 1) succeeded. Allow proceeding if Step 2 failed.
+    if 'fetch_models_main' in locals() or 'fetch_models_main' in globals(): # Check if Step 1 setup occurred
+        try:
+            from build_index import main as build_index_main
+            if not run_step(build_index_main, "Build Search Index (FAISS)"):
+                all_steps_succeeded = False
+                logging.error("Index building failed. The search index may be outdated or corrupted.")
+                # Stop if index building fails
+                return # Exit the main function for this cycle
+        except ImportError:
+            logging.error("Failed to import build_index.py. Ensure it's in the same directory or Python path.")
+            all_steps_succeeded = False
+            return # Exit the main function for this cycle
+        except Exception as e: # Catch any unexpected error during import/setup
+            logging.error(f"Unexpected error setting up index building step: {e}")
+            logging.error(traceback.format_exc())
+            all_steps_succeeded = False
+            return # Exit the main function for this cycle
+    logging.info("===========================================")
+    if all_steps_succeeded:
+        logging.info("=== Daily Model Update Process Completed Successfully ===")
+    else:
+        logging.error("=== Daily Model Update Process Completed with Errors ===")
+if __name__ == "__main__":
+    main()

huggingface_model_descriptions.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import os
+import requests
+from tqdm import tqdm
+import time
+import re
+import json
+from huggingface_hub import HfApi, hf_hub_download
+from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError, HFValidationError
+from requests.exceptions import RequestException
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import pickle # Add pickle for caching
+# Create a directory to store JSON data
+OUTPUT_DIR = "model_data_json"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+# Number of worker threads for parallel processing - REDUCED
+NUM_WORKERS = 4
+# Add a delay between download attempts across threads
+DOWNLOAD_DELAY_SECONDS = 0.2 # Adjust as needed
+# --- README Cleaning ---
+def clean_readme_content(text):
+    """Basic cleaning of README markdown: remove code blocks, links."""
+    if not text:
+        return ""
+    # Remove fenced code blocks (``` ... ```)
+    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
+    # Remove inline code (`...`)
+    text = re.sub(r'`[^`]+`', '', text)
+    # Remove markdown links ([text](url))
+    text = re.sub(r'\[([^]]+)\]\([^)]+\)', r'\1', text) # Keep link text
+    # Remove standalone URLs (simple version)
+    text = re.sub(r'https?://\S+', '', text)
+    # Remove markdown images (![alt](url))
+    text = re.sub(r'!\[[^]]*\]\([^)]+\)', '', text)
+    # Replace multiple newlines/spaces with single ones
+    text = ' '.join(text.split())
+    return text
+# ---
+MODELS_CACHE_FILE = "models_list_cache.pkl" # File to cache the raw model list
+def get_all_models_with_downloads(min_downloads=10000):
+    """Fetch all models from Hugging Face with at least min_downloads, using a local cache for the list."""
+    models_list = None
+    # 1. Check for cache
+    if os.path.exists(MODELS_CACHE_FILE):
+        try:
+            print(f"Loading cached model list from {MODELS_CACHE_FILE}...")
+            with open(MODELS_CACHE_FILE, 'rb') as f:
+                models_list = pickle.load(f)
+            print(f"Loaded {len(models_list)} models from cache.")
+        except Exception as e:
+            print(f"Error loading cache file {MODELS_CACHE_FILE}: {e}. Fetching from API.")
+            models_list = None # Ensure fetching if cache loading fails
+    # 2. Fetch from API if cache doesn't exist or failed to load
+    if models_list is None:
+        print(f"Fetching all models with more than {min_downloads} downloads from API...")
+        try:
+            print("Initializing HfApi...")
+            api = HfApi()
+            print("HfApi initialized. Calling list_models...")
+            # Fetch the iterator
+            models_iterator = api.list_models(sort="downloads", direction=-1, fetch_config=False, cardData=True)
+            print("list_models call returned. Converting iterator to list...")
+            # Convert the iterator to a list TO ALLOW CACHING
+            models_list = list(models_iterator)
+            print(f"Converted to list with {len(models_list)} models.")
+            # Save to cache
+            try:
+                print(f"Saving model list to cache file: {MODELS_CACHE_FILE}...")
+                with open(MODELS_CACHE_FILE, 'wb') as f:
+                    pickle.dump(models_list, f)
+                print("Model list saved to cache.")
+            except Exception as e:
+                print(f"Error saving cache file {MODELS_CACHE_FILE}: {e}")
+        except Exception as e:
+            print(f"Error during HfApi initialization or list_models call: {e}")
+            return [] # Return empty list on error
+    # 3. Filter the loaded/fetched list
+    if not models_list:
+        print("Model list is empty after fetching/loading.")
+        return []
+    qualifying_models = []
+    print(f"Filtering {len(models_list)} models by download count...")
+    for model in models_list: # Iterate through the list (from cache or API)
+        # No need for prints inside this loop now, as it should be fast
+        if not hasattr(model, 'downloads') or model.downloads is None:
+            continue
+        if model.downloads < min_downloads:
+            # Since the list is sorted by downloads, we can stop
+            break
+        qualifying_models.append(model)
+    print(f"Found {len(qualifying_models)} models with more than {min_downloads} downloads")
+    return qualifying_models
+def get_model_readme(model_id):
+    """Get README.md content for a specific model using hf_hub_download. Returns None if not found or inaccessible."""
+    filenames_to_try = ["README.md", "readme.md"]
+    branches_to_try = ["main", "master"]
+    for branch in branches_to_try:
+        for filename in filenames_to_try:
+            try:
+                # print(f"Attempting download: repo={model_id}, branch={branch}, file={filename}") # Debug
+                # Use hf_hub_download which uses stored token
+                readme_path = hf_hub_download(
+                    repo_id=model_id,
+                    filename=filename,
+                    revision=branch,
+                    repo_type="model",
+                    local_files_only=False, # Ensure it tries to download
+                    # token=True # Often not needed if logged in via CLI, but can be explicit
+                )
+                # If download succeeded, read the content
+                # print(f"Successfully downloaded {filename} from {branch} to {readme_path}") # Debug
+                with open(readme_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                return content
+            except RepositoryNotFoundError:
+                print(f"Repository {model_id} not found.")
+                return None # If repo doesn't exist, no point trying other files/branches
+            except EntryNotFoundError:
+                # print(f"{filename} not found in branch {branch} for {model_id}. Trying next...") # Debug
+                continue # File not found in this specific branch/filename combination, try next
+            except HFValidationError as e: # Catch invalid repo ID or filename errors
+                 print(f"Validation error for {model_id} (branch: {branch}, file: {filename}): {e}")
+                 continue # Try next filename/branch
+            except Exception as e: # Catch other potential errors (like 401 HfHubHTTPError, network issues)
+                print(f"Error downloading {filename} from branch {branch} for {model_id}: {e}")
+                # Check if it's a likely authentication error (401/403)
+                if "401" in str(e) or "403" in str(e):
+                    print(f"Authentication error (401/403) for {model_id}. Ensure you are logged in and accepted terms.")
+                    return None # Don't try other files/branches if auth failed
+                # For other errors, we continue to the next filename/branch attempt
+                continue
+    # If all attempts failed
+    print(f"Could not fetch README for {model_id} from any standard location.")
+    return None
+def get_filename_for_model(model_id):
+    """Generate JSON filename for a model"""
+    safe_id = model_id.replace("/", "_")
+    return os.path.join(OUTPUT_DIR, f"{safe_id}.json") # Change extension to .json
+def save_model_data(model_id, data):
+    """Save model data (description, tags, downloads) to a JSON file."""
+    filename = get_filename_for_model(model_id)
+    try:
+        with open(filename, "w", encoding="utf-8") as f:
+            json.dump(data, f, ensure_ascii=False, indent=4)
+        return filename
+    except Exception as e:
+        print(f"Error saving JSON for {model_id} to {filename}: {e}")
+        return None
+def file_exists_for_model(model_id):
+    """Check if a JSON file already exists for this model"""
+    filename = get_filename_for_model(model_id)
+    return os.path.exists(filename)
+def process_model(model):
+    """Process a single model - fetch README, clean it, save as JSON."""
+    model_id = model.modelId
+    downloads = model.downloads
+    tags = getattr(model, 'tags', []) # Get tags if available
+    # Check if JSON file already exists
+    if file_exists_for_model(model_id):
+        return (model_id, downloads, None, "skipped")
+    # --- Add Delay Before Download Attempt ---
+    time.sleep(DOWNLOAD_DELAY_SECONDS)
+    # ---------------------------------------
+    # Get model README content
+    readme_content = get_model_readme(model_id)
+    # If README is not available, skip saving this model
+    if readme_content is None:
+        return (model_id, downloads, None, "no_readme")
+    # Clean the README
+    cleaned_readme = clean_readme_content(readme_content)
+    # Prepare data payload
+    model_data = {
+        "model_id": model_id,
+        "downloads": downloads,
+        "tags": tags,
+        "description": cleaned_readme
+    }
+    # Save data as JSON
+    filename = save_model_data(model_id, model_data)
+    if filename:
+        return (model_id, downloads, filename, "downloaded")
+    else:
+        return (model_id, downloads, None, "save_failed")
+def main():
+    qualifying_models = get_all_models_with_downloads(min_downloads=10000)
+    if not qualifying_models:
+        print("No qualifying models found")
+        return
+    print(f"Processing {len(qualifying_models)} models, saving to '{OUTPUT_DIR}'...")
+    downloaded = 0
+    skipped = 0
+    no_readme = 0
+    failed = 0
+    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
+        future_to_model = {executor.submit(process_model, model): model for model in qualifying_models}
+        for future in tqdm(as_completed(future_to_model), total=len(qualifying_models)):
+            try:
+                model_id, downloads, filename, status = future.result()
+                if status == "downloaded":
+                    # Don't print every success to avoid clutter
+                    # print(f"Saved data for {model_id} ({downloads} downloads) to {filename}")
+                    downloaded += 1
+                elif status == "skipped":
+                    skipped += 1
+                elif status == "no_readme":
+                    no_readme += 1
+                else: # save_failed or other errors
+                    failed += 1
+            except Exception as e:
+                # Extract model_id for better error reporting if possible
+                processed_model = future_to_model[future]
+                print(f"Error processing model {getattr(processed_model, 'modelId', 'unknown')}: {e}")
+                failed += 1
+    print(f"\nCompleted! Downloaded: {downloaded}, Skipped existing: {skipped}, No README found: {no_readme}, Failed: {failed}")
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -4,4 +4,6 @@ sentence-transformers>=2.3.0
 numpy>=1.20.0
 faiss-cpu>=1.7.0 # Use faiss-gpu if you need GPU support on HF Spaces
 huggingface-hub>=0.15.1 # Version compatible with sentence-transformers >= 2.3.0
-gunicorn # Added for deployment on Hugging Face Spaces

 numpy>=1.20.0
 faiss-cpu>=1.7.0 # Use faiss-gpu if you need GPU support on HF Spaces
 huggingface-hub>=0.15.1 # Version compatible with sentence-transformers >= 2.3.0
+gunicorn # Added for deployment on Hugging Face Spaces
+openai>=1.0.0 # Added back for DeepSeek API via OpenAI client
+schedule>=1.0.0 # Added for in-app scheduling