shayan5422 commited on
Commit
ec1f977
·
verified ·
1 Parent(s): d11e38f

Upload 5 files

Browse files
add_model_explanations.py CHANGED
@@ -9,7 +9,11 @@ from openai import OpenAI, APIError # Add back OpenAI imports
9
  # Configure logging
10
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
 
12
- MODEL_DATA_DIR = "model_data_json"
 
 
 
 
13
  EXPLANATION_KEY = "model_explanation_gemini"
14
  DESCRIPTION_KEY = "description"
15
  MAX_RETRIES = 3 # Retries for API calls
 
9
  # Configure logging
10
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
 
12
+ # Define the base persistent storage path (must match other scripts)
13
+ PERSISTENT_STORAGE_PATH = "/data" # <-- ADJUST IF YOUR PATH IS DIFFERENT
14
+
15
+ # Point to the JSON data within persistent storage
16
+ MODEL_DATA_DIR = os.path.join(PERSISTENT_STORAGE_PATH, "model_data_json")
17
  EXPLANATION_KEY = "model_explanation_gemini"
18
  DESCRIPTION_KEY = "description"
19
  MAX_RETRIES = 3 # Retries for API calls
app.py CHANGED
@@ -22,6 +22,9 @@ except ImportError:
22
 
23
  app = Flask(__name__) # Create app object FIRST
24
 
 
 
 
25
  # Configure Flask app logging (optional but recommended)
26
  # app.logger.setLevel(logging.INFO)
27
 
@@ -29,11 +32,12 @@ app = Flask(__name__) # Create app object FIRST
29
  CORS(app, origins=["http://127.0.0.1:3000", "http://localhost:3000", "https://rag-huggingface.vercel.app"], supports_credentials=True)
30
 
31
  # --- Configuration ---
32
- INDEX_FILE = "index.faiss"
33
- MAP_FILE = "index_to_metadata.pkl"
 
34
  EMBEDDING_MODEL = 'all-mpnet-base-v2'
35
- # Corrected path joining for model_data_json - relative to app.py location
36
- MODEL_DATA_DIR = os.path.join(os.path.dirname(__file__), 'model_data_json')
37
  # ---
38
 
39
  # --- Global variables for resources ---
@@ -72,7 +76,8 @@ def load_resources():
72
  print("Sentence transformer model loaded successfully.")
73
 
74
  # Load FAISS Index
75
- index_path = os.path.join(os.path.dirname(__file__), INDEX_FILE)
 
76
  print(f"Loading FAISS index from: {index_path}")
77
  if not os.path.exists(index_path):
78
  raise FileNotFoundError(f"FAISS index file not found at {index_path}")
@@ -81,7 +86,8 @@ def load_resources():
81
  print("FAISS index loaded successfully.")
82
 
83
  # Load Index-to-Metadata Map
84
- map_path = os.path.join(os.path.dirname(__file__), MAP_FILE)
 
85
  print(f"Loading index-to-Metadata map from: {map_path}")
86
  if not os.path.exists(map_path):
87
  raise FileNotFoundError(f"Metadata map file not found at {map_path}")
@@ -95,8 +101,8 @@ def load_resources():
95
 
96
  except FileNotFoundError as fnf_error:
97
  print(f"Error: {fnf_error}")
98
- print(f"Please ensure {INDEX_FILE} and {MAP_FILE} exist in the 'backend' directory relative to app.py.")
99
- print("You might need to run 'python build_index.py' first.")
100
  RESOURCES_LOADED = False # Keep as False
101
  except ImportError as import_error:
102
  print(f"Import Error loading resources: {import_error}")
@@ -235,7 +241,7 @@ def search():
235
  # --- Add description from model_data_json ---
236
  model_id = metadata.get('model_id')
237
  description = None
238
- # Use the globally defined and corrected MODEL_DATA_DIR
239
  if model_id and MODEL_DATA_DIR:
240
  filename = model_id.replace('/', '_') + '.json'
241
  filepath = os.path.join(MODEL_DATA_DIR, filename)
 
22
 
23
  app = Flask(__name__) # Create app object FIRST
24
 
25
+ # Define the base persistent storage path (must match other scripts)
26
+ PERSISTENT_STORAGE_PATH = "/data" # <-- ADJUST IF YOUR PATH IS DIFFERENT
27
+
28
  # Configure Flask app logging (optional but recommended)
29
  # app.logger.setLevel(logging.INFO)
30
 
 
32
  CORS(app, origins=["http://127.0.0.1:3000", "http://localhost:3000", "https://rag-huggingface.vercel.app"], supports_credentials=True)
33
 
34
  # --- Configuration ---
35
+ # Point to index/map files in persistent storage
36
+ INDEX_FILE = os.path.join(PERSISTENT_STORAGE_PATH, "index.faiss")
37
+ MAP_FILE = os.path.join(PERSISTENT_STORAGE_PATH, "index_to_metadata.pkl")
38
  EMBEDDING_MODEL = 'all-mpnet-base-v2'
39
+ # Point to model data JSON in persistent storage
40
+ MODEL_DATA_DIR = os.path.join(PERSISTENT_STORAGE_PATH, "model_data_json")
41
  # ---
42
 
43
  # --- Global variables for resources ---
 
76
  print("Sentence transformer model loaded successfully.")
77
 
78
  # Load FAISS Index
79
+ # index_path = os.path.join(os.path.dirname(__file__), INDEX_FILE) # Old path
80
+ index_path = INDEX_FILE # Use configured path
81
  print(f"Loading FAISS index from: {index_path}")
82
  if not os.path.exists(index_path):
83
  raise FileNotFoundError(f"FAISS index file not found at {index_path}")
 
86
  print("FAISS index loaded successfully.")
87
 
88
  # Load Index-to-Metadata Map
89
+ # map_path = os.path.join(os.path.dirname(__file__), MAP_FILE) # Old path
90
+ map_path = MAP_FILE # Use configured path
91
  print(f"Loading index-to-Metadata map from: {map_path}")
92
  if not os.path.exists(map_path):
93
  raise FileNotFoundError(f"Metadata map file not found at {map_path}")
 
101
 
102
  except FileNotFoundError as fnf_error:
103
  print(f"Error: {fnf_error}")
104
+ print(f"Please ensure {os.path.basename(INDEX_FILE)} and {os.path.basename(MAP_FILE)} exist in the persistent storage directory ({PERSISTENT_STORAGE_PATH}).")
105
+ print("You might need to run the update process first or manually place initial files there.")
106
  RESOURCES_LOADED = False # Keep as False
107
  except ImportError as import_error:
108
  print(f"Import Error loading resources: {import_error}")
 
241
  # --- Add description from model_data_json ---
242
  model_id = metadata.get('model_id')
243
  description = None
244
+ # Use the globally defined MODEL_DATA_DIR pointing to persistent storage
245
  if model_id and MODEL_DATA_DIR:
246
  filename = model_id.replace('/', '_') + '.json'
247
  filepath = os.path.join(MODEL_DATA_DIR, filename)
build_index.py CHANGED
@@ -7,10 +7,15 @@ import pickle
7
  import json # Import json module
8
  from tqdm import tqdm
9
 
 
 
 
10
  # --- Configuration ---
11
- MODEL_DATA_DIR = "model_data_json" # Path to downloaded JSON data
12
- INDEX_FILE = "index.faiss"
13
- MAP_FILE = "index_to_metadata.pkl" # Changed filename to reflect content
 
 
14
  EMBEDDING_MODEL = 'all-mpnet-base-v2' # Efficient and good quality model
15
  ENCODE_BATCH_SIZE = 32 # Process descriptions in smaller batches
16
  # Tags to exclude from indexing text
 
7
  import json # Import json module
8
  from tqdm import tqdm
9
 
10
+ # Define the base persistent storage path (must match other scripts)
11
+ PERSISTENT_STORAGE_PATH = "/data" # <-- ADJUST IF YOUR PATH IS DIFFERENT
12
+
13
  # --- Configuration ---
14
+ # Point to the JSON data within persistent storage
15
+ MODEL_DATA_DIR = os.path.join(PERSISTENT_STORAGE_PATH, "model_data_json")
16
+ # Save index and map to persistent storage
17
+ INDEX_FILE = os.path.join(PERSISTENT_STORAGE_PATH, "index.faiss")
18
+ MAP_FILE = os.path.join(PERSISTENT_STORAGE_PATH, "index_to_metadata.pkl")
19
  EMBEDDING_MODEL = 'all-mpnet-base-v2' # Efficient and good quality model
20
  ENCODE_BATCH_SIZE = 32 # Process descriptions in smaller batches
21
  # Tags to exclude from indexing text
huggingface_model_descriptions.py CHANGED
@@ -10,8 +10,11 @@ from requests.exceptions import RequestException
10
  from concurrent.futures import ThreadPoolExecutor, as_completed
11
  import pickle # Add pickle for caching
12
 
13
- # Create a directory to store JSON data
14
- OUTPUT_DIR = "model_data_json"
 
 
 
15
  os.makedirs(OUTPUT_DIR, exist_ok=True)
16
 
17
  # Number of worker threads for parallel processing - REDUCED
@@ -41,7 +44,8 @@ def clean_readme_content(text):
41
  return text
42
  # ---
43
 
44
- MODELS_CACHE_FILE = "models_list_cache.pkl" # File to cache the raw model list
 
45
 
46
  def get_all_models_with_downloads(min_downloads=10000):
47
  """Fetch all models from Hugging Face with at least min_downloads, using a local cache for the list."""
@@ -154,9 +158,9 @@ def get_model_readme(model_id):
154
  return None
155
 
156
  def get_filename_for_model(model_id):
157
- """Generate JSON filename for a model"""
158
  safe_id = model_id.replace("/", "_")
159
- return os.path.join(OUTPUT_DIR, f"{safe_id}.json") # Change extension to .json
160
 
161
  def save_model_data(model_id, data):
162
  """Save model data (description, tags, downloads) to a JSON file."""
 
10
  from concurrent.futures import ThreadPoolExecutor, as_completed
11
  import pickle # Add pickle for caching
12
 
13
+ # Define the base persistent storage path
14
+ PERSISTENT_STORAGE_PATH = "/data" # <-- ADJUST IF YOUR PATH IS DIFFERENT
15
+
16
+ # Create a directory to store JSON data within persistent storage
17
+ OUTPUT_DIR = os.path.join(PERSISTENT_STORAGE_PATH, "model_data_json")
18
  os.makedirs(OUTPUT_DIR, exist_ok=True)
19
 
20
  # Number of worker threads for parallel processing - REDUCED
 
44
  return text
45
  # ---
46
 
47
+ # Use persistent storage for the cache file
48
+ MODELS_CACHE_FILE = os.path.join(PERSISTENT_STORAGE_PATH, "models_list_cache.pkl") # File to cache the raw model list
49
 
50
  def get_all_models_with_downloads(min_downloads=10000):
51
  """Fetch all models from Hugging Face with at least min_downloads, using a local cache for the list."""
 
158
  return None
159
 
160
  def get_filename_for_model(model_id):
161
+ """Generate JSON filename for a model (uses global OUTPUT_DIR)"""
162
  safe_id = model_id.replace("/", "_")
163
+ return os.path.join(OUTPUT_DIR, f"{safe_id}.json") # OUTPUT_DIR is already correct path
164
 
165
  def save_model_data(model_id, data):
166
  """Save model data (description, tags, downloads) to a JSON file."""