Spaces:
Running
Running
File size: 9,628 Bytes
0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 8181a7b 0db8b33 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
import os
import json
from typing import Dict, Any, Optional
import logging
import time
from openai import OpenAI, APIError
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
MODEL_DATA_DIR = "model_data_json"
EXPLANATION_KEY = "model_explanation_gemini"
DESCRIPTION_KEY = "description"
MAX_RETRIES = 3 # Retries for API calls
RETRY_DELAY_SECONDS = 5 # Delay between retries
# --- DeepSeek API Configuration ---
DEEPSEEK_API_KEY_ENV_VAR = "DEEPSEEK_API_KEY"
DEEPSEEK_BASE_URL = "https://api.deepseek.com"
DEEPSEEK_MODEL_NAME = "deepseek-chat"
# Global client variable
client: Optional[OpenAI] = None
def configure_llm_client():
"""Configures the OpenAI client for DeepSeek API using the API key from environment variables."""
global client
api_key = os.getenv(DEEPSEEK_API_KEY_ENV_VAR)
if not api_key:
logging.error(f"Error: {DEEPSEEK_API_KEY_ENV_VAR} environment variable not set.")
logging.error("Please set the environment variable before running the script.")
return False
try:
client = OpenAI(api_key=api_key, base_url=DEEPSEEK_BASE_URL)
logging.info("DeepSeek API client configured successfully.")
return True
except Exception as e:
logging.error(f"Failed to configure DeepSeek API client: {e}")
client = None
return False
# --- End DeepSeek API Configuration ---
def generate_explanation(model_id: str, description: str) -> Optional[str]:
"""
Generates a short English explanation for the model based on its description
by calling the DeepSeek API via the OpenAI library.
Args:
model_id: The ID of the model (for context).
description: The model description text.
Returns:
A short English explanation string from DeepSeek, or None if generation fails.
"""
global client
if not client:
logging.error(f"[{model_id}] DeepSeek client not configured. Cannot generate explanation.")
return None
if not description or not isinstance(description, str):
logging.warning(f"[{model_id}] Description is empty or not a string. Skipping explanation generation.")
return None
# Truncate very long descriptions
max_desc_length = 4000
if len(description) > max_desc_length:
logging.warning(f"[{model_id}] Description truncated to {max_desc_length} chars for API call.")
description = description[:max_desc_length] + "... [truncated]"
# Construct the messages for DeepSeek API
messages = [
{"role": "system", "content": "You are an AI assistant tasked with summarizing Hugging Face model descriptions concisely."},
{"role": "user", "content": (
f"Analyze the following description for the Hugging Face model '{model_id}'. "
f"Based **only** on this description, provide a concise, one-sentence explanation in English "
f"summarizing what this model does and its primary purpose or task. "
f"Focus on the core functionality mentioned. Avoid adding introductory phrases like 'This model is...' or 'The model...'."
f"\n\n---\nModel Description:\n{description}\n---\n\nConcise Explanation:"
)}
]
retries = 0
while retries < MAX_RETRIES:
try:
logging.info(f"[{model_id}] Calling DeepSeek API (Attempt {retries + 1}/{MAX_RETRIES})...")
response = client.chat.completions.create(
model=DEEPSEEK_MODEL_NAME,
messages=messages,
stream=False,
max_tokens=100, # Limit response length
temperature=0.2 # Lower temperature for more focused summary
)
explanation = response.choices[0].message.content.strip()
logging.info(f"[{model_id}] Explanation received from DeepSeek: '{explanation}'")
# Basic post-processing: remove potential quotes
if explanation.startswith('"') and explanation.endswith('"'):
explanation = explanation[1:-1]
return explanation
except APIError as e:
retries += 1
logging.error(f"[{model_id}] DeepSeek API Error (Attempt {retries}/{MAX_RETRIES}): {e}")
if retries < MAX_RETRIES:
logging.info(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
time.sleep(RETRY_DELAY_SECONDS)
else:
logging.error(f"[{model_id}] Max retries reached. Failed to generate explanation via DeepSeek.")
return None
except Exception as e: # Catch other potential errors
logging.error(f"[{model_id}] Unexpected error during DeepSeek API call: {e}")
return None # Don't retry for unexpected errors
return None
def process_json_file(filepath: str):
"""Reads, updates, and writes a single JSON file."""
model_id = os.path.basename(filepath).replace('.json', '')
logging.info(f"Processing {filepath}...")
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
except json.JSONDecodeError:
logging.error(f"[{model_id}] Invalid JSON format in {filepath}. Skipping.")
return
except FileNotFoundError:
logging.error(f"[{model_id}] File not found: {filepath}. Skipping.")
return
except Exception as e:
logging.error(f"[{model_id}] Error reading {filepath}: {e}. Skipping.")
return
if not isinstance(data, dict):
logging.error(f"[{model_id}] Expected JSON object (dict) but got {type(data)} in {filepath}. Skipping.")
return
description = data.get(DESCRIPTION_KEY)
explanation_overwritten = False
# --- Deletion Logic: Always remove existing explanation before trying to regenerate ---
if EXPLANATION_KEY in data:
logging.info(f"[{model_id}] Existing explanation found. Deleting before regenerating.")
del data[EXPLANATION_KEY]
explanation_overwritten = True # Mark that we intend to replace it
# --- Generation Logic ---
if not description:
logging.warning(f"[{model_id}] Description field is missing or empty. Cannot generate explanation.")
return
explanation = generate_explanation(model_id, description) # Try to generate a new one
# --- Update and Write Logic ---
if explanation: # Only update if generation was successful
data[EXPLANATION_KEY] = explanation
try:
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if explanation_overwritten:
logging.info(f"[{model_id}] Successfully overwrote and updated {filepath} with new explanation.")
else:
logging.info(f"[{model_id}] Successfully generated and updated {filepath} with new explanation.")
except IOError as e:
logging.error(f"[{model_id}] Error writing updated data to {filepath}: {e}")
except Exception as e:
logging.error(f"[{model_id}] Unexpected error writing {filepath}: {e}")
else: # Explanation generation failed
log_message = f"[{model_id}] Failed to generate new explanation for {filepath} via API."
if explanation_overwritten:
log_message += " Existing explanation was removed but not replaced due to API failure."
logging.warning(log_message)
def main():
"""Main function to iterate through the directory and process files."""
# Configure LLM client at the start
if not configure_llm_client():
return # Stop if API key is not configured
if not os.path.isdir(MODEL_DATA_DIR):
logging.error(f"Directory not found: {MODEL_DATA_DIR}")
return
logging.info(f"Starting processing directory: {MODEL_DATA_DIR}")
processed_files = 0
updated_files = 0
skipped_files = 0
all_files = [f for f in os.listdir(MODEL_DATA_DIR) if f.lower().endswith(".json")]
total_files = len(all_files)
logging.info(f"Found {total_files} JSON files to process.")
for i, filename in enumerate(all_files):
filepath = os.path.join(MODEL_DATA_DIR, filename)
logging.info(f"--- Processing file {i+1}/{total_files}: {filename} ---")
try:
# Check if explanation exists before calling process_json_file
# to potentially save API calls if already done.
# However, process_json_file already has this check.
process_json_file(filepath)
processed_files +=1 # Count as processed even if skipped due to existing explanation
# Check if file was actually updated (optional metric)
# Re-read might be inefficient, could return status from process_json_file
# For simplicity, we just log success/failure in process_json_file
except Exception as e:
logging.error(f"Unexpected error processing file {filename}: {e}")
skipped_files += 1
# Add a small delay between files to potentially avoid hitting rate limits
time.sleep(0.5) # Adjust delay as needed
logging.info(f"--- Processing complete ---")
# Refine reporting slightly
logging.info(f"Total JSON files found: {total_files}")
logging.info(f"Files processed (attempted): {processed_files}")
# A more accurate count of updated files would require modifying process_json_file to return status
logging.info(f"Files skipped due to unexpected errors: {skipped_files}")
if __name__ == "__main__":
main() |