Spaces:
Running
Running
import os | |
import json | |
from typing import Dict, Any, Optional | |
import logging | |
import time | |
from openai import OpenAI, APIError | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
MODEL_DATA_DIR = "model_data_json" | |
EXPLANATION_KEY = "model_explanation_gemini" | |
DESCRIPTION_KEY = "description" | |
MAX_RETRIES = 3 # Retries for API calls | |
RETRY_DELAY_SECONDS = 5 # Delay between retries | |
# --- DeepSeek API Configuration --- | |
DEEPSEEK_API_KEY_ENV_VAR = "DEEPSEEK_API_KEY" | |
DEEPSEEK_BASE_URL = "https://api.deepseek.com" | |
DEEPSEEK_MODEL_NAME = "deepseek-chat" | |
# Global client variable | |
client: Optional[OpenAI] = None | |
def configure_llm_client(): | |
"""Configures the OpenAI client for DeepSeek API using the API key from environment variables.""" | |
global client | |
api_key = os.getenv(DEEPSEEK_API_KEY_ENV_VAR) | |
if not api_key: | |
logging.error(f"Error: {DEEPSEEK_API_KEY_ENV_VAR} environment variable not set.") | |
logging.error("Please set the environment variable before running the script.") | |
return False | |
try: | |
client = OpenAI(api_key=api_key, base_url=DEEPSEEK_BASE_URL) | |
logging.info("DeepSeek API client configured successfully.") | |
return True | |
except Exception as e: | |
logging.error(f"Failed to configure DeepSeek API client: {e}") | |
client = None | |
return False | |
# --- End DeepSeek API Configuration --- | |
def generate_explanation(model_id: str, description: str) -> Optional[str]: | |
""" | |
Generates a short English explanation for the model based on its description | |
by calling the DeepSeek API via the OpenAI library. | |
Args: | |
model_id: The ID of the model (for context). | |
description: The model description text. | |
Returns: | |
A short English explanation string from DeepSeek, or None if generation fails. | |
""" | |
global client | |
if not client: | |
logging.error(f"[{model_id}] DeepSeek client not configured. Cannot generate explanation.") | |
return None | |
if not description or not isinstance(description, str): | |
logging.warning(f"[{model_id}] Description is empty or not a string. Skipping explanation generation.") | |
return None | |
# Truncate very long descriptions | |
max_desc_length = 4000 | |
if len(description) > max_desc_length: | |
logging.warning(f"[{model_id}] Description truncated to {max_desc_length} chars for API call.") | |
description = description[:max_desc_length] + "... [truncated]" | |
# Construct the messages for DeepSeek API | |
messages = [ | |
{"role": "system", "content": "You are an AI assistant tasked with summarizing Hugging Face model descriptions concisely."}, | |
{"role": "user", "content": ( | |
f"Analyze the following description for the Hugging Face model '{model_id}'. " | |
f"Based **only** on this description, provide a concise, one-sentence explanation in English " | |
f"summarizing what this model does and its primary purpose or task. " | |
f"Focus on the core functionality mentioned. Avoid adding introductory phrases like 'This model is...' or 'The model...'." | |
f"\n\n---\nModel Description:\n{description}\n---\n\nConcise Explanation:" | |
)} | |
] | |
retries = 0 | |
while retries < MAX_RETRIES: | |
try: | |
logging.info(f"[{model_id}] Calling DeepSeek API (Attempt {retries + 1}/{MAX_RETRIES})...") | |
response = client.chat.completions.create( | |
model=DEEPSEEK_MODEL_NAME, | |
messages=messages, | |
stream=False, | |
max_tokens=100, # Limit response length | |
temperature=0.2 # Lower temperature for more focused summary | |
) | |
explanation = response.choices[0].message.content.strip() | |
logging.info(f"[{model_id}] Explanation received from DeepSeek: '{explanation}'") | |
# Basic post-processing: remove potential quotes | |
if explanation.startswith('"') and explanation.endswith('"'): | |
explanation = explanation[1:-1] | |
return explanation | |
except APIError as e: | |
retries += 1 | |
logging.error(f"[{model_id}] DeepSeek API Error (Attempt {retries}/{MAX_RETRIES}): {e}") | |
if retries < MAX_RETRIES: | |
logging.info(f"Retrying in {RETRY_DELAY_SECONDS} seconds...") | |
time.sleep(RETRY_DELAY_SECONDS) | |
else: | |
logging.error(f"[{model_id}] Max retries reached. Failed to generate explanation via DeepSeek.") | |
return None | |
except Exception as e: # Catch other potential errors | |
logging.error(f"[{model_id}] Unexpected error during DeepSeek API call: {e}") | |
return None # Don't retry for unexpected errors | |
return None | |
def process_json_file(filepath: str): | |
"""Reads, updates, and writes a single JSON file.""" | |
model_id = os.path.basename(filepath).replace('.json', '') | |
logging.info(f"Processing {filepath}...") | |
try: | |
with open(filepath, 'r', encoding='utf-8') as f: | |
data = json.load(f) | |
except json.JSONDecodeError: | |
logging.error(f"[{model_id}] Invalid JSON format in {filepath}. Skipping.") | |
return | |
except FileNotFoundError: | |
logging.error(f"[{model_id}] File not found: {filepath}. Skipping.") | |
return | |
except Exception as e: | |
logging.error(f"[{model_id}] Error reading {filepath}: {e}. Skipping.") | |
return | |
if not isinstance(data, dict): | |
logging.error(f"[{model_id}] Expected JSON object (dict) but got {type(data)} in {filepath}. Skipping.") | |
return | |
description = data.get(DESCRIPTION_KEY) | |
explanation_overwritten = False | |
# --- Deletion Logic: Always remove existing explanation before trying to regenerate --- | |
if EXPLANATION_KEY in data: | |
logging.info(f"[{model_id}] Existing explanation found. Deleting before regenerating.") | |
del data[EXPLANATION_KEY] | |
explanation_overwritten = True # Mark that we intend to replace it | |
# --- Generation Logic --- | |
if not description: | |
logging.warning(f"[{model_id}] Description field is missing or empty. Cannot generate explanation.") | |
return | |
explanation = generate_explanation(model_id, description) # Try to generate a new one | |
# --- Update and Write Logic --- | |
if explanation: # Only update if generation was successful | |
data[EXPLANATION_KEY] = explanation | |
try: | |
with open(filepath, 'w', encoding='utf-8') as f: | |
json.dump(data, f, ensure_ascii=False, indent=4) | |
if explanation_overwritten: | |
logging.info(f"[{model_id}] Successfully overwrote and updated {filepath} with new explanation.") | |
else: | |
logging.info(f"[{model_id}] Successfully generated and updated {filepath} with new explanation.") | |
except IOError as e: | |
logging.error(f"[{model_id}] Error writing updated data to {filepath}: {e}") | |
except Exception as e: | |
logging.error(f"[{model_id}] Unexpected error writing {filepath}: {e}") | |
else: # Explanation generation failed | |
log_message = f"[{model_id}] Failed to generate new explanation for {filepath} via API." | |
if explanation_overwritten: | |
log_message += " Existing explanation was removed but not replaced due to API failure." | |
logging.warning(log_message) | |
def main(): | |
"""Main function to iterate through the directory and process files.""" | |
# Configure LLM client at the start | |
if not configure_llm_client(): | |
return # Stop if API key is not configured | |
if not os.path.isdir(MODEL_DATA_DIR): | |
logging.error(f"Directory not found: {MODEL_DATA_DIR}") | |
return | |
logging.info(f"Starting processing directory: {MODEL_DATA_DIR}") | |
processed_files = 0 | |
updated_files = 0 | |
skipped_files = 0 | |
all_files = [f for f in os.listdir(MODEL_DATA_DIR) if f.lower().endswith(".json")] | |
total_files = len(all_files) | |
logging.info(f"Found {total_files} JSON files to process.") | |
for i, filename in enumerate(all_files): | |
filepath = os.path.join(MODEL_DATA_DIR, filename) | |
logging.info(f"--- Processing file {i+1}/{total_files}: {filename} ---") | |
try: | |
# Check if explanation exists before calling process_json_file | |
# to potentially save API calls if already done. | |
# However, process_json_file already has this check. | |
process_json_file(filepath) | |
processed_files +=1 # Count as processed even if skipped due to existing explanation | |
# Check if file was actually updated (optional metric) | |
# Re-read might be inefficient, could return status from process_json_file | |
# For simplicity, we just log success/failure in process_json_file | |
except Exception as e: | |
logging.error(f"Unexpected error processing file {filename}: {e}") | |
skipped_files += 1 | |
# Add a small delay between files to potentially avoid hitting rate limits | |
time.sleep(0.5) # Adjust delay as needed | |
logging.info(f"--- Processing complete ---") | |
# Refine reporting slightly | |
logging.info(f"Total JSON files found: {total_files}") | |
logging.info(f"Files processed (attempted): {processed_files}") | |
# A more accurate count of updated files would require modifying process_json_file to return status | |
logging.info(f"Files skipped due to unexpected errors: {skipped_files}") | |
if __name__ == "__main__": | |
main() |