Spaces:

shayan5422
/

back_rag_huggingface

Running

File size: 9,628 Bytes

import os
import json
from typing import Dict, Any, Optional
import logging
import time
from openai import OpenAI, APIError

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

MODEL_DATA_DIR = "model_data_json"
EXPLANATION_KEY = "model_explanation_gemini"
DESCRIPTION_KEY = "description"
MAX_RETRIES = 3 # Retries for API calls
RETRY_DELAY_SECONDS = 5 # Delay between retries

# --- DeepSeek API Configuration ---
DEEPSEEK_API_KEY_ENV_VAR = "DEEPSEEK_API_KEY"
DEEPSEEK_BASE_URL = "https://api.deepseek.com"
DEEPSEEK_MODEL_NAME = "deepseek-chat"

# Global client variable
client: Optional[OpenAI] = None

def configure_llm_client():
    """Configures the OpenAI client for DeepSeek API using the API key from environment variables."""
    global client
    api_key = os.getenv(DEEPSEEK_API_KEY_ENV_VAR)
    if not api_key:
        logging.error(f"Error: {DEEPSEEK_API_KEY_ENV_VAR} environment variable not set.")
        logging.error("Please set the environment variable before running the script.")
        return False
    try:
        client = OpenAI(api_key=api_key, base_url=DEEPSEEK_BASE_URL)
        logging.info("DeepSeek API client configured successfully.")
        return True
    except Exception as e:
        logging.error(f"Failed to configure DeepSeek API client: {e}")
        client = None
        return False

# --- End DeepSeek API Configuration ---

def generate_explanation(model_id: str, description: str) -> Optional[str]:
    """
    Generates a short English explanation for the model based on its description
    by calling the DeepSeek API via the OpenAI library.

    Args:
        model_id: The ID of the model (for context).
        description: The model description text.

    Returns:
        A short English explanation string from DeepSeek, or None if generation fails.
    """
    global client
    if not client:
        logging.error(f"[{model_id}] DeepSeek client not configured. Cannot generate explanation.")
        return None

    if not description or not isinstance(description, str):
        logging.warning(f"[{model_id}] Description is empty or not a string. Skipping explanation generation.")
        return None

    # Truncate very long descriptions
    max_desc_length = 4000
    if len(description) > max_desc_length:
        logging.warning(f"[{model_id}] Description truncated to {max_desc_length} chars for API call.")
        description = description[:max_desc_length] + "... [truncated]"

    # Construct the messages for DeepSeek API
    messages = [
        {"role": "system", "content": "You are an AI assistant tasked with summarizing Hugging Face model descriptions concisely."},        
        {"role": "user", "content": (
            f"Analyze the following description for the Hugging Face model '{model_id}'. "
            f"Based **only** on this description, provide a concise, one-sentence explanation in English "
            f"summarizing what this model does and its primary purpose or task. "
            f"Focus on the core functionality mentioned. Avoid adding introductory phrases like 'This model is...' or 'The model...'."
            f"\n\n---\nModel Description:\n{description}\n---\n\nConcise Explanation:"
        )}
    ]

    retries = 0
    while retries < MAX_RETRIES:
        try:
            logging.info(f"[{model_id}] Calling DeepSeek API (Attempt {retries + 1}/{MAX_RETRIES})...")
            response = client.chat.completions.create(
                model=DEEPSEEK_MODEL_NAME,
                messages=messages,
                stream=False,
                max_tokens=100, # Limit response length
                temperature=0.2 # Lower temperature for more focused summary
            )

            explanation = response.choices[0].message.content.strip()
            logging.info(f"[{model_id}] Explanation received from DeepSeek: '{explanation}'")
            # Basic post-processing: remove potential quotes
            if explanation.startswith('"') and explanation.endswith('"'):
                explanation = explanation[1:-1]
            return explanation

        except APIError as e:
            retries += 1
            logging.error(f"[{model_id}] DeepSeek API Error (Attempt {retries}/{MAX_RETRIES}): {e}")
            if retries < MAX_RETRIES:
                logging.info(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
                time.sleep(RETRY_DELAY_SECONDS)
            else:
                logging.error(f"[{model_id}] Max retries reached. Failed to generate explanation via DeepSeek.")
                return None
        except Exception as e: # Catch other potential errors
            logging.error(f"[{model_id}] Unexpected error during DeepSeek API call: {e}")
            return None # Don't retry for unexpected errors

    return None

def process_json_file(filepath: str):
    """Reads, updates, and writes a single JSON file."""
    model_id = os.path.basename(filepath).replace('.json', '')
    logging.info(f"Processing {filepath}...")

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except json.JSONDecodeError:
        logging.error(f"[{model_id}] Invalid JSON format in {filepath}. Skipping.")
        return
    except FileNotFoundError:
        logging.error(f"[{model_id}] File not found: {filepath}. Skipping.")
        return
    except Exception as e:
        logging.error(f"[{model_id}] Error reading {filepath}: {e}. Skipping.")
        return

    if not isinstance(data, dict):
        logging.error(f"[{model_id}] Expected JSON object (dict) but got {type(data)} in {filepath}. Skipping.")
        return

    description = data.get(DESCRIPTION_KEY)
    explanation_overwritten = False

    # --- Deletion Logic: Always remove existing explanation before trying to regenerate ---
    if EXPLANATION_KEY in data:
        logging.info(f"[{model_id}] Existing explanation found. Deleting before regenerating.")
        del data[EXPLANATION_KEY]
        explanation_overwritten = True # Mark that we intend to replace it

    # --- Generation Logic ---
    if not description:
         logging.warning(f"[{model_id}] Description field is missing or empty. Cannot generate explanation.")
         return

    explanation = generate_explanation(model_id, description) # Try to generate a new one

    # --- Update and Write Logic ---    
    if explanation: # Only update if generation was successful
        data[EXPLANATION_KEY] = explanation
        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=4)
            if explanation_overwritten:
                 logging.info(f"[{model_id}] Successfully overwrote and updated {filepath} with new explanation.")
            else:
                 logging.info(f"[{model_id}] Successfully generated and updated {filepath} with new explanation.")
        except IOError as e:
            logging.error(f"[{model_id}] Error writing updated data to {filepath}: {e}")
        except Exception as e:
            logging.error(f"[{model_id}] Unexpected error writing {filepath}: {e}")
    else: # Explanation generation failed
         log_message = f"[{model_id}] Failed to generate new explanation for {filepath} via API."
         if explanation_overwritten:
             log_message += " Existing explanation was removed but not replaced due to API failure."
         logging.warning(log_message)


def main():
    """Main function to iterate through the directory and process files."""
    # Configure LLM client at the start
    if not configure_llm_client():
        return # Stop if API key is not configured

    if not os.path.isdir(MODEL_DATA_DIR):
        logging.error(f"Directory not found: {MODEL_DATA_DIR}")
        return

    logging.info(f"Starting processing directory: {MODEL_DATA_DIR}")
    processed_files = 0
    updated_files = 0
    skipped_files = 0

    all_files = [f for f in os.listdir(MODEL_DATA_DIR) if f.lower().endswith(".json")]
    total_files = len(all_files)
    logging.info(f"Found {total_files} JSON files to process.")

    for i, filename in enumerate(all_files):
        filepath = os.path.join(MODEL_DATA_DIR, filename)
        logging.info(f"--- Processing file {i+1}/{total_files}: {filename} ---")
        try:
            # Check if explanation exists before calling process_json_file
            # to potentially save API calls if already done.
            # However, process_json_file already has this check.
            process_json_file(filepath)
            processed_files +=1 # Count as processed even if skipped due to existing explanation

            # Check if file was actually updated (optional metric)
            # Re-read might be inefficient, could return status from process_json_file
            # For simplicity, we just log success/failure in process_json_file

        except Exception as e:
            logging.error(f"Unexpected error processing file {filename}: {e}")
            skipped_files += 1
        # Add a small delay between files to potentially avoid hitting rate limits
        time.sleep(0.5) # Adjust delay as needed


    logging.info(f"--- Processing complete ---")
    # Refine reporting slightly
    logging.info(f"Total JSON files found: {total_files}")
    logging.info(f"Files processed (attempted): {processed_files}")
    # A more accurate count of updated files would require modifying process_json_file to return status
    logging.info(f"Files skipped due to unexpected errors: {skipped_files}")

if __name__ == "__main__":
    main()