File size: 9,628 Bytes
0db8b33
 
 
 
 
8181a7b
0db8b33
 
 
 
8181a7b
0db8b33
 
 
 
 
8181a7b
 
0db8b33
 
 
8181a7b
 
0db8b33
 
 
 
8181a7b
0db8b33
 
8181a7b
0db8b33
 
 
8181a7b
0db8b33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8181a7b
0db8b33
 
 
 
 
 
 
 
8181a7b
0db8b33
 
 
 
 
8181a7b
0db8b33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8181a7b
0db8b33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8181a7b
 
 
0db8b33
8181a7b
0db8b33
 
8181a7b
0db8b33
 
 
 
 
 
 
 
8181a7b
0db8b33
 
8181a7b
0db8b33
 
8181a7b
0db8b33
 
 
8181a7b
0db8b33
8181a7b
 
0db8b33
8181a7b
 
 
 
 
0db8b33
 
 
 
8181a7b
0db8b33
 
 
8181a7b
0db8b33
 
 
 
 
8181a7b
 
 
 
0db8b33
 
 
 
 
8181a7b
 
 
 
0db8b33
 
 
 
8181a7b
0db8b33
 
 
 
 
 
 
 
 
8181a7b
 
0db8b33
 
 
 
 
 
 
 
 
8181a7b
 
 
 
 
 
 
 
 
0db8b33
 
8181a7b
 
0db8b33
8181a7b
0db8b33
 
 
8181a7b
0db8b33
 
8181a7b
 
0db8b33
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import os
import json
from typing import Dict, Any, Optional
import logging
import time
from openai import OpenAI, APIError

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

MODEL_DATA_DIR = "model_data_json"
EXPLANATION_KEY = "model_explanation_gemini"
DESCRIPTION_KEY = "description"
MAX_RETRIES = 3 # Retries for API calls
RETRY_DELAY_SECONDS = 5 # Delay between retries

# --- DeepSeek API Configuration ---
DEEPSEEK_API_KEY_ENV_VAR = "DEEPSEEK_API_KEY"
DEEPSEEK_BASE_URL = "https://api.deepseek.com"
DEEPSEEK_MODEL_NAME = "deepseek-chat"

# Global client variable
client: Optional[OpenAI] = None

def configure_llm_client():
    """Configures the OpenAI client for DeepSeek API using the API key from environment variables."""
    global client
    api_key = os.getenv(DEEPSEEK_API_KEY_ENV_VAR)
    if not api_key:
        logging.error(f"Error: {DEEPSEEK_API_KEY_ENV_VAR} environment variable not set.")
        logging.error("Please set the environment variable before running the script.")
        return False
    try:
        client = OpenAI(api_key=api_key, base_url=DEEPSEEK_BASE_URL)
        logging.info("DeepSeek API client configured successfully.")
        return True
    except Exception as e:
        logging.error(f"Failed to configure DeepSeek API client: {e}")
        client = None
        return False

# --- End DeepSeek API Configuration ---

def generate_explanation(model_id: str, description: str) -> Optional[str]:
    """
    Generates a short English explanation for the model based on its description
    by calling the DeepSeek API via the OpenAI library.

    Args:
        model_id: The ID of the model (for context).
        description: The model description text.

    Returns:
        A short English explanation string from DeepSeek, or None if generation fails.
    """
    global client
    if not client:
        logging.error(f"[{model_id}] DeepSeek client not configured. Cannot generate explanation.")
        return None

    if not description or not isinstance(description, str):
        logging.warning(f"[{model_id}] Description is empty or not a string. Skipping explanation generation.")
        return None

    # Truncate very long descriptions
    max_desc_length = 4000
    if len(description) > max_desc_length:
        logging.warning(f"[{model_id}] Description truncated to {max_desc_length} chars for API call.")
        description = description[:max_desc_length] + "... [truncated]"

    # Construct the messages for DeepSeek API
    messages = [
        {"role": "system", "content": "You are an AI assistant tasked with summarizing Hugging Face model descriptions concisely."},        
        {"role": "user", "content": (
            f"Analyze the following description for the Hugging Face model '{model_id}'. "
            f"Based **only** on this description, provide a concise, one-sentence explanation in English "
            f"summarizing what this model does and its primary purpose or task. "
            f"Focus on the core functionality mentioned. Avoid adding introductory phrases like 'This model is...' or 'The model...'."
            f"\n\n---\nModel Description:\n{description}\n---\n\nConcise Explanation:"
        )}
    ]

    retries = 0
    while retries < MAX_RETRIES:
        try:
            logging.info(f"[{model_id}] Calling DeepSeek API (Attempt {retries + 1}/{MAX_RETRIES})...")
            response = client.chat.completions.create(
                model=DEEPSEEK_MODEL_NAME,
                messages=messages,
                stream=False,
                max_tokens=100, # Limit response length
                temperature=0.2 # Lower temperature for more focused summary
            )

            explanation = response.choices[0].message.content.strip()
            logging.info(f"[{model_id}] Explanation received from DeepSeek: '{explanation}'")
            # Basic post-processing: remove potential quotes
            if explanation.startswith('"') and explanation.endswith('"'):
                explanation = explanation[1:-1]
            return explanation

        except APIError as e:
            retries += 1
            logging.error(f"[{model_id}] DeepSeek API Error (Attempt {retries}/{MAX_RETRIES}): {e}")
            if retries < MAX_RETRIES:
                logging.info(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
                time.sleep(RETRY_DELAY_SECONDS)
            else:
                logging.error(f"[{model_id}] Max retries reached. Failed to generate explanation via DeepSeek.")
                return None
        except Exception as e: # Catch other potential errors
            logging.error(f"[{model_id}] Unexpected error during DeepSeek API call: {e}")
            return None # Don't retry for unexpected errors

    return None

def process_json_file(filepath: str):
    """Reads, updates, and writes a single JSON file."""
    model_id = os.path.basename(filepath).replace('.json', '')
    logging.info(f"Processing {filepath}...")

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except json.JSONDecodeError:
        logging.error(f"[{model_id}] Invalid JSON format in {filepath}. Skipping.")
        return
    except FileNotFoundError:
        logging.error(f"[{model_id}] File not found: {filepath}. Skipping.")
        return
    except Exception as e:
        logging.error(f"[{model_id}] Error reading {filepath}: {e}. Skipping.")
        return

    if not isinstance(data, dict):
        logging.error(f"[{model_id}] Expected JSON object (dict) but got {type(data)} in {filepath}. Skipping.")
        return

    description = data.get(DESCRIPTION_KEY)
    explanation_overwritten = False

    # --- Deletion Logic: Always remove existing explanation before trying to regenerate ---
    if EXPLANATION_KEY in data:
        logging.info(f"[{model_id}] Existing explanation found. Deleting before regenerating.")
        del data[EXPLANATION_KEY]
        explanation_overwritten = True # Mark that we intend to replace it

    # --- Generation Logic ---
    if not description:
         logging.warning(f"[{model_id}] Description field is missing or empty. Cannot generate explanation.")
         return

    explanation = generate_explanation(model_id, description) # Try to generate a new one

    # --- Update and Write Logic ---    
    if explanation: # Only update if generation was successful
        data[EXPLANATION_KEY] = explanation
        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=4)
            if explanation_overwritten:
                 logging.info(f"[{model_id}] Successfully overwrote and updated {filepath} with new explanation.")
            else:
                 logging.info(f"[{model_id}] Successfully generated and updated {filepath} with new explanation.")
        except IOError as e:
            logging.error(f"[{model_id}] Error writing updated data to {filepath}: {e}")
        except Exception as e:
            logging.error(f"[{model_id}] Unexpected error writing {filepath}: {e}")
    else: # Explanation generation failed
         log_message = f"[{model_id}] Failed to generate new explanation for {filepath} via API."
         if explanation_overwritten:
             log_message += " Existing explanation was removed but not replaced due to API failure."
         logging.warning(log_message)


def main():
    """Main function to iterate through the directory and process files."""
    # Configure LLM client at the start
    if not configure_llm_client():
        return # Stop if API key is not configured

    if not os.path.isdir(MODEL_DATA_DIR):
        logging.error(f"Directory not found: {MODEL_DATA_DIR}")
        return

    logging.info(f"Starting processing directory: {MODEL_DATA_DIR}")
    processed_files = 0
    updated_files = 0
    skipped_files = 0

    all_files = [f for f in os.listdir(MODEL_DATA_DIR) if f.lower().endswith(".json")]
    total_files = len(all_files)
    logging.info(f"Found {total_files} JSON files to process.")

    for i, filename in enumerate(all_files):
        filepath = os.path.join(MODEL_DATA_DIR, filename)
        logging.info(f"--- Processing file {i+1}/{total_files}: {filename} ---")
        try:
            # Check if explanation exists before calling process_json_file
            # to potentially save API calls if already done.
            # However, process_json_file already has this check.
            process_json_file(filepath)
            processed_files +=1 # Count as processed even if skipped due to existing explanation

            # Check if file was actually updated (optional metric)
            # Re-read might be inefficient, could return status from process_json_file
            # For simplicity, we just log success/failure in process_json_file

        except Exception as e:
            logging.error(f"Unexpected error processing file {filename}: {e}")
            skipped_files += 1
        # Add a small delay between files to potentially avoid hitting rate limits
        time.sleep(0.5) # Adjust delay as needed


    logging.info(f"--- Processing complete ---")
    # Refine reporting slightly
    logging.info(f"Total JSON files found: {total_files}")
    logging.info(f"Files processed (attempted): {processed_files}")
    # A more accurate count of updated files would require modifying process_json_file to return status
    logging.info(f"Files skipped due to unexpected errors: {skipped_files}")

if __name__ == "__main__":
    main()