File size: 12,026 Bytes
0db8b33
 
 
 
 
 
 
 
 
 
 
ec1f977
 
 
 
 
0db8b33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4b732c
 
 
0db8b33
 
 
 
 
 
 
e4b732c
0db8b33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import os
import json
from typing import Dict, Any, Optional
import logging
import time
# import google.generativeai as genai # Remove Gemini import
from openai import OpenAI, APIError # Add back OpenAI imports

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the base persistent storage path (must match other scripts)
PERSISTENT_STORAGE_PATH = "/data" # <-- ADJUST IF YOUR PATH IS DIFFERENT

# Point to the JSON data within persistent storage
MODEL_DATA_DIR = os.path.join(PERSISTENT_STORAGE_PATH, "model_data_json")
EXPLANATION_KEY = "model_explanation_gemini"
DESCRIPTION_KEY = "description"
MAX_RETRIES = 3 # Retries for API calls
RETRY_DELAY_SECONDS = 5 # Delay between retries

# --- DeepSeek API Configuration (Restored) ---
DEEPSEEK_API_KEY_ENV_VAR = "DEEPSEEK_API_KEY" # Environment variable for the key
DEEPSEEK_BASE_URL = "https://api.deepseek.com"
DEEPSEEK_MODEL_NAME = "deepseek-chat"
# ---

# Remove Gemini configuration
# GEMINI_API_KEY_ENV_VAR = "GEMINI_API_KEY"
# GEMINI_MODEL_NAME = "gemini-1.5-flash-latest"

# Global client variable for DeepSeek/OpenAI client
client: Optional[OpenAI] = None # Use OpenAI client type
# gemini_model: Optional[genai.GenerativeModel] = None # Remove Gemini model variable

def configure_llm_client():
    """Configures the OpenAI client for DeepSeek API using the API key from environment variables."""
    global client
    # global gemini_model # Remove
    api_key = os.getenv(DEEPSEEK_API_KEY_ENV_VAR) # Use DeepSeek env var
    if not api_key:
        logging.error(f"Error: {DEEPSEEK_API_KEY_ENV_VAR} environment variable not set.")
        logging.error("Please set the environment variable with your DeepSeek API key before running the script.")
        return False
    try:
        # Configure OpenAI client for DeepSeek
        client = OpenAI(api_key=api_key, base_url=DEEPSEEK_BASE_URL)
        logging.info(f"DeepSeek API client configured successfully for model: {DEEPSEEK_MODEL_NAME}.")
        return True
    except Exception as e:
        logging.error(f"Failed to configure DeepSeek API client: {e}")
        client = None
        return False

# --- End DeepSeek API Configuration ---

def generate_explanation(model_id: str, description: str) -> Optional[str]:
    """
    Generates a short English explanation for the model based on its description
    by calling the DeepSeek API via the OpenAI library.

    Args:
        model_id: The ID of the model (for context).
        description: The model description text.

    Returns:
        A short English explanation string from DeepSeek, or None if generation fails.
    """
    global client # Use OpenAI client
    # global gemini_model # Remove
    if not client:
        logging.error(f"[{model_id}] DeepSeek client not configured. Cannot generate explanation.")
        return None

    if not description or not isinstance(description, str):
        logging.warning(f"[{model_id}] Description is empty or not a string. Skipping explanation generation.")
        return None

    # Truncate very long descriptions (adjust limit back if needed for DeepSeek)
    max_desc_length = 4000
    if len(description) > max_desc_length:
        logging.warning(f"[{model_id}] Description truncated to {max_desc_length} chars for API call.")
        description = description[:max_desc_length] + "... [truncated]"

    # Construct the messages for DeepSeek API (Restore original format)
    messages = [
        {"role": "system", "content": "You are an AI assistant tasked with summarizing Hugging Face model descriptions concisely."},        
        {"role": "user", "content": (
            f"Analyze the following description for the Hugging Face model '{model_id}'. "
            f"Based **only** on this description, provide a concise, one-sentence explanation in English "
            f"summarizing what this model does and its primary purpose or task. "
            f"Focus on the core functionality mentioned. Avoid adding introductory phrases like 'This model is...' or 'The model...'."
            f"\n\n---\nModel Description:\n{description}\n---\n\nConcise Explanation:"
        )}
    ]

    # Remove Gemini prompt construction
    # prompt = (...) 

    retries = 0
    while retries < MAX_RETRIES:
        try:
            logging.info(f"[{model_id}] Calling DeepSeek API (Attempt {retries + 1}/{MAX_RETRIES})...")
            # Use OpenAI client call format
            response = client.chat.completions.create(
                model=DEEPSEEK_MODEL_NAME,
                messages=messages,
                stream=False,
                max_tokens=100, # Limit response length
                temperature=0.2 # Lower temperature for more focused summary
            )

            # Remove Gemini response handling
            # if not response.candidates: ...
            
            explanation = response.choices[0].message.content.strip() # Get explanation from OpenAI response structure
            logging.info(f"[{model_id}] Explanation received from DeepSeek: '{explanation}'")
            
            # Basic post-processing: remove potential quotes
            if explanation.startswith('"') and explanation.endswith('"'):
                explanation = explanation[1:-1]
            # Remove Gemini specific post-processing
            # explanation = explanation.replace('**', '') 
            return explanation

        # Restore specific APIError catch for OpenAI client
        except APIError as e:
            retries += 1
            logging.error(f"[{model_id}] DeepSeek API Error (Attempt {retries}/{MAX_RETRIES}): {e}")
            if retries < MAX_RETRIES:
                logging.info(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
                time.sleep(RETRY_DELAY_SECONDS)
            else:
                logging.error(f"[{model_id}] Max retries reached. Failed to generate explanation via DeepSeek.")
                return None
        # Keep general Exception catch
        except Exception as e:
            retries += 1 # Consider retrying general errors too or handle differently
            logging.error(f"[{model_id}] Unexpected Error during API call (Attempt {retries}/{MAX_RETRIES}): {e}")
            if retries < MAX_RETRIES:
                 logging.info(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
                 time.sleep(RETRY_DELAY_SECONDS)
            else:
                logging.error(f"[{model_id}] Max retries reached. Failed to generate explanation due to unexpected errors.")
                return None

    return None # Should not be reached if loop finishes without returning

def process_json_file(filepath: str):
    """Reads, updates (only if explanation missing), and writes a single JSON file."""
    model_id = os.path.basename(filepath).replace('.json', '')
    logging.info(f"Processing {filepath}...")

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except json.JSONDecodeError:
        logging.error(f"[{model_id}] Invalid JSON format in {filepath}. Skipping.")
        return False # Indicate failure/skip
    except FileNotFoundError:
        logging.error(f"[{model_id}] File not found: {filepath}. Skipping.")
        return False
    except Exception as e:
        logging.error(f"[{model_id}] Error reading {filepath}: {e}. Skipping.")
        return False

    if not isinstance(data, dict):
        logging.error(f"[{model_id}] Expected JSON object (dict) but got {type(data)} in {filepath}. Skipping.")
        return False

    # --- Check if explanation already exists ---
    existing_explanation = data.get(EXPLANATION_KEY)
    logging.debug(f"[{model_id}] Checking for existing explanation. Key: '{EXPLANATION_KEY}'. Found value: '{existing_explanation}' (Type: {type(existing_explanation)})")
    if existing_explanation: # Simplified check: Checks for non-empty string, non-None
        logging.info(f"[{model_id}] Explanation already exists. Skipping generation.")
        return False # Indicate no update was needed

    # --- Deletion Logic REMOVED ---
    # if EXPLANATION_KEY in data: ...

    # --- Generation Logic ---
    logging.info(f"[{model_id}] Existing explanation is missing or empty. Proceeding with generation.")
    description = data.get(DESCRIPTION_KEY)
    if not description:
         logging.warning(f"[{model_id}] Description field is missing or empty. Cannot generate explanation.")
         return False # Cannot generate, so no update possible

    explanation = generate_explanation(model_id, description) # Try to generate a new one

    # --- Update and Write Logic ---
    if explanation: # Only update if generation was successful
        data[EXPLANATION_KEY] = explanation
        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=4)
            logging.info(f"[{model_id}] Successfully generated and updated {filepath} with new explanation.")
            return True # Indicate success/update
        except IOError as e:
            logging.error(f"[{model_id}] Error writing updated data to {filepath}: {e}")
            return False
        except Exception as e:
            logging.error(f"[{model_id}] Unexpected error writing {filepath}: {e}")
            return False
    else: # Explanation generation failed
         logging.warning(f"[{model_id}] Failed to generate new explanation for {filepath} via API. File not updated.")
         return False # Indicate failure/no update


def main():
    """Main function to iterate through the directory and process files."""
    if not configure_llm_client():
        return # Stop if API key is not configured

    if not os.path.isdir(MODEL_DATA_DIR):
        logging.error(f"Directory not found: {MODEL_DATA_DIR}")
        return

    logging.info(f"Starting processing directory: {MODEL_DATA_DIR}")
    processed_files = 0
    updated_files = 0 # Count files actually updated
    skipped_existing = 0 # Count files skipped because explanation existed
    skipped_error = 0 # Count files skipped due to read/write/API errors or no description

    all_files = [f for f in os.listdir(MODEL_DATA_DIR) if f.lower().endswith(".json")]
    total_files = len(all_files)
    logging.info(f"Found {total_files} JSON files to process.")

    for i, filename in enumerate(all_files):
        filepath = os.path.join(MODEL_DATA_DIR, filename)
        logging.info(f"--- Processing file {i+1}/{total_files}: {filename} ---")
        try:
            # process_json_file now returns True if updated, False otherwise
            updated = process_json_file(filepath)
            processed_files += 1
            if updated:
                updated_files += 1
            else:
                # Need to differentiate why it wasn't updated. Re-read is inefficient.
                # Let's rely on logs from process_json_file for now.
                # A better way would be for process_json_file to return status codes.
                pass # Logging within the function indicates reason (skipped existing, API fail, etc.)

        except Exception as e:
            logging.error(f"Unexpected error processing file loop for {filename}: {e}")
            skipped_error += 1 # Count generic loop errors
        # Add a small delay between files to potentially avoid hitting rate limits
        # Adjust delay based on Gemini quota/limits (might need less than 0.5s)
        time.sleep(0.2)


    logging.info(f"--- Processing complete ---")
    logging.info(f"Total JSON files found: {total_files}")
    logging.info(f"Files processed (attempted): {processed_files}")
    logging.info(f"Files successfully updated with new explanation: {updated_files}")
    # Cannot precisely count skipped_existing vs skipped_error without better return values
    # logging.info(f"Files skipped (existing explanation, errors, or no description): {total_files - updated_files}")


if __name__ == "__main__":
    main()