Yacine Jernite commited on
Commit
36de078
·
1 Parent(s): d6d8868

added TLDR functionality

Browse files
Files changed (4) hide show
  1. analysis_utils.py +684 -0
  2. app.py +548 -349
  3. llm_interface.py +1 -0
  4. utils.py +86 -34
analysis_utils.py ADDED
@@ -0,0 +1,684 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json # Added for TLDR JSON parsing
2
+ import logging
3
+ import os
4
+ import tempfile
5
+
6
+ from huggingface_hub import HfApi
7
+ from huggingface_hub.inference._generated.types import \
8
+ ChatCompletionOutput # Added for type hinting
9
+
10
+ # Imports from other project modules
11
+ from llm_interface import (ERROR_503_DICT, parse_qwen_response,
12
+ query_qwen_endpoint)
13
+ from prompts import format_privacy_prompt, format_summary_highlights_prompt
14
+ from utils import (PRIVACY_FILENAME, # Import constants for filenames
15
+ SUMMARY_FILENAME, TLDR_FILENAME, check_report_exists,
16
+ download_cached_reports, get_space_code_files)
17
+
18
+ # Configure logging (can inherit from app.py if called from there, but good practice)
19
+ logging.basicConfig(
20
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
21
+ )
22
+
23
+ # Load environment variables - redundant if always called by app.py which already loads them
24
+ # load_dotenv()
25
+
26
+ # Constants needed by helper functions (can be passed as args too)
27
+ # Consider passing these from app.py if they might change or for clarity
28
+ CACHE_INFO_MSG = "\n\n*(Report retrieved from cache)*"
29
+ TRUNCATION_WARNING = """**⚠️ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n"""
30
+
31
+ # --- Constants for TLDR Generation ---
32
+ TLDR_SYSTEM_PROMPT = (
33
+ "You are an AI assistant specialized in summarizing privacy analysis reports for Hugging Face Spaces. "
34
+ "You will receive two reports: a detailed privacy analysis and a summary/highlights report. "
35
+ "Based **only** on the content of these two reports, generate a concise JSON object containing a structured TLDR (Too Long; Didn't Read). "
36
+ "Do not use any information not present in the provided reports. "
37
+ "The JSON object must have the following keys:\n"
38
+ '- "app_description": A 1-2 sentence summary of what the application does from a user\'s perspective.\n'
39
+ '- "privacy_tldr": A 2-3 sentence high-level overview of privacy. Mention if the analysis was conclusive based on available code, if data processing is local, or if/what data goes to external services.\n'
40
+ '- "data_types": A list of JSON objects, where each object has two keys: \'name\' (a short, unique identifier string for the data type, e.g., "User Text") and \'description\' (a brief string explaining the data type in context, max 6-8 words, e.g., "Text prompt entered by the user").\n'
41
+ "- \"user_input_data\": A list of strings, where each string is the 'name' of a data type defined in 'data_types' that is provided by the user to the app.\n"
42
+ "- \"local_processing\": A list of strings describing data processed locally. Each string should start with the 'name' of a data type defined in 'data_types', followed by details (like the processing model) in parentheses if mentioned in the reports. Example: \"User Text (Local Model XYZ)\".\n"
43
+ "- \"remote_processing\": A list of strings describing data sent to remote services. Each string should start with the 'name' of a data type defined in 'data_types', followed by the service/model name in parentheses if mentioned in the reports. Example: \"User Text (HF Inference API)\".\n"
44
+ "- \"external_logging\": A list of strings describing data logged or saved externally. Each string should start with the 'name' of a data type defined in 'data_types', followed by the location/service in parentheses if mentioned. Example: \"User Text (External DB)\".\n"
45
+ "Ensure the output is **only** a valid JSON object, starting with `{` and ending with `}`. Ensure all listed data types in the processing/logging lists exactly match a 'name' defined in the 'data_types' list."
46
+ )
47
+
48
+ # --- Analysis Pipeline Helper Functions ---
49
+
50
+
51
+ def check_cache_and_download(space_id: str, dataset_id: str, hf_token: str | None):
52
+ """Checks cache and downloads if reports exist."""
53
+ logging.info(f"Checking cache for '{space_id}'...")
54
+ found_in_cache = False
55
+ if hf_token:
56
+ try:
57
+ found_in_cache = check_report_exists(space_id, dataset_id, hf_token)
58
+ except Exception as e:
59
+ logging.warning(f"Cache check failed for {space_id}: {e}. Proceeding.")
60
+ # Return cache_miss even if check failed, proceed to live analysis
61
+ return {"status": "cache_miss", "error_message": f"Cache check failed: {e}"}
62
+
63
+ if found_in_cache:
64
+ logging.info(f"Cache hit for {space_id}. Downloading.")
65
+ try:
66
+ cached_reports = download_cached_reports(space_id, dataset_id, hf_token)
67
+ summary_report = (
68
+ cached_reports.get("summary", "Error: Cached summary not found.")
69
+ + CACHE_INFO_MSG
70
+ )
71
+ privacy_report = (
72
+ cached_reports.get("privacy", "Error: Cached privacy report not found.")
73
+ + CACHE_INFO_MSG
74
+ )
75
+ logging.info(f"Successfully downloaded cached reports for {space_id}.")
76
+ return {
77
+ "status": "cache_hit",
78
+ "summary": summary_report,
79
+ "privacy": privacy_report,
80
+ "tldr_json_str": cached_reports.get("tldr_json_str"),
81
+ }
82
+ except Exception as e:
83
+ error_msg = f"Cache download failed for {space_id}: {e}"
84
+ logging.warning(f"{error_msg}. Proceeding with live analysis.")
85
+ # Return error, but let caller decide if live analysis proceeds
86
+ return {"status": "cache_error", "ui_message": error_msg}
87
+ else:
88
+ logging.info(f"Cache miss for {space_id}. Performing live analysis.")
89
+ return {"status": "cache_miss"}
90
+
91
+
92
+ def check_endpoint_status(
93
+ endpoint_name: str, hf_token: str | None, error_503_user_message: str
94
+ ):
95
+ """Checks the status of the inference endpoint."""
96
+ logging.info(f"Checking endpoint status for '{endpoint_name}'...")
97
+ if not hf_token:
98
+ # Allow proceeding if token missing, maybe endpoint is public
99
+ logging.warning("HF_TOKEN not set, cannot check endpoint status definitively.")
100
+ return {"status": "ready", "warning": "HF_TOKEN not set"}
101
+
102
+ try:
103
+ api = HfApi(token=hf_token)
104
+ endpoint = api.get_inference_endpoint(name=endpoint_name)
105
+ status = endpoint.status
106
+ logging.info(f"Endpoint '{endpoint_name}' status: {status}")
107
+
108
+ if status == "running":
109
+ return {"status": "ready"}
110
+ else:
111
+ logging.warning(
112
+ f"Endpoint '{endpoint_name}' is not ready (Status: {status})."
113
+ )
114
+ if status == "scaledToZero":
115
+ logging.info(
116
+ f"Endpoint '{endpoint_name}' is scaled to zero. Attempting to resume..."
117
+ )
118
+ try:
119
+ endpoint.resume()
120
+ # Still return an error message suggesting retry, as resume takes time
121
+ # Keep this message concise as the action is specific (wait)
122
+ msg = f"**Endpoint Resuming:** The analysis endpoint ('{endpoint_name}') was scaled to zero and is now restarting.\n\n{error_503_user_message}"
123
+ return {"status": "error", "ui_message": msg}
124
+ except Exception as resume_error:
125
+ # Resume failed, provide detailed message
126
+ logging.error(
127
+ f"Failed to resume endpoint {endpoint_name}: {resume_error}"
128
+ )
129
+ # Construct detailed message including full explanation
130
+ msg = f"**Endpoint Issue:** The analysis endpoint ('{endpoint_name}') is currently {status} and an attempt to resume it failed ({resume_error}).\n\n{error_503_user_message}"
131
+ return {"status": "error", "ui_message": msg}
132
+ else: # Paused, failed, pending etc.
133
+ # Construct detailed message including full explanation
134
+ msg = f"**Endpoint Issue:** The analysis endpoint ('{endpoint_name}') status is currently <span style='color:red'>**{status}**</span>.\n\n{error_503_user_message}"
135
+ return {"status": "error", "ui_message": msg}
136
+
137
+ except Exception as e:
138
+ error_msg = f"Error checking analysis endpoint status for {endpoint_name}: {e}"
139
+ logging.error(error_msg)
140
+ # Let analysis stop if endpoint check fails critically
141
+ return {"status": "error", "ui_message": f"Error checking endpoint status: {e}"}
142
+
143
+
144
+ def fetch_and_validate_code(space_id: str):
145
+ """Fetches and validates code files for the space."""
146
+ logging.info(f"Fetching code files for {space_id}...")
147
+ code_files = get_space_code_files(space_id)
148
+ if not code_files:
149
+ error_msg = f"Could not retrieve code files for '{space_id}'. Check ID and ensure it's a public Space."
150
+ logging.warning(error_msg)
151
+ return {
152
+ "status": "error",
153
+ "ui_message": f"**Error:**\n{error_msg}\nAnalysis Canceled.",
154
+ }
155
+ logging.info(f"Successfully fetched {len(code_files)} files for {space_id}.")
156
+ return {"status": "success", "code_files": code_files}
157
+
158
+
159
+ def generate_detailed_report(
160
+ space_id: str, code_files: dict, error_503_user_message: str
161
+ ):
162
+ """Generates the detailed privacy report using the LLM."""
163
+ logging.info("Generating detailed privacy analysis report...")
164
+ privacy_prompt_messages, privacy_truncated = format_privacy_prompt(
165
+ space_id, code_files
166
+ )
167
+
168
+ privacy_api_response = query_qwen_endpoint(privacy_prompt_messages, max_tokens=3072)
169
+
170
+ if privacy_api_response == ERROR_503_DICT:
171
+ logging.warning("LLM Call 1 (Privacy) failed with 503.")
172
+ return {"status": "error", "ui_message": error_503_user_message}
173
+
174
+ detailed_privacy_report = parse_qwen_response(privacy_api_response)
175
+
176
+ if "Error:" in detailed_privacy_report:
177
+ error_msg = (
178
+ f"Failed to generate detailed privacy report: {detailed_privacy_report}"
179
+ )
180
+ logging.error(error_msg)
181
+ return {
182
+ "status": "error",
183
+ "ui_message": f"**Error Generating Detailed Privacy Report:**\n{detailed_privacy_report}\nAnalysis Halted.",
184
+ }
185
+
186
+ if privacy_truncated:
187
+ detailed_privacy_report = TRUNCATION_WARNING + detailed_privacy_report
188
+
189
+ logging.info("Successfully generated detailed privacy report.")
190
+ return {
191
+ "status": "success",
192
+ "report": detailed_privacy_report,
193
+ "truncated": privacy_truncated,
194
+ }
195
+
196
+
197
+ def generate_summary_report(
198
+ space_id: str,
199
+ code_files: dict,
200
+ detailed_privacy_report: str,
201
+ error_503_user_message: str,
202
+ ):
203
+ """Generates the summary & highlights report using the LLM."""
204
+ logging.info("Generating summary and highlights report...")
205
+ # Remove potential truncation warning from detailed report before sending to next LLM
206
+ clean_detailed_report = detailed_privacy_report.replace(TRUNCATION_WARNING, "")
207
+
208
+ summary_highlights_prompt_messages, summary_truncated = (
209
+ format_summary_highlights_prompt(space_id, code_files, clean_detailed_report)
210
+ )
211
+
212
+ summary_highlights_api_response = query_qwen_endpoint(
213
+ summary_highlights_prompt_messages, max_tokens=2048
214
+ )
215
+
216
+ if summary_highlights_api_response == ERROR_503_DICT:
217
+ logging.warning("LLM Call 2 (Summary) failed with 503.")
218
+ # Return specific status to indicate partial success
219
+ return {"status": "error_503_summary", "ui_message": error_503_user_message}
220
+
221
+ summary_highlights_report = parse_qwen_response(summary_highlights_api_response)
222
+
223
+ if "Error:" in summary_highlights_report:
224
+ error_msg = (
225
+ f"Failed to generate summary/highlights report: {summary_highlights_report}"
226
+ )
227
+ logging.error(error_msg)
228
+ # Return specific status to indicate partial success
229
+ return {
230
+ "status": "error_summary",
231
+ "ui_message": f"**Error Generating Summary/Highlights:**\n{summary_highlights_report}",
232
+ }
233
+
234
+ if summary_truncated:
235
+ summary_highlights_report = TRUNCATION_WARNING + summary_highlights_report
236
+
237
+ logging.info("Successfully generated summary & highlights report.")
238
+ return {
239
+ "status": "success",
240
+ "report": summary_highlights_report,
241
+ "truncated": summary_truncated,
242
+ }
243
+
244
+
245
+ def upload_results(
246
+ space_id: str,
247
+ summary_report: str,
248
+ detailed_report: str,
249
+ dataset_id: str,
250
+ hf_token: str | None,
251
+ tldr_json_data: dict | None = None,
252
+ ):
253
+ """Uploads the generated reports (Markdown and optional JSON TLDR) to the specified dataset repository."""
254
+ if not hf_token:
255
+ logging.warning("HF Token not provided, skipping dataset report upload.")
256
+ return {"status": "skipped", "reason": "HF_TOKEN not set"}
257
+ if "Error:" in detailed_report or "Error:" in summary_report:
258
+ msg = "Skipping cache upload due to errors in generated reports."
259
+ logging.warning(msg)
260
+ return {"status": "skipped", "reason": msg}
261
+
262
+ safe_space_id = space_id.replace("..", "")
263
+
264
+ try:
265
+ with tempfile.TemporaryDirectory() as tmpdir:
266
+ # Define local paths
267
+ summary_path_local = os.path.join(tmpdir, SUMMARY_FILENAME)
268
+ privacy_path_local = os.path.join(tmpdir, PRIVACY_FILENAME)
269
+ tldr_json_path_local = os.path.join(tmpdir, TLDR_FILENAME)
270
+
271
+ # Write Markdown reports
272
+ with open(summary_path_local, "w", encoding="utf-8") as f:
273
+ f.write(summary_report)
274
+ with open(privacy_path_local, "w", encoding="utf-8") as f:
275
+ f.write(detailed_report)
276
+
277
+ # Prepare commit message
278
+ commit_message = f"Add analysis reports for Space: {safe_space_id}"
279
+ if tldr_json_data:
280
+ commit_message += " (including TLDR JSON)"
281
+ print(f"Successfully wrote TLDR JSON locally for {safe_space_id}.")
282
+ # Write JSON TLDR data if available
283
+ try:
284
+ with open(tldr_json_path_local, "w", encoding="utf-8") as f:
285
+ json.dump(tldr_json_data, f, indent=2, ensure_ascii=False)
286
+ logging.info(
287
+ f"Successfully wrote TLDR JSON locally for {safe_space_id}."
288
+ )
289
+ except Exception as json_err:
290
+ logging.error(
291
+ f"Failed to write TLDR JSON locally for {safe_space_id}: {json_err}"
292
+ )
293
+ tldr_json_data = None # Prevent upload attempt if writing failed
294
+
295
+ # Ensure repo exists
296
+ api = HfApi(token=hf_token)
297
+ repo_url = api.create_repo(
298
+ repo_id=dataset_id,
299
+ repo_type="dataset",
300
+ exist_ok=True,
301
+ )
302
+ logging.info(f"Ensured dataset repo {repo_url} exists.")
303
+
304
+ # Upload summary report
305
+ api.upload_file(
306
+ path_or_fileobj=summary_path_local,
307
+ path_in_repo=f"{safe_space_id}/{SUMMARY_FILENAME}",
308
+ repo_id=dataset_id,
309
+ repo_type="dataset",
310
+ commit_message=commit_message,
311
+ )
312
+ logging.info(f"Successfully uploaded summary report for {safe_space_id}.")
313
+
314
+ # Upload privacy report
315
+ api.upload_file(
316
+ path_or_fileobj=privacy_path_local,
317
+ path_in_repo=f"{safe_space_id}/{PRIVACY_FILENAME}",
318
+ repo_id=dataset_id,
319
+ repo_type="dataset",
320
+ commit_message=commit_message,
321
+ )
322
+ logging.info(
323
+ f"Successfully uploaded detailed privacy report for {safe_space_id}."
324
+ )
325
+ # print(f"Successfully uploaded detailed privacy report for {safe_space_id}.") # Keep if needed for debug
326
+
327
+ # Upload JSON TLDR if it was successfully written locally
328
+ if tldr_json_data and os.path.exists(tldr_json_path_local):
329
+ api.upload_file(
330
+ path_or_fileobj=tldr_json_path_local,
331
+ path_in_repo=f"{safe_space_id}/{TLDR_FILENAME}",
332
+ repo_id=dataset_id,
333
+ repo_type="dataset",
334
+ commit_message=commit_message, # Can reuse commit message or make specific
335
+ )
336
+ logging.info(f"Successfully uploaded TLDR JSON for {safe_space_id}.")
337
+ print(f"Successfully uploaded TLDR JSON for {safe_space_id}.")
338
+
339
+ # Return success if all uploads finished without error
340
+ return {"status": "success"}
341
+
342
+ except Exception as e:
343
+ error_msg = f"Non-critical error during report upload for {safe_space_id}: {e}"
344
+ logging.error(error_msg)
345
+ print(error_msg)
346
+ return {"status": "error", "message": error_msg}
347
+
348
+
349
+ # --- New TLDR Generation Functions ---
350
+
351
+
352
+ def format_tldr_prompt(
353
+ detailed_report: str, summary_report: str
354
+ ) -> list[dict[str, str]]:
355
+ """Formats the prompt for the TLDR generation task."""
356
+ # Clean potential cache/truncation markers from input reports for the LLM
357
+ cleaned_detailed = detailed_report.replace(CACHE_INFO_MSG, "").replace(
358
+ TRUNCATION_WARNING, ""
359
+ )
360
+ cleaned_summary = summary_report.replace(CACHE_INFO_MSG, "").replace(
361
+ TRUNCATION_WARNING, ""
362
+ )
363
+
364
+ user_content = (
365
+ "Please generate a structured JSON TLDR based on the following reports:\n\n"
366
+ "--- DETAILED PRIVACY ANALYSIS REPORT START ---\n"
367
+ f"{cleaned_detailed}\n"
368
+ "--- DETAILED PRIVACY ANALYSIS REPORT END ---\n\n"
369
+ "--- SUMMARY & HIGHLIGHTS REPORT START ---\n"
370
+ f"{cleaned_summary}\n"
371
+ "--- SUMMARY & HIGHLIGHTS REPORT END ---"
372
+ )
373
+
374
+ # Note: We are not handling truncation here, assuming the input reports
375
+ # are already reasonably sized from the previous steps.
376
+ # If reports could be extremely long, add truncation logic similar to other format_* functions.
377
+
378
+ messages = [
379
+ {"role": "system", "content": TLDR_SYSTEM_PROMPT},
380
+ {"role": "user", "content": user_content},
381
+ ]
382
+ return messages
383
+
384
+
385
+ def parse_tldr_json_response(
386
+ response: ChatCompletionOutput | dict | None,
387
+ ) -> dict | None:
388
+ """Parses the LLM response, expecting JSON content for the TLDR."""
389
+ if response is None:
390
+ logging.error("TLDR Generation: Failed to get response from LLM.")
391
+ return None
392
+
393
+ # Check for 503 error dict first
394
+ if isinstance(response, dict) and response.get("error_type") == "503":
395
+ logging.error(f"TLDR Generation: Received 503 error: {response.get('message')}")
396
+ return None # Treat 503 as failure for this specific task
397
+
398
+ # --- Direct Content Extraction (Replaces call to parse_qwen_response) ---
399
+ raw_content = ""
400
+ try:
401
+ # Check if it's likely the expected ChatCompletionOutput structure
402
+ if not hasattr(response, "choices"):
403
+ logging.error(
404
+ f"TLDR Generation: Unexpected response type received: {type(response)}. Content: {response}"
405
+ )
406
+ return None # Return None if not the expected structure
407
+
408
+ # Access the generated content according to the ChatCompletionOutput structure
409
+ if response.choices and len(response.choices) > 0:
410
+ content = response.choices[0].message.content
411
+ if content:
412
+ raw_content = content.strip()
413
+ logging.info(
414
+ "TLDR Generation: Successfully extracted raw content from response."
415
+ )
416
+ else:
417
+ logging.warning(
418
+ "TLDR Generation: Response received, but content is empty."
419
+ )
420
+ return None
421
+ else:
422
+ logging.warning("TLDR Generation: Response received, but no choices found.")
423
+ return None
424
+ except AttributeError as e:
425
+ # This might catch cases where response looks like the object but lacks expected attributes
426
+ logging.error(
427
+ f"TLDR Generation: Attribute error parsing response object: {e}. Response structure might be unexpected. Response: {response}"
428
+ )
429
+ return None
430
+ except Exception as e:
431
+ logging.error(
432
+ f"TLDR Generation: Unexpected error extracting content from response object: {e}"
433
+ )
434
+ return None
435
+ # --- End Direct Content Extraction ---
436
+
437
+ # --- JSON Parsing Logic ---
438
+ if not raw_content: # Should be caught by checks above, but belts and suspenders
439
+ logging.error("TLDR Generation: Raw content is empty after extraction attempt.")
440
+ return None
441
+
442
+ try:
443
+ # Clean potential markdown code block formatting
444
+ if raw_content.strip().startswith("```json"):
445
+ raw_content = raw_content.strip()[7:-3].strip()
446
+ elif raw_content.strip().startswith("```"):
447
+ raw_content = raw_content.strip()[3:-3].strip()
448
+
449
+ tldr_data = json.loads(raw_content)
450
+
451
+ # Validate structure: Check if it's a dict and has all required keys
452
+ required_keys = [
453
+ "app_description",
454
+ "privacy_tldr",
455
+ "data_types",
456
+ "user_input_data",
457
+ "local_processing",
458
+ "remote_processing",
459
+ "external_logging",
460
+ ]
461
+ if not isinstance(tldr_data, dict):
462
+ logging.error(
463
+ f"TLDR Generation: Parsed content is not a dictionary. Content: {raw_content[:500]}..."
464
+ )
465
+ return None
466
+ if not all(key in tldr_data for key in required_keys):
467
+ missing_keys = [key for key in required_keys if key not in tldr_data]
468
+ logging.error(
469
+ f"TLDR Generation: Parsed JSON is missing required keys: {missing_keys}. Content: {raw_content[:500]}..."
470
+ )
471
+ return None
472
+
473
+ # --- Add validation for the new data_types structure ---
474
+ data_types_list = tldr_data.get("data_types")
475
+ if not isinstance(data_types_list, list):
476
+ logging.error(
477
+ f"TLDR Generation: 'data_types' is not a list. Content: {data_types_list}"
478
+ )
479
+ return None
480
+ for item in data_types_list:
481
+ if (
482
+ not isinstance(item, dict)
483
+ or "name" not in item
484
+ or "description" not in item
485
+ ):
486
+ logging.error(
487
+ f"TLDR Generation: Invalid item found in 'data_types' list: {item}. Must be dict with 'name' and 'description'."
488
+ )
489
+ return None
490
+ if not isinstance(item["name"], str) or not isinstance(
491
+ item["description"], str
492
+ ):
493
+ logging.error(
494
+ f"TLDR Generation: Invalid types for name/description in 'data_types' item: {item}. Must be strings."
495
+ )
496
+ return None
497
+ # --- End validation for data_types ---
498
+
499
+ # Basic validation for other lists (should contain strings)
500
+ validation_passed = True
501
+ for key in [
502
+ "user_input_data",
503
+ "local_processing",
504
+ "remote_processing",
505
+ "external_logging",
506
+ ]:
507
+ data_list = tldr_data.get(key)
508
+ # Add more detailed check and logging
509
+ if not isinstance(data_list, list):
510
+ logging.error(
511
+ f"TLDR Generation Validation Error: Key '{key}' is not a list. Found type: {type(data_list)}, Value: {data_list}"
512
+ )
513
+ validation_passed = False
514
+ # Allow continuing validation for other keys, but mark as failed
515
+ elif not all(isinstance(x, str) for x in data_list):
516
+ # This check might be too strict if LLM includes non-strings, but keep for now
517
+ logging.warning(
518
+ f"TLDR Generation Validation Warning: Not all items in list '{key}' are strings. Content: {data_list}"
519
+ )
520
+ # Decide if this should cause failure - currently it doesn't, just warns
521
+
522
+ if not validation_passed:
523
+ logging.error(
524
+ "TLDR Generation: Validation failed due to incorrect list types."
525
+ )
526
+ return None # Ensure failure if any key wasn't a list
527
+
528
+ logging.info("Successfully parsed and validated TLDR JSON response.")
529
+ return tldr_data
530
+
531
+ except json.JSONDecodeError as e:
532
+ logging.error(
533
+ f"TLDR Generation: Failed to decode JSON response: {e}. Content: {raw_content[:500]}..."
534
+ )
535
+ return None
536
+ except Exception as e:
537
+ logging.error(f"TLDR Generation: Unexpected error parsing JSON response: {e}")
538
+ return None
539
+
540
+
541
+ def render_tldr_markdown(tldr_data: dict | None, space_id: str | None = None) -> str:
542
+ """Renders the top-level TLDR (description, privacy) data into a Markdown string.
543
+
544
+ (Does not include the data lists)
545
+ """
546
+ if not tldr_data:
547
+ # Return a more specific message for this part
548
+ return "*TLDR Summary could not be generated.*\n"
549
+
550
+ output = []
551
+
552
+ # Add Space link if space_id is provided
553
+ if space_id:
554
+ output.append(
555
+ f"**Source Space:** [`{space_id}`](https://huggingface.co/spaces/{space_id})\n"
556
+ )
557
+
558
+ output.append(f"**App Description:** {tldr_data.get('app_description', 'N/A')}\n")
559
+ privacy_summary = tldr_data.get("privacy_tldr", "N/A")
560
+ output.append(f"**Privacy TLDR:** {privacy_summary}") # Removed extra newline
561
+
562
+ # Removed data list rendering from this function
563
+
564
+ return "\n".join(output)
565
+
566
+
567
+ def render_data_details_markdown(tldr_data: dict | None) -> str:
568
+ """Renders the data lists (types, input, processing, logging) from TLDR data."""
569
+ if not tldr_data:
570
+ return "*Data details could not be generated.*\n"
571
+
572
+ output = []
573
+ # Get defined names for formatting
574
+ defined_names = sorted(
575
+ [
576
+ dt.get("name", "")
577
+ for dt in tldr_data.get("data_types", [])
578
+ if dt.get("name")
579
+ ],
580
+ key=len,
581
+ reverse=True,
582
+ )
583
+
584
+ output.append("**Data Types Defined:**") # Renamed slightly for clarity
585
+ data_types = tldr_data.get("data_types")
586
+ if data_types and isinstance(data_types, list):
587
+ if not data_types:
588
+ output.append("- None identified.")
589
+ else:
590
+ for item in data_types:
591
+ name = item.get("name", "Unnamed")
592
+ desc = item.get("description", "No description")
593
+ output.append(f"- `{name}`: {desc}")
594
+ else:
595
+ output.append("- (Error loading data types)")
596
+ output.append("") # Add newline for spacing
597
+
598
+ # Reusable helper for rendering lists
599
+ def render_list(title, key):
600
+ output.append(f"**{title}:**")
601
+ data_list = tldr_data.get(key)
602
+ if isinstance(data_list, list):
603
+ if not data_list:
604
+ output.append("- None identified.")
605
+ else:
606
+ for item_str in data_list:
607
+ formatted_item = item_str # Default
608
+ found_match = False
609
+ for name in defined_names:
610
+ if item_str == name:
611
+ formatted_item = f"`{name}`"
612
+ found_match = True
613
+ break
614
+ elif item_str.startswith(name + " "):
615
+ formatted_item = f"`{name}`{item_str[len(name):]}"
616
+ found_match = True
617
+ break
618
+ if (
619
+ not found_match
620
+ and " " not in item_str
621
+ and not item_str.startswith("`")
622
+ ):
623
+ formatted_item = f"`{item_str}`"
624
+ output.append(f"- {formatted_item}")
625
+ else:
626
+ output.append("- (Error loading list)")
627
+ output.append("")
628
+
629
+ render_list("Data Sent by User to App", "user_input_data")
630
+ render_list("Data Processed Locally within App", "local_processing")
631
+ render_list("Data Processed Remotely", "remote_processing")
632
+ render_list("Data Logged/Saved Externally", "external_logging")
633
+
634
+ # Remove the last empty line
635
+ if output and output[-1] == "":
636
+ output.pop()
637
+
638
+ return "\n".join(output)
639
+
640
+
641
+ # --- Combined TLDR Generation Function ---
642
+
643
+
644
+ def generate_and_parse_tldr(detailed_report: str, summary_report: str) -> dict | None:
645
+ """Formats prompt, queries LLM, and parses JSON response for TLDR.
646
+
647
+ Args:
648
+ detailed_report: The detailed privacy report content.
649
+ summary_report: The summary & highlights report content.
650
+
651
+ Returns:
652
+ A dictionary with the parsed TLDR data, or None if any step fails.
653
+ """
654
+ logging.info("Starting TLDR generation and parsing...")
655
+ try:
656
+ # Format
657
+ tldr_prompt_messages = format_tldr_prompt(detailed_report, summary_report)
658
+ if not tldr_prompt_messages:
659
+ logging.error("TLDR Generation: Failed to format prompt.")
660
+ return None
661
+
662
+ # Query (using existing import within analysis_utils)
663
+ # Use slightly smaller max_tokens
664
+ llm_response = query_qwen_endpoint(tldr_prompt_messages, max_tokens=1024)
665
+ if llm_response is None: # Check if query itself failed critically
666
+ logging.error("TLDR Generation: LLM query returned None.")
667
+ return None
668
+ # 503 handled within parse function below
669
+
670
+ # Parse
671
+ parsed_data = parse_tldr_json_response(llm_response)
672
+ if parsed_data:
673
+ logging.info("Successfully generated and parsed TLDR.")
674
+ return parsed_data
675
+ else:
676
+ logging.error("TLDR Generation: Failed to parse JSON response.")
677
+ return None
678
+
679
+ except Exception as e:
680
+ logging.error(
681
+ f"TLDR Generation: Unexpected error in generate_and_parse_tldr: {e}",
682
+ exc_info=True,
683
+ )
684
+ return None
app.py CHANGED
@@ -1,25 +1,37 @@
 
1
  import logging
2
  import os
3
 
4
  import gradio as gr
5
  from dotenv import load_dotenv
6
-
7
  from huggingface_hub import HfApi
8
 
9
- from llm_interface import ERROR_503_DICT # Import error dict
10
- from llm_interface import parse_qwen_response, query_qwen_endpoint
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- # Updated prompt imports for new order
13
- from prompts import format_privacy_prompt, format_summary_highlights_prompt
14
 
15
- # Import helper functions from other modules
16
- from utils import list_cached_spaces # Added import
17
- from utils import (
18
- check_report_exists,
19
- download_cached_reports,
20
- get_space_code_files,
21
- upload_reports_to_dataset,
22
- )
23
 
24
  # Configure logging
25
  logging.basicConfig(
@@ -34,10 +46,13 @@ load_dotenv()
34
  HF_TOKEN = os.getenv("HF_TOKEN")
35
  ENDPOINT_NAME = "qwen2-5-coder-32b-instruct-pmf"
36
  DATASET_ID = "yjernite/spaces-privacy-reports"
37
- CACHE_INFO_MSG = "\n\n*(Report retrieved from cache)*"
 
 
38
  DEFAULT_SELECTION = "HuggingFaceTB/SmolVLM2"
39
 
40
- TRUNCATION_WARNING = """**⚠️ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n"""
 
41
 
42
  ERROR_503_USER_MESSAGE = """It appears that the analysis model endpoint is currently down or starting up.
43
 
@@ -49,419 +64,582 @@ You have a few options:
49
  """
50
 
51
 
52
- def get_space_report_wrapper(
53
- selected_cached_space: str | None,
54
- new_space_id: str | None,
55
- progress=gr.Progress(track_tqdm=True),
56
- ):
57
  """
58
- Wrapper function to decide whether to fetch cache or run live analysis.
59
- Handles the logic based on Dropdown and Textbox inputs.
60
  Yields tuples of Gradio updates.
61
  """
62
- target_space_id = None
63
- source = "new" # Assume new input unless dropdown is chosen
 
 
 
 
 
 
 
64
 
65
- # Prioritize new_space_id if provided
66
- if new_space_id and new_space_id.strip():
67
- target_space_id = new_space_id.strip()
68
- if target_space_id == selected_cached_space:
69
- source = "dropdown_match" # User typed ID that exists in dropdown
70
- else:
71
- source = "new"
72
- elif selected_cached_space:
73
- target_space_id = selected_cached_space
74
- source = "dropdown"
75
 
76
- if not target_space_id:
77
- # No input provided
78
- return (
79
- gr.update(
80
- value="Please select an existing report or enter a new Space ID.",
81
- visible=True,
82
- ),
83
- gr.update(value="", visible=False),
84
- gr.update(visible=True, open=True),
85
- gr.update(visible=False),
86
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
- # Validate format
89
- if "/" not in target_space_id:
90
- return (
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  gr.update(
92
- value=f"Invalid Space ID format: '{target_space_id}'. Use 'owner/name'.",
93
  visible=True,
94
  ),
95
- gr.update(value="", visible=False),
96
- gr.update(visible=True, open=True),
97
- gr.update(visible=False),
98
- )
99
-
100
- logging.info(f"Request received for: '{target_space_id}' (Source: {source})")
101
-
102
- # --- Cache Handling ---
103
- # If the user explicitly selected from the dropdown, try to fetch it directly.
104
- if source == "dropdown":
105
- progress(
106
- 0.1, desc="Fetching cached report..."
107
- ) # Simple progress for cache fetch
108
- yield (
109
- gr.update(value="Fetching selected cached report...", visible=True),
110
  gr.update(value="", visible=True),
111
- gr.update(visible=True, open=True),
 
112
  gr.update(visible=True, open=False),
113
  )
114
- try:
115
- cached_reports = download_cached_reports(
116
- target_space_id, DATASET_ID, HF_TOKEN
117
- )
118
- summary_report = (
119
- cached_reports.get("summary", "Error: Cached summary not found.")
120
- + CACHE_INFO_MSG
121
- )
122
- privacy_report = (
123
- cached_reports.get("privacy", "Error: Cached privacy report not found.")
124
- + CACHE_INFO_MSG
125
- )
126
- logging.info(
127
- f"Successfully displayed cached reports for selected '{target_space_id}'."
128
- )
129
- progress(1.0, desc="Complete (from cache)")
130
- yield (
131
- gr.update(value=summary_report, visible=True),
132
- gr.update(value=privacy_report, visible=True),
133
- gr.update(visible=True, open=True),
134
- gr.update(visible=True, open=True),
135
- )
136
- except Exception as e:
137
- error_msg = f"Failed to download cached report for selected '{target_space_id}': {e}"
138
- logging.error(error_msg)
139
- progress(1.0, desc="Error")
140
- yield (
141
- gr.update(value=error_msg, visible=True),
142
- gr.update(value="", visible=False),
143
- gr.update(visible=True, open=True),
144
- gr.update(visible=False),
145
- )
146
-
147
- # --- Live Analysis or Check Cache for New Input ---
148
- # If it came from the textbox OR was a dropdown match, we first check cache, then run live.
149
- else: # source == "new" or source == "dropdown_match"
150
- # This generator now performs the full analysis if needed
151
- # Yield intermediate updates from the generator
152
- # Important: Need to use a loop to consume the generator
153
- final_update = None
154
- for update_tuple in _run_live_analysis(target_space_id, progress):
155
- yield update_tuple
156
- final_update = update_tuple # Keep track of the last update
157
- yield final_update # Return the very last state
158
-
159
 
160
- def _run_live_analysis(space_id: str, progress=gr.Progress(track_tqdm=True)):
161
- """
162
- Performs the full analysis pipeline: cache check, code fetch, LLM calls, upload.
163
- Yields tuples of Gradio updates.
164
- (This contains the logic previously in analyze_space_privacy, minus initial input handling)
165
- """
166
- steps = 8 # Steps for the full pipeline
167
- privacy_truncated = False
168
- summary_truncated = False
169
-
170
- # --- Step 1: Check Cache --- (Check again for new/matched input)
171
- progress(1 / steps, desc="Step 1/8: Checking cache...")
172
- logging.info(f"Step 1/8: Checking cache for '{space_id}'...")
173
  yield (
174
- gr.update(value="Checking cache for existing reports...", visible=True),
 
 
175
  gr.update(value="", visible=True),
176
- gr.update(visible=True, open=True),
 
177
  gr.update(visible=True, open=False),
178
  )
179
- found_in_cache = False
180
- if HF_TOKEN:
181
- try:
182
- found_in_cache = check_report_exists(space_id, DATASET_ID, HF_TOKEN)
183
- except Exception as e:
184
- logging.warning(f"Cache check failed: {e}. Proceeding.")
185
- yield (
186
- gr.update(
187
- value="Cache check failed, proceeding with live analysis...",
188
- visible=True,
189
- ),
190
- gr.update(value="", visible=True),
191
- gr.update(visible=True, open=True),
192
- gr.update(visible=True, open=False),
193
- )
194
 
195
- if found_in_cache:
196
- logging.info(f"Cache hit for {space_id}. Downloading.")
197
- progress(2 / steps, desc="Step 2/8: Cache hit! Downloading reports...")
 
198
  yield (
199
- gr.update(value="Cache hit! Downloading reports...", visible=True),
200
- gr.update(value="", visible=True),
201
- gr.update(visible=True, open=True),
202
- gr.update(visible=True, open=False),
203
- )
204
- try:
205
- cached_reports = download_cached_reports(space_id, DATASET_ID, HF_TOKEN)
206
- summary_report = (
207
- cached_reports.get("summary", "Error: Cached summary not found.")
208
- + CACHE_INFO_MSG
209
- )
210
- privacy_report = (
211
- cached_reports.get("privacy", "Error: Cached privacy report not found.")
212
- + CACHE_INFO_MSG
213
- )
214
- logging.info(f"Successfully displayed cached reports for {space_id}.")
215
- progress(8 / steps, desc="Complete (from cache)")
216
- yield (
217
- gr.update(value=summary_report, visible=True),
218
- gr.update(value=privacy_report, visible=True),
219
- gr.update(visible=True, open=True),
220
- gr.update(visible=True, open=True),
221
- )
222
- return # End generation here if cache successful
223
- except Exception as e:
224
- logging.warning(f"Cache download failed for {space_id}: {e}. Proceeding.")
225
- yield (
226
- gr.update(
227
- value="Cache download failed, proceeding with live analysis...",
228
- visible=True,
229
- ),
230
- gr.update(value="", visible=True),
231
- gr.update(visible=True, open=True),
232
- gr.update(visible=True, open=False),
233
- )
234
- else:
235
- logging.info(f"Cache miss for {space_id}. Performing live analysis.")
236
- yield (
237
- gr.update(value="Cache miss. Fetching code...", visible=True),
238
- gr.update(value="", visible=True),
239
- gr.update(visible=True, open=True),
240
- gr.update(visible=True, open=False),
241
  )
 
242
 
243
- # --- Step 2: Check Endpoint Status ---
244
- progress(2 / steps, desc="Step 2/8: Checking endpoint status...")
245
- logging.info("Step 2/8: Checking endpoint status...")
 
 
246
  yield (
247
- gr.update(value="Checking whether model endpoint is active...", visible=True),
 
 
248
  gr.update(value="", visible=True),
249
- gr.update(visible=True, open=True),
 
250
  gr.update(visible=True, open=False),
251
  )
 
252
 
253
- endpoint_ready = False
254
- if HF_TOKEN:
255
- try:
256
- api = HfApi(token=HF_TOKEN)
257
- endpoint = api.get_inference_endpoint(name=ENDPOINT_NAME)
258
- status = endpoint.status
259
-
260
- logging.info(f"Endpoint '{ENDPOINT_NAME}' status: {status}")
261
-
262
- if status == 'running':
263
- endpoint_ready = True
264
- else:
265
- logging.warning(f"Endpoint '{ENDPOINT_NAME}' is not ready (Status: {status}).")
266
- if status == 'scaledToZero':
267
- logging.info(f"Endpoint '{ENDPOINT_NAME}' is scaled to zero. Attempting to resume...")
268
- endpoint.resume()
269
- msg_503 = f"**Full Service Temporarily Unavailable**: but you can **browse existing reports** or **check back later!**\n\n The status of the Qwen2.5-Coder-32B-Instruct endpoint powering the analysis is currently: <span style='color:red'>**{status}**</span>\n\n" + ERROR_503_USER_MESSAGE
270
- yield (
271
- gr.update(value=msg_503, visible=True),
272
- gr.update(value="", visible=False),
273
- gr.update(visible=True, open=True),
274
- gr.update(visible=False)
275
- )
276
- return # Stop analysis, user needs to retry
277
- except Exception as e:
278
- logging.error(f"Error checking endpoint status for {ENDPOINT_NAME}: {e}")
279
- yield (
280
- gr.update(value=f"Error checking analysis endpoint status: {e}", visible=True),
281
- gr.update(value="", visible=False),
282
- gr.update(visible=True, open=True),
283
- gr.update(visible=False)
284
- )
285
- return # Stop analysis
286
-
287
- # --- Step 3: Fetch Code Files (if not cached) ---
288
- progress(3 / steps, desc="Step 3/8: Fetching code files...")
289
- logging.info("Step 3/8: Fetching code files...")
290
- code_files = get_space_code_files(space_id)
291
- if not code_files:
292
- error_msg = f"Could not retrieve code files for '{space_id}'. Check ID and ensure it's a public Space."
293
- logging.warning(error_msg)
294
  yield (
295
- gr.update(value=f"**Error:**\n{error_msg}", visible=True),
 
 
296
  gr.update(value="Analysis Canceled", visible=True),
297
- gr.update(visible=True, open=True),
 
298
  gr.update(visible=True, open=False),
299
  )
300
- return # End generation on error
 
301
 
302
  # --- Step 4: Generate DETAILED Privacy Report (LLM Call 1) ---
303
- progress(
304
- 4 / steps, desc="Step 4/8: Generating detailed privacy report (AI Call 1)..."
 
305
  )
306
- logging.info("Step 4/8: Generating detailed privacy analysis report...")
 
307
  yield (
308
- gr.update(value="Generating detailed privacy report...", visible=True),
 
 
 
 
309
  gr.update(value="Generating detailed privacy report via AI...", visible=True),
310
- gr.update(visible=True, open=True),
 
311
  gr.update(visible=True, open=True),
312
  )
313
- privacy_prompt_messages, privacy_truncated = format_privacy_prompt(
314
- space_id, code_files
315
  )
316
 
317
- # --- Check for 503 after query ---
318
- privacy_api_response = query_qwen_endpoint(privacy_prompt_messages, max_tokens=3072)
319
- if privacy_api_response == ERROR_503_DICT:
320
- logging.warning("LLM Call 1 failed with 503.")
321
  yield (
322
- gr.update(
323
- value=ERROR_503_USER_MESSAGE, visible=True
324
- ), # Show 503 message in summary area
325
- gr.update(value="", visible=False), # Clear privacy area
326
- gr.update(visible=True, open=True), # Keep summary open
327
- gr.update(visible=False), # Hide privacy accordion
 
328
  )
329
- return # Stop analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
- detailed_privacy_report = parse_qwen_response(privacy_api_response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
- if "Error:" in detailed_privacy_report:
334
- logging.error(
335
- f"Failed to generate detailed privacy report: {detailed_privacy_report}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  )
 
 
 
 
 
 
337
  yield (
338
- gr.update(value="Analysis Halted due to Error", visible=True),
 
339
  gr.update(
340
- value=f"**Error Generating Detailed Privacy Report:**\n{detailed_privacy_report}",
341
  visible=True,
342
  ),
343
- gr.update(visible=True, open=True),
 
 
344
  gr.update(visible=True, open=True),
345
  )
346
- return # End generation on error
347
- if privacy_truncated:
348
- detailed_privacy_report = TRUNCATION_WARNING + detailed_privacy_report
349
 
 
 
 
 
 
350
  yield (
351
- gr.update(value="Extracting model info...", visible=True),
352
- gr.update(value=detailed_privacy_report, visible=True),
353
- gr.update(visible=True, open=True),
 
 
 
354
  gr.update(visible=True, open=True),
355
  )
356
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
- # --- Step 5: Fetch Model Descriptions ---
359
- progress(5 / steps, desc="Step 5/8: Fetching model descriptions...")
360
- logging.info("Step 5/8: Fetching model descriptions...")
361
  yield (
362
- gr.update(value="Fetching model descriptions...", visible=True),
 
363
  gr.update(),
364
  gr.update(),
 
 
365
  gr.update(),
366
  )
367
- # --- Step 6: Generate Summary + Highlights Report (LLM Call 2) ---
368
- progress(6 / steps, desc="Step 6/8: Generating summary & highlights (AI Call 2)...")
369
- logging.info("Step 6/8: Generating summary and highlights report...")
 
 
 
370
  yield (
371
- gr.update(value="Generating summary & highlights via AI...", visible=True),
 
 
 
372
  gr.update(),
373
  gr.update(),
374
  gr.update(),
375
  )
376
- summary_highlights_prompt_messages, summary_truncated = (
377
- format_summary_highlights_prompt(space_id, code_files, detailed_privacy_report)
 
378
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
 
380
- # --- Check for 503 after query ---
381
- summary_highlights_api_response = query_qwen_endpoint(
382
- summary_highlights_prompt_messages, max_tokens=2048
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  )
384
- if summary_highlights_api_response == ERROR_503_DICT:
385
- logging.warning("LLM Call 2 failed with 503.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  yield (
 
 
387
  gr.update(
388
- value=ERROR_503_USER_MESSAGE, visible=True
389
- ), # Show 503 message in summary area
390
- gr.update(
391
- value=detailed_privacy_report, visible=True
392
- ), # Keep previous report visible
393
- gr.update(visible=True, open=True), # Keep summary open
394
- gr.update(visible=True, open=True), # Keep privacy open
395
  )
396
- return # Stop analysis
397
-
398
- summary_highlights_report = parse_qwen_response(summary_highlights_api_response)
399
 
400
- if "Error:" in summary_highlights_report:
401
- logging.error(
402
- f"Failed to generate summary/highlights report: {summary_highlights_report}"
403
- )
404
  yield (
 
 
405
  gr.update(
406
- value=f"**Error Generating Summary/Highlights:**\n{summary_highlights_report}",
407
  visible=True,
408
  ),
409
- gr.update(value=detailed_privacy_report, visible=True),
410
- gr.update(visible=True, open=True),
411
- gr.update(visible=True, open=True),
 
412
  )
413
- return # End generation on error
414
- if summary_truncated:
415
- summary_highlights_report = TRUNCATION_WARNING + summary_highlights_report
416
 
417
- # Yield summary report before attempting upload
418
- yield (
419
- gr.update(value=summary_highlights_report, visible=True),
420
- gr.update(value=detailed_privacy_report, visible=True),
421
- gr.update(visible=True, open=True),
422
- gr.update(visible=True, open=True),
423
- )
424
 
425
- # --- Step 7: Upload to Cache ---
426
- progress(7 / steps, desc="Step 7/8: Uploading results to cache...")
427
- logging.info("Step 7/8: Attempting to upload results to dataset cache...")
428
- try:
429
- if (
430
- HF_TOKEN
431
- and not found_in_cache
432
- and "Error:" not in detailed_privacy_report
433
- and "Error:" not in summary_highlights_report
434
- ):
435
- summary_to_save = summary_highlights_report.replace(
436
- TRUNCATION_WARNING, ""
437
- ).replace(CACHE_INFO_MSG, "")
438
- privacy_to_save = detailed_privacy_report.replace(
439
- TRUNCATION_WARNING, ""
440
- ).replace(CACHE_INFO_MSG, "")
441
- upload_reports_to_dataset(
442
- space_id=space_id,
443
- summary_report=summary_to_save,
444
- detailed_report=privacy_to_save,
445
- dataset_id=DATASET_ID,
446
- hf_token=HF_TOKEN,
447
  )
448
- elif not HF_TOKEN:
449
- logging.warning("Skipping cache upload as HF_TOKEN is not set.")
450
- elif found_in_cache:
451
- logging.info("Skipping cache upload as results were loaded from cache.")
452
- except Exception as e:
453
- logging.error(f"Non-critical error during report upload: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
 
455
- logging.info("Step 8/8: Analysis complete.")
456
- progress(8 / steps, desc="Step 8/8: Analysis Complete!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
- # --- Step 8: Yield Final Results --- (Ensure final state is correct)
459
- yield (
460
- gr.update(value=summary_highlights_report, visible=True),
461
- gr.update(value=detailed_privacy_report, visible=True),
462
- gr.update(visible=True, open=True),
463
- gr.update(visible=True, open=True),
464
- )
465
 
466
 
467
  # --- Load Initial Data Function (for demo.load) ---
@@ -511,7 +689,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
511
  with gr.Row():
512
  with gr.Column(scale=1): # Left column for inputs
513
  description_accordion = gr.Accordion(
514
- "What Privacy Questions do 🤗 Spaces Raise? Click here for Demo Description 👇", open=False, visible=True
 
 
515
  )
516
  with description_accordion:
517
  gr.Markdown(DESCRIPTION)
@@ -532,12 +712,28 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
532
  analyze_button = gr.Button("Get Space Report", variant="primary", scale=1)
533
 
534
  with gr.Column(scale=1): # Right column for outputs
535
- # Define Accordions first, open by default, hidden initially
 
 
 
 
 
 
 
 
 
 
 
 
536
  summary_accordion = gr.Accordion(
537
- "Summary & Privacy Highlights", open=True, visible=True
 
 
538
  )
539
  privacy_accordion = gr.Accordion(
540
- "Detailed Privacy Analysis Report", open=False, visible=True
 
 
541
  )
542
  with summary_accordion:
543
  summary_markdown = gr.Markdown(
@@ -559,8 +755,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
559
  fn=get_space_report_wrapper,
560
  inputs=[cached_spaces_dropdown, space_id_input],
561
  outputs=[
 
 
562
  summary_markdown,
563
  privacy_markdown,
 
564
  summary_accordion,
565
  privacy_accordion,
566
  ],
 
1
+ import json
2
  import logging
3
  import os
4
 
5
  import gradio as gr
6
  from dotenv import load_dotenv
 
7
  from huggingface_hub import HfApi
8
 
9
+ # Import analysis pipeline helpers
10
+ from analysis_utils import (check_cache_and_download, check_endpoint_status,
11
+ fetch_and_validate_code, format_tldr_prompt,
12
+ generate_and_parse_tldr, generate_detailed_report,
13
+ generate_summary_report, parse_tldr_json_response,
14
+ render_data_details_markdown, render_tldr_markdown,
15
+ upload_results)
16
+ # Import general utils
17
+ from utils import list_cached_spaces # Added import
18
+
19
+ # Removed LLM interface imports, handled by analysis_utils
20
+ # from llm_interface import ERROR_503_DICT
21
+ # from llm_interface import parse_qwen_response, query_qwen_endpoint
22
+
23
+ # Removed prompts import, handled by analysis_utils
24
+ # from prompts import format_privacy_prompt, format_summary_highlights_prompt
25
 
 
 
26
 
27
+
28
+ # Removed specific utils imports now handled via analysis_utils
29
+ # from utils import (
30
+ # check_report_exists,
31
+ # download_cached_reports,
32
+ # get_space_code_files,
33
+ # upload_reports_to_dataset,
34
+ # )
35
 
36
  # Configure logging
37
  logging.basicConfig(
 
46
  HF_TOKEN = os.getenv("HF_TOKEN")
47
  ENDPOINT_NAME = "qwen2-5-coder-32b-instruct-pmf"
48
  DATASET_ID = "yjernite/spaces-privacy-reports"
49
+ CACHE_INFO_MSG = (
50
+ "\n\n*(Report retrieved from cache)*" # Still needed for dropdown cache hit message
51
+ )
52
  DEFAULT_SELECTION = "HuggingFaceTB/SmolVLM2"
53
 
54
+ # TRUNCATION_WARNING now defined and used within analysis_utils
55
+ # TRUNCATION_WARNING = """**⚠️ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n"""
56
 
57
  ERROR_503_USER_MESSAGE = """It appears that the analysis model endpoint is currently down or starting up.
58
 
 
64
  """
65
 
66
 
67
+ def _run_live_analysis(space_id: str, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
68
  """
69
+ Performs the full analysis pipeline using helper functions from analysis_utils.
 
70
  Yields tuples of Gradio updates.
71
  """
72
+ total_steps = 9 # Increased step count for TLDR generation
73
+ current_step = 0
74
+ summary_report = ""
75
+ privacy_report = ""
76
+ tldr_data = None
77
+ tldr_markdown_content = "*TLDR loading...*"
78
+ data_details_content = (
79
+ "*Data details loading...*" # Default message for new component
80
+ )
81
 
82
+ # Initial message before first step
83
+ tldr_status_message = "*Starting analysis...*"
 
 
 
 
 
 
 
 
84
 
85
+ # --- Step 1: Check Cache ---
86
+ current_step += 1
87
+ progress_desc = f"Step {current_step}/{total_steps}: Checking cache..."
88
+ progress(current_step / total_steps, desc=progress_desc)
89
+ tldr_status_message = f"*{progress_desc}*"
90
+ yield (
91
+ gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
92
+ gr.update(value="*Checking cache...*", visible=True),
93
+ gr.update(value="Checking cache for existing reports...", visible=True),
94
+ gr.update(value="", visible=True),
95
+ gr.update(visible=True, open=False),
96
+ gr.update(visible=True, open=False),
97
+ gr.update(visible=True, open=False),
98
+ )
99
+ cache_result = check_cache_and_download(space_id, DATASET_ID, HF_TOKEN)
100
+
101
+ if cache_result["status"] == "cache_hit":
102
+ progress(total_steps / total_steps, desc="Complete (from cache)")
103
+ # Try to parse and render TLDR from cache
104
+ tldr_json_str = cache_result.get("tldr_json_str")
105
+ rendered_tldr = "*TLDR not found in cache.*"
106
+ if tldr_json_str:
107
+ try:
108
+ cached_tldr_data = json.loads(tldr_json_str)
109
+ # Render both parts
110
+ rendered_tldr = render_tldr_markdown(cached_tldr_data, space_id)
111
+ rendered_data_details = render_data_details_markdown(cached_tldr_data)
112
+ except Exception as parse_err:
113
+ logging.warning(
114
+ f"Failed to parse cached TLDR JSON for {space_id}: {parse_err}"
115
+ )
116
+ rendered_tldr = "*Error parsing cached TLDR.*"
117
+ rendered_data_details = (
118
+ "*Could not load data details due to parsing error.*"
119
+ )
120
 
121
+ yield (
122
+ gr.update(value=rendered_tldr, visible=True),
123
+ gr.update(value=rendered_data_details, visible=True),
124
+ gr.update(value=cache_result["summary"], visible=True),
125
+ gr.update(value=cache_result["privacy"], visible=True),
126
+ gr.update(visible=True, open=False),
127
+ gr.update(visible=True, open=False),
128
+ gr.update(visible=True, open=False),
129
+ )
130
+ return # End generation successfully from cache
131
+ elif cache_result["status"] == "cache_error":
132
+ # Display final error in TLDR field
133
+ tldr_status_message = (
134
+ f"*Cache download failed. {cache_result.get('ui_message', '')}*"
135
+ )
136
+ data_details_content = "*Data details unavailable due to cache error.*"
137
+ yield (
138
+ gr.update(value=tldr_status_message, visible=True),
139
+ gr.update(value=data_details_content, visible=True),
140
+ gr.update(value=cache_result["ui_message"], visible=True),
141
+ gr.update(value="", visible=True),
142
+ gr.update(visible=True, open=False),
143
+ gr.update(visible=True, open=False),
144
+ gr.update(visible=True, open=False),
145
+ )
146
+ # Still continue to live analysis if cache download fails
147
+ elif cache_result["status"] == "cache_miss":
148
+ tldr_status_message = f"*{progress_desc} - Cache miss.*" # Update status
149
+ data_details_content = "*Generating report...*"
150
+ yield (
151
+ gr.update(value=tldr_status_message, visible=True),
152
+ gr.update(value=data_details_content, visible=True),
153
+ gr.update(value="Cache miss. Starting live analysis...", visible=True),
154
+ gr.update(value="", visible=True),
155
+ gr.update(visible=True, open=False),
156
+ gr.update(visible=True, open=False),
157
+ gr.update(visible=True, open=False),
158
+ )
159
+ elif "error_message" in cache_result:
160
+ # Display final error in TLDR field
161
+ tldr_status_message = (
162
+ f"*Cache check failed. {cache_result.get('error_message', '')}*"
163
+ )
164
+ data_details_content = "*Data details unavailable due to cache error.*"
165
+ yield (
166
+ gr.update(value=tldr_status_message, visible=True),
167
+ gr.update(value=data_details_content, visible=True),
168
  gr.update(
169
+ value=f"Cache check failed: {cache_result.get('error_message', 'Unknown error')}. Proceeding with live analysis...",
170
  visible=True,
171
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  gr.update(value="", visible=True),
173
+ gr.update(visible=True, open=False),
174
+ gr.update(visible=True, open=False),
175
  gr.update(visible=True, open=False),
176
  )
177
+ # Still continue if cache check fails
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
+ # --- Step 2: Check Endpoint Status ---
180
+ current_step += 1
181
+ progress_desc = f"Step {current_step}/{total_steps}: Checking endpoint..."
182
+ progress(current_step / total_steps, desc=progress_desc)
183
+ tldr_status_message = f"*{progress_desc}*"
 
 
 
 
 
 
 
 
184
  yield (
185
+ gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
186
+ gr.update(),
187
+ gr.update(value="Checking analysis model endpoint status...", visible=True),
188
  gr.update(value="", visible=True),
189
+ gr.update(visible=True, open=False),
190
+ gr.update(visible=True, open=False),
191
  gr.update(visible=True, open=False),
192
  )
193
+ endpoint_result = check_endpoint_status(
194
+ ENDPOINT_NAME, HF_TOKEN, ERROR_503_USER_MESSAGE
195
+ )
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
+ if endpoint_result["status"] == "error":
198
+ progress(total_steps / total_steps, desc="Endpoint Error")
199
+ # Display final error in TLDR field
200
+ tldr_markdown_content = endpoint_result["ui_message"]
201
  yield (
202
+ gr.update(value=tldr_markdown_content, visible=True),
203
+ gr.update(value="", visible=False),
204
+ gr.update(value="", visible=False),
205
+ gr.update(value="", visible=False),
206
+ gr.update(visible=False),
207
+ gr.update(visible=False),
208
+ gr.update(visible=False),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  )
210
+ return
211
 
212
+ # --- Step 3: Fetch Code Files ---
213
+ current_step += 1
214
+ progress_desc = f"Step {current_step}/{total_steps}: Fetching code..."
215
+ progress(current_step / total_steps, desc=progress_desc)
216
+ tldr_status_message = f"*{progress_desc}*"
217
  yield (
218
+ gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
219
+ gr.update(),
220
+ gr.update(value="Fetching code files from the Space...", visible=True),
221
  gr.update(value="", visible=True),
222
+ gr.update(visible=True, open=False),
223
+ gr.update(visible=True, open=False),
224
  gr.update(visible=True, open=False),
225
  )
226
+ code_result = fetch_and_validate_code(space_id)
227
 
228
+ if code_result["status"] == "error":
229
+ progress(total_steps / total_steps, desc="Code Fetch Error")
230
+ # Display final error in TLDR field
231
+ tldr_markdown_content = (
232
+ f"**Error:** {code_result.get('ui_message', 'Failed to fetch code.')}"
233
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  yield (
235
+ gr.update(value=tldr_markdown_content, visible=True),
236
+ gr.update(value="", visible=False),
237
+ gr.update(value="", visible=False),
238
  gr.update(value="Analysis Canceled", visible=True),
239
+ gr.update(visible=False),
240
+ gr.update(visible=False),
241
  gr.update(visible=True, open=False),
242
  )
243
+ return
244
+ code_files = code_result["code_files"]
245
 
246
  # --- Step 4: Generate DETAILED Privacy Report (LLM Call 1) ---
247
+ current_step += 1
248
+ progress_desc = (
249
+ f"Step {current_step}/{total_steps}: Generating privacy report (AI Call 1)..."
250
  )
251
+ progress(current_step / total_steps, desc=progress_desc)
252
+ tldr_status_message = f"*{progress_desc}*"
253
  yield (
254
+ gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
255
+ gr.update(),
256
+ gr.update(
257
+ value="Generating detailed privacy report (AI Call 1)...", visible=True
258
+ ),
259
  gr.update(value="Generating detailed privacy report via AI...", visible=True),
260
+ gr.update(visible=True, open=False),
261
+ gr.update(visible=True, open=False),
262
  gr.update(visible=True, open=True),
263
  )
264
+ privacy_result = generate_detailed_report(
265
+ space_id, code_files, ERROR_503_USER_MESSAGE
266
  )
267
 
268
+ if privacy_result["status"] == "error":
269
+ progress(total_steps / total_steps, desc="Privacy Report Error")
270
+ # Display final error in TLDR field
271
+ tldr_markdown_content = f"**Error:** {privacy_result.get('ui_message', 'Failed during detailed report generation.')}"
272
  yield (
273
+ gr.update(value=tldr_markdown_content, visible=True),
274
+ gr.update(value="", visible=False),
275
+ gr.update(value="", visible=False),
276
+ gr.update(value="", visible=False),
277
+ gr.update(visible=False),
278
+ gr.update(visible=False),
279
+ gr.update(visible=False),
280
  )
281
+ return
282
+ privacy_report = privacy_result["report"]
283
+
284
+ # Update UI with successful detailed report
285
+ yield (
286
+ gr.update(value=tldr_status_message, visible=True), # Still show progress
287
+ gr.update(),
288
+ gr.update(
289
+ value="Detailed privacy report generated. Proceeding...", visible=True
290
+ ),
291
+ gr.update(value=privacy_report, visible=True),
292
+ gr.update(visible=True, open=False),
293
+ gr.update(visible=True, open=False),
294
+ gr.update(visible=True, open=True),
295
+ )
296
 
297
+ # --- Step 5: Fetch Model Descriptions (Placeholder/Optional) ---
298
+ current_step += 1
299
+ progress_desc = f"Step {current_step}/{total_steps}: Extracting model info..."
300
+ progress(current_step / total_steps, desc=progress_desc)
301
+ tldr_status_message = f"*{progress_desc}*"
302
+ logging.info(progress_desc + " (Placeholder)")
303
+ yield (
304
+ gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
305
+ gr.update(),
306
+ gr.update(value="Extracting model info...", visible=True),
307
+ gr.update(),
308
+ gr.update(),
309
+ gr.update(),
310
+ gr.update(),
311
+ )
312
+ # model_ids = extract_hf_model_ids(code_files) # utils function not imported
313
+ # model_descriptions = get_model_descriptions(model_ids) # utils function not imported
314
+ # Add model_descriptions to context if needed for summary prompt later
315
 
316
+ # --- Step 6: Generate Summary + Highlights Report (LLM Call 2) ---
317
+ current_step += 1
318
+ progress_desc = (
319
+ f"Step {current_step}/{total_steps}: Generating summary (AI Call 2)..."
320
+ )
321
+ progress(current_step / total_steps, desc=progress_desc)
322
+ tldr_status_message = f"*{progress_desc}*"
323
+ yield (
324
+ gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
325
+ gr.update(),
326
+ gr.update(value="Generating summary & highlights (AI Call 2)...", visible=True),
327
+ gr.update(),
328
+ gr.update(),
329
+ gr.update(),
330
+ gr.update(),
331
+ )
332
+ summary_result = generate_summary_report(
333
+ space_id, code_files, privacy_report, ERROR_503_USER_MESSAGE
334
+ )
335
+
336
+ if (
337
+ summary_result["status"] == "error_503_summary"
338
+ or summary_result["status"] == "error_summary"
339
+ ):
340
+ progress(total_steps / total_steps, desc="Summary Report Error")
341
+ # Display error in TLDR, show partial results below
342
+ tldr_markdown_content = f"**Error:** {summary_result.get('ui_message', 'Failed during summary generation.')}"
343
+ data_details_content = "*Data details may be incomplete.*"
344
+ yield (
345
+ gr.update(value=tldr_markdown_content, visible=True),
346
+ gr.update(value=data_details_content, visible=True),
347
+ gr.update(value=summary_result["ui_message"], visible=True),
348
+ gr.update(value=privacy_report, visible=True),
349
+ gr.update(visible=True, open=False),
350
+ gr.update(visible=True, open=False),
351
+ gr.update(visible=True, open=True),
352
  )
353
+ return
354
+ elif summary_result["status"] != "success":
355
+ progress(total_steps / total_steps, desc="Summary Report Error")
356
+ # Display error in TLDR, show partial results below
357
+ tldr_markdown_content = f"**Error:** Unexpected error generating summary: {summary_result.get('ui_message', 'Unknown')}"
358
+ data_details_content = "*Data details unavailable.*"
359
  yield (
360
+ gr.update(value=tldr_markdown_content, visible=True),
361
+ gr.update(value=data_details_content, visible=True),
362
  gr.update(
363
+ value=f"Unexpected error generating summary: {summary_result.get('ui_message', 'Unknown')}",
364
  visible=True,
365
  ),
366
+ gr.update(value=privacy_report, visible=True),
367
+ gr.update(visible=True, open=False),
368
+ gr.update(visible=True, open=False),
369
  gr.update(visible=True, open=True),
370
  )
371
+ return
372
+
373
+ summary_report = summary_result["report"]
374
 
375
+ # Update UI with successful summary report before TLDR generation
376
+ tldr_status_message = (
377
+ f"*{progress_desc} - Success. Generating TLDR...*" # Update status
378
+ )
379
+ data_details_content = "*Generating data details...*"
380
  yield (
381
+ gr.update(value=tldr_status_message, visible=True),
382
+ gr.update(value=data_details_content, visible=True),
383
+ gr.update(value=summary_report, visible=True),
384
+ gr.update(value=privacy_report, visible=True),
385
+ gr.update(visible=True, open=False),
386
+ gr.update(visible=True, open=False),
387
  gr.update(visible=True, open=True),
388
  )
389
 
390
+ # --- Step 7: Generate TLDR --- (New Step)
391
+ current_step += 1
392
+ progress_desc = f"Step {current_step}/{total_steps}: Generating TLDR summary..."
393
+ progress(current_step / total_steps, desc=progress_desc)
394
+ tldr_status_message = f"*{progress_desc}*"
395
+ yield (
396
+ gr.update(value=tldr_status_message, visible=True),
397
+ gr.update(),
398
+ gr.update(),
399
+ gr.update(),
400
+ gr.update(),
401
+ gr.update(),
402
+ gr.update(),
403
+ )
404
+ tldr_data = None # Reset tldr_data before attempt
405
+ try:
406
+ # Call the combined helper function from analysis_utils
407
+ tldr_data = generate_and_parse_tldr(privacy_report, summary_report)
408
+
409
+ if tldr_data:
410
+ logging.info(f"Successfully generated and parsed TLDR for {space_id}.")
411
+ tldr_markdown_content = render_tldr_markdown(tldr_data, space_id)
412
+ data_details_content = render_data_details_markdown(tldr_data)
413
+ else:
414
+ logging.warning(
415
+ f"Failed to generate or parse TLDR for {space_id}. Proceeding without it."
416
+ )
417
+ tldr_markdown_content = "*TLDR generation failed.*"
418
+ data_details_content = "*Data details generation failed.*"
419
+ except Exception as tldr_err:
420
+ # This catch block might be redundant now if generate_and_parse_tldr handles its errors
421
+ logging.error(
422
+ f"Unexpected error during TLDR generation step call for {space_id}: {tldr_err}"
423
+ )
424
+ tldr_markdown_content = "*Error during TLDR generation step.*"
425
+ data_details_content = "*Error generating data details.*"
426
+ tldr_data = None # Ensure it's None on error
427
 
428
+ # Update UI including the generated (or failed) TLDR before upload
 
 
429
  yield (
430
+ gr.update(value=tldr_markdown_content, visible=True),
431
+ gr.update(value=data_details_content, visible=True),
432
  gr.update(),
433
  gr.update(),
434
+ gr.update(visible=True, open=False),
435
+ gr.update(),
436
  gr.update(),
437
  )
438
+
439
+ # --- Step 8: Upload to Cache --- (Old Step 7)
440
+ current_step += 1
441
+ progress_desc = f"Step {current_step}/{total_steps}: Uploading to cache..."
442
+ progress(current_step / total_steps, desc=progress_desc)
443
+ tldr_status_message = f"*{progress_desc}*" # Display final action in TLDR field
444
  yield (
445
+ gr.update(value=tldr_status_message, visible=True),
446
+ gr.update(),
447
+ gr.update(value="Uploading results to cache...", visible=True),
448
+ gr.update(),
449
  gr.update(),
450
  gr.update(),
451
  gr.update(),
452
  )
453
+ upload_needed = (
454
+ cache_result["status"] != "cache_hit"
455
+ and cache_result["status"] != "cache_error"
456
  )
457
+ if upload_needed:
458
+ # Call imported function, now passing tldr_data
459
+ upload_result = upload_results(
460
+ space_id,
461
+ summary_report,
462
+ privacy_report,
463
+ DATASET_ID,
464
+ HF_TOKEN,
465
+ tldr_json_data=tldr_data,
466
+ )
467
+ if upload_result["status"] == "error":
468
+ # Ensure logging uses f-string if adding step count here
469
+ logging.error(
470
+ f"Cache upload failed: {upload_result.get('message', 'Unknown error')}"
471
+ )
472
+ # Non-critical, don't stop the UI, just log
473
+ elif upload_result["status"] == "skipped":
474
+ logging.info(f"Cache upload skipped: {upload_result.get('reason', '')}")
475
+ else:
476
+ logging.info(
477
+ "Skipping cache upload as results were loaded from cache or cache check failed."
478
+ )
479
 
480
+ # Update UI including the generated (or failed) TLDR before upload
481
+ # Yield 7 updates
482
+ yield (
483
+ gr.update(value=tldr_markdown_content, visible=True),
484
+ gr.update(value=data_details_content, visible=True),
485
+ gr.update(value=summary_report, visible=True),
486
+ gr.update(value=privacy_report, visible=True),
487
+ gr.update(visible=True, open=False),
488
+ gr.update(visible=True, open=False),
489
+ gr.update(visible=True, open=False),
490
+ )
491
+
492
+ # --- Step 9: Final Update --- (Old Step 8)
493
+ current_step += 1
494
+ progress_desc = f"Step {current_step}/{total_steps}: Analysis Complete!"
495
+ progress(current_step / total_steps, desc=progress_desc)
496
+ logging.info(progress_desc + f" Analysis complete for {space_id}.")
497
+ # Yield final state again to ensure UI is correct after potential upload messages
498
+ # Display final generated TLDR and Data Details
499
+ yield (
500
+ gr.update(value=tldr_markdown_content, visible=True),
501
+ gr.update(value=data_details_content, visible=True),
502
+ gr.update(value=summary_report, visible=True),
503
+ gr.update(value=privacy_report, visible=True),
504
+ gr.update(visible=True, open=False),
505
+ gr.update(visible=True, open=False),
506
+ gr.update(visible=True, open=False),
507
  )
508
+
509
+
510
+ # --- Original Input Handling Wrapper (updated yields for initial errors) ---
511
+ def get_space_report_wrapper(
512
+ selected_cached_space: str | None,
513
+ new_space_id: str | None,
514
+ progress=gr.Progress(track_tqdm=True),
515
+ ):
516
+ """
517
+ Wrapper function to decide whether to fetch cache or run live analysis.
518
+ Handles the logic based on Dropdown and Textbox inputs.
519
+ Yields tuples of Gradio updates.
520
+ """
521
+ target_space_id = None
522
+ source = "new" # Assume new input unless dropdown is chosen
523
+
524
+ # Prioritize new_space_id if provided
525
+ if new_space_id and new_space_id.strip():
526
+ target_space_id = new_space_id.strip()
527
+ if target_space_id == selected_cached_space:
528
+ source = "dropdown_match" # User typed ID that exists in dropdown
529
+ else:
530
+ source = "new"
531
+ elif selected_cached_space:
532
+ target_space_id = selected_cached_space
533
+ source = "dropdown"
534
+
535
+ if not target_space_id:
536
+ # Yield 7 updates
537
  yield (
538
+ gr.update(value="*Please provide a Space ID.*", visible=True),
539
+ gr.update(value="", visible=False),
540
  gr.update(
541
+ value="Please select an existing report or enter a new Space ID.",
542
+ visible=True,
543
+ ),
544
+ gr.update(value="", visible=False),
545
+ gr.update(visible=True, open=False),
546
+ gr.update(visible=True, open=False),
547
+ gr.update(visible=False),
548
  )
549
+ return
 
 
550
 
551
+ if "/" not in target_space_id:
552
+ # Yield 7 updates
 
 
553
  yield (
554
+ gr.update(value="*Invalid Space ID format.*", visible=True),
555
+ gr.update(value="", visible=False),
556
  gr.update(
557
+ value=f"Invalid Space ID format: '{target_space_id}'. Use 'owner/name'.",
558
  visible=True,
559
  ),
560
+ gr.update(value="", visible=False),
561
+ gr.update(visible=True, open=False),
562
+ gr.update(visible=True, open=False),
563
+ gr.update(visible=False),
564
  )
565
+ return
 
 
566
 
567
+ logging.info(f"Request received for: '{target_space_id}' (Source: {source})")
 
 
 
 
 
 
568
 
569
+ if source == "dropdown":
570
+ progress(0.1, desc="Fetching selected cached report...")
571
+ # Yield 7 updates (initial placeholder)
572
+ yield (
573
+ gr.update(value="*Loading TLDR...*", visible=True),
574
+ gr.update(value="*Loading data details...*", visible=True),
575
+ gr.update(value="Fetching selected cached report...", visible=True),
576
+ gr.update(value="", visible=True),
577
+ gr.update(visible=True, open=False),
578
+ gr.update(visible=True, open=False),
579
+ gr.update(visible=True, open=False),
580
+ )
581
+ cache_result = check_cache_and_download(target_space_id, DATASET_ID, HF_TOKEN)
582
+ if cache_result["status"] == "cache_hit":
583
+ logging.info(
584
+ f"Successfully displayed cached reports for selected '{target_space_id}'."
 
 
 
 
 
 
585
  )
586
+ progress(1.0, desc="Complete (from cache)")
587
+ # Use the cached report text directly here, adding the cache message is done within the helper now.
588
+ # Parse and render TLDR if available
589
+ tldr_json_str = cache_result.get("tldr_json_str")
590
+ rendered_tldr = "*TLDR not found in cache.*"
591
+ if tldr_json_str:
592
+ try:
593
+ cached_tldr_data = json.loads(tldr_json_str)
594
+ rendered_tldr = render_tldr_markdown(
595
+ cached_tldr_data, target_space_id
596
+ )
597
+ rendered_data_details = render_data_details_markdown(
598
+ cached_tldr_data
599
+ )
600
+ except Exception as parse_err:
601
+ logging.warning(
602
+ f"Failed to parse cached TLDR JSON for {target_space_id}: {parse_err}"
603
+ )
604
+ rendered_tldr = "*Error parsing cached TLDR.*"
605
+ rendered_data_details = (
606
+ "*Could not load data details due to parsing error.*"
607
+ )
608
 
609
+ yield (
610
+ gr.update(value=rendered_tldr, visible=True),
611
+ gr.update(value=rendered_data_details, visible=True),
612
+ gr.update(value=cache_result["summary"], visible=True),
613
+ gr.update(value=cache_result["privacy"], visible=True),
614
+ gr.update(visible=True, open=False),
615
+ gr.update(visible=True, open=False),
616
+ gr.update(visible=True, open=False),
617
+ )
618
+ else: # Cache miss or error for a dropdown selection is an error state
619
+ error_msg = cache_result.get(
620
+ "ui_message",
621
+ f"Failed to find or download cached report for selected '{target_space_id}'.",
622
+ )
623
+ logging.error(error_msg)
624
+ progress(1.0, desc="Error")
625
+ yield (
626
+ gr.update(value="*TLDR load failed.*", visible=True),
627
+ gr.update(value="*Data details load failed.*", visible=True),
628
+ gr.update(value=error_msg, visible=True),
629
+ gr.update(value="", visible=False),
630
+ gr.update(visible=True, open=False),
631
+ gr.update(visible=True, open=False),
632
+ gr.update(visible=False),
633
+ )
634
+ return # Stop after handling dropdown source
635
 
636
+ # --- Live Analysis or Check Cache for New Input ---
637
+ # If it came from the textbox OR was a dropdown match, run the full live analysis pipeline
638
+ # which includes its own cache check at the beginning.
639
+ else: # source == "new" or source == "dropdown_match"
640
+ # Yield intermediate updates from the generator by iterating through it
641
+ for update_tuple in _run_live_analysis(target_space_id, progress):
642
+ yield update_tuple
643
 
644
 
645
  # --- Load Initial Data Function (for demo.load) ---
 
689
  with gr.Row():
690
  with gr.Column(scale=1): # Left column for inputs
691
  description_accordion = gr.Accordion(
692
+ "What Privacy Questions do 🤗 Spaces Raise? Click here for Demo Description 👇",
693
+ open=False,
694
+ visible=True,
695
  )
696
  with description_accordion:
697
  gr.Markdown(DESCRIPTION)
 
712
  analyze_button = gr.Button("Get Space Report", variant="primary", scale=1)
713
 
714
  with gr.Column(scale=1): # Right column for outputs
715
+ # Define TLDR Markdown component first, always visible
716
+ gr.Markdown("### Privacy TLDR 🕵️\n", visible=True)
717
+ tldr_markdown = gr.Markdown(
718
+ "*Select or enter a Space ID to get started.*", visible=True
719
+ )
720
+
721
+ # Define Accordions next, closed by default, visible
722
+ data_types_accordion = gr.Accordion(
723
+ "Data Types at Play", open=False, visible=True
724
+ )
725
+ with data_types_accordion:
726
+ data_details_markdown = gr.Markdown("*Data details will appear here.*")
727
+
728
  summary_accordion = gr.Accordion(
729
+ "Summary & Privacy Highlights",
730
+ open=False,
731
+ visible=True, # Changed to open=False
732
  )
733
  privacy_accordion = gr.Accordion(
734
+ "Detailed Privacy Analysis Report",
735
+ open=False,
736
+ visible=True, # Changed to open=False
737
  )
738
  with summary_accordion:
739
  summary_markdown = gr.Markdown(
 
755
  fn=get_space_report_wrapper,
756
  inputs=[cached_spaces_dropdown, space_id_input],
757
  outputs=[
758
+ tldr_markdown,
759
+ data_details_markdown, # Added data details output
760
  summary_markdown,
761
  privacy_markdown,
762
+ data_types_accordion, # Added data details accordion output
763
  summary_accordion,
764
  privacy_accordion,
765
  ],
llm_interface.py CHANGED
@@ -79,6 +79,7 @@ def query_qwen_endpoint(
79
  return None # Return None for other HTTP errors
80
  except Exception as e:
81
  logging.error(f"An unexpected error occurred querying Inference Endpoint: {e}")
 
82
  return None
83
 
84
 
 
79
  return None # Return None for other HTTP errors
80
  except Exception as e:
81
  logging.error(f"An unexpected error occurred querying Inference Endpoint: {e}")
82
+ print(f"An unexpected error occurred querying Inference Endpoint: {e}")
83
  return None
84
 
85
 
utils.py CHANGED
@@ -80,6 +80,7 @@ MAX_MODEL_DESC_LENGTH = 1500
80
 
81
  SUMMARY_FILENAME = "summary_highlights.md"
82
  PRIVACY_FILENAME = "privacy_report.md"
 
83
 
84
 
85
  def _is_relevant_file(filename):
@@ -367,7 +368,13 @@ def check_report_exists(space_id: str, dataset_id: str, hf_token: str | None) ->
367
  def download_cached_reports(
368
  space_id: str, dataset_id: str, hf_token: str | None
369
  ) -> dict[str, str]:
370
- """Downloads cached reports from the dataset repo. Raises error on failure."""
 
 
 
 
 
 
371
  if not hf_token:
372
  raise ValueError("HF Token required to download cached reports.")
373
 
@@ -378,50 +385,95 @@ def download_cached_reports(
378
  # Define paths relative to dataset root for hf_hub_download
379
  summary_repo_path = f"{space_id}/{SUMMARY_FILENAME}"
380
  privacy_repo_path = f"{space_id}/{PRIVACY_FILENAME}"
 
 
381
  try:
382
  # Download summary
383
- summary_path_local = hf_hub_download(
384
- repo_id=dataset_id,
385
- filename=summary_repo_path,
386
- repo_type="dataset",
387
- token=hf_token,
388
- )
389
- with open(summary_path_local, "r", encoding="utf-8") as f:
390
- reports["summary"] = f.read()
391
- logging.info(f"Successfully downloaded cached summary for {space_id}.")
 
 
 
 
 
 
 
 
 
 
392
 
393
  # Download privacy report
394
- privacy_path_local = hf_hub_download(
395
- repo_id=dataset_id,
396
- filename=privacy_repo_path,
397
- repo_type="dataset",
398
- token=hf_token,
399
- )
400
- with open(privacy_path_local, "r", encoding="utf-8") as f:
401
- reports["privacy"] = f.read()
402
- logging.info(f"Successfully downloaded cached privacy report for {space_id}.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
 
404
  return reports
405
 
406
- except EntryNotFoundError as e:
407
- # More specific error based on which file failed
408
- missing_file = (
409
- summary_repo_path if summary_repo_path in str(e) else privacy_repo_path
410
- )
411
  logging.error(
412
- f"Cache download error: Report file {missing_file} not found for {space_id} in {dataset_id}. {e}"
413
  )
414
- raise FileNotFoundError(
415
- f"Cached report file {missing_file} not found for {space_id}"
416
- ) from e
417
- except RepositoryNotFoundError as e:
418
- logging.error(f"Cache download error: Dataset repo {dataset_id} not found. {e}")
419
- raise FileNotFoundError(f"Dataset repo {dataset_id} not found") from e
420
- except Exception as e:
421
  logging.error(
422
- f"Unexpected error downloading cached reports for {space_id} from {dataset_id}: {e}"
423
  )
424
- raise IOError(f"Failed to download cached reports for {space_id}") from e
 
 
425
 
426
 
427
  def upload_reports_to_dataset(
 
80
 
81
  SUMMARY_FILENAME = "summary_highlights.md"
82
  PRIVACY_FILENAME = "privacy_report.md"
83
+ TLDR_FILENAME = "tldr_summary.json"
84
 
85
 
86
  def _is_relevant_file(filename):
 
368
  def download_cached_reports(
369
  space_id: str, dataset_id: str, hf_token: str | None
370
  ) -> dict[str, str]:
371
+ """Downloads cached reports (summary, privacy, tldr json) from the dataset repo.
372
+
373
+ Returns:
374
+ Dict containing report contents keyed by 'summary', 'privacy', 'tldr_json_str'.
375
+ Keys will be missing if a specific file is not found.
376
+ Raises error on critical download failures (repo not found, etc.).
377
+ """
378
  if not hf_token:
379
  raise ValueError("HF Token required to download cached reports.")
380
 
 
385
  # Define paths relative to dataset root for hf_hub_download
386
  summary_repo_path = f"{space_id}/{SUMMARY_FILENAME}"
387
  privacy_repo_path = f"{space_id}/{PRIVACY_FILENAME}"
388
+ tldr_repo_path = f"{space_id}/{TLDR_FILENAME}" # Path for TLDR JSON
389
+
390
  try:
391
  # Download summary
392
+ try:
393
+ summary_path_local = hf_hub_download(
394
+ repo_id=dataset_id,
395
+ filename=summary_repo_path,
396
+ repo_type="dataset",
397
+ token=hf_token,
398
+ )
399
+ with open(summary_path_local, "r", encoding="utf-8") as f:
400
+ reports["summary"] = f.read()
401
+ logging.info(f"Successfully downloaded cached summary for {space_id}.")
402
+ except EntryNotFoundError:
403
+ logging.warning(
404
+ f"Cached summary file {summary_repo_path} not found for {space_id}."
405
+ )
406
+ except Exception as e_summary:
407
+ logging.error(
408
+ f"Error downloading cached summary for {space_id}: {e_summary}"
409
+ )
410
+ # Decide if this is critical - for now, we warn and continue
411
 
412
  # Download privacy report
413
+ try:
414
+ privacy_path_local = hf_hub_download(
415
+ repo_id=dataset_id,
416
+ filename=privacy_repo_path,
417
+ repo_type="dataset",
418
+ token=hf_token,
419
+ )
420
+ with open(privacy_path_local, "r", encoding="utf-8") as f:
421
+ reports["privacy"] = f.read()
422
+ logging.info(
423
+ f"Successfully downloaded cached privacy report for {space_id}."
424
+ )
425
+ except EntryNotFoundError:
426
+ logging.warning(
427
+ f"Cached privacy file {privacy_repo_path} not found for {space_id}."
428
+ )
429
+ except Exception as e_privacy:
430
+ logging.error(
431
+ f"Error downloading cached privacy report for {space_id}: {e_privacy}"
432
+ )
433
+ # Decide if this is critical - for now, we warn and continue
434
+
435
+ # Download TLDR JSON
436
+ try:
437
+ tldr_path_local = hf_hub_download(
438
+ repo_id=dataset_id,
439
+ filename=tldr_repo_path,
440
+ repo_type="dataset",
441
+ token=hf_token,
442
+ )
443
+ with open(tldr_path_local, "r", encoding="utf-8") as f:
444
+ reports["tldr_json_str"] = f.read() # Store raw string content
445
+ logging.info(f"Successfully downloaded cached TLDR JSON for {space_id}.")
446
+ except EntryNotFoundError:
447
+ logging.warning(
448
+ f"Cached TLDR file {tldr_repo_path} not found for {space_id}."
449
+ )
450
+ # Don't treat TLDR absence as an error, just won't be in the dict
451
+ except Exception as e_tldr:
452
+ logging.error(
453
+ f"Error downloading cached TLDR JSON for {space_id}: {e_tldr}"
454
+ )
455
+ # Don't treat TLDR download error as critical, just won't be included
456
+
457
+ # Check if at least one report was downloaded successfully
458
+ if not reports.get("summary") and not reports.get("privacy"):
459
+ raise FileNotFoundError(
460
+ f"Failed to download *any* primary cache files (summary/privacy) for {space_id}"
461
+ )
462
 
463
  return reports
464
 
465
+ except RepositoryNotFoundError as e_repo:
 
 
 
 
466
  logging.error(
467
+ f"Cache download error: Dataset repo {dataset_id} not found. {e_repo}"
468
  )
469
+ raise FileNotFoundError(f"Dataset repo {dataset_id} not found") from e_repo
470
+ except Exception as e_critical: # Catch other potential critical errors
 
 
 
 
 
471
  logging.error(
472
+ f"Unexpected critical error downloading cached reports for {space_id} from {dataset_id}: {e_critical}"
473
  )
474
+ raise IOError(
475
+ f"Failed critically during cached report download for {space_id}"
476
+ ) from e_critical
477
 
478
 
479
  def upload_reports_to_dataset(