yjernite HF Staff commited on
Commit
28be125
·
verified ·
1 Parent(s): b650e6f

Upload 6 files

Browse files
Files changed (6) hide show
  1. .gitignore +78 -0
  2. app.py +521 -0
  3. llm_interface.py +154 -0
  4. prompts.py +267 -0
  5. requirements.txt +4 -0
  6. utils.py +507 -0
.gitignore ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+
7
+ # C extensions
8
+ *.so
9
+
10
+ # Distribution / packaging
11
+ .Python
12
+ build/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ .pytest_cache/
47
+ .hypothesis/
48
+
49
+ # Environments
50
+ .env
51
+ .venv
52
+ venv/
53
+ ENV/
54
+ env/
55
+
56
+ # Spyder project settings
57
+ .spyderproject
58
+ .spyproject
59
+
60
+ # Rope project settings
61
+ .ropeproject
62
+
63
+ # mkdocs documentation
64
+ /site
65
+
66
+ # mypy
67
+ .mypy_cache/
68
+ .dmypy.json
69
+ dmypy.json
70
+
71
+ # Jupyter Notebook
72
+ .ipynb_checkpoints
73
+
74
+ # VS Code settings
75
+ .vscode/
76
+
77
+ # Temp files
78
+ *.tmp
app.py ADDED
@@ -0,0 +1,521 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ import gradio as gr
5
+ from dotenv import load_dotenv
6
+
7
+ from llm_interface import ERROR_503_DICT # Import error dict
8
+ from llm_interface import parse_qwen_response, query_qwen_endpoint
9
+
10
+ # Updated prompt imports for new order
11
+ from prompts import format_privacy_prompt, format_summary_highlights_prompt
12
+
13
+ # Import helper functions from other modules
14
+ from utils import list_cached_spaces # Added import
15
+ from utils import (
16
+ check_report_exists,
17
+ download_cached_reports,
18
+ get_space_code_files,
19
+ upload_reports_to_dataset,
20
+ )
21
+
22
+ # Configure logging
23
+ logging.basicConfig(
24
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
25
+ )
26
+
27
+ # Load environment variables from .env file
28
+ # This is important to ensure API keys and endpoints are loaded before use
29
+ load_dotenv()
30
+
31
+ # --- Constants ---
32
+ HF_TOKEN = os.getenv("HF_TOKEN")
33
+ DATASET_ID = "yjernite/spaces-privacy-reports"
34
+ CACHE_INFO_MSG = "\n\n*(Report retrieved from cache)*"
35
+ DEFAULT_SELECTION = "HuggingFaceTB/SmolVLM2"
36
+
37
+ TRUNCATION_WARNING = """**⚠️ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n"""
38
+
39
+ ERROR_503_USER_MESSAGE = """**503 Service Unavailable**: It appears that the analysis model endpoint is currently down or starting up.
40
+
41
+ You have a few options:
42
+
43
+ * **Wait & Retry:** Try clicking "Get Space Report" again in ~3-5 minutes. Endpoints often scale down to save resources and take a short time to wake up.
44
+ * **Select Cached Report:** Use the dropdown above to view a report for a Space that has already been analyzed.
45
+ * **Request Analysis:** If the error persists, please open an issue or discussion in the Space's Community tab requesting analysis for your target Space ID. We can run the job manually when the endpoint is available.
46
+ """
47
+
48
+
49
+ def get_space_report_wrapper(
50
+ selected_cached_space: str | None,
51
+ new_space_id: str | None,
52
+ progress=gr.Progress(track_tqdm=True),
53
+ ):
54
+ """
55
+ Wrapper function to decide whether to fetch cache or run live analysis.
56
+ Handles the logic based on Dropdown and Textbox inputs.
57
+ Yields tuples of Gradio updates.
58
+ """
59
+ target_space_id = None
60
+ source = "new" # Assume new input unless dropdown is chosen
61
+
62
+ # Prioritize new_space_id if provided
63
+ if new_space_id and new_space_id.strip():
64
+ target_space_id = new_space_id.strip()
65
+ if target_space_id == selected_cached_space:
66
+ source = "dropdown_match" # User typed ID that exists in dropdown
67
+ else:
68
+ source = "new"
69
+ elif selected_cached_space:
70
+ target_space_id = selected_cached_space
71
+ source = "dropdown"
72
+
73
+ if not target_space_id:
74
+ # No input provided
75
+ return (
76
+ gr.update(
77
+ value="Please select an existing report or enter a new Space ID.",
78
+ visible=True,
79
+ ),
80
+ gr.update(value="", visible=False),
81
+ gr.update(visible=True, open=True),
82
+ gr.update(visible=False),
83
+ )
84
+
85
+ # Validate format
86
+ if "/" not in target_space_id:
87
+ return (
88
+ gr.update(
89
+ value=f"Invalid Space ID format: '{target_space_id}'. Use 'owner/name'.",
90
+ visible=True,
91
+ ),
92
+ gr.update(value="", visible=False),
93
+ gr.update(visible=True, open=True),
94
+ gr.update(visible=False),
95
+ )
96
+
97
+ logging.info(f"Request received for: '{target_space_id}' (Source: {source})")
98
+
99
+ # --- Cache Handling ---
100
+ # If the user explicitly selected from the dropdown, try to fetch it directly.
101
+ if source == "dropdown":
102
+ progress(
103
+ 0.1, desc="Fetching cached report..."
104
+ ) # Simple progress for cache fetch
105
+ yield (
106
+ gr.update(value="Fetching selected cached report...", visible=True),
107
+ gr.update(value="", visible=True),
108
+ gr.update(visible=True, open=True),
109
+ gr.update(visible=True, open=False),
110
+ )
111
+ try:
112
+ cached_reports = download_cached_reports(
113
+ target_space_id, DATASET_ID, HF_TOKEN
114
+ )
115
+ summary_report = (
116
+ cached_reports.get("summary", "Error: Cached summary not found.")
117
+ + CACHE_INFO_MSG
118
+ )
119
+ privacy_report = (
120
+ cached_reports.get("privacy", "Error: Cached privacy report not found.")
121
+ + CACHE_INFO_MSG
122
+ )
123
+ logging.info(
124
+ f"Successfully displayed cached reports for selected '{target_space_id}'."
125
+ )
126
+ progress(1.0, desc="Complete (from cache)")
127
+ yield (
128
+ gr.update(value=summary_report, visible=True),
129
+ gr.update(value=privacy_report, visible=True),
130
+ gr.update(visible=True, open=True),
131
+ gr.update(visible=True, open=True),
132
+ )
133
+ except Exception as e:
134
+ error_msg = f"Failed to download cached report for selected '{target_space_id}': {e}"
135
+ logging.error(error_msg)
136
+ progress(1.0, desc="Error")
137
+ yield (
138
+ gr.update(value=error_msg, visible=True),
139
+ gr.update(value="", visible=False),
140
+ gr.update(visible=True, open=True),
141
+ gr.update(visible=False),
142
+ )
143
+
144
+ # --- Live Analysis or Check Cache for New Input ---
145
+ # If it came from the textbox OR was a dropdown match, we first check cache, then run live.
146
+ else: # source == "new" or source == "dropdown_match"
147
+ # This generator now performs the full analysis if needed
148
+ # Yield intermediate updates from the generator
149
+ # Important: Need to use a loop to consume the generator
150
+ final_update = None
151
+ for update_tuple in _run_live_analysis(target_space_id, progress):
152
+ yield update_tuple
153
+ final_update = update_tuple # Keep track of the last update
154
+ yield final_update # Return the very last state
155
+
156
+
157
+ def _run_live_analysis(space_id: str, progress=gr.Progress(track_tqdm=True)):
158
+ """
159
+ Performs the full analysis pipeline: cache check, code fetch, LLM calls, upload.
160
+ Yields tuples of Gradio updates.
161
+ (This contains the logic previously in analyze_space_privacy, minus initial input handling)
162
+ """
163
+ steps = 8 # Steps for the full pipeline
164
+ privacy_truncated = False
165
+ summary_truncated = False
166
+
167
+ # --- Step 1: Check Cache --- (Check again for new/matched input)
168
+ progress(1 / steps, desc="Step 1/8: Checking cache...")
169
+ logging.info(f"Step 1/8: Checking cache for '{space_id}'...")
170
+ yield (
171
+ gr.update(value="Checking cache for existing reports...", visible=True),
172
+ gr.update(value="", visible=True),
173
+ gr.update(visible=True, open=True),
174
+ gr.update(visible=True, open=False),
175
+ )
176
+ found_in_cache = False
177
+ if HF_TOKEN:
178
+ try:
179
+ found_in_cache = check_report_exists(space_id, DATASET_ID, HF_TOKEN)
180
+ except Exception as e:
181
+ logging.warning(f"Cache check failed: {e}. Proceeding.")
182
+ yield (
183
+ gr.update(
184
+ value="Cache check failed, proceeding with live analysis...",
185
+ visible=True,
186
+ ),
187
+ gr.update(value="", visible=True),
188
+ gr.update(visible=True, open=True),
189
+ gr.update(visible=True, open=False),
190
+ )
191
+
192
+ if found_in_cache:
193
+ logging.info(f"Cache hit for {space_id}. Downloading.")
194
+ progress(2 / steps, desc="Step 2/8: Cache hit! Downloading reports...")
195
+ yield (
196
+ gr.update(value="Cache hit! Downloading reports...", visible=True),
197
+ gr.update(value="", visible=True),
198
+ gr.update(visible=True, open=True),
199
+ gr.update(visible=True, open=False),
200
+ )
201
+ try:
202
+ cached_reports = download_cached_reports(space_id, DATASET_ID, HF_TOKEN)
203
+ summary_report = (
204
+ cached_reports.get("summary", "Error: Cached summary not found.")
205
+ + CACHE_INFO_MSG
206
+ )
207
+ privacy_report = (
208
+ cached_reports.get("privacy", "Error: Cached privacy report not found.")
209
+ + CACHE_INFO_MSG
210
+ )
211
+ logging.info(f"Successfully displayed cached reports for {space_id}.")
212
+ progress(8 / steps, desc="Complete (from cache)")
213
+ yield (
214
+ gr.update(value=summary_report, visible=True),
215
+ gr.update(value=privacy_report, visible=True),
216
+ gr.update(visible=True, open=True),
217
+ gr.update(visible=True, open=True),
218
+ )
219
+ return # End generation here if cache successful
220
+ except Exception as e:
221
+ logging.warning(f"Cache download failed for {space_id}: {e}. Proceeding.")
222
+ yield (
223
+ gr.update(
224
+ value="Cache download failed, proceeding with live analysis...",
225
+ visible=True,
226
+ ),
227
+ gr.update(value="", visible=True),
228
+ gr.update(visible=True, open=True),
229
+ gr.update(visible=True, open=False),
230
+ )
231
+ else:
232
+ logging.info(f"Cache miss for {space_id}. Performing live analysis.")
233
+ yield (
234
+ gr.update(value="Cache miss. Fetching code...", visible=True),
235
+ gr.update(value="", visible=True),
236
+ gr.update(visible=True, open=True),
237
+ gr.update(visible=True, open=False),
238
+ )
239
+
240
+ # --- Step 2: Fetch Code Files (if not cached) ---
241
+ progress(2 / steps, desc="Step 2/8: Fetching code files...")
242
+ logging.info("Step 2/8: Fetching code files...")
243
+ code_files = get_space_code_files(space_id)
244
+ if not code_files:
245
+ error_msg = f"Could not retrieve code files for '{space_id}'. Check ID and ensure it's a public Space."
246
+ logging.warning(error_msg)
247
+ yield (
248
+ gr.update(value=f"**Error:**\n{error_msg}", visible=True),
249
+ gr.update(value="Analysis Canceled", visible=True),
250
+ gr.update(visible=True, open=True),
251
+ gr.update(visible=True, open=False),
252
+ )
253
+ return # End generation on error
254
+
255
+ # --- Step 3: Generate DETAILED Privacy Report (LLM Call 1) ---
256
+ progress(
257
+ 3 / steps, desc="Step 3/8: Generating detailed privacy report (AI Call 1)..."
258
+ )
259
+ logging.info("Step 3/8: Generating detailed privacy analysis report...")
260
+ yield (
261
+ gr.update(value="Generating detailed privacy report...", visible=True),
262
+ gr.update(value="Generating detailed privacy report via AI...", visible=True),
263
+ gr.update(visible=True, open=True),
264
+ gr.update(visible=True, open=True),
265
+ )
266
+ privacy_prompt_messages, privacy_truncated = format_privacy_prompt(
267
+ space_id, code_files
268
+ )
269
+
270
+ # --- Check for 503 after query ---
271
+ privacy_api_response = query_qwen_endpoint(privacy_prompt_messages, max_tokens=3072)
272
+ if privacy_api_response == ERROR_503_DICT:
273
+ logging.warning("LLM Call 1 failed with 503.")
274
+ yield (
275
+ gr.update(
276
+ value=ERROR_503_USER_MESSAGE, visible=True
277
+ ), # Show 503 message in summary area
278
+ gr.update(value="", visible=False), # Clear privacy area
279
+ gr.update(visible=True, open=True), # Keep summary open
280
+ gr.update(visible=False), # Hide privacy accordion
281
+ )
282
+ return # Stop analysis
283
+
284
+ detailed_privacy_report = parse_qwen_response(privacy_api_response)
285
+
286
+ if "Error:" in detailed_privacy_report:
287
+ logging.error(
288
+ f"Failed to generate detailed privacy report: {detailed_privacy_report}"
289
+ )
290
+ yield (
291
+ gr.update(value="Analysis Halted due to Error", visible=True),
292
+ gr.update(
293
+ value=f"**Error Generating Detailed Privacy Report:**\n{detailed_privacy_report}",
294
+ visible=True,
295
+ ),
296
+ gr.update(visible=True, open=True),
297
+ gr.update(visible=True, open=True),
298
+ )
299
+ return # End generation on error
300
+ if privacy_truncated:
301
+ detailed_privacy_report = TRUNCATION_WARNING + detailed_privacy_report
302
+
303
+ yield (
304
+ gr.update(value="Extracting model info...", visible=True),
305
+ gr.update(value=detailed_privacy_report, visible=True),
306
+ gr.update(visible=True, open=True),
307
+ gr.update(visible=True, open=True),
308
+ )
309
+
310
+ # --- Step 4: Extract Model IDs ---
311
+ progress(4 / steps, desc="Step 4/8: Extracting model IDs...")
312
+ logging.info("Step 4/8: Extracting potential model IDs...")
313
+
314
+ # --- Step 5: Fetch Model Descriptions ---
315
+ progress(5 / steps, desc="Step 5/8: Fetching model descriptions...")
316
+ logging.info("Step 5/8: Fetching model descriptions...")
317
+ yield (
318
+ gr.update(value="Fetching model descriptions...", visible=True),
319
+ gr.update(),
320
+ gr.update(),
321
+ gr.update(),
322
+ )
323
+ # --- Step 6: Generate Summary + Highlights Report (LLM Call 2) ---
324
+ progress(6 / steps, desc="Step 6/8: Generating summary & highlights (AI Call 2)...")
325
+ logging.info("Step 6/8: Generating summary and highlights report...")
326
+ yield (
327
+ gr.update(value="Generating summary & highlights via AI...", visible=True),
328
+ gr.update(),
329
+ gr.update(),
330
+ gr.update(),
331
+ )
332
+ summary_highlights_prompt_messages, summary_truncated = (
333
+ format_summary_highlights_prompt(space_id, code_files, detailed_privacy_report)
334
+ )
335
+
336
+ # --- Check for 503 after query ---
337
+ summary_highlights_api_response = query_qwen_endpoint(
338
+ summary_highlights_prompt_messages, max_tokens=2048
339
+ )
340
+ if summary_highlights_api_response == ERROR_503_DICT:
341
+ logging.warning("LLM Call 2 failed with 503.")
342
+ yield (
343
+ gr.update(
344
+ value=ERROR_503_USER_MESSAGE, visible=True
345
+ ), # Show 503 message in summary area
346
+ gr.update(
347
+ value=detailed_privacy_report, visible=True
348
+ ), # Keep previous report visible
349
+ gr.update(visible=True, open=True), # Keep summary open
350
+ gr.update(visible=True, open=True), # Keep privacy open
351
+ )
352
+ return # Stop analysis
353
+
354
+ summary_highlights_report = parse_qwen_response(summary_highlights_api_response)
355
+
356
+ if "Error:" in summary_highlights_report:
357
+ logging.error(
358
+ f"Failed to generate summary/highlights report: {summary_highlights_report}"
359
+ )
360
+ yield (
361
+ gr.update(
362
+ value=f"**Error Generating Summary/Highlights:**\n{summary_highlights_report}",
363
+ visible=True,
364
+ ),
365
+ gr.update(value=detailed_privacy_report, visible=True),
366
+ gr.update(visible=True, open=True),
367
+ gr.update(visible=True, open=True),
368
+ )
369
+ return # End generation on error
370
+ if summary_truncated:
371
+ summary_highlights_report = TRUNCATION_WARNING + summary_highlights_report
372
+
373
+ # Yield summary report before attempting upload
374
+ yield (
375
+ gr.update(value=summary_highlights_report, visible=True),
376
+ gr.update(value=detailed_privacy_report, visible=True),
377
+ gr.update(visible=True, open=True),
378
+ gr.update(visible=True, open=True),
379
+ )
380
+
381
+ # --- Step 7: Upload to Cache ---
382
+ progress(7 / steps, desc="Step 7/8: Uploading results to cache...")
383
+ logging.info("Step 7/8: Attempting to upload results to dataset cache...")
384
+ try:
385
+ if (
386
+ HF_TOKEN
387
+ and not found_in_cache
388
+ and "Error:" not in detailed_privacy_report
389
+ and "Error:" not in summary_highlights_report
390
+ ):
391
+ summary_to_save = summary_highlights_report.replace(
392
+ TRUNCATION_WARNING, ""
393
+ ).replace(CACHE_INFO_MSG, "")
394
+ privacy_to_save = detailed_privacy_report.replace(
395
+ TRUNCATION_WARNING, ""
396
+ ).replace(CACHE_INFO_MSG, "")
397
+ upload_reports_to_dataset(
398
+ space_id=space_id,
399
+ summary_report=summary_to_save,
400
+ detailed_report=privacy_to_save,
401
+ dataset_id=DATASET_ID,
402
+ hf_token=HF_TOKEN,
403
+ )
404
+ elif not HF_TOKEN:
405
+ logging.warning("Skipping cache upload as HF_TOKEN is not set.")
406
+ elif found_in_cache:
407
+ logging.info("Skipping cache upload as results were loaded from cache.")
408
+ except Exception as e:
409
+ logging.error(f"Non-critical error during report upload: {e}")
410
+
411
+ logging.info("Step 8/8: Analysis complete.")
412
+ progress(8 / steps, desc="Step 8/8: Analysis Complete!")
413
+
414
+ # --- Step 8: Yield Final Results --- (Ensure final state is correct)
415
+ yield (
416
+ gr.update(value=summary_highlights_report, visible=True),
417
+ gr.update(value=detailed_privacy_report, visible=True),
418
+ gr.update(visible=True, open=True),
419
+ gr.update(visible=True, open=True),
420
+ )
421
+
422
+
423
+ # --- Load Initial Data Function (for demo.load) ---
424
+ def load_cached_list():
425
+ """Fetches the list of cached spaces and determines the default selection."""
426
+ print("Running demo.load: Fetching list of cached spaces...")
427
+ # Use os.getenv here directly as HF_TOKEN might be loaded after initial import
428
+ token = os.getenv("HF_TOKEN")
429
+ cached_list = list_cached_spaces(DATASET_ID, token)
430
+ default_value = DEFAULT_SELECTION if DEFAULT_SELECTION in cached_list else None
431
+ if not cached_list:
432
+ print(
433
+ "WARNING: No cached spaces found or failed to fetch list during demo.load."
434
+ )
435
+ # Return an update object for the dropdown using gr.update()
436
+ return gr.update(choices=cached_list, value=default_value)
437
+
438
+
439
+ # --- Gradio Interface Definition ---
440
+ # Use HTML/CSS for centering the title
441
+ TITLE = "<div style='text-align: center;'><h1>🤗 Space Privacy Analyzer 🕵️</h1></div>\n<div style='text-align: center;'><h4>Automatic code Data transfer review powered by <a href='https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct' target='_blank'>Qwen2.5-Coder-32B-Instruct</a></h4></div>"
442
+
443
+ DESCRIPTION = """
444
+ ### What Privacy Questions do 🤗 Spaces Raise?
445
+
446
+ [Hugging Face Spaces](https://huggingface.co/spaces) offer a convenient way to build and share demos leveraging AI models.
447
+ In most cases, the code for these demos is open source &mdash; which provides a unique opportunity to check **how they manage the privacy** of the data in use.
448
+
449
+ This demo leverages a code analysis model ([Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct)) to help explore these questions in two steps:
450
+ it first obtains and **parses the code** to identify data inputs, AI model use, API calls, and data transfer, then generates a summary of the app's function and **key privacy points**.
451
+
452
+ Use the dropdown menu below to explore the [reports generated for some popular Spaces](https://huggingface.co/datasets/yjernite/spaces-privacy-reports/tree/main), or enter a new Space ID to query your own 👇
453
+
454
+ *Please note the following limitations:*
455
+ - *The model may easily miss important details in the code, especially when it leverages docker files or external libraries.*
456
+ - *This app uses the base Qwen Coder model without specific adaptation to the task. We'd love to discuss how to improve this, if you want to participate feel free to open a discussion!*
457
+ """
458
+
459
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
460
+ gr.Markdown(TITLE) # This will now render the centered HTML
461
+
462
+ with gr.Row():
463
+ with gr.Column(scale=1): # Left column for inputs
464
+ gr.Markdown(DESCRIPTION)
465
+
466
+ cached_spaces_dropdown = gr.Dropdown(
467
+ label="Select Existing Report",
468
+ info="Select a Space whose report has been previously generated.",
469
+ choices=[], # Initialize empty, will be populated by demo.load
470
+ value=None, # Initialize empty
471
+ )
472
+
473
+ space_id_input = gr.Textbox(
474
+ label="Or Enter New Space ID",
475
+ placeholder="owner/space-name",
476
+ info="Enter a new Space ID to analyze (takes precedence over selection).",
477
+ )
478
+
479
+ analyze_button = gr.Button("Get Space Report", variant="primary", scale=1)
480
+
481
+ with gr.Column(scale=1): # Right column for outputs
482
+ # Define Accordions first, open by default, hidden initially
483
+ summary_accordion = gr.Accordion(
484
+ "Summary & Privacy Highlights", open=True, visible=True
485
+ )
486
+ privacy_accordion = gr.Accordion(
487
+ "Detailed Privacy Analysis Report", open=False, visible=True
488
+ )
489
+ with summary_accordion:
490
+ summary_markdown = gr.Markdown(
491
+ "Enter or select a Space ID and click Get Report.",
492
+ show_copy_button=True,
493
+ )
494
+ with privacy_accordion:
495
+ privacy_markdown = gr.Markdown(
496
+ "Detailed report will appear here.", show_copy_button=True
497
+ )
498
+
499
+ # --- Event Listeners ---
500
+
501
+ # Load event to populate the dropdown when the UI loads for a user session
502
+ demo.load(fn=load_cached_list, inputs=None, outputs=cached_spaces_dropdown)
503
+
504
+ # Button click event
505
+ analyze_button.click(
506
+ fn=get_space_report_wrapper,
507
+ inputs=[cached_spaces_dropdown, space_id_input],
508
+ outputs=[
509
+ summary_markdown,
510
+ privacy_markdown,
511
+ summary_accordion,
512
+ privacy_accordion,
513
+ ],
514
+ show_progress="full",
515
+ )
516
+
517
+ # --- Application Entry Point ---
518
+
519
+ if __name__ == "__main__":
520
+ logging.info("Starting Gradio application...")
521
+ demo.launch()
llm_interface.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ from dotenv import load_dotenv
5
+ from huggingface_hub import InferenceClient
6
+ from huggingface_hub.inference._generated.types import ChatCompletionOutput
7
+ from huggingface_hub.utils import HfHubHTTPError
8
+
9
+ # Configure logging
10
+ logging.basicConfig(
11
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
12
+ )
13
+
14
+ # Load environment variables from .env file
15
+ # load_dotenv() # Removed: This should be loaded only at the main entry point (app.py)
16
+ load_dotenv() # Restored: Ensure env vars are loaded when this module is imported/used
17
+
18
+ HF_TOKEN = os.getenv("HF_TOKEN")
19
+ HF_INFERENCE_ENDPOINT_URL = os.getenv("HF_INFERENCE_ENDPOINT_URL")
20
+
21
+ # Default parameters for the LLM call
22
+ DEFAULT_MAX_TOKENS = 2048
23
+ DEFAULT_TEMPERATURE = 0.1 # Lower temperature for more deterministic analysis
24
+
25
+ # Special dictionary to indicate a 503 error
26
+ ERROR_503_DICT = {"error_type": "503", "message": "Service Unavailable"}
27
+
28
+
29
+ def query_qwen_endpoint(
30
+ formatted_prompt: list[dict[str, str]], max_tokens: int = DEFAULT_MAX_TOKENS
31
+ ) -> ChatCompletionOutput | dict | None:
32
+ """
33
+ Queries the specified Qwen Inference Endpoint with the formatted prompt.
34
+
35
+ Args:
36
+ formatted_prompt: A list of message dictionaries for the chat completion API.
37
+ max_tokens: The maximum number of tokens to generate.
38
+
39
+ Returns:
40
+ The ChatCompletionOutput object from the inference client,
41
+ a specific dictionary (ERROR_503_DICT) if a 503 error occurs,
42
+ or None if another error occurs.
43
+ """
44
+ if not HF_INFERENCE_ENDPOINT_URL:
45
+ logging.error("HF_INFERENCE_ENDPOINT_URL environment variable not set.")
46
+ return None
47
+ if not HF_TOKEN:
48
+ logging.warning(
49
+ "HF_TOKEN environment variable not set. Requests might fail if the endpoint requires authentication."
50
+ )
51
+ # Depending on endpoint config, it might still work without token
52
+
53
+ logging.info(f"Querying Inference Endpoint: {HF_INFERENCE_ENDPOINT_URL}")
54
+ client = InferenceClient(model=HF_INFERENCE_ENDPOINT_URL, token=HF_TOKEN)
55
+
56
+ try:
57
+ response = client.chat_completion(
58
+ messages=formatted_prompt,
59
+ max_tokens=max_tokens,
60
+ temperature=DEFAULT_TEMPERATURE,
61
+ # Qwen models often benefit from setting stop sequences if known,
62
+ # but we'll rely on max_tokens and model's natural stopping for now.
63
+ # stop=["<|im_end|>"] # Example stop token if needed for specific Qwen finetunes
64
+ )
65
+ logging.info("Successfully received response from Inference Endpoint.")
66
+ return response
67
+ except HfHubHTTPError as e:
68
+ # Check specifically for 503 Service Unavailable
69
+ if e.response is not None and e.response.status_code == 503:
70
+ logging.warning(
71
+ f"Encountered 503 Service Unavailable from endpoint: {HF_INFERENCE_ENDPOINT_URL}"
72
+ )
73
+ return ERROR_503_DICT # Return special dict for 503
74
+ else:
75
+ # Handle other HTTP errors
76
+ logging.error(f"HTTP error querying Inference Endpoint: {e}")
77
+ if e.response is not None:
78
+ logging.error(f"Response details: {e.response.text}")
79
+ return None # Return None for other HTTP errors
80
+ except Exception as e:
81
+ logging.error(f"An unexpected error occurred querying Inference Endpoint: {e}")
82
+ return None
83
+
84
+
85
+ def parse_qwen_response(response: ChatCompletionOutput | dict | None) -> str:
86
+ """
87
+ Parses the response from the Qwen model to extract the generated text.
88
+ Handles potential None or error dict inputs.
89
+
90
+ Args:
91
+ response: The ChatCompletionOutput object, ERROR_503_DICT, or None.
92
+
93
+ Returns:
94
+ The extracted response text as a string, or an error message string.
95
+ """
96
+ if response is None:
97
+ return "Error: Failed to get response from the language model."
98
+
99
+ # Check if it's our specific 503 error signal before trying to parse as ChatCompletionOutput
100
+ if isinstance(response, dict) and response.get("error_type") == "503":
101
+ return f"Error: {response['error_type']} {response['message']}"
102
+
103
+ # Check if it's likely the expected ChatCompletionOutput structure
104
+ if not hasattr(response, "choices"):
105
+ logging.error(
106
+ f"Unexpected response type received by parse_qwen_response: {type(response)}. Content: {response}"
107
+ )
108
+ return "Error: Received an unexpected response format from the language model endpoint."
109
+
110
+ try:
111
+ # Access the generated content according to the ChatCompletionOutput structure
112
+ if response.choices and len(response.choices) > 0:
113
+ content = response.choices[0].message.content
114
+ if content:
115
+ logging.info("Successfully parsed response content.")
116
+ return content.strip()
117
+ else:
118
+ logging.warning("Response received, but content is empty.")
119
+ return "Error: Received an empty response from the language model."
120
+ else:
121
+ logging.warning("Response received, but no choices found.")
122
+ return "Error: No response choices found in the language model output."
123
+ except AttributeError as e:
124
+ # This might catch cases where response looks like the object but lacks expected attributes
125
+ logging.error(
126
+ f"Attribute error parsing response: {e}. Response structure might be unexpected."
127
+ )
128
+ logging.error(f"Raw response object: {response}")
129
+ return "Error: Could not parse the structure of the language model response."
130
+ except Exception as e:
131
+ logging.error(f"An unexpected error occurred parsing the response: {e}")
132
+ return "Error: An unexpected error occurred while parsing the language model response."
133
+
134
+
135
+ # Example Usage (for testing - requires .env setup and potentially prompts.py)
136
+ # if __name__ == '__main__':
137
+ # # This example assumes you have a prompts.py that can generate a test prompt
138
+ # try:
139
+ # from prompts import format_code_for_analysis
140
+ # # Create a dummy prompt for testing
141
+ # test_files = {"app.py": "print('hello')"}
142
+ # test_prompt = format_code_for_analysis("test/minimal", test_files)
143
+ # print("--- Sending Test Prompt ---")
144
+ # print(test_prompt)
145
+ # api_response = query_qwen_endpoint(test_prompt)
146
+ # print("\n--- Raw API Response ---")
147
+ # print(api_response)
148
+ # print("\n--- Parsed Response ---")
149
+ # parsed_text = parse_qwen_response(api_response)
150
+ # print(parsed_text)
151
+ # except ImportError:
152
+ # print("Could not import prompts.py for testing. Run this test from the project root.")
153
+ # except Exception as e:
154
+ # print(f"An error occurred during testing: {e}")
prompts.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from collections import defaultdict
3
+
4
+ MAX_FILE_CONTENT_LENGTH = 10000 # Limit the length of individual file contents
5
+ MAX_MODEL_DESC_LENGTH = 1500 # Limit the length of fetched model descriptions
6
+ MAX_PROMPT_CHARS = 110000 # Approx < 30k tokens (using ~4 chars/token heuristic)
7
+
8
+ # Prompt for the first LLM call: Detailed Privacy Analysis
9
+ PRIVACY_SYSTEM_PROMPT = (
10
+ "You are a helpful AI assistant specialized in analyzing Hugging Face Spaces code for privacy concerns. "
11
+ "Your goal is to identify data flows and potential privacy risks based *only* on the provided code files. "
12
+ "Analyze the following aspects and provide relevant code snippets (formatted as Markdown code blocks) as evidence for each point. "
13
+ "**Crucially, include the filename for each code snippet.** Example: `(filename.py)`\n\n"
14
+ "**Note:** If the app uses externally defined or unaccessible code to upload or process data, say so.\n\n"
15
+ "1. **Data Inputs:**\n"
16
+ " - What types of user data does the application accept as input (e.g., text, images, audio, files)?\n"
17
+ " - Where in the code are these inputs defined (e.g., Gradio input widgets, file uploads)? Provide the filename and code snippet.\n\n"
18
+ "2. **Processing Services & Data Transmission:**\n"
19
+ " - What specific internal or external APIs, models, or services are used to process the input data?\n"
20
+ " - What specific AI models or services are used to process the input data? Are any of these Hugging Face-hosted models?\n"
21
+ " - Where in the code are these services called (e.g., `requests.post`, `InferenceClient`, specific API endpoint URLs) or defined (e.g., `transformers` library)? Provide the filename and code snippet.\n"
22
+ " - Is it likely that user data is transmitted to these external services, and what kind of data is transmitted by each service or API? Mention if the services are known (like Hugging Face Inference API/Endpoints) or potentially unknown third parties.\n\n"
23
+ "3. **Execution Environment & Potential Local Processing:**\n"
24
+ " - Does the code indicate that models or significant processing might run *locally* within the Space container? Provide the filename and code snippet.\n"
25
+ " - Does the code explicitly use external *inference services* to query AI models? If so, reiterate the relevant code snippet from point 2 with filename.\n\n"
26
+ " - Does the code mention interactions with remote databases (e.g., `sqlite`, `postgres`, `mysql`, `redis`, `mongodb`, etc.), storage (e.g., `s3`, `gcs`, `azure`, etc.), or Cloud-based data services? If so, provide the filename and code snippet.\n\n"
27
+ "4. **Explicit Data Storage/Logging:**\n"
28
+ " - Is there any code that explicitly stores user input or results to files, databases, or external logging services? Provide the filename and code snippet.\n\n"
29
+ "5. **Overall Privacy Risk Summary:**\n"
30
+ " - Based ONLY on the evidence from the code snippets above, provide a concise summary paragraph highlighting the main potential privacy considerations or risks.\n\n"
31
+ "Format your entire response clearly using Markdown. Ensure all code snippets include filename and are properly formatted."
32
+ )
33
+
34
+ # Prompt for the second LLM call: Space Summary + Privacy Highlights
35
+ SUMMARY_HIGHLIGHTS_SYSTEM_PROMPT = (
36
+ "You are an AI assistant reviewing a Hugging Face Space. You have been provided with: "
37
+ "(1) the application code, and "
38
+ "(2) a detailed preliminary privacy analysis report."
39
+ "Your task is to generate a summary report containing two parts:\n\n"
40
+ "**Part 1: Space Summary**\n"
41
+ "- Based on the code and privacy analysis report, provide a concise summary (4-6 sentences max) of what the application does from a user's perspective.\n\n"
42
+ "**Part 2: Privacy Highlights**\n"
43
+ "- Using information from the preliminary privacy report (cross-referencing code/descriptions as needed), list the following key privacy aspects:\n"
44
+ " 1. **Data Inputs:** List the main types of data provided to the application with a brief description for each. List where the data is used or stored by the application.\n"
45
+ " 2. **AI Models/Services:** List the core AI models or services used. For each, specify: Is it run locally or remotely? What library or service is used, or is the code defined within the app?\n"
46
+ " 3. **Other Remote Data or Dataset Calls:** List any other identified remote data calls that might upload or transmit data outside of the app (e.g., to databases, external APIs not covered above, cloud storage).\n"
47
+ " 4. **Libraries Suggesting Data Transmission:** List libraries used (e.g., `requests`, `gradio[sharing]`) that might implicitly or explicitly transmit data, suggesting where users might look for more details (e.g., library documentation, specific code sections).\n\n"
48
+ "Format the entire response clearly using Markdown. Do not include the preliminary privacy report itself in your output."
49
+ )
50
+
51
+
52
+ def _generate_file_structure(code_files: dict[str, str]) -> str:
53
+ """Generates a tree-like textual representation of the file structure."""
54
+ tree = defaultdict(dict)
55
+ files = sorted(code_files.keys())
56
+
57
+ for fpath in files:
58
+ parts = fpath.split("/")
59
+ node = tree
60
+ for i, part in enumerate(parts):
61
+ if i == len(parts) - 1: # It's a file
62
+ node[part] = None # Mark as file
63
+ else: # It's a directory
64
+ if part not in node:
65
+ node[part] = defaultdict(dict) # Create dir node if not exists
66
+ # Check if we previously marked this as a file (edge case where dir name = file name at higher level)
67
+ elif node[part] is None:
68
+ node[part] = defaultdict(dict) # Convert file marker to dir
69
+ node = node[part] # Move deeper
70
+
71
+ output_lines = ["Project File Structure:"]
72
+
73
+ def build_tree_lines(node, prefix=""):
74
+ # Sort items: directories first (defaultdict instances), then files (keys with None value)
75
+ items = sorted(
76
+ node.items(),
77
+ key=lambda item: isinstance(item[1], defaultdict),
78
+ reverse=True,
79
+ )
80
+
81
+ pointers = ["├── " for _ in range(len(items) - 1)] + ["└── "]
82
+ for pointer, (name, sub_node) in zip(pointers, items):
83
+ output_lines.append(prefix + pointer + name)
84
+ if isinstance(sub_node, defaultdict): # It's a directory
85
+ extension = "│ " if pointer == "├── " else " "
86
+ build_tree_lines(sub_node, prefix + extension)
87
+
88
+ build_tree_lines(tree)
89
+ print("\n".join(output_lines))
90
+ return "\n".join(output_lines)
91
+
92
+
93
+ def _format_code_files_for_prompt(code_files: dict[str, str]) -> str:
94
+ """Formats the code files into a single string for the prompt, sorted by depth and path."""
95
+
96
+ def sort_key(filepath):
97
+ parts = filepath.split("/")
98
+ depth = len(parts) - 1
99
+ dir_path = "/".join(parts[:-1]) if depth > 0 else ""
100
+ filename = parts[-1]
101
+ return (depth, dir_path, filename)
102
+
103
+ sorted_filenames = sorted(code_files.keys(), key=sort_key)
104
+
105
+ output_parts = []
106
+ for filename in sorted_filenames:
107
+ content = code_files[filename]
108
+ print(f"--- File: {filename} ---")
109
+ print(content[:128])
110
+ output_parts.append(
111
+ f"--- File: {filename} ---\n```\n{content[:MAX_FILE_CONTENT_LENGTH]}{'\n... [truncated]' if len(content) > MAX_FILE_CONTENT_LENGTH else ''}\n```"
112
+ )
113
+
114
+ return "\n".join(output_parts)
115
+
116
+
117
+ def format_privacy_prompt(
118
+ space_id: str, code_files: dict[str, str]
119
+ ) -> tuple[list[dict[str, str]], bool]:
120
+ """
121
+ Formats the prompt for the initial detailed privacy analysis task.
122
+ Returns messages list and a boolean indicating if truncation occurred.
123
+ """
124
+ was_truncated = False
125
+ file_structure = _generate_file_structure(code_files)
126
+ formatted_code = _format_code_files_for_prompt(code_files)
127
+
128
+ # Define components for length calculation
129
+ prompt_header = f"Please perform a detailed privacy analysis for the Hugging Face Space '{space_id}'.\n\n{file_structure}\n\nCode Files Content:\n"
130
+ base_length = len(prompt_header) + len(PRIVACY_SYSTEM_PROMPT)
131
+
132
+ # Check if formatted code needs truncation for the overall prompt
133
+ available_chars_for_code = MAX_PROMPT_CHARS - base_length
134
+ if available_chars_for_code < 0: # Header itself is too long (unlikely)
135
+ available_chars_for_code = 0
136
+ was_truncated = True
137
+
138
+ if len(formatted_code) > available_chars_for_code:
139
+ formatted_code = (
140
+ formatted_code[:available_chars_for_code]
141
+ + "\n... [Code Section Truncated Due to Overall Prompt Length] ..."
142
+ )
143
+ was_truncated = True
144
+ logging.warning(
145
+ f"Privacy prompt code section truncated for Space ID {space_id} due to overall length."
146
+ )
147
+
148
+ user_content = prompt_header + formatted_code
149
+
150
+ messages = [
151
+ {"role": "system", "content": PRIVACY_SYSTEM_PROMPT},
152
+ {"role": "user", "content": user_content},
153
+ ]
154
+ return messages, was_truncated
155
+
156
+
157
+ def format_summary_highlights_prompt(
158
+ space_id: str, code_files: dict[str, str], detailed_privacy_report: str
159
+ ) -> tuple[list[dict[str, str]], bool]:
160
+ """
161
+ Formats the prompt for the final summary + highlights report.
162
+ Returns messages list and a boolean indicating if truncation occurred.
163
+ """
164
+ was_truncated = False
165
+ file_structure = _generate_file_structure(code_files)
166
+ formatted_code = _format_code_files_for_prompt(code_files)
167
+
168
+ # Define components for length calculation
169
+ prompt_header = f"Please generate a final summary and privacy highlights report for the Hugging Face Space '{space_id}'.\n\n"
170
+ report_header = "**Preliminary Detailed Privacy Report:**\n---\n"
171
+ report_footer = "\n---\n\n"
172
+ support_header = f"**Supporting Information:**\n{file_structure}\n\n"
173
+ code_header = "**Original Code Files Content:**\n"
174
+
175
+ base_length = (
176
+ len(prompt_header)
177
+ + len(report_header)
178
+ + len(report_footer)
179
+ + len(support_header)
180
+ + len(code_header)
181
+ + len(SUMMARY_HIGHLIGHTS_SYSTEM_PROMPT)
182
+ )
183
+ available_chars_total = MAX_PROMPT_CHARS - base_length
184
+
185
+ if available_chars_total < 0: # Base structure is too long
186
+ logging.error(
187
+ f"Base prompt structure for summary highlights exceeds limit for Space ID {space_id}. Cannot proceed effectively."
188
+ )
189
+ # Return minimal user content to avoid errors, but flag truncation heavily
190
+ user_content = (
191
+ prompt_header
192
+ + report_header
193
+ + "[TRUNCATED DUE TO LENGTH]"
194
+ + report_footer
195
+ + support_header
196
+ + code_header
197
+ + "[TRUNCATED DUE TO LENGTH]"
198
+ )
199
+ was_truncated = True
200
+ else:
201
+ # Prioritize truncating the detailed report first
202
+ available_chars_for_report = available_chars_total - len(
203
+ formatted_code
204
+ ) # Reserve space for code
205
+ if available_chars_for_report < 0:
206
+ available_chars_for_report = 0 # Cannot fit report
207
+
208
+ if len(detailed_privacy_report) > available_chars_for_report:
209
+ detailed_privacy_report = (
210
+ detailed_privacy_report[:available_chars_for_report]
211
+ + "\n... [Detailed Privacy Report Truncated Due to Overall Prompt Length] ..."
212
+ )
213
+ was_truncated = True
214
+ logging.warning(
215
+ f"Summary prompt detailed report section truncated for Space ID {space_id}."
216
+ )
217
+
218
+ # Now check code length again with (potentially truncated) report length
219
+ available_chars_for_code = available_chars_total - len(detailed_privacy_report)
220
+ if available_chars_for_code < 0:
221
+ available_chars_for_code = 0 # Cannot fit code
222
+
223
+ if len(formatted_code) > available_chars_for_code:
224
+ formatted_code = (
225
+ formatted_code[:available_chars_for_code]
226
+ + "\n... [Code Section Truncated Due to Overall Prompt Length] ..."
227
+ )
228
+ was_truncated = True
229
+ logging.warning(
230
+ f"Summary prompt code section truncated for Space ID {space_id}."
231
+ )
232
+
233
+ # Assemble the final user content
234
+ user_content = (
235
+ prompt_header
236
+ + report_header
237
+ + detailed_privacy_report
238
+ + report_footer
239
+ + support_header
240
+ + code_header
241
+ + formatted_code
242
+ )
243
+
244
+ messages = [
245
+ {"role": "system", "content": SUMMARY_HIGHLIGHTS_SYSTEM_PROMPT},
246
+ {"role": "user", "content": user_content},
247
+ ]
248
+ return messages, was_truncated
249
+
250
+
251
+ # Example usage (for testing)
252
+ # if __name__ == '__main__':
253
+ # test_files = {
254
+ # "app.py": "import gradio as gr\n\ndef greet(name):\n # Potentially send data to external service?\n # requests.post('http://example.com/log', json={'user': name})\n return f'Hello {name}!'",
255
+ # "requirements.txt": "gradio\nrequests",
256
+ # "nested/utils.py": "def helper():\n pass",
257
+ # "README.md": "This should be ignored.", # Example of a file that *should* be filtered out before reaching here
258
+ # "very_long_file.py": "print('hello' * 5000)" # Test truncation
259
+ # }
260
+ # # Typically, files like README.md would be filtered by get_space_code_files in utils.py
261
+ # # We include it here just for demo purposes if you were to test prompts.py directly.
262
+ # filtered_test_files = {k: v for k, v in test_files.items() if not k.endswith('.md')}
263
+ # prompt_messages = format_code_for_analysis("test/space", filtered_test_files)
264
+ # print("--- System Prompt ---")
265
+ # print(prompt_messages[0]['content'])
266
+ # print("\n--- User Prompt ---")
267
+ # print(prompt_messages[1]['content'])
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ huggingface-hub
3
+ python-dotenv
4
+ requests
utils.py ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import re
4
+ import tempfile
5
+
6
+ from huggingface_hub import HfApi, hf_hub_download
7
+ from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
8
+
9
+ # Configure logging
10
+ logging.basicConfig(
11
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
12
+ )
13
+
14
+ # Files/extensions to definitely include
15
+ INCLUDE_PATTERNS = [
16
+ ".py",
17
+ "requirements.txt",
18
+ "Dockerfile",
19
+ ".js",
20
+ ".jsx",
21
+ ".ts",
22
+ ".tsx",
23
+ ".html",
24
+ ".css",
25
+ ".svelte",
26
+ ".vue",
27
+ ".json",
28
+ ".yaml",
29
+ ".yml",
30
+ ".toml",
31
+ "Procfile",
32
+ ".sh",
33
+ ]
34
+
35
+ # Files/extensions/folders to ignore
36
+ IGNORE_PATTERNS = [
37
+ ".git",
38
+ ".hfignore",
39
+ "README.md",
40
+ "LICENSE",
41
+ "__pycache__",
42
+ ".ipynb_checkpoints",
43
+ ".png",
44
+ ".jpg",
45
+ ".jpeg",
46
+ ".gif",
47
+ ".svg",
48
+ ".ico",
49
+ ".mp3",
50
+ ".wav",
51
+ ".mp4",
52
+ ".mov",
53
+ ".avi",
54
+ ".onnx",
55
+ ".pt",
56
+ ".pth",
57
+ ".bin",
58
+ ".safetensors",
59
+ ".tflite",
60
+ ".pickle",
61
+ ".pkl",
62
+ ".joblib",
63
+ ".parquet",
64
+ ".csv",
65
+ ".tsv",
66
+ ".zip",
67
+ ".tar.gz",
68
+ ".gz",
69
+ ".ipynb",
70
+ ".DS_Store",
71
+ "node_modules",
72
+ ]
73
+
74
+ # Regex to find potential Hugging Face model IDs (e.g., "org/model-name", "user/model-name")
75
+ # This is a simple heuristic and might catch non-model strings or miss complex cases.
76
+ HF_MODEL_ID_PATTERN = re.compile(r"([\"\'])([\w\-.]+/[\w\-\.]+)\1\'")
77
+
78
+ # Max length for model descriptions to keep prompts manageable
79
+ MAX_MODEL_DESC_LENGTH = 1500
80
+
81
+ SUMMARY_FILENAME = "summary_highlights.md"
82
+ PRIVACY_FILENAME = "privacy_report.md"
83
+
84
+
85
+ def _is_relevant_file(filename):
86
+ """Check if a file should be included based on patterns."""
87
+ # Ignore files matching ignore patterns (case-insensitive check for some)
88
+ lower_filename = filename.lower()
89
+ if any(
90
+ pattern in lower_filename
91
+ for pattern in [".git", ".hfignore", "readme.md", "license"]
92
+ ):
93
+ return False
94
+ if any(
95
+ filename.endswith(ext) for ext in IGNORE_PATTERNS if ext.startswith(".")
96
+ ): # Check extensions
97
+ return False
98
+ if any(
99
+ part == pattern
100
+ for part in filename.split("/")
101
+ for pattern in IGNORE_PATTERNS
102
+ if "." not in pattern and "/" not in pattern
103
+ ): # Check directory/file names
104
+ return False
105
+ if filename in IGNORE_PATTERNS: # Check full filenames
106
+ return False
107
+
108
+ # Include files matching include patterns
109
+ if any(filename.endswith(ext) for ext in INCLUDE_PATTERNS if ext.startswith(".")):
110
+ return True
111
+ if any(filename == pattern for pattern in INCLUDE_PATTERNS if "." not in pattern):
112
+ return True
113
+
114
+ # Default to False if not explicitly included (safer)
115
+ # logging.debug(f"File '{filename}' excluded by default.")
116
+ return False
117
+
118
+
119
+ def get_space_code_files(space_id: str) -> dict[str, str]:
120
+ """
121
+ Downloads relevant code and configuration files from a Hugging Face Space.
122
+
123
+ Args:
124
+ space_id: The ID of the Hugging Face Space (e.g., 'gradio/hello_world').
125
+
126
+ Returns:
127
+ A dictionary where keys are filenames and values are file contents as strings.
128
+ Returns an empty dictionary if the space is not found or has no relevant files.
129
+ """
130
+ code_files = {}
131
+ api = HfApi()
132
+
133
+ try:
134
+ logging.info(f"Fetching file list for Space: {space_id}")
135
+ repo_files = api.list_repo_files(repo_id=space_id, repo_type="space")
136
+ logging.info(f"Found {len(repo_files)} total files in {space_id}.")
137
+
138
+ relevant_files = [f for f in repo_files if _is_relevant_file(f)]
139
+ logging.info(f"Identified {len(relevant_files)} relevant files for download.")
140
+
141
+ for filename in relevant_files:
142
+ try:
143
+ logging.debug(f"Downloading {filename} from {space_id}...")
144
+ file_path = hf_hub_download(
145
+ repo_id=space_id,
146
+ filename=filename,
147
+ repo_type="space",
148
+ # Consider adding use_auth_token=os.getenv("HF_TOKEN") if accessing private spaces later
149
+ )
150
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
151
+ content = f.read()
152
+ code_files[filename] = content
153
+ logging.debug(f"Successfully read content of {filename}")
154
+ except EntryNotFoundError:
155
+ logging.warning(
156
+ f"File {filename} listed but not found in repo {space_id}."
157
+ )
158
+ except UnicodeDecodeError:
159
+ logging.warning(
160
+ f"Could not decode file {filename} from {space_id} as UTF-8. Skipping."
161
+ )
162
+ except OSError as e:
163
+ logging.warning(f"OS error reading file {filename} from cache: {e}")
164
+ except Exception as e:
165
+ logging.error(
166
+ f"Unexpected error downloading or reading file {filename} from {space_id}: {e}"
167
+ )
168
+
169
+ except RepositoryNotFoundError:
170
+ logging.error(f"Space repository '{space_id}' not found.")
171
+ return {}
172
+ except Exception as e:
173
+ logging.error(f"Failed to list or process files for space {space_id}: {e}")
174
+ return {}
175
+
176
+ logging.info(
177
+ f"Successfully retrieved content for {len(code_files)} files from {space_id}."
178
+ )
179
+ return code_files
180
+
181
+
182
+ def extract_hf_model_ids(code_files: dict[str, str]) -> set[str]:
183
+ """
184
+ Extracts potential Hugging Face model IDs mentioned in code files.
185
+
186
+ Args:
187
+ code_files: Dictionary of {filename: content}.
188
+
189
+ Returns:
190
+ A set of unique potential model IDs found.
191
+ """
192
+ potential_ids = set()
193
+ for filename, content in code_files.items():
194
+ # Limit search to relevant file types
195
+ if filename.endswith((".py", ".json", ".yaml", ".yml", ".toml", ".md")):
196
+ try:
197
+ matches = HF_MODEL_ID_PATTERN.findall(content)
198
+ for _, model_id in matches:
199
+ # Basic validation: must contain exactly one '/'
200
+ if model_id.count("/") == 1:
201
+ # Avoid adding common paths that look like IDs
202
+ if not any(
203
+ part in model_id.lower()
204
+ for part in ["http", "www", "@", " ", ".", ":"]
205
+ ): # Check if '/' is only separator
206
+ if len(model_id) < 100: # Avoid overly long strings
207
+ potential_ids.add(model_id)
208
+ except Exception as e:
209
+ logging.warning(f"Regex error processing file {filename}: {e}")
210
+
211
+ logging.info(f"Extracted {len(potential_ids)} potential model IDs.")
212
+ # Add simple filter for very common false positives if needed
213
+ # potential_ids = {id for id in potential_ids if id not in ['user/repo']}
214
+ return potential_ids
215
+
216
+
217
+ def get_model_descriptions(model_ids: set[str]) -> dict[str, str]:
218
+ """
219
+ Fetches the README.md content (description) for a set of model IDs.
220
+
221
+ Args:
222
+ model_ids: A set of Hugging Face model IDs.
223
+
224
+ Returns:
225
+ A dictionary mapping model_id to its description string (or an error message).
226
+ """
227
+ descriptions = {}
228
+ if not model_ids:
229
+ return descriptions
230
+
231
+ logging.info(f"Fetching descriptions for {len(model_ids)} models...")
232
+ for model_id in model_ids:
233
+ try:
234
+ # Check if the model exists first (optional but good practice)
235
+ # api.model_info(model_id)
236
+
237
+ # Download README.md
238
+ readme_path = hf_hub_download(
239
+ repo_id=model_id,
240
+ filename="README.md",
241
+ repo_type="model",
242
+ # Add token if needing to access private/gated models - unlikely for Space analysis
243
+ # use_auth_token=os.getenv("HF_TOKEN"),
244
+ error_if_not_found=True, # Raise error if README doesn't exist
245
+ )
246
+ with open(readme_path, "r", encoding="utf-8", errors="ignore") as f:
247
+ description = f.read()
248
+ descriptions[model_id] = description[:MAX_MODEL_DESC_LENGTH] + (
249
+ "... [truncated]" if len(description) > MAX_MODEL_DESC_LENGTH else ""
250
+ )
251
+ logging.debug(f"Successfully fetched description for {model_id}")
252
+ except RepositoryNotFoundError:
253
+ logging.warning(f"Model repository '{model_id}' not found.")
254
+ descriptions[model_id] = "[Model repository not found]"
255
+ except EntryNotFoundError:
256
+ logging.warning(f"README.md not found in model repository '{model_id}'.")
257
+ descriptions[model_id] = "[README.md not found in model repository]"
258
+ except Exception as e:
259
+ logging.error(f"Error fetching description for model '{model_id}': {e}")
260
+ descriptions[model_id] = f"[Error fetching description: {e}]"
261
+
262
+ logging.info(f"Finished fetching descriptions for {len(descriptions)} models.")
263
+ return descriptions
264
+
265
+
266
+ def list_cached_spaces(dataset_id: str, hf_token: str | None) -> list[str]:
267
+ """Lists the space IDs (owner/name) that have cached reports in the dataset repository."""
268
+ if not hf_token:
269
+ logging.warning("HF Token not provided, cannot list cached spaces.")
270
+ return []
271
+ try:
272
+ api = HfApi(token=hf_token)
273
+ # Get all filenames in the dataset repository
274
+ all_files = api.list_repo_files(repo_id=dataset_id, repo_type="dataset")
275
+
276
+ # Extract unique directory paths that look like owner/space_name
277
+ # by checking if they contain our specific report files.
278
+ space_ids = set()
279
+ for f_path in all_files:
280
+ # Check if the file is one of our report files
281
+ if f_path.endswith(f"/{PRIVACY_FILENAME}") or f_path.endswith(
282
+ f"/{SUMMARY_FILENAME}"
283
+ ):
284
+ # Extract the directory path part (owner/space_name)
285
+ parts = f_path.split("/")
286
+ if len(parts) == 3: # Expecting owner/space_name/filename.md
287
+ owner_slash_space_name = "/".join(parts[:-1])
288
+ # Basic validation: owner and space name shouldn't start with '.'
289
+ if not parts[0].startswith(".") and not parts[1].startswith("."):
290
+ space_ids.add(owner_slash_space_name)
291
+
292
+ sorted_space_ids = sorted(list(space_ids))
293
+ logging.info(
294
+ f"Found {len(sorted_space_ids)} cached space reports in {dataset_id} via HfApi."
295
+ )
296
+ return sorted_space_ids
297
+
298
+ except RepositoryNotFoundError:
299
+ logging.warning(
300
+ f"Dataset {dataset_id} not found or empty when listing cached spaces."
301
+ )
302
+ return []
303
+ except Exception as e:
304
+ logging.error(f"Error listing cached spaces in {dataset_id} via HfApi: {e}")
305
+ return [] # Return empty list on error
306
+
307
+
308
+ def check_report_exists(space_id: str, dataset_id: str, hf_token: str | None) -> bool:
309
+ """Checks if report files already exist in the target dataset repo using HfApi."""
310
+ print(
311
+ f"[Debug Cache Check] Checking for space_id: '{space_id}' in dataset: '{dataset_id}'"
312
+ ) # DEBUG
313
+ if not hf_token:
314
+ logging.warning("HF Token not provided, cannot check dataset cache.")
315
+ print("[Debug Cache Check] No HF Token, returning False.") # DEBUG
316
+ return False
317
+ try:
318
+ api = HfApi(token=hf_token)
319
+ # List ALL files in the repo
320
+ print(f"[Debug Cache Check] Listing ALL files in repo '{dataset_id}'") # DEBUG
321
+ all_repo_files = api.list_repo_files(repo_id=dataset_id, repo_type="dataset")
322
+ # DEBUG: Optionally print a subset if the list is huge
323
+ # print(f"[Debug Cache Check] First 10 files returned by API: {all_repo_files[:10]}")
324
+
325
+ # Construct the exact paths we expect for the target space_id
326
+ expected_summary_path = f"{space_id}/{SUMMARY_FILENAME}"
327
+ expected_privacy_path = f"{space_id}/{PRIVACY_FILENAME}"
328
+ print(
329
+ f"[Debug Cache Check] Expecting summary file: '{expected_summary_path}'"
330
+ ) # DEBUG
331
+ print(
332
+ f"[Debug Cache Check] Expecting privacy file: '{expected_privacy_path}'"
333
+ ) # DEBUG
334
+
335
+ # Check if both expected paths exist in the full list of files
336
+ summary_exists = expected_summary_path in all_repo_files
337
+ privacy_exists = expected_privacy_path in all_repo_files
338
+ exists = summary_exists and privacy_exists
339
+ print(
340
+ f"[Debug Cache Check] Summary exists in full list: {summary_exists}"
341
+ ) # DEBUG
342
+ print(
343
+ f"[Debug Cache Check] Privacy exists in full list: {privacy_exists}"
344
+ ) # DEBUG
345
+ print(f"[Debug Cache Check] Overall exists check result: {exists}") # DEBUG
346
+ return exists
347
+
348
+ except RepositoryNotFoundError:
349
+ logging.warning(
350
+ f"Dataset repository {dataset_id} not found or not accessible during check."
351
+ )
352
+ print(
353
+ f"[Debug Cache Check] Repository {dataset_id} not found, returning False."
354
+ ) # DEBUG
355
+ except Exception as e:
356
+ # ... (error handling remains the same) ...
357
+ print(f"[Debug Cache Check] Exception caught: {type(e).__name__}: {e}") # DEBUG
358
+ # Note: 404 check based on path_in_repo is no longer applicable here
359
+ # We rely on RepositoryNotFoundError or general Exception
360
+ logging.error(
361
+ f"Error checking dataset {dataset_id} for {space_id} via HfApi: {e}"
362
+ )
363
+ print("[Debug Cache Check] Other exception, returning False.") # DEBUG
364
+ return False # Treat errors as cache miss
365
+
366
+
367
+ def download_cached_reports(
368
+ space_id: str, dataset_id: str, hf_token: str | None
369
+ ) -> dict[str, str]:
370
+ """Downloads cached reports from the dataset repo. Raises error on failure."""
371
+ if not hf_token:
372
+ raise ValueError("HF Token required to download cached reports.")
373
+
374
+ logging.info(
375
+ f"Attempting to download cached reports for {space_id} from {dataset_id}..."
376
+ )
377
+ reports = {}
378
+ # Define paths relative to dataset root for hf_hub_download
379
+ summary_repo_path = f"{space_id}/{SUMMARY_FILENAME}"
380
+ privacy_repo_path = f"{space_id}/{PRIVACY_FILENAME}"
381
+ try:
382
+ # Download summary
383
+ summary_path_local = hf_hub_download(
384
+ repo_id=dataset_id,
385
+ filename=summary_repo_path,
386
+ repo_type="dataset",
387
+ token=hf_token,
388
+ )
389
+ with open(summary_path_local, "r", encoding="utf-8") as f:
390
+ reports["summary"] = f.read()
391
+ logging.info(f"Successfully downloaded cached summary for {space_id}.")
392
+
393
+ # Download privacy report
394
+ privacy_path_local = hf_hub_download(
395
+ repo_id=dataset_id,
396
+ filename=privacy_repo_path,
397
+ repo_type="dataset",
398
+ token=hf_token,
399
+ )
400
+ with open(privacy_path_local, "r", encoding="utf-8") as f:
401
+ reports["privacy"] = f.read()
402
+ logging.info(f"Successfully downloaded cached privacy report for {space_id}.")
403
+
404
+ return reports
405
+
406
+ except EntryNotFoundError as e:
407
+ # More specific error based on which file failed
408
+ missing_file = (
409
+ summary_repo_path if summary_repo_path in str(e) else privacy_repo_path
410
+ )
411
+ logging.error(
412
+ f"Cache download error: Report file {missing_file} not found for {space_id} in {dataset_id}. {e}"
413
+ )
414
+ raise FileNotFoundError(
415
+ f"Cached report file {missing_file} not found for {space_id}"
416
+ ) from e
417
+ except RepositoryNotFoundError as e:
418
+ logging.error(f"Cache download error: Dataset repo {dataset_id} not found. {e}")
419
+ raise FileNotFoundError(f"Dataset repo {dataset_id} not found") from e
420
+ except Exception as e:
421
+ logging.error(
422
+ f"Unexpected error downloading cached reports for {space_id} from {dataset_id}: {e}"
423
+ )
424
+ raise IOError(f"Failed to download cached reports for {space_id}") from e
425
+
426
+
427
+ def upload_reports_to_dataset(
428
+ space_id: str,
429
+ summary_report: str,
430
+ detailed_report: str,
431
+ dataset_id: str,
432
+ hf_token: str | None,
433
+ ):
434
+ """Uploads the generated reports to the specified dataset repository."""
435
+ if not hf_token:
436
+ logging.warning("HF Token not provided, skipping dataset report upload.")
437
+ return
438
+
439
+ logging.info(
440
+ f"Attempting to upload reports for {space_id} to dataset {dataset_id}..."
441
+ )
442
+ api = HfApi(token=hf_token)
443
+
444
+ # Sanitize space_id for path safety (though HF Hub usually handles this)
445
+ safe_space_id = space_id.replace("..", "")
446
+
447
+ try:
448
+ with tempfile.TemporaryDirectory() as tmpdir:
449
+ summary_path_local = os.path.join(tmpdir, SUMMARY_FILENAME)
450
+ privacy_path_local = os.path.join(tmpdir, PRIVACY_FILENAME)
451
+
452
+ with open(summary_path_local, "w", encoding="utf-8") as f:
453
+ f.write(summary_report)
454
+ with open(privacy_path_local, "w", encoding="utf-8") as f:
455
+ f.write(detailed_report)
456
+
457
+ commit_message = f"Add privacy analysis reports for Space: {safe_space_id}"
458
+ repo_url = api.create_repo(
459
+ repo_id=dataset_id,
460
+ repo_type="dataset",
461
+ exist_ok=True,
462
+ )
463
+ logging.info(f"Ensured dataset repo {repo_url} exists.")
464
+
465
+ api.upload_file(
466
+ path_or_fileobj=summary_path_local,
467
+ path_in_repo=f"{safe_space_id}/{SUMMARY_FILENAME}",
468
+ repo_id=dataset_id,
469
+ repo_type="dataset",
470
+ commit_message=commit_message,
471
+ )
472
+ logging.info(f"Successfully uploaded summary report for {safe_space_id}.")
473
+
474
+ api.upload_file(
475
+ path_or_fileobj=privacy_path_local,
476
+ path_in_repo=f"{safe_space_id}/{PRIVACY_FILENAME}",
477
+ repo_id=dataset_id,
478
+ repo_type="dataset",
479
+ commit_message=commit_message,
480
+ )
481
+ logging.info(
482
+ f"Successfully uploaded detailed privacy report for {safe_space_id}."
483
+ )
484
+
485
+ except Exception as e:
486
+ logging.error(
487
+ f"Failed to upload reports for {safe_space_id} to dataset {dataset_id}: {e}"
488
+ )
489
+
490
+
491
+ # Example usage (for testing)
492
+ # if __name__ == '__main__':
493
+ # # Make sure HF_TOKEN is set if accessing private spaces or for higher rate limits
494
+ # from dotenv import load_dotenv
495
+ # load_dotenv()
496
+ # # test_space = "gradio/hello_world"
497
+ # test_space = "huggingface-projects/diffusers-gallery" # A more complex example
498
+ # # test_space = "nonexistent/space" # Test not found
499
+ # files_content = get_space_code_files(test_space)
500
+ # if files_content:
501
+ # print(f"\n--- Files retrieved from {test_space} ---")
502
+ # for name in files_content.keys():
503
+ # print(f"- {name}")
504
+ # # print("\n--- Content of app.py (first 200 chars) ---")
505
+ # # print(files_content.get("app.py", "app.py not found")[:200])
506
+ # else:
507
+ # print(f"Could not retrieve files from {test_space}")