Spaces:
Running
Running
Upload 6 files
Browse files- .gitignore +78 -0
- app.py +521 -0
- llm_interface.py +154 -0
- prompts.py +267 -0
- requirements.txt +4 -0
- utils.py +507 -0
.gitignore
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
|
7 |
+
# C extensions
|
8 |
+
*.so
|
9 |
+
|
10 |
+
# Distribution / packaging
|
11 |
+
.Python
|
12 |
+
build/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
.pytest_cache/
|
47 |
+
.hypothesis/
|
48 |
+
|
49 |
+
# Environments
|
50 |
+
.env
|
51 |
+
.venv
|
52 |
+
venv/
|
53 |
+
ENV/
|
54 |
+
env/
|
55 |
+
|
56 |
+
# Spyder project settings
|
57 |
+
.spyderproject
|
58 |
+
.spyproject
|
59 |
+
|
60 |
+
# Rope project settings
|
61 |
+
.ropeproject
|
62 |
+
|
63 |
+
# mkdocs documentation
|
64 |
+
/site
|
65 |
+
|
66 |
+
# mypy
|
67 |
+
.mypy_cache/
|
68 |
+
.dmypy.json
|
69 |
+
dmypy.json
|
70 |
+
|
71 |
+
# Jupyter Notebook
|
72 |
+
.ipynb_checkpoints
|
73 |
+
|
74 |
+
# VS Code settings
|
75 |
+
.vscode/
|
76 |
+
|
77 |
+
# Temp files
|
78 |
+
*.tmp
|
app.py
ADDED
@@ -0,0 +1,521 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
|
7 |
+
from llm_interface import ERROR_503_DICT # Import error dict
|
8 |
+
from llm_interface import parse_qwen_response, query_qwen_endpoint
|
9 |
+
|
10 |
+
# Updated prompt imports for new order
|
11 |
+
from prompts import format_privacy_prompt, format_summary_highlights_prompt
|
12 |
+
|
13 |
+
# Import helper functions from other modules
|
14 |
+
from utils import list_cached_spaces # Added import
|
15 |
+
from utils import (
|
16 |
+
check_report_exists,
|
17 |
+
download_cached_reports,
|
18 |
+
get_space_code_files,
|
19 |
+
upload_reports_to_dataset,
|
20 |
+
)
|
21 |
+
|
22 |
+
# Configure logging
|
23 |
+
logging.basicConfig(
|
24 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
25 |
+
)
|
26 |
+
|
27 |
+
# Load environment variables from .env file
|
28 |
+
# This is important to ensure API keys and endpoints are loaded before use
|
29 |
+
load_dotenv()
|
30 |
+
|
31 |
+
# --- Constants ---
|
32 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
33 |
+
DATASET_ID = "yjernite/spaces-privacy-reports"
|
34 |
+
CACHE_INFO_MSG = "\n\n*(Report retrieved from cache)*"
|
35 |
+
DEFAULT_SELECTION = "HuggingFaceTB/SmolVLM2"
|
36 |
+
|
37 |
+
TRUNCATION_WARNING = """**⚠️ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n"""
|
38 |
+
|
39 |
+
ERROR_503_USER_MESSAGE = """**503 Service Unavailable**: It appears that the analysis model endpoint is currently down or starting up.
|
40 |
+
|
41 |
+
You have a few options:
|
42 |
+
|
43 |
+
* **Wait & Retry:** Try clicking "Get Space Report" again in ~3-5 minutes. Endpoints often scale down to save resources and take a short time to wake up.
|
44 |
+
* **Select Cached Report:** Use the dropdown above to view a report for a Space that has already been analyzed.
|
45 |
+
* **Request Analysis:** If the error persists, please open an issue or discussion in the Space's Community tab requesting analysis for your target Space ID. We can run the job manually when the endpoint is available.
|
46 |
+
"""
|
47 |
+
|
48 |
+
|
49 |
+
def get_space_report_wrapper(
|
50 |
+
selected_cached_space: str | None,
|
51 |
+
new_space_id: str | None,
|
52 |
+
progress=gr.Progress(track_tqdm=True),
|
53 |
+
):
|
54 |
+
"""
|
55 |
+
Wrapper function to decide whether to fetch cache or run live analysis.
|
56 |
+
Handles the logic based on Dropdown and Textbox inputs.
|
57 |
+
Yields tuples of Gradio updates.
|
58 |
+
"""
|
59 |
+
target_space_id = None
|
60 |
+
source = "new" # Assume new input unless dropdown is chosen
|
61 |
+
|
62 |
+
# Prioritize new_space_id if provided
|
63 |
+
if new_space_id and new_space_id.strip():
|
64 |
+
target_space_id = new_space_id.strip()
|
65 |
+
if target_space_id == selected_cached_space:
|
66 |
+
source = "dropdown_match" # User typed ID that exists in dropdown
|
67 |
+
else:
|
68 |
+
source = "new"
|
69 |
+
elif selected_cached_space:
|
70 |
+
target_space_id = selected_cached_space
|
71 |
+
source = "dropdown"
|
72 |
+
|
73 |
+
if not target_space_id:
|
74 |
+
# No input provided
|
75 |
+
return (
|
76 |
+
gr.update(
|
77 |
+
value="Please select an existing report or enter a new Space ID.",
|
78 |
+
visible=True,
|
79 |
+
),
|
80 |
+
gr.update(value="", visible=False),
|
81 |
+
gr.update(visible=True, open=True),
|
82 |
+
gr.update(visible=False),
|
83 |
+
)
|
84 |
+
|
85 |
+
# Validate format
|
86 |
+
if "/" not in target_space_id:
|
87 |
+
return (
|
88 |
+
gr.update(
|
89 |
+
value=f"Invalid Space ID format: '{target_space_id}'. Use 'owner/name'.",
|
90 |
+
visible=True,
|
91 |
+
),
|
92 |
+
gr.update(value="", visible=False),
|
93 |
+
gr.update(visible=True, open=True),
|
94 |
+
gr.update(visible=False),
|
95 |
+
)
|
96 |
+
|
97 |
+
logging.info(f"Request received for: '{target_space_id}' (Source: {source})")
|
98 |
+
|
99 |
+
# --- Cache Handling ---
|
100 |
+
# If the user explicitly selected from the dropdown, try to fetch it directly.
|
101 |
+
if source == "dropdown":
|
102 |
+
progress(
|
103 |
+
0.1, desc="Fetching cached report..."
|
104 |
+
) # Simple progress for cache fetch
|
105 |
+
yield (
|
106 |
+
gr.update(value="Fetching selected cached report...", visible=True),
|
107 |
+
gr.update(value="", visible=True),
|
108 |
+
gr.update(visible=True, open=True),
|
109 |
+
gr.update(visible=True, open=False),
|
110 |
+
)
|
111 |
+
try:
|
112 |
+
cached_reports = download_cached_reports(
|
113 |
+
target_space_id, DATASET_ID, HF_TOKEN
|
114 |
+
)
|
115 |
+
summary_report = (
|
116 |
+
cached_reports.get("summary", "Error: Cached summary not found.")
|
117 |
+
+ CACHE_INFO_MSG
|
118 |
+
)
|
119 |
+
privacy_report = (
|
120 |
+
cached_reports.get("privacy", "Error: Cached privacy report not found.")
|
121 |
+
+ CACHE_INFO_MSG
|
122 |
+
)
|
123 |
+
logging.info(
|
124 |
+
f"Successfully displayed cached reports for selected '{target_space_id}'."
|
125 |
+
)
|
126 |
+
progress(1.0, desc="Complete (from cache)")
|
127 |
+
yield (
|
128 |
+
gr.update(value=summary_report, visible=True),
|
129 |
+
gr.update(value=privacy_report, visible=True),
|
130 |
+
gr.update(visible=True, open=True),
|
131 |
+
gr.update(visible=True, open=True),
|
132 |
+
)
|
133 |
+
except Exception as e:
|
134 |
+
error_msg = f"Failed to download cached report for selected '{target_space_id}': {e}"
|
135 |
+
logging.error(error_msg)
|
136 |
+
progress(1.0, desc="Error")
|
137 |
+
yield (
|
138 |
+
gr.update(value=error_msg, visible=True),
|
139 |
+
gr.update(value="", visible=False),
|
140 |
+
gr.update(visible=True, open=True),
|
141 |
+
gr.update(visible=False),
|
142 |
+
)
|
143 |
+
|
144 |
+
# --- Live Analysis or Check Cache for New Input ---
|
145 |
+
# If it came from the textbox OR was a dropdown match, we first check cache, then run live.
|
146 |
+
else: # source == "new" or source == "dropdown_match"
|
147 |
+
# This generator now performs the full analysis if needed
|
148 |
+
# Yield intermediate updates from the generator
|
149 |
+
# Important: Need to use a loop to consume the generator
|
150 |
+
final_update = None
|
151 |
+
for update_tuple in _run_live_analysis(target_space_id, progress):
|
152 |
+
yield update_tuple
|
153 |
+
final_update = update_tuple # Keep track of the last update
|
154 |
+
yield final_update # Return the very last state
|
155 |
+
|
156 |
+
|
157 |
+
def _run_live_analysis(space_id: str, progress=gr.Progress(track_tqdm=True)):
|
158 |
+
"""
|
159 |
+
Performs the full analysis pipeline: cache check, code fetch, LLM calls, upload.
|
160 |
+
Yields tuples of Gradio updates.
|
161 |
+
(This contains the logic previously in analyze_space_privacy, minus initial input handling)
|
162 |
+
"""
|
163 |
+
steps = 8 # Steps for the full pipeline
|
164 |
+
privacy_truncated = False
|
165 |
+
summary_truncated = False
|
166 |
+
|
167 |
+
# --- Step 1: Check Cache --- (Check again for new/matched input)
|
168 |
+
progress(1 / steps, desc="Step 1/8: Checking cache...")
|
169 |
+
logging.info(f"Step 1/8: Checking cache for '{space_id}'...")
|
170 |
+
yield (
|
171 |
+
gr.update(value="Checking cache for existing reports...", visible=True),
|
172 |
+
gr.update(value="", visible=True),
|
173 |
+
gr.update(visible=True, open=True),
|
174 |
+
gr.update(visible=True, open=False),
|
175 |
+
)
|
176 |
+
found_in_cache = False
|
177 |
+
if HF_TOKEN:
|
178 |
+
try:
|
179 |
+
found_in_cache = check_report_exists(space_id, DATASET_ID, HF_TOKEN)
|
180 |
+
except Exception as e:
|
181 |
+
logging.warning(f"Cache check failed: {e}. Proceeding.")
|
182 |
+
yield (
|
183 |
+
gr.update(
|
184 |
+
value="Cache check failed, proceeding with live analysis...",
|
185 |
+
visible=True,
|
186 |
+
),
|
187 |
+
gr.update(value="", visible=True),
|
188 |
+
gr.update(visible=True, open=True),
|
189 |
+
gr.update(visible=True, open=False),
|
190 |
+
)
|
191 |
+
|
192 |
+
if found_in_cache:
|
193 |
+
logging.info(f"Cache hit for {space_id}. Downloading.")
|
194 |
+
progress(2 / steps, desc="Step 2/8: Cache hit! Downloading reports...")
|
195 |
+
yield (
|
196 |
+
gr.update(value="Cache hit! Downloading reports...", visible=True),
|
197 |
+
gr.update(value="", visible=True),
|
198 |
+
gr.update(visible=True, open=True),
|
199 |
+
gr.update(visible=True, open=False),
|
200 |
+
)
|
201 |
+
try:
|
202 |
+
cached_reports = download_cached_reports(space_id, DATASET_ID, HF_TOKEN)
|
203 |
+
summary_report = (
|
204 |
+
cached_reports.get("summary", "Error: Cached summary not found.")
|
205 |
+
+ CACHE_INFO_MSG
|
206 |
+
)
|
207 |
+
privacy_report = (
|
208 |
+
cached_reports.get("privacy", "Error: Cached privacy report not found.")
|
209 |
+
+ CACHE_INFO_MSG
|
210 |
+
)
|
211 |
+
logging.info(f"Successfully displayed cached reports for {space_id}.")
|
212 |
+
progress(8 / steps, desc="Complete (from cache)")
|
213 |
+
yield (
|
214 |
+
gr.update(value=summary_report, visible=True),
|
215 |
+
gr.update(value=privacy_report, visible=True),
|
216 |
+
gr.update(visible=True, open=True),
|
217 |
+
gr.update(visible=True, open=True),
|
218 |
+
)
|
219 |
+
return # End generation here if cache successful
|
220 |
+
except Exception as e:
|
221 |
+
logging.warning(f"Cache download failed for {space_id}: {e}. Proceeding.")
|
222 |
+
yield (
|
223 |
+
gr.update(
|
224 |
+
value="Cache download failed, proceeding with live analysis...",
|
225 |
+
visible=True,
|
226 |
+
),
|
227 |
+
gr.update(value="", visible=True),
|
228 |
+
gr.update(visible=True, open=True),
|
229 |
+
gr.update(visible=True, open=False),
|
230 |
+
)
|
231 |
+
else:
|
232 |
+
logging.info(f"Cache miss for {space_id}. Performing live analysis.")
|
233 |
+
yield (
|
234 |
+
gr.update(value="Cache miss. Fetching code...", visible=True),
|
235 |
+
gr.update(value="", visible=True),
|
236 |
+
gr.update(visible=True, open=True),
|
237 |
+
gr.update(visible=True, open=False),
|
238 |
+
)
|
239 |
+
|
240 |
+
# --- Step 2: Fetch Code Files (if not cached) ---
|
241 |
+
progress(2 / steps, desc="Step 2/8: Fetching code files...")
|
242 |
+
logging.info("Step 2/8: Fetching code files...")
|
243 |
+
code_files = get_space_code_files(space_id)
|
244 |
+
if not code_files:
|
245 |
+
error_msg = f"Could not retrieve code files for '{space_id}'. Check ID and ensure it's a public Space."
|
246 |
+
logging.warning(error_msg)
|
247 |
+
yield (
|
248 |
+
gr.update(value=f"**Error:**\n{error_msg}", visible=True),
|
249 |
+
gr.update(value="Analysis Canceled", visible=True),
|
250 |
+
gr.update(visible=True, open=True),
|
251 |
+
gr.update(visible=True, open=False),
|
252 |
+
)
|
253 |
+
return # End generation on error
|
254 |
+
|
255 |
+
# --- Step 3: Generate DETAILED Privacy Report (LLM Call 1) ---
|
256 |
+
progress(
|
257 |
+
3 / steps, desc="Step 3/8: Generating detailed privacy report (AI Call 1)..."
|
258 |
+
)
|
259 |
+
logging.info("Step 3/8: Generating detailed privacy analysis report...")
|
260 |
+
yield (
|
261 |
+
gr.update(value="Generating detailed privacy report...", visible=True),
|
262 |
+
gr.update(value="Generating detailed privacy report via AI...", visible=True),
|
263 |
+
gr.update(visible=True, open=True),
|
264 |
+
gr.update(visible=True, open=True),
|
265 |
+
)
|
266 |
+
privacy_prompt_messages, privacy_truncated = format_privacy_prompt(
|
267 |
+
space_id, code_files
|
268 |
+
)
|
269 |
+
|
270 |
+
# --- Check for 503 after query ---
|
271 |
+
privacy_api_response = query_qwen_endpoint(privacy_prompt_messages, max_tokens=3072)
|
272 |
+
if privacy_api_response == ERROR_503_DICT:
|
273 |
+
logging.warning("LLM Call 1 failed with 503.")
|
274 |
+
yield (
|
275 |
+
gr.update(
|
276 |
+
value=ERROR_503_USER_MESSAGE, visible=True
|
277 |
+
), # Show 503 message in summary area
|
278 |
+
gr.update(value="", visible=False), # Clear privacy area
|
279 |
+
gr.update(visible=True, open=True), # Keep summary open
|
280 |
+
gr.update(visible=False), # Hide privacy accordion
|
281 |
+
)
|
282 |
+
return # Stop analysis
|
283 |
+
|
284 |
+
detailed_privacy_report = parse_qwen_response(privacy_api_response)
|
285 |
+
|
286 |
+
if "Error:" in detailed_privacy_report:
|
287 |
+
logging.error(
|
288 |
+
f"Failed to generate detailed privacy report: {detailed_privacy_report}"
|
289 |
+
)
|
290 |
+
yield (
|
291 |
+
gr.update(value="Analysis Halted due to Error", visible=True),
|
292 |
+
gr.update(
|
293 |
+
value=f"**Error Generating Detailed Privacy Report:**\n{detailed_privacy_report}",
|
294 |
+
visible=True,
|
295 |
+
),
|
296 |
+
gr.update(visible=True, open=True),
|
297 |
+
gr.update(visible=True, open=True),
|
298 |
+
)
|
299 |
+
return # End generation on error
|
300 |
+
if privacy_truncated:
|
301 |
+
detailed_privacy_report = TRUNCATION_WARNING + detailed_privacy_report
|
302 |
+
|
303 |
+
yield (
|
304 |
+
gr.update(value="Extracting model info...", visible=True),
|
305 |
+
gr.update(value=detailed_privacy_report, visible=True),
|
306 |
+
gr.update(visible=True, open=True),
|
307 |
+
gr.update(visible=True, open=True),
|
308 |
+
)
|
309 |
+
|
310 |
+
# --- Step 4: Extract Model IDs ---
|
311 |
+
progress(4 / steps, desc="Step 4/8: Extracting model IDs...")
|
312 |
+
logging.info("Step 4/8: Extracting potential model IDs...")
|
313 |
+
|
314 |
+
# --- Step 5: Fetch Model Descriptions ---
|
315 |
+
progress(5 / steps, desc="Step 5/8: Fetching model descriptions...")
|
316 |
+
logging.info("Step 5/8: Fetching model descriptions...")
|
317 |
+
yield (
|
318 |
+
gr.update(value="Fetching model descriptions...", visible=True),
|
319 |
+
gr.update(),
|
320 |
+
gr.update(),
|
321 |
+
gr.update(),
|
322 |
+
)
|
323 |
+
# --- Step 6: Generate Summary + Highlights Report (LLM Call 2) ---
|
324 |
+
progress(6 / steps, desc="Step 6/8: Generating summary & highlights (AI Call 2)...")
|
325 |
+
logging.info("Step 6/8: Generating summary and highlights report...")
|
326 |
+
yield (
|
327 |
+
gr.update(value="Generating summary & highlights via AI...", visible=True),
|
328 |
+
gr.update(),
|
329 |
+
gr.update(),
|
330 |
+
gr.update(),
|
331 |
+
)
|
332 |
+
summary_highlights_prompt_messages, summary_truncated = (
|
333 |
+
format_summary_highlights_prompt(space_id, code_files, detailed_privacy_report)
|
334 |
+
)
|
335 |
+
|
336 |
+
# --- Check for 503 after query ---
|
337 |
+
summary_highlights_api_response = query_qwen_endpoint(
|
338 |
+
summary_highlights_prompt_messages, max_tokens=2048
|
339 |
+
)
|
340 |
+
if summary_highlights_api_response == ERROR_503_DICT:
|
341 |
+
logging.warning("LLM Call 2 failed with 503.")
|
342 |
+
yield (
|
343 |
+
gr.update(
|
344 |
+
value=ERROR_503_USER_MESSAGE, visible=True
|
345 |
+
), # Show 503 message in summary area
|
346 |
+
gr.update(
|
347 |
+
value=detailed_privacy_report, visible=True
|
348 |
+
), # Keep previous report visible
|
349 |
+
gr.update(visible=True, open=True), # Keep summary open
|
350 |
+
gr.update(visible=True, open=True), # Keep privacy open
|
351 |
+
)
|
352 |
+
return # Stop analysis
|
353 |
+
|
354 |
+
summary_highlights_report = parse_qwen_response(summary_highlights_api_response)
|
355 |
+
|
356 |
+
if "Error:" in summary_highlights_report:
|
357 |
+
logging.error(
|
358 |
+
f"Failed to generate summary/highlights report: {summary_highlights_report}"
|
359 |
+
)
|
360 |
+
yield (
|
361 |
+
gr.update(
|
362 |
+
value=f"**Error Generating Summary/Highlights:**\n{summary_highlights_report}",
|
363 |
+
visible=True,
|
364 |
+
),
|
365 |
+
gr.update(value=detailed_privacy_report, visible=True),
|
366 |
+
gr.update(visible=True, open=True),
|
367 |
+
gr.update(visible=True, open=True),
|
368 |
+
)
|
369 |
+
return # End generation on error
|
370 |
+
if summary_truncated:
|
371 |
+
summary_highlights_report = TRUNCATION_WARNING + summary_highlights_report
|
372 |
+
|
373 |
+
# Yield summary report before attempting upload
|
374 |
+
yield (
|
375 |
+
gr.update(value=summary_highlights_report, visible=True),
|
376 |
+
gr.update(value=detailed_privacy_report, visible=True),
|
377 |
+
gr.update(visible=True, open=True),
|
378 |
+
gr.update(visible=True, open=True),
|
379 |
+
)
|
380 |
+
|
381 |
+
# --- Step 7: Upload to Cache ---
|
382 |
+
progress(7 / steps, desc="Step 7/8: Uploading results to cache...")
|
383 |
+
logging.info("Step 7/8: Attempting to upload results to dataset cache...")
|
384 |
+
try:
|
385 |
+
if (
|
386 |
+
HF_TOKEN
|
387 |
+
and not found_in_cache
|
388 |
+
and "Error:" not in detailed_privacy_report
|
389 |
+
and "Error:" not in summary_highlights_report
|
390 |
+
):
|
391 |
+
summary_to_save = summary_highlights_report.replace(
|
392 |
+
TRUNCATION_WARNING, ""
|
393 |
+
).replace(CACHE_INFO_MSG, "")
|
394 |
+
privacy_to_save = detailed_privacy_report.replace(
|
395 |
+
TRUNCATION_WARNING, ""
|
396 |
+
).replace(CACHE_INFO_MSG, "")
|
397 |
+
upload_reports_to_dataset(
|
398 |
+
space_id=space_id,
|
399 |
+
summary_report=summary_to_save,
|
400 |
+
detailed_report=privacy_to_save,
|
401 |
+
dataset_id=DATASET_ID,
|
402 |
+
hf_token=HF_TOKEN,
|
403 |
+
)
|
404 |
+
elif not HF_TOKEN:
|
405 |
+
logging.warning("Skipping cache upload as HF_TOKEN is not set.")
|
406 |
+
elif found_in_cache:
|
407 |
+
logging.info("Skipping cache upload as results were loaded from cache.")
|
408 |
+
except Exception as e:
|
409 |
+
logging.error(f"Non-critical error during report upload: {e}")
|
410 |
+
|
411 |
+
logging.info("Step 8/8: Analysis complete.")
|
412 |
+
progress(8 / steps, desc="Step 8/8: Analysis Complete!")
|
413 |
+
|
414 |
+
# --- Step 8: Yield Final Results --- (Ensure final state is correct)
|
415 |
+
yield (
|
416 |
+
gr.update(value=summary_highlights_report, visible=True),
|
417 |
+
gr.update(value=detailed_privacy_report, visible=True),
|
418 |
+
gr.update(visible=True, open=True),
|
419 |
+
gr.update(visible=True, open=True),
|
420 |
+
)
|
421 |
+
|
422 |
+
|
423 |
+
# --- Load Initial Data Function (for demo.load) ---
|
424 |
+
def load_cached_list():
|
425 |
+
"""Fetches the list of cached spaces and determines the default selection."""
|
426 |
+
print("Running demo.load: Fetching list of cached spaces...")
|
427 |
+
# Use os.getenv here directly as HF_TOKEN might be loaded after initial import
|
428 |
+
token = os.getenv("HF_TOKEN")
|
429 |
+
cached_list = list_cached_spaces(DATASET_ID, token)
|
430 |
+
default_value = DEFAULT_SELECTION if DEFAULT_SELECTION in cached_list else None
|
431 |
+
if not cached_list:
|
432 |
+
print(
|
433 |
+
"WARNING: No cached spaces found or failed to fetch list during demo.load."
|
434 |
+
)
|
435 |
+
# Return an update object for the dropdown using gr.update()
|
436 |
+
return gr.update(choices=cached_list, value=default_value)
|
437 |
+
|
438 |
+
|
439 |
+
# --- Gradio Interface Definition ---
|
440 |
+
# Use HTML/CSS for centering the title
|
441 |
+
TITLE = "<div style='text-align: center;'><h1>🤗 Space Privacy Analyzer 🕵️</h1></div>\n<div style='text-align: center;'><h4>Automatic code Data transfer review powered by <a href='https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct' target='_blank'>Qwen2.5-Coder-32B-Instruct</a></h4></div>"
|
442 |
+
|
443 |
+
DESCRIPTION = """
|
444 |
+
### What Privacy Questions do 🤗 Spaces Raise?
|
445 |
+
|
446 |
+
[Hugging Face Spaces](https://huggingface.co/spaces) offer a convenient way to build and share demos leveraging AI models.
|
447 |
+
In most cases, the code for these demos is open source — which provides a unique opportunity to check **how they manage the privacy** of the data in use.
|
448 |
+
|
449 |
+
This demo leverages a code analysis model ([Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct)) to help explore these questions in two steps:
|
450 |
+
it first obtains and **parses the code** to identify data inputs, AI model use, API calls, and data transfer, then generates a summary of the app's function and **key privacy points**.
|
451 |
+
|
452 |
+
Use the dropdown menu below to explore the [reports generated for some popular Spaces](https://huggingface.co/datasets/yjernite/spaces-privacy-reports/tree/main), or enter a new Space ID to query your own 👇
|
453 |
+
|
454 |
+
*Please note the following limitations:*
|
455 |
+
- *The model may easily miss important details in the code, especially when it leverages docker files or external libraries.*
|
456 |
+
- *This app uses the base Qwen Coder model without specific adaptation to the task. We'd love to discuss how to improve this, if you want to participate feel free to open a discussion!*
|
457 |
+
"""
|
458 |
+
|
459 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
460 |
+
gr.Markdown(TITLE) # This will now render the centered HTML
|
461 |
+
|
462 |
+
with gr.Row():
|
463 |
+
with gr.Column(scale=1): # Left column for inputs
|
464 |
+
gr.Markdown(DESCRIPTION)
|
465 |
+
|
466 |
+
cached_spaces_dropdown = gr.Dropdown(
|
467 |
+
label="Select Existing Report",
|
468 |
+
info="Select a Space whose report has been previously generated.",
|
469 |
+
choices=[], # Initialize empty, will be populated by demo.load
|
470 |
+
value=None, # Initialize empty
|
471 |
+
)
|
472 |
+
|
473 |
+
space_id_input = gr.Textbox(
|
474 |
+
label="Or Enter New Space ID",
|
475 |
+
placeholder="owner/space-name",
|
476 |
+
info="Enter a new Space ID to analyze (takes precedence over selection).",
|
477 |
+
)
|
478 |
+
|
479 |
+
analyze_button = gr.Button("Get Space Report", variant="primary", scale=1)
|
480 |
+
|
481 |
+
with gr.Column(scale=1): # Right column for outputs
|
482 |
+
# Define Accordions first, open by default, hidden initially
|
483 |
+
summary_accordion = gr.Accordion(
|
484 |
+
"Summary & Privacy Highlights", open=True, visible=True
|
485 |
+
)
|
486 |
+
privacy_accordion = gr.Accordion(
|
487 |
+
"Detailed Privacy Analysis Report", open=False, visible=True
|
488 |
+
)
|
489 |
+
with summary_accordion:
|
490 |
+
summary_markdown = gr.Markdown(
|
491 |
+
"Enter or select a Space ID and click Get Report.",
|
492 |
+
show_copy_button=True,
|
493 |
+
)
|
494 |
+
with privacy_accordion:
|
495 |
+
privacy_markdown = gr.Markdown(
|
496 |
+
"Detailed report will appear here.", show_copy_button=True
|
497 |
+
)
|
498 |
+
|
499 |
+
# --- Event Listeners ---
|
500 |
+
|
501 |
+
# Load event to populate the dropdown when the UI loads for a user session
|
502 |
+
demo.load(fn=load_cached_list, inputs=None, outputs=cached_spaces_dropdown)
|
503 |
+
|
504 |
+
# Button click event
|
505 |
+
analyze_button.click(
|
506 |
+
fn=get_space_report_wrapper,
|
507 |
+
inputs=[cached_spaces_dropdown, space_id_input],
|
508 |
+
outputs=[
|
509 |
+
summary_markdown,
|
510 |
+
privacy_markdown,
|
511 |
+
summary_accordion,
|
512 |
+
privacy_accordion,
|
513 |
+
],
|
514 |
+
show_progress="full",
|
515 |
+
)
|
516 |
+
|
517 |
+
# --- Application Entry Point ---
|
518 |
+
|
519 |
+
if __name__ == "__main__":
|
520 |
+
logging.info("Starting Gradio application...")
|
521 |
+
demo.launch()
|
llm_interface.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from huggingface_hub import InferenceClient
|
6 |
+
from huggingface_hub.inference._generated.types import ChatCompletionOutput
|
7 |
+
from huggingface_hub.utils import HfHubHTTPError
|
8 |
+
|
9 |
+
# Configure logging
|
10 |
+
logging.basicConfig(
|
11 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
12 |
+
)
|
13 |
+
|
14 |
+
# Load environment variables from .env file
|
15 |
+
# load_dotenv() # Removed: This should be loaded only at the main entry point (app.py)
|
16 |
+
load_dotenv() # Restored: Ensure env vars are loaded when this module is imported/used
|
17 |
+
|
18 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
19 |
+
HF_INFERENCE_ENDPOINT_URL = os.getenv("HF_INFERENCE_ENDPOINT_URL")
|
20 |
+
|
21 |
+
# Default parameters for the LLM call
|
22 |
+
DEFAULT_MAX_TOKENS = 2048
|
23 |
+
DEFAULT_TEMPERATURE = 0.1 # Lower temperature for more deterministic analysis
|
24 |
+
|
25 |
+
# Special dictionary to indicate a 503 error
|
26 |
+
ERROR_503_DICT = {"error_type": "503", "message": "Service Unavailable"}
|
27 |
+
|
28 |
+
|
29 |
+
def query_qwen_endpoint(
|
30 |
+
formatted_prompt: list[dict[str, str]], max_tokens: int = DEFAULT_MAX_TOKENS
|
31 |
+
) -> ChatCompletionOutput | dict | None:
|
32 |
+
"""
|
33 |
+
Queries the specified Qwen Inference Endpoint with the formatted prompt.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
formatted_prompt: A list of message dictionaries for the chat completion API.
|
37 |
+
max_tokens: The maximum number of tokens to generate.
|
38 |
+
|
39 |
+
Returns:
|
40 |
+
The ChatCompletionOutput object from the inference client,
|
41 |
+
a specific dictionary (ERROR_503_DICT) if a 503 error occurs,
|
42 |
+
or None if another error occurs.
|
43 |
+
"""
|
44 |
+
if not HF_INFERENCE_ENDPOINT_URL:
|
45 |
+
logging.error("HF_INFERENCE_ENDPOINT_URL environment variable not set.")
|
46 |
+
return None
|
47 |
+
if not HF_TOKEN:
|
48 |
+
logging.warning(
|
49 |
+
"HF_TOKEN environment variable not set. Requests might fail if the endpoint requires authentication."
|
50 |
+
)
|
51 |
+
# Depending on endpoint config, it might still work without token
|
52 |
+
|
53 |
+
logging.info(f"Querying Inference Endpoint: {HF_INFERENCE_ENDPOINT_URL}")
|
54 |
+
client = InferenceClient(model=HF_INFERENCE_ENDPOINT_URL, token=HF_TOKEN)
|
55 |
+
|
56 |
+
try:
|
57 |
+
response = client.chat_completion(
|
58 |
+
messages=formatted_prompt,
|
59 |
+
max_tokens=max_tokens,
|
60 |
+
temperature=DEFAULT_TEMPERATURE,
|
61 |
+
# Qwen models often benefit from setting stop sequences if known,
|
62 |
+
# but we'll rely on max_tokens and model's natural stopping for now.
|
63 |
+
# stop=["<|im_end|>"] # Example stop token if needed for specific Qwen finetunes
|
64 |
+
)
|
65 |
+
logging.info("Successfully received response from Inference Endpoint.")
|
66 |
+
return response
|
67 |
+
except HfHubHTTPError as e:
|
68 |
+
# Check specifically for 503 Service Unavailable
|
69 |
+
if e.response is not None and e.response.status_code == 503:
|
70 |
+
logging.warning(
|
71 |
+
f"Encountered 503 Service Unavailable from endpoint: {HF_INFERENCE_ENDPOINT_URL}"
|
72 |
+
)
|
73 |
+
return ERROR_503_DICT # Return special dict for 503
|
74 |
+
else:
|
75 |
+
# Handle other HTTP errors
|
76 |
+
logging.error(f"HTTP error querying Inference Endpoint: {e}")
|
77 |
+
if e.response is not None:
|
78 |
+
logging.error(f"Response details: {e.response.text}")
|
79 |
+
return None # Return None for other HTTP errors
|
80 |
+
except Exception as e:
|
81 |
+
logging.error(f"An unexpected error occurred querying Inference Endpoint: {e}")
|
82 |
+
return None
|
83 |
+
|
84 |
+
|
85 |
+
def parse_qwen_response(response: ChatCompletionOutput | dict | None) -> str:
|
86 |
+
"""
|
87 |
+
Parses the response from the Qwen model to extract the generated text.
|
88 |
+
Handles potential None or error dict inputs.
|
89 |
+
|
90 |
+
Args:
|
91 |
+
response: The ChatCompletionOutput object, ERROR_503_DICT, or None.
|
92 |
+
|
93 |
+
Returns:
|
94 |
+
The extracted response text as a string, or an error message string.
|
95 |
+
"""
|
96 |
+
if response is None:
|
97 |
+
return "Error: Failed to get response from the language model."
|
98 |
+
|
99 |
+
# Check if it's our specific 503 error signal before trying to parse as ChatCompletionOutput
|
100 |
+
if isinstance(response, dict) and response.get("error_type") == "503":
|
101 |
+
return f"Error: {response['error_type']} {response['message']}"
|
102 |
+
|
103 |
+
# Check if it's likely the expected ChatCompletionOutput structure
|
104 |
+
if not hasattr(response, "choices"):
|
105 |
+
logging.error(
|
106 |
+
f"Unexpected response type received by parse_qwen_response: {type(response)}. Content: {response}"
|
107 |
+
)
|
108 |
+
return "Error: Received an unexpected response format from the language model endpoint."
|
109 |
+
|
110 |
+
try:
|
111 |
+
# Access the generated content according to the ChatCompletionOutput structure
|
112 |
+
if response.choices and len(response.choices) > 0:
|
113 |
+
content = response.choices[0].message.content
|
114 |
+
if content:
|
115 |
+
logging.info("Successfully parsed response content.")
|
116 |
+
return content.strip()
|
117 |
+
else:
|
118 |
+
logging.warning("Response received, but content is empty.")
|
119 |
+
return "Error: Received an empty response from the language model."
|
120 |
+
else:
|
121 |
+
logging.warning("Response received, but no choices found.")
|
122 |
+
return "Error: No response choices found in the language model output."
|
123 |
+
except AttributeError as e:
|
124 |
+
# This might catch cases where response looks like the object but lacks expected attributes
|
125 |
+
logging.error(
|
126 |
+
f"Attribute error parsing response: {e}. Response structure might be unexpected."
|
127 |
+
)
|
128 |
+
logging.error(f"Raw response object: {response}")
|
129 |
+
return "Error: Could not parse the structure of the language model response."
|
130 |
+
except Exception as e:
|
131 |
+
logging.error(f"An unexpected error occurred parsing the response: {e}")
|
132 |
+
return "Error: An unexpected error occurred while parsing the language model response."
|
133 |
+
|
134 |
+
|
135 |
+
# Example Usage (for testing - requires .env setup and potentially prompts.py)
|
136 |
+
# if __name__ == '__main__':
|
137 |
+
# # This example assumes you have a prompts.py that can generate a test prompt
|
138 |
+
# try:
|
139 |
+
# from prompts import format_code_for_analysis
|
140 |
+
# # Create a dummy prompt for testing
|
141 |
+
# test_files = {"app.py": "print('hello')"}
|
142 |
+
# test_prompt = format_code_for_analysis("test/minimal", test_files)
|
143 |
+
# print("--- Sending Test Prompt ---")
|
144 |
+
# print(test_prompt)
|
145 |
+
# api_response = query_qwen_endpoint(test_prompt)
|
146 |
+
# print("\n--- Raw API Response ---")
|
147 |
+
# print(api_response)
|
148 |
+
# print("\n--- Parsed Response ---")
|
149 |
+
# parsed_text = parse_qwen_response(api_response)
|
150 |
+
# print(parsed_text)
|
151 |
+
# except ImportError:
|
152 |
+
# print("Could not import prompts.py for testing. Run this test from the project root.")
|
153 |
+
# except Exception as e:
|
154 |
+
# print(f"An error occurred during testing: {e}")
|
prompts.py
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from collections import defaultdict
|
3 |
+
|
4 |
+
MAX_FILE_CONTENT_LENGTH = 10000 # Limit the length of individual file contents
|
5 |
+
MAX_MODEL_DESC_LENGTH = 1500 # Limit the length of fetched model descriptions
|
6 |
+
MAX_PROMPT_CHARS = 110000 # Approx < 30k tokens (using ~4 chars/token heuristic)
|
7 |
+
|
8 |
+
# Prompt for the first LLM call: Detailed Privacy Analysis
|
9 |
+
PRIVACY_SYSTEM_PROMPT = (
|
10 |
+
"You are a helpful AI assistant specialized in analyzing Hugging Face Spaces code for privacy concerns. "
|
11 |
+
"Your goal is to identify data flows and potential privacy risks based *only* on the provided code files. "
|
12 |
+
"Analyze the following aspects and provide relevant code snippets (formatted as Markdown code blocks) as evidence for each point. "
|
13 |
+
"**Crucially, include the filename for each code snippet.** Example: `(filename.py)`\n\n"
|
14 |
+
"**Note:** If the app uses externally defined or unaccessible code to upload or process data, say so.\n\n"
|
15 |
+
"1. **Data Inputs:**\n"
|
16 |
+
" - What types of user data does the application accept as input (e.g., text, images, audio, files)?\n"
|
17 |
+
" - Where in the code are these inputs defined (e.g., Gradio input widgets, file uploads)? Provide the filename and code snippet.\n\n"
|
18 |
+
"2. **Processing Services & Data Transmission:**\n"
|
19 |
+
" - What specific internal or external APIs, models, or services are used to process the input data?\n"
|
20 |
+
" - What specific AI models or services are used to process the input data? Are any of these Hugging Face-hosted models?\n"
|
21 |
+
" - Where in the code are these services called (e.g., `requests.post`, `InferenceClient`, specific API endpoint URLs) or defined (e.g., `transformers` library)? Provide the filename and code snippet.\n"
|
22 |
+
" - Is it likely that user data is transmitted to these external services, and what kind of data is transmitted by each service or API? Mention if the services are known (like Hugging Face Inference API/Endpoints) or potentially unknown third parties.\n\n"
|
23 |
+
"3. **Execution Environment & Potential Local Processing:**\n"
|
24 |
+
" - Does the code indicate that models or significant processing might run *locally* within the Space container? Provide the filename and code snippet.\n"
|
25 |
+
" - Does the code explicitly use external *inference services* to query AI models? If so, reiterate the relevant code snippet from point 2 with filename.\n\n"
|
26 |
+
" - Does the code mention interactions with remote databases (e.g., `sqlite`, `postgres`, `mysql`, `redis`, `mongodb`, etc.), storage (e.g., `s3`, `gcs`, `azure`, etc.), or Cloud-based data services? If so, provide the filename and code snippet.\n\n"
|
27 |
+
"4. **Explicit Data Storage/Logging:**\n"
|
28 |
+
" - Is there any code that explicitly stores user input or results to files, databases, or external logging services? Provide the filename and code snippet.\n\n"
|
29 |
+
"5. **Overall Privacy Risk Summary:**\n"
|
30 |
+
" - Based ONLY on the evidence from the code snippets above, provide a concise summary paragraph highlighting the main potential privacy considerations or risks.\n\n"
|
31 |
+
"Format your entire response clearly using Markdown. Ensure all code snippets include filename and are properly formatted."
|
32 |
+
)
|
33 |
+
|
34 |
+
# Prompt for the second LLM call: Space Summary + Privacy Highlights
|
35 |
+
SUMMARY_HIGHLIGHTS_SYSTEM_PROMPT = (
|
36 |
+
"You are an AI assistant reviewing a Hugging Face Space. You have been provided with: "
|
37 |
+
"(1) the application code, and "
|
38 |
+
"(2) a detailed preliminary privacy analysis report."
|
39 |
+
"Your task is to generate a summary report containing two parts:\n\n"
|
40 |
+
"**Part 1: Space Summary**\n"
|
41 |
+
"- Based on the code and privacy analysis report, provide a concise summary (4-6 sentences max) of what the application does from a user's perspective.\n\n"
|
42 |
+
"**Part 2: Privacy Highlights**\n"
|
43 |
+
"- Using information from the preliminary privacy report (cross-referencing code/descriptions as needed), list the following key privacy aspects:\n"
|
44 |
+
" 1. **Data Inputs:** List the main types of data provided to the application with a brief description for each. List where the data is used or stored by the application.\n"
|
45 |
+
" 2. **AI Models/Services:** List the core AI models or services used. For each, specify: Is it run locally or remotely? What library or service is used, or is the code defined within the app?\n"
|
46 |
+
" 3. **Other Remote Data or Dataset Calls:** List any other identified remote data calls that might upload or transmit data outside of the app (e.g., to databases, external APIs not covered above, cloud storage).\n"
|
47 |
+
" 4. **Libraries Suggesting Data Transmission:** List libraries used (e.g., `requests`, `gradio[sharing]`) that might implicitly or explicitly transmit data, suggesting where users might look for more details (e.g., library documentation, specific code sections).\n\n"
|
48 |
+
"Format the entire response clearly using Markdown. Do not include the preliminary privacy report itself in your output."
|
49 |
+
)
|
50 |
+
|
51 |
+
|
52 |
+
def _generate_file_structure(code_files: dict[str, str]) -> str:
|
53 |
+
"""Generates a tree-like textual representation of the file structure."""
|
54 |
+
tree = defaultdict(dict)
|
55 |
+
files = sorted(code_files.keys())
|
56 |
+
|
57 |
+
for fpath in files:
|
58 |
+
parts = fpath.split("/")
|
59 |
+
node = tree
|
60 |
+
for i, part in enumerate(parts):
|
61 |
+
if i == len(parts) - 1: # It's a file
|
62 |
+
node[part] = None # Mark as file
|
63 |
+
else: # It's a directory
|
64 |
+
if part not in node:
|
65 |
+
node[part] = defaultdict(dict) # Create dir node if not exists
|
66 |
+
# Check if we previously marked this as a file (edge case where dir name = file name at higher level)
|
67 |
+
elif node[part] is None:
|
68 |
+
node[part] = defaultdict(dict) # Convert file marker to dir
|
69 |
+
node = node[part] # Move deeper
|
70 |
+
|
71 |
+
output_lines = ["Project File Structure:"]
|
72 |
+
|
73 |
+
def build_tree_lines(node, prefix=""):
|
74 |
+
# Sort items: directories first (defaultdict instances), then files (keys with None value)
|
75 |
+
items = sorted(
|
76 |
+
node.items(),
|
77 |
+
key=lambda item: isinstance(item[1], defaultdict),
|
78 |
+
reverse=True,
|
79 |
+
)
|
80 |
+
|
81 |
+
pointers = ["├── " for _ in range(len(items) - 1)] + ["└── "]
|
82 |
+
for pointer, (name, sub_node) in zip(pointers, items):
|
83 |
+
output_lines.append(prefix + pointer + name)
|
84 |
+
if isinstance(sub_node, defaultdict): # It's a directory
|
85 |
+
extension = "│ " if pointer == "├── " else " "
|
86 |
+
build_tree_lines(sub_node, prefix + extension)
|
87 |
+
|
88 |
+
build_tree_lines(tree)
|
89 |
+
print("\n".join(output_lines))
|
90 |
+
return "\n".join(output_lines)
|
91 |
+
|
92 |
+
|
93 |
+
def _format_code_files_for_prompt(code_files: dict[str, str]) -> str:
|
94 |
+
"""Formats the code files into a single string for the prompt, sorted by depth and path."""
|
95 |
+
|
96 |
+
def sort_key(filepath):
|
97 |
+
parts = filepath.split("/")
|
98 |
+
depth = len(parts) - 1
|
99 |
+
dir_path = "/".join(parts[:-1]) if depth > 0 else ""
|
100 |
+
filename = parts[-1]
|
101 |
+
return (depth, dir_path, filename)
|
102 |
+
|
103 |
+
sorted_filenames = sorted(code_files.keys(), key=sort_key)
|
104 |
+
|
105 |
+
output_parts = []
|
106 |
+
for filename in sorted_filenames:
|
107 |
+
content = code_files[filename]
|
108 |
+
print(f"--- File: {filename} ---")
|
109 |
+
print(content[:128])
|
110 |
+
output_parts.append(
|
111 |
+
f"--- File: {filename} ---\n```\n{content[:MAX_FILE_CONTENT_LENGTH]}{'\n... [truncated]' if len(content) > MAX_FILE_CONTENT_LENGTH else ''}\n```"
|
112 |
+
)
|
113 |
+
|
114 |
+
return "\n".join(output_parts)
|
115 |
+
|
116 |
+
|
117 |
+
def format_privacy_prompt(
|
118 |
+
space_id: str, code_files: dict[str, str]
|
119 |
+
) -> tuple[list[dict[str, str]], bool]:
|
120 |
+
"""
|
121 |
+
Formats the prompt for the initial detailed privacy analysis task.
|
122 |
+
Returns messages list and a boolean indicating if truncation occurred.
|
123 |
+
"""
|
124 |
+
was_truncated = False
|
125 |
+
file_structure = _generate_file_structure(code_files)
|
126 |
+
formatted_code = _format_code_files_for_prompt(code_files)
|
127 |
+
|
128 |
+
# Define components for length calculation
|
129 |
+
prompt_header = f"Please perform a detailed privacy analysis for the Hugging Face Space '{space_id}'.\n\n{file_structure}\n\nCode Files Content:\n"
|
130 |
+
base_length = len(prompt_header) + len(PRIVACY_SYSTEM_PROMPT)
|
131 |
+
|
132 |
+
# Check if formatted code needs truncation for the overall prompt
|
133 |
+
available_chars_for_code = MAX_PROMPT_CHARS - base_length
|
134 |
+
if available_chars_for_code < 0: # Header itself is too long (unlikely)
|
135 |
+
available_chars_for_code = 0
|
136 |
+
was_truncated = True
|
137 |
+
|
138 |
+
if len(formatted_code) > available_chars_for_code:
|
139 |
+
formatted_code = (
|
140 |
+
formatted_code[:available_chars_for_code]
|
141 |
+
+ "\n... [Code Section Truncated Due to Overall Prompt Length] ..."
|
142 |
+
)
|
143 |
+
was_truncated = True
|
144 |
+
logging.warning(
|
145 |
+
f"Privacy prompt code section truncated for Space ID {space_id} due to overall length."
|
146 |
+
)
|
147 |
+
|
148 |
+
user_content = prompt_header + formatted_code
|
149 |
+
|
150 |
+
messages = [
|
151 |
+
{"role": "system", "content": PRIVACY_SYSTEM_PROMPT},
|
152 |
+
{"role": "user", "content": user_content},
|
153 |
+
]
|
154 |
+
return messages, was_truncated
|
155 |
+
|
156 |
+
|
157 |
+
def format_summary_highlights_prompt(
|
158 |
+
space_id: str, code_files: dict[str, str], detailed_privacy_report: str
|
159 |
+
) -> tuple[list[dict[str, str]], bool]:
|
160 |
+
"""
|
161 |
+
Formats the prompt for the final summary + highlights report.
|
162 |
+
Returns messages list and a boolean indicating if truncation occurred.
|
163 |
+
"""
|
164 |
+
was_truncated = False
|
165 |
+
file_structure = _generate_file_structure(code_files)
|
166 |
+
formatted_code = _format_code_files_for_prompt(code_files)
|
167 |
+
|
168 |
+
# Define components for length calculation
|
169 |
+
prompt_header = f"Please generate a final summary and privacy highlights report for the Hugging Face Space '{space_id}'.\n\n"
|
170 |
+
report_header = "**Preliminary Detailed Privacy Report:**\n---\n"
|
171 |
+
report_footer = "\n---\n\n"
|
172 |
+
support_header = f"**Supporting Information:**\n{file_structure}\n\n"
|
173 |
+
code_header = "**Original Code Files Content:**\n"
|
174 |
+
|
175 |
+
base_length = (
|
176 |
+
len(prompt_header)
|
177 |
+
+ len(report_header)
|
178 |
+
+ len(report_footer)
|
179 |
+
+ len(support_header)
|
180 |
+
+ len(code_header)
|
181 |
+
+ len(SUMMARY_HIGHLIGHTS_SYSTEM_PROMPT)
|
182 |
+
)
|
183 |
+
available_chars_total = MAX_PROMPT_CHARS - base_length
|
184 |
+
|
185 |
+
if available_chars_total < 0: # Base structure is too long
|
186 |
+
logging.error(
|
187 |
+
f"Base prompt structure for summary highlights exceeds limit for Space ID {space_id}. Cannot proceed effectively."
|
188 |
+
)
|
189 |
+
# Return minimal user content to avoid errors, but flag truncation heavily
|
190 |
+
user_content = (
|
191 |
+
prompt_header
|
192 |
+
+ report_header
|
193 |
+
+ "[TRUNCATED DUE TO LENGTH]"
|
194 |
+
+ report_footer
|
195 |
+
+ support_header
|
196 |
+
+ code_header
|
197 |
+
+ "[TRUNCATED DUE TO LENGTH]"
|
198 |
+
)
|
199 |
+
was_truncated = True
|
200 |
+
else:
|
201 |
+
# Prioritize truncating the detailed report first
|
202 |
+
available_chars_for_report = available_chars_total - len(
|
203 |
+
formatted_code
|
204 |
+
) # Reserve space for code
|
205 |
+
if available_chars_for_report < 0:
|
206 |
+
available_chars_for_report = 0 # Cannot fit report
|
207 |
+
|
208 |
+
if len(detailed_privacy_report) > available_chars_for_report:
|
209 |
+
detailed_privacy_report = (
|
210 |
+
detailed_privacy_report[:available_chars_for_report]
|
211 |
+
+ "\n... [Detailed Privacy Report Truncated Due to Overall Prompt Length] ..."
|
212 |
+
)
|
213 |
+
was_truncated = True
|
214 |
+
logging.warning(
|
215 |
+
f"Summary prompt detailed report section truncated for Space ID {space_id}."
|
216 |
+
)
|
217 |
+
|
218 |
+
# Now check code length again with (potentially truncated) report length
|
219 |
+
available_chars_for_code = available_chars_total - len(detailed_privacy_report)
|
220 |
+
if available_chars_for_code < 0:
|
221 |
+
available_chars_for_code = 0 # Cannot fit code
|
222 |
+
|
223 |
+
if len(formatted_code) > available_chars_for_code:
|
224 |
+
formatted_code = (
|
225 |
+
formatted_code[:available_chars_for_code]
|
226 |
+
+ "\n... [Code Section Truncated Due to Overall Prompt Length] ..."
|
227 |
+
)
|
228 |
+
was_truncated = True
|
229 |
+
logging.warning(
|
230 |
+
f"Summary prompt code section truncated for Space ID {space_id}."
|
231 |
+
)
|
232 |
+
|
233 |
+
# Assemble the final user content
|
234 |
+
user_content = (
|
235 |
+
prompt_header
|
236 |
+
+ report_header
|
237 |
+
+ detailed_privacy_report
|
238 |
+
+ report_footer
|
239 |
+
+ support_header
|
240 |
+
+ code_header
|
241 |
+
+ formatted_code
|
242 |
+
)
|
243 |
+
|
244 |
+
messages = [
|
245 |
+
{"role": "system", "content": SUMMARY_HIGHLIGHTS_SYSTEM_PROMPT},
|
246 |
+
{"role": "user", "content": user_content},
|
247 |
+
]
|
248 |
+
return messages, was_truncated
|
249 |
+
|
250 |
+
|
251 |
+
# Example usage (for testing)
|
252 |
+
# if __name__ == '__main__':
|
253 |
+
# test_files = {
|
254 |
+
# "app.py": "import gradio as gr\n\ndef greet(name):\n # Potentially send data to external service?\n # requests.post('http://example.com/log', json={'user': name})\n return f'Hello {name}!'",
|
255 |
+
# "requirements.txt": "gradio\nrequests",
|
256 |
+
# "nested/utils.py": "def helper():\n pass",
|
257 |
+
# "README.md": "This should be ignored.", # Example of a file that *should* be filtered out before reaching here
|
258 |
+
# "very_long_file.py": "print('hello' * 5000)" # Test truncation
|
259 |
+
# }
|
260 |
+
# # Typically, files like README.md would be filtered by get_space_code_files in utils.py
|
261 |
+
# # We include it here just for demo purposes if you were to test prompts.py directly.
|
262 |
+
# filtered_test_files = {k: v for k, v in test_files.items() if not k.endswith('.md')}
|
263 |
+
# prompt_messages = format_code_for_analysis("test/space", filtered_test_files)
|
264 |
+
# print("--- System Prompt ---")
|
265 |
+
# print(prompt_messages[0]['content'])
|
266 |
+
# print("\n--- User Prompt ---")
|
267 |
+
# print(prompt_messages[1]['content'])
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
huggingface-hub
|
3 |
+
python-dotenv
|
4 |
+
requests
|
utils.py
ADDED
@@ -0,0 +1,507 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import tempfile
|
5 |
+
|
6 |
+
from huggingface_hub import HfApi, hf_hub_download
|
7 |
+
from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
|
8 |
+
|
9 |
+
# Configure logging
|
10 |
+
logging.basicConfig(
|
11 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
12 |
+
)
|
13 |
+
|
14 |
+
# Files/extensions to definitely include
|
15 |
+
INCLUDE_PATTERNS = [
|
16 |
+
".py",
|
17 |
+
"requirements.txt",
|
18 |
+
"Dockerfile",
|
19 |
+
".js",
|
20 |
+
".jsx",
|
21 |
+
".ts",
|
22 |
+
".tsx",
|
23 |
+
".html",
|
24 |
+
".css",
|
25 |
+
".svelte",
|
26 |
+
".vue",
|
27 |
+
".json",
|
28 |
+
".yaml",
|
29 |
+
".yml",
|
30 |
+
".toml",
|
31 |
+
"Procfile",
|
32 |
+
".sh",
|
33 |
+
]
|
34 |
+
|
35 |
+
# Files/extensions/folders to ignore
|
36 |
+
IGNORE_PATTERNS = [
|
37 |
+
".git",
|
38 |
+
".hfignore",
|
39 |
+
"README.md",
|
40 |
+
"LICENSE",
|
41 |
+
"__pycache__",
|
42 |
+
".ipynb_checkpoints",
|
43 |
+
".png",
|
44 |
+
".jpg",
|
45 |
+
".jpeg",
|
46 |
+
".gif",
|
47 |
+
".svg",
|
48 |
+
".ico",
|
49 |
+
".mp3",
|
50 |
+
".wav",
|
51 |
+
".mp4",
|
52 |
+
".mov",
|
53 |
+
".avi",
|
54 |
+
".onnx",
|
55 |
+
".pt",
|
56 |
+
".pth",
|
57 |
+
".bin",
|
58 |
+
".safetensors",
|
59 |
+
".tflite",
|
60 |
+
".pickle",
|
61 |
+
".pkl",
|
62 |
+
".joblib",
|
63 |
+
".parquet",
|
64 |
+
".csv",
|
65 |
+
".tsv",
|
66 |
+
".zip",
|
67 |
+
".tar.gz",
|
68 |
+
".gz",
|
69 |
+
".ipynb",
|
70 |
+
".DS_Store",
|
71 |
+
"node_modules",
|
72 |
+
]
|
73 |
+
|
74 |
+
# Regex to find potential Hugging Face model IDs (e.g., "org/model-name", "user/model-name")
|
75 |
+
# This is a simple heuristic and might catch non-model strings or miss complex cases.
|
76 |
+
HF_MODEL_ID_PATTERN = re.compile(r"([\"\'])([\w\-.]+/[\w\-\.]+)\1\'")
|
77 |
+
|
78 |
+
# Max length for model descriptions to keep prompts manageable
|
79 |
+
MAX_MODEL_DESC_LENGTH = 1500
|
80 |
+
|
81 |
+
SUMMARY_FILENAME = "summary_highlights.md"
|
82 |
+
PRIVACY_FILENAME = "privacy_report.md"
|
83 |
+
|
84 |
+
|
85 |
+
def _is_relevant_file(filename):
|
86 |
+
"""Check if a file should be included based on patterns."""
|
87 |
+
# Ignore files matching ignore patterns (case-insensitive check for some)
|
88 |
+
lower_filename = filename.lower()
|
89 |
+
if any(
|
90 |
+
pattern in lower_filename
|
91 |
+
for pattern in [".git", ".hfignore", "readme.md", "license"]
|
92 |
+
):
|
93 |
+
return False
|
94 |
+
if any(
|
95 |
+
filename.endswith(ext) for ext in IGNORE_PATTERNS if ext.startswith(".")
|
96 |
+
): # Check extensions
|
97 |
+
return False
|
98 |
+
if any(
|
99 |
+
part == pattern
|
100 |
+
for part in filename.split("/")
|
101 |
+
for pattern in IGNORE_PATTERNS
|
102 |
+
if "." not in pattern and "/" not in pattern
|
103 |
+
): # Check directory/file names
|
104 |
+
return False
|
105 |
+
if filename in IGNORE_PATTERNS: # Check full filenames
|
106 |
+
return False
|
107 |
+
|
108 |
+
# Include files matching include patterns
|
109 |
+
if any(filename.endswith(ext) for ext in INCLUDE_PATTERNS if ext.startswith(".")):
|
110 |
+
return True
|
111 |
+
if any(filename == pattern for pattern in INCLUDE_PATTERNS if "." not in pattern):
|
112 |
+
return True
|
113 |
+
|
114 |
+
# Default to False if not explicitly included (safer)
|
115 |
+
# logging.debug(f"File '{filename}' excluded by default.")
|
116 |
+
return False
|
117 |
+
|
118 |
+
|
119 |
+
def get_space_code_files(space_id: str) -> dict[str, str]:
|
120 |
+
"""
|
121 |
+
Downloads relevant code and configuration files from a Hugging Face Space.
|
122 |
+
|
123 |
+
Args:
|
124 |
+
space_id: The ID of the Hugging Face Space (e.g., 'gradio/hello_world').
|
125 |
+
|
126 |
+
Returns:
|
127 |
+
A dictionary where keys are filenames and values are file contents as strings.
|
128 |
+
Returns an empty dictionary if the space is not found or has no relevant files.
|
129 |
+
"""
|
130 |
+
code_files = {}
|
131 |
+
api = HfApi()
|
132 |
+
|
133 |
+
try:
|
134 |
+
logging.info(f"Fetching file list for Space: {space_id}")
|
135 |
+
repo_files = api.list_repo_files(repo_id=space_id, repo_type="space")
|
136 |
+
logging.info(f"Found {len(repo_files)} total files in {space_id}.")
|
137 |
+
|
138 |
+
relevant_files = [f for f in repo_files if _is_relevant_file(f)]
|
139 |
+
logging.info(f"Identified {len(relevant_files)} relevant files for download.")
|
140 |
+
|
141 |
+
for filename in relevant_files:
|
142 |
+
try:
|
143 |
+
logging.debug(f"Downloading {filename} from {space_id}...")
|
144 |
+
file_path = hf_hub_download(
|
145 |
+
repo_id=space_id,
|
146 |
+
filename=filename,
|
147 |
+
repo_type="space",
|
148 |
+
# Consider adding use_auth_token=os.getenv("HF_TOKEN") if accessing private spaces later
|
149 |
+
)
|
150 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
151 |
+
content = f.read()
|
152 |
+
code_files[filename] = content
|
153 |
+
logging.debug(f"Successfully read content of {filename}")
|
154 |
+
except EntryNotFoundError:
|
155 |
+
logging.warning(
|
156 |
+
f"File {filename} listed but not found in repo {space_id}."
|
157 |
+
)
|
158 |
+
except UnicodeDecodeError:
|
159 |
+
logging.warning(
|
160 |
+
f"Could not decode file {filename} from {space_id} as UTF-8. Skipping."
|
161 |
+
)
|
162 |
+
except OSError as e:
|
163 |
+
logging.warning(f"OS error reading file {filename} from cache: {e}")
|
164 |
+
except Exception as e:
|
165 |
+
logging.error(
|
166 |
+
f"Unexpected error downloading or reading file {filename} from {space_id}: {e}"
|
167 |
+
)
|
168 |
+
|
169 |
+
except RepositoryNotFoundError:
|
170 |
+
logging.error(f"Space repository '{space_id}' not found.")
|
171 |
+
return {}
|
172 |
+
except Exception as e:
|
173 |
+
logging.error(f"Failed to list or process files for space {space_id}: {e}")
|
174 |
+
return {}
|
175 |
+
|
176 |
+
logging.info(
|
177 |
+
f"Successfully retrieved content for {len(code_files)} files from {space_id}."
|
178 |
+
)
|
179 |
+
return code_files
|
180 |
+
|
181 |
+
|
182 |
+
def extract_hf_model_ids(code_files: dict[str, str]) -> set[str]:
|
183 |
+
"""
|
184 |
+
Extracts potential Hugging Face model IDs mentioned in code files.
|
185 |
+
|
186 |
+
Args:
|
187 |
+
code_files: Dictionary of {filename: content}.
|
188 |
+
|
189 |
+
Returns:
|
190 |
+
A set of unique potential model IDs found.
|
191 |
+
"""
|
192 |
+
potential_ids = set()
|
193 |
+
for filename, content in code_files.items():
|
194 |
+
# Limit search to relevant file types
|
195 |
+
if filename.endswith((".py", ".json", ".yaml", ".yml", ".toml", ".md")):
|
196 |
+
try:
|
197 |
+
matches = HF_MODEL_ID_PATTERN.findall(content)
|
198 |
+
for _, model_id in matches:
|
199 |
+
# Basic validation: must contain exactly one '/'
|
200 |
+
if model_id.count("/") == 1:
|
201 |
+
# Avoid adding common paths that look like IDs
|
202 |
+
if not any(
|
203 |
+
part in model_id.lower()
|
204 |
+
for part in ["http", "www", "@", " ", ".", ":"]
|
205 |
+
): # Check if '/' is only separator
|
206 |
+
if len(model_id) < 100: # Avoid overly long strings
|
207 |
+
potential_ids.add(model_id)
|
208 |
+
except Exception as e:
|
209 |
+
logging.warning(f"Regex error processing file {filename}: {e}")
|
210 |
+
|
211 |
+
logging.info(f"Extracted {len(potential_ids)} potential model IDs.")
|
212 |
+
# Add simple filter for very common false positives if needed
|
213 |
+
# potential_ids = {id for id in potential_ids if id not in ['user/repo']}
|
214 |
+
return potential_ids
|
215 |
+
|
216 |
+
|
217 |
+
def get_model_descriptions(model_ids: set[str]) -> dict[str, str]:
|
218 |
+
"""
|
219 |
+
Fetches the README.md content (description) for a set of model IDs.
|
220 |
+
|
221 |
+
Args:
|
222 |
+
model_ids: A set of Hugging Face model IDs.
|
223 |
+
|
224 |
+
Returns:
|
225 |
+
A dictionary mapping model_id to its description string (or an error message).
|
226 |
+
"""
|
227 |
+
descriptions = {}
|
228 |
+
if not model_ids:
|
229 |
+
return descriptions
|
230 |
+
|
231 |
+
logging.info(f"Fetching descriptions for {len(model_ids)} models...")
|
232 |
+
for model_id in model_ids:
|
233 |
+
try:
|
234 |
+
# Check if the model exists first (optional but good practice)
|
235 |
+
# api.model_info(model_id)
|
236 |
+
|
237 |
+
# Download README.md
|
238 |
+
readme_path = hf_hub_download(
|
239 |
+
repo_id=model_id,
|
240 |
+
filename="README.md",
|
241 |
+
repo_type="model",
|
242 |
+
# Add token if needing to access private/gated models - unlikely for Space analysis
|
243 |
+
# use_auth_token=os.getenv("HF_TOKEN"),
|
244 |
+
error_if_not_found=True, # Raise error if README doesn't exist
|
245 |
+
)
|
246 |
+
with open(readme_path, "r", encoding="utf-8", errors="ignore") as f:
|
247 |
+
description = f.read()
|
248 |
+
descriptions[model_id] = description[:MAX_MODEL_DESC_LENGTH] + (
|
249 |
+
"... [truncated]" if len(description) > MAX_MODEL_DESC_LENGTH else ""
|
250 |
+
)
|
251 |
+
logging.debug(f"Successfully fetched description for {model_id}")
|
252 |
+
except RepositoryNotFoundError:
|
253 |
+
logging.warning(f"Model repository '{model_id}' not found.")
|
254 |
+
descriptions[model_id] = "[Model repository not found]"
|
255 |
+
except EntryNotFoundError:
|
256 |
+
logging.warning(f"README.md not found in model repository '{model_id}'.")
|
257 |
+
descriptions[model_id] = "[README.md not found in model repository]"
|
258 |
+
except Exception as e:
|
259 |
+
logging.error(f"Error fetching description for model '{model_id}': {e}")
|
260 |
+
descriptions[model_id] = f"[Error fetching description: {e}]"
|
261 |
+
|
262 |
+
logging.info(f"Finished fetching descriptions for {len(descriptions)} models.")
|
263 |
+
return descriptions
|
264 |
+
|
265 |
+
|
266 |
+
def list_cached_spaces(dataset_id: str, hf_token: str | None) -> list[str]:
|
267 |
+
"""Lists the space IDs (owner/name) that have cached reports in the dataset repository."""
|
268 |
+
if not hf_token:
|
269 |
+
logging.warning("HF Token not provided, cannot list cached spaces.")
|
270 |
+
return []
|
271 |
+
try:
|
272 |
+
api = HfApi(token=hf_token)
|
273 |
+
# Get all filenames in the dataset repository
|
274 |
+
all_files = api.list_repo_files(repo_id=dataset_id, repo_type="dataset")
|
275 |
+
|
276 |
+
# Extract unique directory paths that look like owner/space_name
|
277 |
+
# by checking if they contain our specific report files.
|
278 |
+
space_ids = set()
|
279 |
+
for f_path in all_files:
|
280 |
+
# Check if the file is one of our report files
|
281 |
+
if f_path.endswith(f"/{PRIVACY_FILENAME}") or f_path.endswith(
|
282 |
+
f"/{SUMMARY_FILENAME}"
|
283 |
+
):
|
284 |
+
# Extract the directory path part (owner/space_name)
|
285 |
+
parts = f_path.split("/")
|
286 |
+
if len(parts) == 3: # Expecting owner/space_name/filename.md
|
287 |
+
owner_slash_space_name = "/".join(parts[:-1])
|
288 |
+
# Basic validation: owner and space name shouldn't start with '.'
|
289 |
+
if not parts[0].startswith(".") and not parts[1].startswith("."):
|
290 |
+
space_ids.add(owner_slash_space_name)
|
291 |
+
|
292 |
+
sorted_space_ids = sorted(list(space_ids))
|
293 |
+
logging.info(
|
294 |
+
f"Found {len(sorted_space_ids)} cached space reports in {dataset_id} via HfApi."
|
295 |
+
)
|
296 |
+
return sorted_space_ids
|
297 |
+
|
298 |
+
except RepositoryNotFoundError:
|
299 |
+
logging.warning(
|
300 |
+
f"Dataset {dataset_id} not found or empty when listing cached spaces."
|
301 |
+
)
|
302 |
+
return []
|
303 |
+
except Exception as e:
|
304 |
+
logging.error(f"Error listing cached spaces in {dataset_id} via HfApi: {e}")
|
305 |
+
return [] # Return empty list on error
|
306 |
+
|
307 |
+
|
308 |
+
def check_report_exists(space_id: str, dataset_id: str, hf_token: str | None) -> bool:
|
309 |
+
"""Checks if report files already exist in the target dataset repo using HfApi."""
|
310 |
+
print(
|
311 |
+
f"[Debug Cache Check] Checking for space_id: '{space_id}' in dataset: '{dataset_id}'"
|
312 |
+
) # DEBUG
|
313 |
+
if not hf_token:
|
314 |
+
logging.warning("HF Token not provided, cannot check dataset cache.")
|
315 |
+
print("[Debug Cache Check] No HF Token, returning False.") # DEBUG
|
316 |
+
return False
|
317 |
+
try:
|
318 |
+
api = HfApi(token=hf_token)
|
319 |
+
# List ALL files in the repo
|
320 |
+
print(f"[Debug Cache Check] Listing ALL files in repo '{dataset_id}'") # DEBUG
|
321 |
+
all_repo_files = api.list_repo_files(repo_id=dataset_id, repo_type="dataset")
|
322 |
+
# DEBUG: Optionally print a subset if the list is huge
|
323 |
+
# print(f"[Debug Cache Check] First 10 files returned by API: {all_repo_files[:10]}")
|
324 |
+
|
325 |
+
# Construct the exact paths we expect for the target space_id
|
326 |
+
expected_summary_path = f"{space_id}/{SUMMARY_FILENAME}"
|
327 |
+
expected_privacy_path = f"{space_id}/{PRIVACY_FILENAME}"
|
328 |
+
print(
|
329 |
+
f"[Debug Cache Check] Expecting summary file: '{expected_summary_path}'"
|
330 |
+
) # DEBUG
|
331 |
+
print(
|
332 |
+
f"[Debug Cache Check] Expecting privacy file: '{expected_privacy_path}'"
|
333 |
+
) # DEBUG
|
334 |
+
|
335 |
+
# Check if both expected paths exist in the full list of files
|
336 |
+
summary_exists = expected_summary_path in all_repo_files
|
337 |
+
privacy_exists = expected_privacy_path in all_repo_files
|
338 |
+
exists = summary_exists and privacy_exists
|
339 |
+
print(
|
340 |
+
f"[Debug Cache Check] Summary exists in full list: {summary_exists}"
|
341 |
+
) # DEBUG
|
342 |
+
print(
|
343 |
+
f"[Debug Cache Check] Privacy exists in full list: {privacy_exists}"
|
344 |
+
) # DEBUG
|
345 |
+
print(f"[Debug Cache Check] Overall exists check result: {exists}") # DEBUG
|
346 |
+
return exists
|
347 |
+
|
348 |
+
except RepositoryNotFoundError:
|
349 |
+
logging.warning(
|
350 |
+
f"Dataset repository {dataset_id} not found or not accessible during check."
|
351 |
+
)
|
352 |
+
print(
|
353 |
+
f"[Debug Cache Check] Repository {dataset_id} not found, returning False."
|
354 |
+
) # DEBUG
|
355 |
+
except Exception as e:
|
356 |
+
# ... (error handling remains the same) ...
|
357 |
+
print(f"[Debug Cache Check] Exception caught: {type(e).__name__}: {e}") # DEBUG
|
358 |
+
# Note: 404 check based on path_in_repo is no longer applicable here
|
359 |
+
# We rely on RepositoryNotFoundError or general Exception
|
360 |
+
logging.error(
|
361 |
+
f"Error checking dataset {dataset_id} for {space_id} via HfApi: {e}"
|
362 |
+
)
|
363 |
+
print("[Debug Cache Check] Other exception, returning False.") # DEBUG
|
364 |
+
return False # Treat errors as cache miss
|
365 |
+
|
366 |
+
|
367 |
+
def download_cached_reports(
|
368 |
+
space_id: str, dataset_id: str, hf_token: str | None
|
369 |
+
) -> dict[str, str]:
|
370 |
+
"""Downloads cached reports from the dataset repo. Raises error on failure."""
|
371 |
+
if not hf_token:
|
372 |
+
raise ValueError("HF Token required to download cached reports.")
|
373 |
+
|
374 |
+
logging.info(
|
375 |
+
f"Attempting to download cached reports for {space_id} from {dataset_id}..."
|
376 |
+
)
|
377 |
+
reports = {}
|
378 |
+
# Define paths relative to dataset root for hf_hub_download
|
379 |
+
summary_repo_path = f"{space_id}/{SUMMARY_FILENAME}"
|
380 |
+
privacy_repo_path = f"{space_id}/{PRIVACY_FILENAME}"
|
381 |
+
try:
|
382 |
+
# Download summary
|
383 |
+
summary_path_local = hf_hub_download(
|
384 |
+
repo_id=dataset_id,
|
385 |
+
filename=summary_repo_path,
|
386 |
+
repo_type="dataset",
|
387 |
+
token=hf_token,
|
388 |
+
)
|
389 |
+
with open(summary_path_local, "r", encoding="utf-8") as f:
|
390 |
+
reports["summary"] = f.read()
|
391 |
+
logging.info(f"Successfully downloaded cached summary for {space_id}.")
|
392 |
+
|
393 |
+
# Download privacy report
|
394 |
+
privacy_path_local = hf_hub_download(
|
395 |
+
repo_id=dataset_id,
|
396 |
+
filename=privacy_repo_path,
|
397 |
+
repo_type="dataset",
|
398 |
+
token=hf_token,
|
399 |
+
)
|
400 |
+
with open(privacy_path_local, "r", encoding="utf-8") as f:
|
401 |
+
reports["privacy"] = f.read()
|
402 |
+
logging.info(f"Successfully downloaded cached privacy report for {space_id}.")
|
403 |
+
|
404 |
+
return reports
|
405 |
+
|
406 |
+
except EntryNotFoundError as e:
|
407 |
+
# More specific error based on which file failed
|
408 |
+
missing_file = (
|
409 |
+
summary_repo_path if summary_repo_path in str(e) else privacy_repo_path
|
410 |
+
)
|
411 |
+
logging.error(
|
412 |
+
f"Cache download error: Report file {missing_file} not found for {space_id} in {dataset_id}. {e}"
|
413 |
+
)
|
414 |
+
raise FileNotFoundError(
|
415 |
+
f"Cached report file {missing_file} not found for {space_id}"
|
416 |
+
) from e
|
417 |
+
except RepositoryNotFoundError as e:
|
418 |
+
logging.error(f"Cache download error: Dataset repo {dataset_id} not found. {e}")
|
419 |
+
raise FileNotFoundError(f"Dataset repo {dataset_id} not found") from e
|
420 |
+
except Exception as e:
|
421 |
+
logging.error(
|
422 |
+
f"Unexpected error downloading cached reports for {space_id} from {dataset_id}: {e}"
|
423 |
+
)
|
424 |
+
raise IOError(f"Failed to download cached reports for {space_id}") from e
|
425 |
+
|
426 |
+
|
427 |
+
def upload_reports_to_dataset(
|
428 |
+
space_id: str,
|
429 |
+
summary_report: str,
|
430 |
+
detailed_report: str,
|
431 |
+
dataset_id: str,
|
432 |
+
hf_token: str | None,
|
433 |
+
):
|
434 |
+
"""Uploads the generated reports to the specified dataset repository."""
|
435 |
+
if not hf_token:
|
436 |
+
logging.warning("HF Token not provided, skipping dataset report upload.")
|
437 |
+
return
|
438 |
+
|
439 |
+
logging.info(
|
440 |
+
f"Attempting to upload reports for {space_id} to dataset {dataset_id}..."
|
441 |
+
)
|
442 |
+
api = HfApi(token=hf_token)
|
443 |
+
|
444 |
+
# Sanitize space_id for path safety (though HF Hub usually handles this)
|
445 |
+
safe_space_id = space_id.replace("..", "")
|
446 |
+
|
447 |
+
try:
|
448 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
449 |
+
summary_path_local = os.path.join(tmpdir, SUMMARY_FILENAME)
|
450 |
+
privacy_path_local = os.path.join(tmpdir, PRIVACY_FILENAME)
|
451 |
+
|
452 |
+
with open(summary_path_local, "w", encoding="utf-8") as f:
|
453 |
+
f.write(summary_report)
|
454 |
+
with open(privacy_path_local, "w", encoding="utf-8") as f:
|
455 |
+
f.write(detailed_report)
|
456 |
+
|
457 |
+
commit_message = f"Add privacy analysis reports for Space: {safe_space_id}"
|
458 |
+
repo_url = api.create_repo(
|
459 |
+
repo_id=dataset_id,
|
460 |
+
repo_type="dataset",
|
461 |
+
exist_ok=True,
|
462 |
+
)
|
463 |
+
logging.info(f"Ensured dataset repo {repo_url} exists.")
|
464 |
+
|
465 |
+
api.upload_file(
|
466 |
+
path_or_fileobj=summary_path_local,
|
467 |
+
path_in_repo=f"{safe_space_id}/{SUMMARY_FILENAME}",
|
468 |
+
repo_id=dataset_id,
|
469 |
+
repo_type="dataset",
|
470 |
+
commit_message=commit_message,
|
471 |
+
)
|
472 |
+
logging.info(f"Successfully uploaded summary report for {safe_space_id}.")
|
473 |
+
|
474 |
+
api.upload_file(
|
475 |
+
path_or_fileobj=privacy_path_local,
|
476 |
+
path_in_repo=f"{safe_space_id}/{PRIVACY_FILENAME}",
|
477 |
+
repo_id=dataset_id,
|
478 |
+
repo_type="dataset",
|
479 |
+
commit_message=commit_message,
|
480 |
+
)
|
481 |
+
logging.info(
|
482 |
+
f"Successfully uploaded detailed privacy report for {safe_space_id}."
|
483 |
+
)
|
484 |
+
|
485 |
+
except Exception as e:
|
486 |
+
logging.error(
|
487 |
+
f"Failed to upload reports for {safe_space_id} to dataset {dataset_id}: {e}"
|
488 |
+
)
|
489 |
+
|
490 |
+
|
491 |
+
# Example usage (for testing)
|
492 |
+
# if __name__ == '__main__':
|
493 |
+
# # Make sure HF_TOKEN is set if accessing private spaces or for higher rate limits
|
494 |
+
# from dotenv import load_dotenv
|
495 |
+
# load_dotenv()
|
496 |
+
# # test_space = "gradio/hello_world"
|
497 |
+
# test_space = "huggingface-projects/diffusers-gallery" # A more complex example
|
498 |
+
# # test_space = "nonexistent/space" # Test not found
|
499 |
+
# files_content = get_space_code_files(test_space)
|
500 |
+
# if files_content:
|
501 |
+
# print(f"\n--- Files retrieved from {test_space} ---")
|
502 |
+
# for name in files_content.keys():
|
503 |
+
# print(f"- {name}")
|
504 |
+
# # print("\n--- Content of app.py (first 200 chars) ---")
|
505 |
+
# # print(files_content.get("app.py", "app.py not found")[:200])
|
506 |
+
# else:
|
507 |
+
# print(f"Could not retrieve files from {test_space}")
|