Spaces:
Running
Running
Yacine Jernite
commited on
Commit
·
36de078
1
Parent(s):
d6d8868
added TLDR functionality
Browse files- analysis_utils.py +684 -0
- app.py +548 -349
- llm_interface.py +1 -0
- utils.py +86 -34
analysis_utils.py
ADDED
@@ -0,0 +1,684 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json # Added for TLDR JSON parsing
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import tempfile
|
5 |
+
|
6 |
+
from huggingface_hub import HfApi
|
7 |
+
from huggingface_hub.inference._generated.types import \
|
8 |
+
ChatCompletionOutput # Added for type hinting
|
9 |
+
|
10 |
+
# Imports from other project modules
|
11 |
+
from llm_interface import (ERROR_503_DICT, parse_qwen_response,
|
12 |
+
query_qwen_endpoint)
|
13 |
+
from prompts import format_privacy_prompt, format_summary_highlights_prompt
|
14 |
+
from utils import (PRIVACY_FILENAME, # Import constants for filenames
|
15 |
+
SUMMARY_FILENAME, TLDR_FILENAME, check_report_exists,
|
16 |
+
download_cached_reports, get_space_code_files)
|
17 |
+
|
18 |
+
# Configure logging (can inherit from app.py if called from there, but good practice)
|
19 |
+
logging.basicConfig(
|
20 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
21 |
+
)
|
22 |
+
|
23 |
+
# Load environment variables - redundant if always called by app.py which already loads them
|
24 |
+
# load_dotenv()
|
25 |
+
|
26 |
+
# Constants needed by helper functions (can be passed as args too)
|
27 |
+
# Consider passing these from app.py if they might change or for clarity
|
28 |
+
CACHE_INFO_MSG = "\n\n*(Report retrieved from cache)*"
|
29 |
+
TRUNCATION_WARNING = """**⚠️ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n"""
|
30 |
+
|
31 |
+
# --- Constants for TLDR Generation ---
|
32 |
+
TLDR_SYSTEM_PROMPT = (
|
33 |
+
"You are an AI assistant specialized in summarizing privacy analysis reports for Hugging Face Spaces. "
|
34 |
+
"You will receive two reports: a detailed privacy analysis and a summary/highlights report. "
|
35 |
+
"Based **only** on the content of these two reports, generate a concise JSON object containing a structured TLDR (Too Long; Didn't Read). "
|
36 |
+
"Do not use any information not present in the provided reports. "
|
37 |
+
"The JSON object must have the following keys:\n"
|
38 |
+
'- "app_description": A 1-2 sentence summary of what the application does from a user\'s perspective.\n'
|
39 |
+
'- "privacy_tldr": A 2-3 sentence high-level overview of privacy. Mention if the analysis was conclusive based on available code, if data processing is local, or if/what data goes to external services.\n'
|
40 |
+
'- "data_types": A list of JSON objects, where each object has two keys: \'name\' (a short, unique identifier string for the data type, e.g., "User Text") and \'description\' (a brief string explaining the data type in context, max 6-8 words, e.g., "Text prompt entered by the user").\n'
|
41 |
+
"- \"user_input_data\": A list of strings, where each string is the 'name' of a data type defined in 'data_types' that is provided by the user to the app.\n"
|
42 |
+
"- \"local_processing\": A list of strings describing data processed locally. Each string should start with the 'name' of a data type defined in 'data_types', followed by details (like the processing model) in parentheses if mentioned in the reports. Example: \"User Text (Local Model XYZ)\".\n"
|
43 |
+
"- \"remote_processing\": A list of strings describing data sent to remote services. Each string should start with the 'name' of a data type defined in 'data_types', followed by the service/model name in parentheses if mentioned in the reports. Example: \"User Text (HF Inference API)\".\n"
|
44 |
+
"- \"external_logging\": A list of strings describing data logged or saved externally. Each string should start with the 'name' of a data type defined in 'data_types', followed by the location/service in parentheses if mentioned. Example: \"User Text (External DB)\".\n"
|
45 |
+
"Ensure the output is **only** a valid JSON object, starting with `{` and ending with `}`. Ensure all listed data types in the processing/logging lists exactly match a 'name' defined in the 'data_types' list."
|
46 |
+
)
|
47 |
+
|
48 |
+
# --- Analysis Pipeline Helper Functions ---
|
49 |
+
|
50 |
+
|
51 |
+
def check_cache_and_download(space_id: str, dataset_id: str, hf_token: str | None):
|
52 |
+
"""Checks cache and downloads if reports exist."""
|
53 |
+
logging.info(f"Checking cache for '{space_id}'...")
|
54 |
+
found_in_cache = False
|
55 |
+
if hf_token:
|
56 |
+
try:
|
57 |
+
found_in_cache = check_report_exists(space_id, dataset_id, hf_token)
|
58 |
+
except Exception as e:
|
59 |
+
logging.warning(f"Cache check failed for {space_id}: {e}. Proceeding.")
|
60 |
+
# Return cache_miss even if check failed, proceed to live analysis
|
61 |
+
return {"status": "cache_miss", "error_message": f"Cache check failed: {e}"}
|
62 |
+
|
63 |
+
if found_in_cache:
|
64 |
+
logging.info(f"Cache hit for {space_id}. Downloading.")
|
65 |
+
try:
|
66 |
+
cached_reports = download_cached_reports(space_id, dataset_id, hf_token)
|
67 |
+
summary_report = (
|
68 |
+
cached_reports.get("summary", "Error: Cached summary not found.")
|
69 |
+
+ CACHE_INFO_MSG
|
70 |
+
)
|
71 |
+
privacy_report = (
|
72 |
+
cached_reports.get("privacy", "Error: Cached privacy report not found.")
|
73 |
+
+ CACHE_INFO_MSG
|
74 |
+
)
|
75 |
+
logging.info(f"Successfully downloaded cached reports for {space_id}.")
|
76 |
+
return {
|
77 |
+
"status": "cache_hit",
|
78 |
+
"summary": summary_report,
|
79 |
+
"privacy": privacy_report,
|
80 |
+
"tldr_json_str": cached_reports.get("tldr_json_str"),
|
81 |
+
}
|
82 |
+
except Exception as e:
|
83 |
+
error_msg = f"Cache download failed for {space_id}: {e}"
|
84 |
+
logging.warning(f"{error_msg}. Proceeding with live analysis.")
|
85 |
+
# Return error, but let caller decide if live analysis proceeds
|
86 |
+
return {"status": "cache_error", "ui_message": error_msg}
|
87 |
+
else:
|
88 |
+
logging.info(f"Cache miss for {space_id}. Performing live analysis.")
|
89 |
+
return {"status": "cache_miss"}
|
90 |
+
|
91 |
+
|
92 |
+
def check_endpoint_status(
|
93 |
+
endpoint_name: str, hf_token: str | None, error_503_user_message: str
|
94 |
+
):
|
95 |
+
"""Checks the status of the inference endpoint."""
|
96 |
+
logging.info(f"Checking endpoint status for '{endpoint_name}'...")
|
97 |
+
if not hf_token:
|
98 |
+
# Allow proceeding if token missing, maybe endpoint is public
|
99 |
+
logging.warning("HF_TOKEN not set, cannot check endpoint status definitively.")
|
100 |
+
return {"status": "ready", "warning": "HF_TOKEN not set"}
|
101 |
+
|
102 |
+
try:
|
103 |
+
api = HfApi(token=hf_token)
|
104 |
+
endpoint = api.get_inference_endpoint(name=endpoint_name)
|
105 |
+
status = endpoint.status
|
106 |
+
logging.info(f"Endpoint '{endpoint_name}' status: {status}")
|
107 |
+
|
108 |
+
if status == "running":
|
109 |
+
return {"status": "ready"}
|
110 |
+
else:
|
111 |
+
logging.warning(
|
112 |
+
f"Endpoint '{endpoint_name}' is not ready (Status: {status})."
|
113 |
+
)
|
114 |
+
if status == "scaledToZero":
|
115 |
+
logging.info(
|
116 |
+
f"Endpoint '{endpoint_name}' is scaled to zero. Attempting to resume..."
|
117 |
+
)
|
118 |
+
try:
|
119 |
+
endpoint.resume()
|
120 |
+
# Still return an error message suggesting retry, as resume takes time
|
121 |
+
# Keep this message concise as the action is specific (wait)
|
122 |
+
msg = f"**Endpoint Resuming:** The analysis endpoint ('{endpoint_name}') was scaled to zero and is now restarting.\n\n{error_503_user_message}"
|
123 |
+
return {"status": "error", "ui_message": msg}
|
124 |
+
except Exception as resume_error:
|
125 |
+
# Resume failed, provide detailed message
|
126 |
+
logging.error(
|
127 |
+
f"Failed to resume endpoint {endpoint_name}: {resume_error}"
|
128 |
+
)
|
129 |
+
# Construct detailed message including full explanation
|
130 |
+
msg = f"**Endpoint Issue:** The analysis endpoint ('{endpoint_name}') is currently {status} and an attempt to resume it failed ({resume_error}).\n\n{error_503_user_message}"
|
131 |
+
return {"status": "error", "ui_message": msg}
|
132 |
+
else: # Paused, failed, pending etc.
|
133 |
+
# Construct detailed message including full explanation
|
134 |
+
msg = f"**Endpoint Issue:** The analysis endpoint ('{endpoint_name}') status is currently <span style='color:red'>**{status}**</span>.\n\n{error_503_user_message}"
|
135 |
+
return {"status": "error", "ui_message": msg}
|
136 |
+
|
137 |
+
except Exception as e:
|
138 |
+
error_msg = f"Error checking analysis endpoint status for {endpoint_name}: {e}"
|
139 |
+
logging.error(error_msg)
|
140 |
+
# Let analysis stop if endpoint check fails critically
|
141 |
+
return {"status": "error", "ui_message": f"Error checking endpoint status: {e}"}
|
142 |
+
|
143 |
+
|
144 |
+
def fetch_and_validate_code(space_id: str):
|
145 |
+
"""Fetches and validates code files for the space."""
|
146 |
+
logging.info(f"Fetching code files for {space_id}...")
|
147 |
+
code_files = get_space_code_files(space_id)
|
148 |
+
if not code_files:
|
149 |
+
error_msg = f"Could not retrieve code files for '{space_id}'. Check ID and ensure it's a public Space."
|
150 |
+
logging.warning(error_msg)
|
151 |
+
return {
|
152 |
+
"status": "error",
|
153 |
+
"ui_message": f"**Error:**\n{error_msg}\nAnalysis Canceled.",
|
154 |
+
}
|
155 |
+
logging.info(f"Successfully fetched {len(code_files)} files for {space_id}.")
|
156 |
+
return {"status": "success", "code_files": code_files}
|
157 |
+
|
158 |
+
|
159 |
+
def generate_detailed_report(
|
160 |
+
space_id: str, code_files: dict, error_503_user_message: str
|
161 |
+
):
|
162 |
+
"""Generates the detailed privacy report using the LLM."""
|
163 |
+
logging.info("Generating detailed privacy analysis report...")
|
164 |
+
privacy_prompt_messages, privacy_truncated = format_privacy_prompt(
|
165 |
+
space_id, code_files
|
166 |
+
)
|
167 |
+
|
168 |
+
privacy_api_response = query_qwen_endpoint(privacy_prompt_messages, max_tokens=3072)
|
169 |
+
|
170 |
+
if privacy_api_response == ERROR_503_DICT:
|
171 |
+
logging.warning("LLM Call 1 (Privacy) failed with 503.")
|
172 |
+
return {"status": "error", "ui_message": error_503_user_message}
|
173 |
+
|
174 |
+
detailed_privacy_report = parse_qwen_response(privacy_api_response)
|
175 |
+
|
176 |
+
if "Error:" in detailed_privacy_report:
|
177 |
+
error_msg = (
|
178 |
+
f"Failed to generate detailed privacy report: {detailed_privacy_report}"
|
179 |
+
)
|
180 |
+
logging.error(error_msg)
|
181 |
+
return {
|
182 |
+
"status": "error",
|
183 |
+
"ui_message": f"**Error Generating Detailed Privacy Report:**\n{detailed_privacy_report}\nAnalysis Halted.",
|
184 |
+
}
|
185 |
+
|
186 |
+
if privacy_truncated:
|
187 |
+
detailed_privacy_report = TRUNCATION_WARNING + detailed_privacy_report
|
188 |
+
|
189 |
+
logging.info("Successfully generated detailed privacy report.")
|
190 |
+
return {
|
191 |
+
"status": "success",
|
192 |
+
"report": detailed_privacy_report,
|
193 |
+
"truncated": privacy_truncated,
|
194 |
+
}
|
195 |
+
|
196 |
+
|
197 |
+
def generate_summary_report(
|
198 |
+
space_id: str,
|
199 |
+
code_files: dict,
|
200 |
+
detailed_privacy_report: str,
|
201 |
+
error_503_user_message: str,
|
202 |
+
):
|
203 |
+
"""Generates the summary & highlights report using the LLM."""
|
204 |
+
logging.info("Generating summary and highlights report...")
|
205 |
+
# Remove potential truncation warning from detailed report before sending to next LLM
|
206 |
+
clean_detailed_report = detailed_privacy_report.replace(TRUNCATION_WARNING, "")
|
207 |
+
|
208 |
+
summary_highlights_prompt_messages, summary_truncated = (
|
209 |
+
format_summary_highlights_prompt(space_id, code_files, clean_detailed_report)
|
210 |
+
)
|
211 |
+
|
212 |
+
summary_highlights_api_response = query_qwen_endpoint(
|
213 |
+
summary_highlights_prompt_messages, max_tokens=2048
|
214 |
+
)
|
215 |
+
|
216 |
+
if summary_highlights_api_response == ERROR_503_DICT:
|
217 |
+
logging.warning("LLM Call 2 (Summary) failed with 503.")
|
218 |
+
# Return specific status to indicate partial success
|
219 |
+
return {"status": "error_503_summary", "ui_message": error_503_user_message}
|
220 |
+
|
221 |
+
summary_highlights_report = parse_qwen_response(summary_highlights_api_response)
|
222 |
+
|
223 |
+
if "Error:" in summary_highlights_report:
|
224 |
+
error_msg = (
|
225 |
+
f"Failed to generate summary/highlights report: {summary_highlights_report}"
|
226 |
+
)
|
227 |
+
logging.error(error_msg)
|
228 |
+
# Return specific status to indicate partial success
|
229 |
+
return {
|
230 |
+
"status": "error_summary",
|
231 |
+
"ui_message": f"**Error Generating Summary/Highlights:**\n{summary_highlights_report}",
|
232 |
+
}
|
233 |
+
|
234 |
+
if summary_truncated:
|
235 |
+
summary_highlights_report = TRUNCATION_WARNING + summary_highlights_report
|
236 |
+
|
237 |
+
logging.info("Successfully generated summary & highlights report.")
|
238 |
+
return {
|
239 |
+
"status": "success",
|
240 |
+
"report": summary_highlights_report,
|
241 |
+
"truncated": summary_truncated,
|
242 |
+
}
|
243 |
+
|
244 |
+
|
245 |
+
def upload_results(
|
246 |
+
space_id: str,
|
247 |
+
summary_report: str,
|
248 |
+
detailed_report: str,
|
249 |
+
dataset_id: str,
|
250 |
+
hf_token: str | None,
|
251 |
+
tldr_json_data: dict | None = None,
|
252 |
+
):
|
253 |
+
"""Uploads the generated reports (Markdown and optional JSON TLDR) to the specified dataset repository."""
|
254 |
+
if not hf_token:
|
255 |
+
logging.warning("HF Token not provided, skipping dataset report upload.")
|
256 |
+
return {"status": "skipped", "reason": "HF_TOKEN not set"}
|
257 |
+
if "Error:" in detailed_report or "Error:" in summary_report:
|
258 |
+
msg = "Skipping cache upload due to errors in generated reports."
|
259 |
+
logging.warning(msg)
|
260 |
+
return {"status": "skipped", "reason": msg}
|
261 |
+
|
262 |
+
safe_space_id = space_id.replace("..", "")
|
263 |
+
|
264 |
+
try:
|
265 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
266 |
+
# Define local paths
|
267 |
+
summary_path_local = os.path.join(tmpdir, SUMMARY_FILENAME)
|
268 |
+
privacy_path_local = os.path.join(tmpdir, PRIVACY_FILENAME)
|
269 |
+
tldr_json_path_local = os.path.join(tmpdir, TLDR_FILENAME)
|
270 |
+
|
271 |
+
# Write Markdown reports
|
272 |
+
with open(summary_path_local, "w", encoding="utf-8") as f:
|
273 |
+
f.write(summary_report)
|
274 |
+
with open(privacy_path_local, "w", encoding="utf-8") as f:
|
275 |
+
f.write(detailed_report)
|
276 |
+
|
277 |
+
# Prepare commit message
|
278 |
+
commit_message = f"Add analysis reports for Space: {safe_space_id}"
|
279 |
+
if tldr_json_data:
|
280 |
+
commit_message += " (including TLDR JSON)"
|
281 |
+
print(f"Successfully wrote TLDR JSON locally for {safe_space_id}.")
|
282 |
+
# Write JSON TLDR data if available
|
283 |
+
try:
|
284 |
+
with open(tldr_json_path_local, "w", encoding="utf-8") as f:
|
285 |
+
json.dump(tldr_json_data, f, indent=2, ensure_ascii=False)
|
286 |
+
logging.info(
|
287 |
+
f"Successfully wrote TLDR JSON locally for {safe_space_id}."
|
288 |
+
)
|
289 |
+
except Exception as json_err:
|
290 |
+
logging.error(
|
291 |
+
f"Failed to write TLDR JSON locally for {safe_space_id}: {json_err}"
|
292 |
+
)
|
293 |
+
tldr_json_data = None # Prevent upload attempt if writing failed
|
294 |
+
|
295 |
+
# Ensure repo exists
|
296 |
+
api = HfApi(token=hf_token)
|
297 |
+
repo_url = api.create_repo(
|
298 |
+
repo_id=dataset_id,
|
299 |
+
repo_type="dataset",
|
300 |
+
exist_ok=True,
|
301 |
+
)
|
302 |
+
logging.info(f"Ensured dataset repo {repo_url} exists.")
|
303 |
+
|
304 |
+
# Upload summary report
|
305 |
+
api.upload_file(
|
306 |
+
path_or_fileobj=summary_path_local,
|
307 |
+
path_in_repo=f"{safe_space_id}/{SUMMARY_FILENAME}",
|
308 |
+
repo_id=dataset_id,
|
309 |
+
repo_type="dataset",
|
310 |
+
commit_message=commit_message,
|
311 |
+
)
|
312 |
+
logging.info(f"Successfully uploaded summary report for {safe_space_id}.")
|
313 |
+
|
314 |
+
# Upload privacy report
|
315 |
+
api.upload_file(
|
316 |
+
path_or_fileobj=privacy_path_local,
|
317 |
+
path_in_repo=f"{safe_space_id}/{PRIVACY_FILENAME}",
|
318 |
+
repo_id=dataset_id,
|
319 |
+
repo_type="dataset",
|
320 |
+
commit_message=commit_message,
|
321 |
+
)
|
322 |
+
logging.info(
|
323 |
+
f"Successfully uploaded detailed privacy report for {safe_space_id}."
|
324 |
+
)
|
325 |
+
# print(f"Successfully uploaded detailed privacy report for {safe_space_id}.") # Keep if needed for debug
|
326 |
+
|
327 |
+
# Upload JSON TLDR if it was successfully written locally
|
328 |
+
if tldr_json_data and os.path.exists(tldr_json_path_local):
|
329 |
+
api.upload_file(
|
330 |
+
path_or_fileobj=tldr_json_path_local,
|
331 |
+
path_in_repo=f"{safe_space_id}/{TLDR_FILENAME}",
|
332 |
+
repo_id=dataset_id,
|
333 |
+
repo_type="dataset",
|
334 |
+
commit_message=commit_message, # Can reuse commit message or make specific
|
335 |
+
)
|
336 |
+
logging.info(f"Successfully uploaded TLDR JSON for {safe_space_id}.")
|
337 |
+
print(f"Successfully uploaded TLDR JSON for {safe_space_id}.")
|
338 |
+
|
339 |
+
# Return success if all uploads finished without error
|
340 |
+
return {"status": "success"}
|
341 |
+
|
342 |
+
except Exception as e:
|
343 |
+
error_msg = f"Non-critical error during report upload for {safe_space_id}: {e}"
|
344 |
+
logging.error(error_msg)
|
345 |
+
print(error_msg)
|
346 |
+
return {"status": "error", "message": error_msg}
|
347 |
+
|
348 |
+
|
349 |
+
# --- New TLDR Generation Functions ---
|
350 |
+
|
351 |
+
|
352 |
+
def format_tldr_prompt(
|
353 |
+
detailed_report: str, summary_report: str
|
354 |
+
) -> list[dict[str, str]]:
|
355 |
+
"""Formats the prompt for the TLDR generation task."""
|
356 |
+
# Clean potential cache/truncation markers from input reports for the LLM
|
357 |
+
cleaned_detailed = detailed_report.replace(CACHE_INFO_MSG, "").replace(
|
358 |
+
TRUNCATION_WARNING, ""
|
359 |
+
)
|
360 |
+
cleaned_summary = summary_report.replace(CACHE_INFO_MSG, "").replace(
|
361 |
+
TRUNCATION_WARNING, ""
|
362 |
+
)
|
363 |
+
|
364 |
+
user_content = (
|
365 |
+
"Please generate a structured JSON TLDR based on the following reports:\n\n"
|
366 |
+
"--- DETAILED PRIVACY ANALYSIS REPORT START ---\n"
|
367 |
+
f"{cleaned_detailed}\n"
|
368 |
+
"--- DETAILED PRIVACY ANALYSIS REPORT END ---\n\n"
|
369 |
+
"--- SUMMARY & HIGHLIGHTS REPORT START ---\n"
|
370 |
+
f"{cleaned_summary}\n"
|
371 |
+
"--- SUMMARY & HIGHLIGHTS REPORT END ---"
|
372 |
+
)
|
373 |
+
|
374 |
+
# Note: We are not handling truncation here, assuming the input reports
|
375 |
+
# are already reasonably sized from the previous steps.
|
376 |
+
# If reports could be extremely long, add truncation logic similar to other format_* functions.
|
377 |
+
|
378 |
+
messages = [
|
379 |
+
{"role": "system", "content": TLDR_SYSTEM_PROMPT},
|
380 |
+
{"role": "user", "content": user_content},
|
381 |
+
]
|
382 |
+
return messages
|
383 |
+
|
384 |
+
|
385 |
+
def parse_tldr_json_response(
|
386 |
+
response: ChatCompletionOutput | dict | None,
|
387 |
+
) -> dict | None:
|
388 |
+
"""Parses the LLM response, expecting JSON content for the TLDR."""
|
389 |
+
if response is None:
|
390 |
+
logging.error("TLDR Generation: Failed to get response from LLM.")
|
391 |
+
return None
|
392 |
+
|
393 |
+
# Check for 503 error dict first
|
394 |
+
if isinstance(response, dict) and response.get("error_type") == "503":
|
395 |
+
logging.error(f"TLDR Generation: Received 503 error: {response.get('message')}")
|
396 |
+
return None # Treat 503 as failure for this specific task
|
397 |
+
|
398 |
+
# --- Direct Content Extraction (Replaces call to parse_qwen_response) ---
|
399 |
+
raw_content = ""
|
400 |
+
try:
|
401 |
+
# Check if it's likely the expected ChatCompletionOutput structure
|
402 |
+
if not hasattr(response, "choices"):
|
403 |
+
logging.error(
|
404 |
+
f"TLDR Generation: Unexpected response type received: {type(response)}. Content: {response}"
|
405 |
+
)
|
406 |
+
return None # Return None if not the expected structure
|
407 |
+
|
408 |
+
# Access the generated content according to the ChatCompletionOutput structure
|
409 |
+
if response.choices and len(response.choices) > 0:
|
410 |
+
content = response.choices[0].message.content
|
411 |
+
if content:
|
412 |
+
raw_content = content.strip()
|
413 |
+
logging.info(
|
414 |
+
"TLDR Generation: Successfully extracted raw content from response."
|
415 |
+
)
|
416 |
+
else:
|
417 |
+
logging.warning(
|
418 |
+
"TLDR Generation: Response received, but content is empty."
|
419 |
+
)
|
420 |
+
return None
|
421 |
+
else:
|
422 |
+
logging.warning("TLDR Generation: Response received, but no choices found.")
|
423 |
+
return None
|
424 |
+
except AttributeError as e:
|
425 |
+
# This might catch cases where response looks like the object but lacks expected attributes
|
426 |
+
logging.error(
|
427 |
+
f"TLDR Generation: Attribute error parsing response object: {e}. Response structure might be unexpected. Response: {response}"
|
428 |
+
)
|
429 |
+
return None
|
430 |
+
except Exception as e:
|
431 |
+
logging.error(
|
432 |
+
f"TLDR Generation: Unexpected error extracting content from response object: {e}"
|
433 |
+
)
|
434 |
+
return None
|
435 |
+
# --- End Direct Content Extraction ---
|
436 |
+
|
437 |
+
# --- JSON Parsing Logic ---
|
438 |
+
if not raw_content: # Should be caught by checks above, but belts and suspenders
|
439 |
+
logging.error("TLDR Generation: Raw content is empty after extraction attempt.")
|
440 |
+
return None
|
441 |
+
|
442 |
+
try:
|
443 |
+
# Clean potential markdown code block formatting
|
444 |
+
if raw_content.strip().startswith("```json"):
|
445 |
+
raw_content = raw_content.strip()[7:-3].strip()
|
446 |
+
elif raw_content.strip().startswith("```"):
|
447 |
+
raw_content = raw_content.strip()[3:-3].strip()
|
448 |
+
|
449 |
+
tldr_data = json.loads(raw_content)
|
450 |
+
|
451 |
+
# Validate structure: Check if it's a dict and has all required keys
|
452 |
+
required_keys = [
|
453 |
+
"app_description",
|
454 |
+
"privacy_tldr",
|
455 |
+
"data_types",
|
456 |
+
"user_input_data",
|
457 |
+
"local_processing",
|
458 |
+
"remote_processing",
|
459 |
+
"external_logging",
|
460 |
+
]
|
461 |
+
if not isinstance(tldr_data, dict):
|
462 |
+
logging.error(
|
463 |
+
f"TLDR Generation: Parsed content is not a dictionary. Content: {raw_content[:500]}..."
|
464 |
+
)
|
465 |
+
return None
|
466 |
+
if not all(key in tldr_data for key in required_keys):
|
467 |
+
missing_keys = [key for key in required_keys if key not in tldr_data]
|
468 |
+
logging.error(
|
469 |
+
f"TLDR Generation: Parsed JSON is missing required keys: {missing_keys}. Content: {raw_content[:500]}..."
|
470 |
+
)
|
471 |
+
return None
|
472 |
+
|
473 |
+
# --- Add validation for the new data_types structure ---
|
474 |
+
data_types_list = tldr_data.get("data_types")
|
475 |
+
if not isinstance(data_types_list, list):
|
476 |
+
logging.error(
|
477 |
+
f"TLDR Generation: 'data_types' is not a list. Content: {data_types_list}"
|
478 |
+
)
|
479 |
+
return None
|
480 |
+
for item in data_types_list:
|
481 |
+
if (
|
482 |
+
not isinstance(item, dict)
|
483 |
+
or "name" not in item
|
484 |
+
or "description" not in item
|
485 |
+
):
|
486 |
+
logging.error(
|
487 |
+
f"TLDR Generation: Invalid item found in 'data_types' list: {item}. Must be dict with 'name' and 'description'."
|
488 |
+
)
|
489 |
+
return None
|
490 |
+
if not isinstance(item["name"], str) or not isinstance(
|
491 |
+
item["description"], str
|
492 |
+
):
|
493 |
+
logging.error(
|
494 |
+
f"TLDR Generation: Invalid types for name/description in 'data_types' item: {item}. Must be strings."
|
495 |
+
)
|
496 |
+
return None
|
497 |
+
# --- End validation for data_types ---
|
498 |
+
|
499 |
+
# Basic validation for other lists (should contain strings)
|
500 |
+
validation_passed = True
|
501 |
+
for key in [
|
502 |
+
"user_input_data",
|
503 |
+
"local_processing",
|
504 |
+
"remote_processing",
|
505 |
+
"external_logging",
|
506 |
+
]:
|
507 |
+
data_list = tldr_data.get(key)
|
508 |
+
# Add more detailed check and logging
|
509 |
+
if not isinstance(data_list, list):
|
510 |
+
logging.error(
|
511 |
+
f"TLDR Generation Validation Error: Key '{key}' is not a list. Found type: {type(data_list)}, Value: {data_list}"
|
512 |
+
)
|
513 |
+
validation_passed = False
|
514 |
+
# Allow continuing validation for other keys, but mark as failed
|
515 |
+
elif not all(isinstance(x, str) for x in data_list):
|
516 |
+
# This check might be too strict if LLM includes non-strings, but keep for now
|
517 |
+
logging.warning(
|
518 |
+
f"TLDR Generation Validation Warning: Not all items in list '{key}' are strings. Content: {data_list}"
|
519 |
+
)
|
520 |
+
# Decide if this should cause failure - currently it doesn't, just warns
|
521 |
+
|
522 |
+
if not validation_passed:
|
523 |
+
logging.error(
|
524 |
+
"TLDR Generation: Validation failed due to incorrect list types."
|
525 |
+
)
|
526 |
+
return None # Ensure failure if any key wasn't a list
|
527 |
+
|
528 |
+
logging.info("Successfully parsed and validated TLDR JSON response.")
|
529 |
+
return tldr_data
|
530 |
+
|
531 |
+
except json.JSONDecodeError as e:
|
532 |
+
logging.error(
|
533 |
+
f"TLDR Generation: Failed to decode JSON response: {e}. Content: {raw_content[:500]}..."
|
534 |
+
)
|
535 |
+
return None
|
536 |
+
except Exception as e:
|
537 |
+
logging.error(f"TLDR Generation: Unexpected error parsing JSON response: {e}")
|
538 |
+
return None
|
539 |
+
|
540 |
+
|
541 |
+
def render_tldr_markdown(tldr_data: dict | None, space_id: str | None = None) -> str:
|
542 |
+
"""Renders the top-level TLDR (description, privacy) data into a Markdown string.
|
543 |
+
|
544 |
+
(Does not include the data lists)
|
545 |
+
"""
|
546 |
+
if not tldr_data:
|
547 |
+
# Return a more specific message for this part
|
548 |
+
return "*TLDR Summary could not be generated.*\n"
|
549 |
+
|
550 |
+
output = []
|
551 |
+
|
552 |
+
# Add Space link if space_id is provided
|
553 |
+
if space_id:
|
554 |
+
output.append(
|
555 |
+
f"**Source Space:** [`{space_id}`](https://huggingface.co/spaces/{space_id})\n"
|
556 |
+
)
|
557 |
+
|
558 |
+
output.append(f"**App Description:** {tldr_data.get('app_description', 'N/A')}\n")
|
559 |
+
privacy_summary = tldr_data.get("privacy_tldr", "N/A")
|
560 |
+
output.append(f"**Privacy TLDR:** {privacy_summary}") # Removed extra newline
|
561 |
+
|
562 |
+
# Removed data list rendering from this function
|
563 |
+
|
564 |
+
return "\n".join(output)
|
565 |
+
|
566 |
+
|
567 |
+
def render_data_details_markdown(tldr_data: dict | None) -> str:
|
568 |
+
"""Renders the data lists (types, input, processing, logging) from TLDR data."""
|
569 |
+
if not tldr_data:
|
570 |
+
return "*Data details could not be generated.*\n"
|
571 |
+
|
572 |
+
output = []
|
573 |
+
# Get defined names for formatting
|
574 |
+
defined_names = sorted(
|
575 |
+
[
|
576 |
+
dt.get("name", "")
|
577 |
+
for dt in tldr_data.get("data_types", [])
|
578 |
+
if dt.get("name")
|
579 |
+
],
|
580 |
+
key=len,
|
581 |
+
reverse=True,
|
582 |
+
)
|
583 |
+
|
584 |
+
output.append("**Data Types Defined:**") # Renamed slightly for clarity
|
585 |
+
data_types = tldr_data.get("data_types")
|
586 |
+
if data_types and isinstance(data_types, list):
|
587 |
+
if not data_types:
|
588 |
+
output.append("- None identified.")
|
589 |
+
else:
|
590 |
+
for item in data_types:
|
591 |
+
name = item.get("name", "Unnamed")
|
592 |
+
desc = item.get("description", "No description")
|
593 |
+
output.append(f"- `{name}`: {desc}")
|
594 |
+
else:
|
595 |
+
output.append("- (Error loading data types)")
|
596 |
+
output.append("") # Add newline for spacing
|
597 |
+
|
598 |
+
# Reusable helper for rendering lists
|
599 |
+
def render_list(title, key):
|
600 |
+
output.append(f"**{title}:**")
|
601 |
+
data_list = tldr_data.get(key)
|
602 |
+
if isinstance(data_list, list):
|
603 |
+
if not data_list:
|
604 |
+
output.append("- None identified.")
|
605 |
+
else:
|
606 |
+
for item_str in data_list:
|
607 |
+
formatted_item = item_str # Default
|
608 |
+
found_match = False
|
609 |
+
for name in defined_names:
|
610 |
+
if item_str == name:
|
611 |
+
formatted_item = f"`{name}`"
|
612 |
+
found_match = True
|
613 |
+
break
|
614 |
+
elif item_str.startswith(name + " "):
|
615 |
+
formatted_item = f"`{name}`{item_str[len(name):]}"
|
616 |
+
found_match = True
|
617 |
+
break
|
618 |
+
if (
|
619 |
+
not found_match
|
620 |
+
and " " not in item_str
|
621 |
+
and not item_str.startswith("`")
|
622 |
+
):
|
623 |
+
formatted_item = f"`{item_str}`"
|
624 |
+
output.append(f"- {formatted_item}")
|
625 |
+
else:
|
626 |
+
output.append("- (Error loading list)")
|
627 |
+
output.append("")
|
628 |
+
|
629 |
+
render_list("Data Sent by User to App", "user_input_data")
|
630 |
+
render_list("Data Processed Locally within App", "local_processing")
|
631 |
+
render_list("Data Processed Remotely", "remote_processing")
|
632 |
+
render_list("Data Logged/Saved Externally", "external_logging")
|
633 |
+
|
634 |
+
# Remove the last empty line
|
635 |
+
if output and output[-1] == "":
|
636 |
+
output.pop()
|
637 |
+
|
638 |
+
return "\n".join(output)
|
639 |
+
|
640 |
+
|
641 |
+
# --- Combined TLDR Generation Function ---
|
642 |
+
|
643 |
+
|
644 |
+
def generate_and_parse_tldr(detailed_report: str, summary_report: str) -> dict | None:
|
645 |
+
"""Formats prompt, queries LLM, and parses JSON response for TLDR.
|
646 |
+
|
647 |
+
Args:
|
648 |
+
detailed_report: The detailed privacy report content.
|
649 |
+
summary_report: The summary & highlights report content.
|
650 |
+
|
651 |
+
Returns:
|
652 |
+
A dictionary with the parsed TLDR data, or None if any step fails.
|
653 |
+
"""
|
654 |
+
logging.info("Starting TLDR generation and parsing...")
|
655 |
+
try:
|
656 |
+
# Format
|
657 |
+
tldr_prompt_messages = format_tldr_prompt(detailed_report, summary_report)
|
658 |
+
if not tldr_prompt_messages:
|
659 |
+
logging.error("TLDR Generation: Failed to format prompt.")
|
660 |
+
return None
|
661 |
+
|
662 |
+
# Query (using existing import within analysis_utils)
|
663 |
+
# Use slightly smaller max_tokens
|
664 |
+
llm_response = query_qwen_endpoint(tldr_prompt_messages, max_tokens=1024)
|
665 |
+
if llm_response is None: # Check if query itself failed critically
|
666 |
+
logging.error("TLDR Generation: LLM query returned None.")
|
667 |
+
return None
|
668 |
+
# 503 handled within parse function below
|
669 |
+
|
670 |
+
# Parse
|
671 |
+
parsed_data = parse_tldr_json_response(llm_response)
|
672 |
+
if parsed_data:
|
673 |
+
logging.info("Successfully generated and parsed TLDR.")
|
674 |
+
return parsed_data
|
675 |
+
else:
|
676 |
+
logging.error("TLDR Generation: Failed to parse JSON response.")
|
677 |
+
return None
|
678 |
+
|
679 |
+
except Exception as e:
|
680 |
+
logging.error(
|
681 |
+
f"TLDR Generation: Unexpected error in generate_and_parse_tldr: {e}",
|
682 |
+
exc_info=True,
|
683 |
+
)
|
684 |
+
return None
|
app.py
CHANGED
@@ -1,25 +1,37 @@
|
|
|
|
1 |
import logging
|
2 |
import os
|
3 |
|
4 |
import gradio as gr
|
5 |
from dotenv import load_dotenv
|
6 |
-
|
7 |
from huggingface_hub import HfApi
|
8 |
|
9 |
-
|
10 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
# Updated prompt imports for new order
|
13 |
-
from prompts import format_privacy_prompt, format_summary_highlights_prompt
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
from utils import (
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
)
|
23 |
|
24 |
# Configure logging
|
25 |
logging.basicConfig(
|
@@ -34,10 +46,13 @@ load_dotenv()
|
|
34 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
35 |
ENDPOINT_NAME = "qwen2-5-coder-32b-instruct-pmf"
|
36 |
DATASET_ID = "yjernite/spaces-privacy-reports"
|
37 |
-
CACHE_INFO_MSG =
|
|
|
|
|
38 |
DEFAULT_SELECTION = "HuggingFaceTB/SmolVLM2"
|
39 |
|
40 |
-
TRUNCATION_WARNING
|
|
|
41 |
|
42 |
ERROR_503_USER_MESSAGE = """It appears that the analysis model endpoint is currently down or starting up.
|
43 |
|
@@ -49,419 +64,582 @@ You have a few options:
|
|
49 |
"""
|
50 |
|
51 |
|
52 |
-
def
|
53 |
-
selected_cached_space: str | None,
|
54 |
-
new_space_id: str | None,
|
55 |
-
progress=gr.Progress(track_tqdm=True),
|
56 |
-
):
|
57 |
"""
|
58 |
-
|
59 |
-
Handles the logic based on Dropdown and Textbox inputs.
|
60 |
Yields tuples of Gradio updates.
|
61 |
"""
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
-
#
|
66 |
-
|
67 |
-
target_space_id = new_space_id.strip()
|
68 |
-
if target_space_id == selected_cached_space:
|
69 |
-
source = "dropdown_match" # User typed ID that exists in dropdown
|
70 |
-
else:
|
71 |
-
source = "new"
|
72 |
-
elif selected_cached_space:
|
73 |
-
target_space_id = selected_cached_space
|
74 |
-
source = "dropdown"
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
gr.update(
|
92 |
-
value=f"
|
93 |
visible=True,
|
94 |
),
|
95 |
-
gr.update(value="", visible=False),
|
96 |
-
gr.update(visible=True, open=True),
|
97 |
-
gr.update(visible=False),
|
98 |
-
)
|
99 |
-
|
100 |
-
logging.info(f"Request received for: '{target_space_id}' (Source: {source})")
|
101 |
-
|
102 |
-
# --- Cache Handling ---
|
103 |
-
# If the user explicitly selected from the dropdown, try to fetch it directly.
|
104 |
-
if source == "dropdown":
|
105 |
-
progress(
|
106 |
-
0.1, desc="Fetching cached report..."
|
107 |
-
) # Simple progress for cache fetch
|
108 |
-
yield (
|
109 |
-
gr.update(value="Fetching selected cached report...", visible=True),
|
110 |
gr.update(value="", visible=True),
|
111 |
-
gr.update(visible=True, open=
|
|
|
112 |
gr.update(visible=True, open=False),
|
113 |
)
|
114 |
-
|
115 |
-
cached_reports = download_cached_reports(
|
116 |
-
target_space_id, DATASET_ID, HF_TOKEN
|
117 |
-
)
|
118 |
-
summary_report = (
|
119 |
-
cached_reports.get("summary", "Error: Cached summary not found.")
|
120 |
-
+ CACHE_INFO_MSG
|
121 |
-
)
|
122 |
-
privacy_report = (
|
123 |
-
cached_reports.get("privacy", "Error: Cached privacy report not found.")
|
124 |
-
+ CACHE_INFO_MSG
|
125 |
-
)
|
126 |
-
logging.info(
|
127 |
-
f"Successfully displayed cached reports for selected '{target_space_id}'."
|
128 |
-
)
|
129 |
-
progress(1.0, desc="Complete (from cache)")
|
130 |
-
yield (
|
131 |
-
gr.update(value=summary_report, visible=True),
|
132 |
-
gr.update(value=privacy_report, visible=True),
|
133 |
-
gr.update(visible=True, open=True),
|
134 |
-
gr.update(visible=True, open=True),
|
135 |
-
)
|
136 |
-
except Exception as e:
|
137 |
-
error_msg = f"Failed to download cached report for selected '{target_space_id}': {e}"
|
138 |
-
logging.error(error_msg)
|
139 |
-
progress(1.0, desc="Error")
|
140 |
-
yield (
|
141 |
-
gr.update(value=error_msg, visible=True),
|
142 |
-
gr.update(value="", visible=False),
|
143 |
-
gr.update(visible=True, open=True),
|
144 |
-
gr.update(visible=False),
|
145 |
-
)
|
146 |
-
|
147 |
-
# --- Live Analysis or Check Cache for New Input ---
|
148 |
-
# If it came from the textbox OR was a dropdown match, we first check cache, then run live.
|
149 |
-
else: # source == "new" or source == "dropdown_match"
|
150 |
-
# This generator now performs the full analysis if needed
|
151 |
-
# Yield intermediate updates from the generator
|
152 |
-
# Important: Need to use a loop to consume the generator
|
153 |
-
final_update = None
|
154 |
-
for update_tuple in _run_live_analysis(target_space_id, progress):
|
155 |
-
yield update_tuple
|
156 |
-
final_update = update_tuple # Keep track of the last update
|
157 |
-
yield final_update # Return the very last state
|
158 |
-
|
159 |
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
"""
|
166 |
-
steps = 8 # Steps for the full pipeline
|
167 |
-
privacy_truncated = False
|
168 |
-
summary_truncated = False
|
169 |
-
|
170 |
-
# --- Step 1: Check Cache --- (Check again for new/matched input)
|
171 |
-
progress(1 / steps, desc="Step 1/8: Checking cache...")
|
172 |
-
logging.info(f"Step 1/8: Checking cache for '{space_id}'...")
|
173 |
yield (
|
174 |
-
gr.update(value=
|
|
|
|
|
175 |
gr.update(value="", visible=True),
|
176 |
-
gr.update(visible=True, open=
|
|
|
177 |
gr.update(visible=True, open=False),
|
178 |
)
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
found_in_cache = check_report_exists(space_id, DATASET_ID, HF_TOKEN)
|
183 |
-
except Exception as e:
|
184 |
-
logging.warning(f"Cache check failed: {e}. Proceeding.")
|
185 |
-
yield (
|
186 |
-
gr.update(
|
187 |
-
value="Cache check failed, proceeding with live analysis...",
|
188 |
-
visible=True,
|
189 |
-
),
|
190 |
-
gr.update(value="", visible=True),
|
191 |
-
gr.update(visible=True, open=True),
|
192 |
-
gr.update(visible=True, open=False),
|
193 |
-
)
|
194 |
|
195 |
-
if
|
196 |
-
|
197 |
-
|
|
|
198 |
yield (
|
199 |
-
gr.update(value=
|
200 |
-
gr.update(value="", visible=
|
201 |
-
gr.update(
|
202 |
-
gr.update(
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
summary_report = (
|
207 |
-
cached_reports.get("summary", "Error: Cached summary not found.")
|
208 |
-
+ CACHE_INFO_MSG
|
209 |
-
)
|
210 |
-
privacy_report = (
|
211 |
-
cached_reports.get("privacy", "Error: Cached privacy report not found.")
|
212 |
-
+ CACHE_INFO_MSG
|
213 |
-
)
|
214 |
-
logging.info(f"Successfully displayed cached reports for {space_id}.")
|
215 |
-
progress(8 / steps, desc="Complete (from cache)")
|
216 |
-
yield (
|
217 |
-
gr.update(value=summary_report, visible=True),
|
218 |
-
gr.update(value=privacy_report, visible=True),
|
219 |
-
gr.update(visible=True, open=True),
|
220 |
-
gr.update(visible=True, open=True),
|
221 |
-
)
|
222 |
-
return # End generation here if cache successful
|
223 |
-
except Exception as e:
|
224 |
-
logging.warning(f"Cache download failed for {space_id}: {e}. Proceeding.")
|
225 |
-
yield (
|
226 |
-
gr.update(
|
227 |
-
value="Cache download failed, proceeding with live analysis...",
|
228 |
-
visible=True,
|
229 |
-
),
|
230 |
-
gr.update(value="", visible=True),
|
231 |
-
gr.update(visible=True, open=True),
|
232 |
-
gr.update(visible=True, open=False),
|
233 |
-
)
|
234 |
-
else:
|
235 |
-
logging.info(f"Cache miss for {space_id}. Performing live analysis.")
|
236 |
-
yield (
|
237 |
-
gr.update(value="Cache miss. Fetching code...", visible=True),
|
238 |
-
gr.update(value="", visible=True),
|
239 |
-
gr.update(visible=True, open=True),
|
240 |
-
gr.update(visible=True, open=False),
|
241 |
)
|
|
|
242 |
|
243 |
-
# --- Step
|
244 |
-
|
245 |
-
|
|
|
|
|
246 |
yield (
|
247 |
-
gr.update(value=
|
|
|
|
|
248 |
gr.update(value="", visible=True),
|
249 |
-
gr.update(visible=True, open=
|
|
|
250 |
gr.update(visible=True, open=False),
|
251 |
)
|
|
|
252 |
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
logging.info(f"Endpoint '{ENDPOINT_NAME}' status: {status}")
|
261 |
-
|
262 |
-
if status == 'running':
|
263 |
-
endpoint_ready = True
|
264 |
-
else:
|
265 |
-
logging.warning(f"Endpoint '{ENDPOINT_NAME}' is not ready (Status: {status}).")
|
266 |
-
if status == 'scaledToZero':
|
267 |
-
logging.info(f"Endpoint '{ENDPOINT_NAME}' is scaled to zero. Attempting to resume...")
|
268 |
-
endpoint.resume()
|
269 |
-
msg_503 = f"**Full Service Temporarily Unavailable**: but you can **browse existing reports** or **check back later!**\n\n The status of the Qwen2.5-Coder-32B-Instruct endpoint powering the analysis is currently: <span style='color:red'>**{status}**</span>\n\n" + ERROR_503_USER_MESSAGE
|
270 |
-
yield (
|
271 |
-
gr.update(value=msg_503, visible=True),
|
272 |
-
gr.update(value="", visible=False),
|
273 |
-
gr.update(visible=True, open=True),
|
274 |
-
gr.update(visible=False)
|
275 |
-
)
|
276 |
-
return # Stop analysis, user needs to retry
|
277 |
-
except Exception as e:
|
278 |
-
logging.error(f"Error checking endpoint status for {ENDPOINT_NAME}: {e}")
|
279 |
-
yield (
|
280 |
-
gr.update(value=f"Error checking analysis endpoint status: {e}", visible=True),
|
281 |
-
gr.update(value="", visible=False),
|
282 |
-
gr.update(visible=True, open=True),
|
283 |
-
gr.update(visible=False)
|
284 |
-
)
|
285 |
-
return # Stop analysis
|
286 |
-
|
287 |
-
# --- Step 3: Fetch Code Files (if not cached) ---
|
288 |
-
progress(3 / steps, desc="Step 3/8: Fetching code files...")
|
289 |
-
logging.info("Step 3/8: Fetching code files...")
|
290 |
-
code_files = get_space_code_files(space_id)
|
291 |
-
if not code_files:
|
292 |
-
error_msg = f"Could not retrieve code files for '{space_id}'. Check ID and ensure it's a public Space."
|
293 |
-
logging.warning(error_msg)
|
294 |
yield (
|
295 |
-
gr.update(value=
|
|
|
|
|
296 |
gr.update(value="Analysis Canceled", visible=True),
|
297 |
-
gr.update(visible=
|
|
|
298 |
gr.update(visible=True, open=False),
|
299 |
)
|
300 |
-
return
|
|
|
301 |
|
302 |
# --- Step 4: Generate DETAILED Privacy Report (LLM Call 1) ---
|
303 |
-
|
304 |
-
|
|
|
305 |
)
|
306 |
-
|
|
|
307 |
yield (
|
308 |
-
gr.update(value=
|
|
|
|
|
|
|
|
|
309 |
gr.update(value="Generating detailed privacy report via AI...", visible=True),
|
310 |
-
gr.update(visible=True, open=
|
|
|
311 |
gr.update(visible=True, open=True),
|
312 |
)
|
313 |
-
|
314 |
-
space_id, code_files
|
315 |
)
|
316 |
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
yield (
|
322 |
-
gr.update(
|
323 |
-
|
324 |
-
),
|
325 |
-
gr.update(value="", visible=False),
|
326 |
-
gr.update(visible=
|
327 |
-
gr.update(visible=False),
|
|
|
328 |
)
|
329 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
|
331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
|
333 |
-
|
334 |
-
|
335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
yield (
|
338 |
-
gr.update(value=
|
|
|
339 |
gr.update(
|
340 |
-
value=f"
|
341 |
visible=True,
|
342 |
),
|
343 |
-
gr.update(
|
|
|
|
|
344 |
gr.update(visible=True, open=True),
|
345 |
)
|
346 |
-
return
|
347 |
-
|
348 |
-
|
349 |
|
|
|
|
|
|
|
|
|
|
|
350 |
yield (
|
351 |
-
gr.update(value=
|
352 |
-
gr.update(value=
|
353 |
-
gr.update(
|
|
|
|
|
|
|
354 |
gr.update(visible=True, open=True),
|
355 |
)
|
356 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
|
358 |
-
#
|
359 |
-
progress(5 / steps, desc="Step 5/8: Fetching model descriptions...")
|
360 |
-
logging.info("Step 5/8: Fetching model descriptions...")
|
361 |
yield (
|
362 |
-
gr.update(value=
|
|
|
363 |
gr.update(),
|
364 |
gr.update(),
|
|
|
|
|
365 |
gr.update(),
|
366 |
)
|
367 |
-
|
368 |
-
|
369 |
-
|
|
|
|
|
|
|
370 |
yield (
|
371 |
-
gr.update(value=
|
|
|
|
|
|
|
372 |
gr.update(),
|
373 |
gr.update(),
|
374 |
gr.update(),
|
375 |
)
|
376 |
-
|
377 |
-
|
|
|
378 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
379 |
|
380 |
-
#
|
381 |
-
|
382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
383 |
)
|
384 |
-
|
385 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
386 |
yield (
|
|
|
|
|
387 |
gr.update(
|
388 |
-
value=
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
),
|
393 |
-
gr.update(visible=True, open=
|
394 |
-
gr.update(visible=
|
395 |
)
|
396 |
-
return
|
397 |
-
|
398 |
-
summary_highlights_report = parse_qwen_response(summary_highlights_api_response)
|
399 |
|
400 |
-
if "
|
401 |
-
|
402 |
-
f"Failed to generate summary/highlights report: {summary_highlights_report}"
|
403 |
-
)
|
404 |
yield (
|
|
|
|
|
405 |
gr.update(
|
406 |
-
value=f"
|
407 |
visible=True,
|
408 |
),
|
409 |
-
gr.update(value=
|
410 |
-
gr.update(visible=True, open=
|
411 |
-
gr.update(visible=True, open=
|
|
|
412 |
)
|
413 |
-
return
|
414 |
-
if summary_truncated:
|
415 |
-
summary_highlights_report = TRUNCATION_WARNING + summary_highlights_report
|
416 |
|
417 |
-
|
418 |
-
yield (
|
419 |
-
gr.update(value=summary_highlights_report, visible=True),
|
420 |
-
gr.update(value=detailed_privacy_report, visible=True),
|
421 |
-
gr.update(visible=True, open=True),
|
422 |
-
gr.update(visible=True, open=True),
|
423 |
-
)
|
424 |
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
upload_reports_to_dataset(
|
442 |
-
space_id=space_id,
|
443 |
-
summary_report=summary_to_save,
|
444 |
-
detailed_report=privacy_to_save,
|
445 |
-
dataset_id=DATASET_ID,
|
446 |
-
hf_token=HF_TOKEN,
|
447 |
)
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
454 |
|
455 |
-
|
456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
457 |
|
458 |
-
# ---
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
|
466 |
|
467 |
# --- Load Initial Data Function (for demo.load) ---
|
@@ -511,7 +689,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
511 |
with gr.Row():
|
512 |
with gr.Column(scale=1): # Left column for inputs
|
513 |
description_accordion = gr.Accordion(
|
514 |
-
"What Privacy Questions do 🤗 Spaces Raise? Click here for Demo Description 👇",
|
|
|
|
|
515 |
)
|
516 |
with description_accordion:
|
517 |
gr.Markdown(DESCRIPTION)
|
@@ -532,12 +712,28 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
532 |
analyze_button = gr.Button("Get Space Report", variant="primary", scale=1)
|
533 |
|
534 |
with gr.Column(scale=1): # Right column for outputs
|
535 |
-
# Define
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
536 |
summary_accordion = gr.Accordion(
|
537 |
-
"Summary & Privacy Highlights",
|
|
|
|
|
538 |
)
|
539 |
privacy_accordion = gr.Accordion(
|
540 |
-
"Detailed Privacy Analysis Report",
|
|
|
|
|
541 |
)
|
542 |
with summary_accordion:
|
543 |
summary_markdown = gr.Markdown(
|
@@ -559,8 +755,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
559 |
fn=get_space_report_wrapper,
|
560 |
inputs=[cached_spaces_dropdown, space_id_input],
|
561 |
outputs=[
|
|
|
|
|
562 |
summary_markdown,
|
563 |
privacy_markdown,
|
|
|
564 |
summary_accordion,
|
565 |
privacy_accordion,
|
566 |
],
|
|
|
1 |
+
import json
|
2 |
import logging
|
3 |
import os
|
4 |
|
5 |
import gradio as gr
|
6 |
from dotenv import load_dotenv
|
|
|
7 |
from huggingface_hub import HfApi
|
8 |
|
9 |
+
# Import analysis pipeline helpers
|
10 |
+
from analysis_utils import (check_cache_and_download, check_endpoint_status,
|
11 |
+
fetch_and_validate_code, format_tldr_prompt,
|
12 |
+
generate_and_parse_tldr, generate_detailed_report,
|
13 |
+
generate_summary_report, parse_tldr_json_response,
|
14 |
+
render_data_details_markdown, render_tldr_markdown,
|
15 |
+
upload_results)
|
16 |
+
# Import general utils
|
17 |
+
from utils import list_cached_spaces # Added import
|
18 |
+
|
19 |
+
# Removed LLM interface imports, handled by analysis_utils
|
20 |
+
# from llm_interface import ERROR_503_DICT
|
21 |
+
# from llm_interface import parse_qwen_response, query_qwen_endpoint
|
22 |
+
|
23 |
+
# Removed prompts import, handled by analysis_utils
|
24 |
+
# from prompts import format_privacy_prompt, format_summary_highlights_prompt
|
25 |
|
|
|
|
|
26 |
|
27 |
+
|
28 |
+
# Removed specific utils imports now handled via analysis_utils
|
29 |
+
# from utils import (
|
30 |
+
# check_report_exists,
|
31 |
+
# download_cached_reports,
|
32 |
+
# get_space_code_files,
|
33 |
+
# upload_reports_to_dataset,
|
34 |
+
# )
|
35 |
|
36 |
# Configure logging
|
37 |
logging.basicConfig(
|
|
|
46 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
47 |
ENDPOINT_NAME = "qwen2-5-coder-32b-instruct-pmf"
|
48 |
DATASET_ID = "yjernite/spaces-privacy-reports"
|
49 |
+
CACHE_INFO_MSG = (
|
50 |
+
"\n\n*(Report retrieved from cache)*" # Still needed for dropdown cache hit message
|
51 |
+
)
|
52 |
DEFAULT_SELECTION = "HuggingFaceTB/SmolVLM2"
|
53 |
|
54 |
+
# TRUNCATION_WARNING now defined and used within analysis_utils
|
55 |
+
# TRUNCATION_WARNING = """**⚠️ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n"""
|
56 |
|
57 |
ERROR_503_USER_MESSAGE = """It appears that the analysis model endpoint is currently down or starting up.
|
58 |
|
|
|
64 |
"""
|
65 |
|
66 |
|
67 |
+
def _run_live_analysis(space_id: str, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
|
|
68 |
"""
|
69 |
+
Performs the full analysis pipeline using helper functions from analysis_utils.
|
|
|
70 |
Yields tuples of Gradio updates.
|
71 |
"""
|
72 |
+
total_steps = 9 # Increased step count for TLDR generation
|
73 |
+
current_step = 0
|
74 |
+
summary_report = ""
|
75 |
+
privacy_report = ""
|
76 |
+
tldr_data = None
|
77 |
+
tldr_markdown_content = "*TLDR loading...*"
|
78 |
+
data_details_content = (
|
79 |
+
"*Data details loading...*" # Default message for new component
|
80 |
+
)
|
81 |
|
82 |
+
# Initial message before first step
|
83 |
+
tldr_status_message = "*Starting analysis...*"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
+
# --- Step 1: Check Cache ---
|
86 |
+
current_step += 1
|
87 |
+
progress_desc = f"Step {current_step}/{total_steps}: Checking cache..."
|
88 |
+
progress(current_step / total_steps, desc=progress_desc)
|
89 |
+
tldr_status_message = f"*{progress_desc}*"
|
90 |
+
yield (
|
91 |
+
gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
|
92 |
+
gr.update(value="*Checking cache...*", visible=True),
|
93 |
+
gr.update(value="Checking cache for existing reports...", visible=True),
|
94 |
+
gr.update(value="", visible=True),
|
95 |
+
gr.update(visible=True, open=False),
|
96 |
+
gr.update(visible=True, open=False),
|
97 |
+
gr.update(visible=True, open=False),
|
98 |
+
)
|
99 |
+
cache_result = check_cache_and_download(space_id, DATASET_ID, HF_TOKEN)
|
100 |
+
|
101 |
+
if cache_result["status"] == "cache_hit":
|
102 |
+
progress(total_steps / total_steps, desc="Complete (from cache)")
|
103 |
+
# Try to parse and render TLDR from cache
|
104 |
+
tldr_json_str = cache_result.get("tldr_json_str")
|
105 |
+
rendered_tldr = "*TLDR not found in cache.*"
|
106 |
+
if tldr_json_str:
|
107 |
+
try:
|
108 |
+
cached_tldr_data = json.loads(tldr_json_str)
|
109 |
+
# Render both parts
|
110 |
+
rendered_tldr = render_tldr_markdown(cached_tldr_data, space_id)
|
111 |
+
rendered_data_details = render_data_details_markdown(cached_tldr_data)
|
112 |
+
except Exception as parse_err:
|
113 |
+
logging.warning(
|
114 |
+
f"Failed to parse cached TLDR JSON for {space_id}: {parse_err}"
|
115 |
+
)
|
116 |
+
rendered_tldr = "*Error parsing cached TLDR.*"
|
117 |
+
rendered_data_details = (
|
118 |
+
"*Could not load data details due to parsing error.*"
|
119 |
+
)
|
120 |
|
121 |
+
yield (
|
122 |
+
gr.update(value=rendered_tldr, visible=True),
|
123 |
+
gr.update(value=rendered_data_details, visible=True),
|
124 |
+
gr.update(value=cache_result["summary"], visible=True),
|
125 |
+
gr.update(value=cache_result["privacy"], visible=True),
|
126 |
+
gr.update(visible=True, open=False),
|
127 |
+
gr.update(visible=True, open=False),
|
128 |
+
gr.update(visible=True, open=False),
|
129 |
+
)
|
130 |
+
return # End generation successfully from cache
|
131 |
+
elif cache_result["status"] == "cache_error":
|
132 |
+
# Display final error in TLDR field
|
133 |
+
tldr_status_message = (
|
134 |
+
f"*Cache download failed. {cache_result.get('ui_message', '')}*"
|
135 |
+
)
|
136 |
+
data_details_content = "*Data details unavailable due to cache error.*"
|
137 |
+
yield (
|
138 |
+
gr.update(value=tldr_status_message, visible=True),
|
139 |
+
gr.update(value=data_details_content, visible=True),
|
140 |
+
gr.update(value=cache_result["ui_message"], visible=True),
|
141 |
+
gr.update(value="", visible=True),
|
142 |
+
gr.update(visible=True, open=False),
|
143 |
+
gr.update(visible=True, open=False),
|
144 |
+
gr.update(visible=True, open=False),
|
145 |
+
)
|
146 |
+
# Still continue to live analysis if cache download fails
|
147 |
+
elif cache_result["status"] == "cache_miss":
|
148 |
+
tldr_status_message = f"*{progress_desc} - Cache miss.*" # Update status
|
149 |
+
data_details_content = "*Generating report...*"
|
150 |
+
yield (
|
151 |
+
gr.update(value=tldr_status_message, visible=True),
|
152 |
+
gr.update(value=data_details_content, visible=True),
|
153 |
+
gr.update(value="Cache miss. Starting live analysis...", visible=True),
|
154 |
+
gr.update(value="", visible=True),
|
155 |
+
gr.update(visible=True, open=False),
|
156 |
+
gr.update(visible=True, open=False),
|
157 |
+
gr.update(visible=True, open=False),
|
158 |
+
)
|
159 |
+
elif "error_message" in cache_result:
|
160 |
+
# Display final error in TLDR field
|
161 |
+
tldr_status_message = (
|
162 |
+
f"*Cache check failed. {cache_result.get('error_message', '')}*"
|
163 |
+
)
|
164 |
+
data_details_content = "*Data details unavailable due to cache error.*"
|
165 |
+
yield (
|
166 |
+
gr.update(value=tldr_status_message, visible=True),
|
167 |
+
gr.update(value=data_details_content, visible=True),
|
168 |
gr.update(
|
169 |
+
value=f"Cache check failed: {cache_result.get('error_message', 'Unknown error')}. Proceeding with live analysis...",
|
170 |
visible=True,
|
171 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
gr.update(value="", visible=True),
|
173 |
+
gr.update(visible=True, open=False),
|
174 |
+
gr.update(visible=True, open=False),
|
175 |
gr.update(visible=True, open=False),
|
176 |
)
|
177 |
+
# Still continue if cache check fails
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
+
# --- Step 2: Check Endpoint Status ---
|
180 |
+
current_step += 1
|
181 |
+
progress_desc = f"Step {current_step}/{total_steps}: Checking endpoint..."
|
182 |
+
progress(current_step / total_steps, desc=progress_desc)
|
183 |
+
tldr_status_message = f"*{progress_desc}*"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
yield (
|
185 |
+
gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
|
186 |
+
gr.update(),
|
187 |
+
gr.update(value="Checking analysis model endpoint status...", visible=True),
|
188 |
gr.update(value="", visible=True),
|
189 |
+
gr.update(visible=True, open=False),
|
190 |
+
gr.update(visible=True, open=False),
|
191 |
gr.update(visible=True, open=False),
|
192 |
)
|
193 |
+
endpoint_result = check_endpoint_status(
|
194 |
+
ENDPOINT_NAME, HF_TOKEN, ERROR_503_USER_MESSAGE
|
195 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
+
if endpoint_result["status"] == "error":
|
198 |
+
progress(total_steps / total_steps, desc="Endpoint Error")
|
199 |
+
# Display final error in TLDR field
|
200 |
+
tldr_markdown_content = endpoint_result["ui_message"]
|
201 |
yield (
|
202 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
203 |
+
gr.update(value="", visible=False),
|
204 |
+
gr.update(value="", visible=False),
|
205 |
+
gr.update(value="", visible=False),
|
206 |
+
gr.update(visible=False),
|
207 |
+
gr.update(visible=False),
|
208 |
+
gr.update(visible=False),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
)
|
210 |
+
return
|
211 |
|
212 |
+
# --- Step 3: Fetch Code Files ---
|
213 |
+
current_step += 1
|
214 |
+
progress_desc = f"Step {current_step}/{total_steps}: Fetching code..."
|
215 |
+
progress(current_step / total_steps, desc=progress_desc)
|
216 |
+
tldr_status_message = f"*{progress_desc}*"
|
217 |
yield (
|
218 |
+
gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
|
219 |
+
gr.update(),
|
220 |
+
gr.update(value="Fetching code files from the Space...", visible=True),
|
221 |
gr.update(value="", visible=True),
|
222 |
+
gr.update(visible=True, open=False),
|
223 |
+
gr.update(visible=True, open=False),
|
224 |
gr.update(visible=True, open=False),
|
225 |
)
|
226 |
+
code_result = fetch_and_validate_code(space_id)
|
227 |
|
228 |
+
if code_result["status"] == "error":
|
229 |
+
progress(total_steps / total_steps, desc="Code Fetch Error")
|
230 |
+
# Display final error in TLDR field
|
231 |
+
tldr_markdown_content = (
|
232 |
+
f"**Error:** {code_result.get('ui_message', 'Failed to fetch code.')}"
|
233 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
yield (
|
235 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
236 |
+
gr.update(value="", visible=False),
|
237 |
+
gr.update(value="", visible=False),
|
238 |
gr.update(value="Analysis Canceled", visible=True),
|
239 |
+
gr.update(visible=False),
|
240 |
+
gr.update(visible=False),
|
241 |
gr.update(visible=True, open=False),
|
242 |
)
|
243 |
+
return
|
244 |
+
code_files = code_result["code_files"]
|
245 |
|
246 |
# --- Step 4: Generate DETAILED Privacy Report (LLM Call 1) ---
|
247 |
+
current_step += 1
|
248 |
+
progress_desc = (
|
249 |
+
f"Step {current_step}/{total_steps}: Generating privacy report (AI Call 1)..."
|
250 |
)
|
251 |
+
progress(current_step / total_steps, desc=progress_desc)
|
252 |
+
tldr_status_message = f"*{progress_desc}*"
|
253 |
yield (
|
254 |
+
gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
|
255 |
+
gr.update(),
|
256 |
+
gr.update(
|
257 |
+
value="Generating detailed privacy report (AI Call 1)...", visible=True
|
258 |
+
),
|
259 |
gr.update(value="Generating detailed privacy report via AI...", visible=True),
|
260 |
+
gr.update(visible=True, open=False),
|
261 |
+
gr.update(visible=True, open=False),
|
262 |
gr.update(visible=True, open=True),
|
263 |
)
|
264 |
+
privacy_result = generate_detailed_report(
|
265 |
+
space_id, code_files, ERROR_503_USER_MESSAGE
|
266 |
)
|
267 |
|
268 |
+
if privacy_result["status"] == "error":
|
269 |
+
progress(total_steps / total_steps, desc="Privacy Report Error")
|
270 |
+
# Display final error in TLDR field
|
271 |
+
tldr_markdown_content = f"**Error:** {privacy_result.get('ui_message', 'Failed during detailed report generation.')}"
|
272 |
yield (
|
273 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
274 |
+
gr.update(value="", visible=False),
|
275 |
+
gr.update(value="", visible=False),
|
276 |
+
gr.update(value="", visible=False),
|
277 |
+
gr.update(visible=False),
|
278 |
+
gr.update(visible=False),
|
279 |
+
gr.update(visible=False),
|
280 |
)
|
281 |
+
return
|
282 |
+
privacy_report = privacy_result["report"]
|
283 |
+
|
284 |
+
# Update UI with successful detailed report
|
285 |
+
yield (
|
286 |
+
gr.update(value=tldr_status_message, visible=True), # Still show progress
|
287 |
+
gr.update(),
|
288 |
+
gr.update(
|
289 |
+
value="Detailed privacy report generated. Proceeding...", visible=True
|
290 |
+
),
|
291 |
+
gr.update(value=privacy_report, visible=True),
|
292 |
+
gr.update(visible=True, open=False),
|
293 |
+
gr.update(visible=True, open=False),
|
294 |
+
gr.update(visible=True, open=True),
|
295 |
+
)
|
296 |
|
297 |
+
# --- Step 5: Fetch Model Descriptions (Placeholder/Optional) ---
|
298 |
+
current_step += 1
|
299 |
+
progress_desc = f"Step {current_step}/{total_steps}: Extracting model info..."
|
300 |
+
progress(current_step / total_steps, desc=progress_desc)
|
301 |
+
tldr_status_message = f"*{progress_desc}*"
|
302 |
+
logging.info(progress_desc + " (Placeholder)")
|
303 |
+
yield (
|
304 |
+
gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
|
305 |
+
gr.update(),
|
306 |
+
gr.update(value="Extracting model info...", visible=True),
|
307 |
+
gr.update(),
|
308 |
+
gr.update(),
|
309 |
+
gr.update(),
|
310 |
+
gr.update(),
|
311 |
+
)
|
312 |
+
# model_ids = extract_hf_model_ids(code_files) # utils function not imported
|
313 |
+
# model_descriptions = get_model_descriptions(model_ids) # utils function not imported
|
314 |
+
# Add model_descriptions to context if needed for summary prompt later
|
315 |
|
316 |
+
# --- Step 6: Generate Summary + Highlights Report (LLM Call 2) ---
|
317 |
+
current_step += 1
|
318 |
+
progress_desc = (
|
319 |
+
f"Step {current_step}/{total_steps}: Generating summary (AI Call 2)..."
|
320 |
+
)
|
321 |
+
progress(current_step / total_steps, desc=progress_desc)
|
322 |
+
tldr_status_message = f"*{progress_desc}*"
|
323 |
+
yield (
|
324 |
+
gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
|
325 |
+
gr.update(),
|
326 |
+
gr.update(value="Generating summary & highlights (AI Call 2)...", visible=True),
|
327 |
+
gr.update(),
|
328 |
+
gr.update(),
|
329 |
+
gr.update(),
|
330 |
+
gr.update(),
|
331 |
+
)
|
332 |
+
summary_result = generate_summary_report(
|
333 |
+
space_id, code_files, privacy_report, ERROR_503_USER_MESSAGE
|
334 |
+
)
|
335 |
+
|
336 |
+
if (
|
337 |
+
summary_result["status"] == "error_503_summary"
|
338 |
+
or summary_result["status"] == "error_summary"
|
339 |
+
):
|
340 |
+
progress(total_steps / total_steps, desc="Summary Report Error")
|
341 |
+
# Display error in TLDR, show partial results below
|
342 |
+
tldr_markdown_content = f"**Error:** {summary_result.get('ui_message', 'Failed during summary generation.')}"
|
343 |
+
data_details_content = "*Data details may be incomplete.*"
|
344 |
+
yield (
|
345 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
346 |
+
gr.update(value=data_details_content, visible=True),
|
347 |
+
gr.update(value=summary_result["ui_message"], visible=True),
|
348 |
+
gr.update(value=privacy_report, visible=True),
|
349 |
+
gr.update(visible=True, open=False),
|
350 |
+
gr.update(visible=True, open=False),
|
351 |
+
gr.update(visible=True, open=True),
|
352 |
)
|
353 |
+
return
|
354 |
+
elif summary_result["status"] != "success":
|
355 |
+
progress(total_steps / total_steps, desc="Summary Report Error")
|
356 |
+
# Display error in TLDR, show partial results below
|
357 |
+
tldr_markdown_content = f"**Error:** Unexpected error generating summary: {summary_result.get('ui_message', 'Unknown')}"
|
358 |
+
data_details_content = "*Data details unavailable.*"
|
359 |
yield (
|
360 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
361 |
+
gr.update(value=data_details_content, visible=True),
|
362 |
gr.update(
|
363 |
+
value=f"Unexpected error generating summary: {summary_result.get('ui_message', 'Unknown')}",
|
364 |
visible=True,
|
365 |
),
|
366 |
+
gr.update(value=privacy_report, visible=True),
|
367 |
+
gr.update(visible=True, open=False),
|
368 |
+
gr.update(visible=True, open=False),
|
369 |
gr.update(visible=True, open=True),
|
370 |
)
|
371 |
+
return
|
372 |
+
|
373 |
+
summary_report = summary_result["report"]
|
374 |
|
375 |
+
# Update UI with successful summary report before TLDR generation
|
376 |
+
tldr_status_message = (
|
377 |
+
f"*{progress_desc} - Success. Generating TLDR...*" # Update status
|
378 |
+
)
|
379 |
+
data_details_content = "*Generating data details...*"
|
380 |
yield (
|
381 |
+
gr.update(value=tldr_status_message, visible=True),
|
382 |
+
gr.update(value=data_details_content, visible=True),
|
383 |
+
gr.update(value=summary_report, visible=True),
|
384 |
+
gr.update(value=privacy_report, visible=True),
|
385 |
+
gr.update(visible=True, open=False),
|
386 |
+
gr.update(visible=True, open=False),
|
387 |
gr.update(visible=True, open=True),
|
388 |
)
|
389 |
|
390 |
+
# --- Step 7: Generate TLDR --- (New Step)
|
391 |
+
current_step += 1
|
392 |
+
progress_desc = f"Step {current_step}/{total_steps}: Generating TLDR summary..."
|
393 |
+
progress(current_step / total_steps, desc=progress_desc)
|
394 |
+
tldr_status_message = f"*{progress_desc}*"
|
395 |
+
yield (
|
396 |
+
gr.update(value=tldr_status_message, visible=True),
|
397 |
+
gr.update(),
|
398 |
+
gr.update(),
|
399 |
+
gr.update(),
|
400 |
+
gr.update(),
|
401 |
+
gr.update(),
|
402 |
+
gr.update(),
|
403 |
+
)
|
404 |
+
tldr_data = None # Reset tldr_data before attempt
|
405 |
+
try:
|
406 |
+
# Call the combined helper function from analysis_utils
|
407 |
+
tldr_data = generate_and_parse_tldr(privacy_report, summary_report)
|
408 |
+
|
409 |
+
if tldr_data:
|
410 |
+
logging.info(f"Successfully generated and parsed TLDR for {space_id}.")
|
411 |
+
tldr_markdown_content = render_tldr_markdown(tldr_data, space_id)
|
412 |
+
data_details_content = render_data_details_markdown(tldr_data)
|
413 |
+
else:
|
414 |
+
logging.warning(
|
415 |
+
f"Failed to generate or parse TLDR for {space_id}. Proceeding without it."
|
416 |
+
)
|
417 |
+
tldr_markdown_content = "*TLDR generation failed.*"
|
418 |
+
data_details_content = "*Data details generation failed.*"
|
419 |
+
except Exception as tldr_err:
|
420 |
+
# This catch block might be redundant now if generate_and_parse_tldr handles its errors
|
421 |
+
logging.error(
|
422 |
+
f"Unexpected error during TLDR generation step call for {space_id}: {tldr_err}"
|
423 |
+
)
|
424 |
+
tldr_markdown_content = "*Error during TLDR generation step.*"
|
425 |
+
data_details_content = "*Error generating data details.*"
|
426 |
+
tldr_data = None # Ensure it's None on error
|
427 |
|
428 |
+
# Update UI including the generated (or failed) TLDR before upload
|
|
|
|
|
429 |
yield (
|
430 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
431 |
+
gr.update(value=data_details_content, visible=True),
|
432 |
gr.update(),
|
433 |
gr.update(),
|
434 |
+
gr.update(visible=True, open=False),
|
435 |
+
gr.update(),
|
436 |
gr.update(),
|
437 |
)
|
438 |
+
|
439 |
+
# --- Step 8: Upload to Cache --- (Old Step 7)
|
440 |
+
current_step += 1
|
441 |
+
progress_desc = f"Step {current_step}/{total_steps}: Uploading to cache..."
|
442 |
+
progress(current_step / total_steps, desc=progress_desc)
|
443 |
+
tldr_status_message = f"*{progress_desc}*" # Display final action in TLDR field
|
444 |
yield (
|
445 |
+
gr.update(value=tldr_status_message, visible=True),
|
446 |
+
gr.update(),
|
447 |
+
gr.update(value="Uploading results to cache...", visible=True),
|
448 |
+
gr.update(),
|
449 |
gr.update(),
|
450 |
gr.update(),
|
451 |
gr.update(),
|
452 |
)
|
453 |
+
upload_needed = (
|
454 |
+
cache_result["status"] != "cache_hit"
|
455 |
+
and cache_result["status"] != "cache_error"
|
456 |
)
|
457 |
+
if upload_needed:
|
458 |
+
# Call imported function, now passing tldr_data
|
459 |
+
upload_result = upload_results(
|
460 |
+
space_id,
|
461 |
+
summary_report,
|
462 |
+
privacy_report,
|
463 |
+
DATASET_ID,
|
464 |
+
HF_TOKEN,
|
465 |
+
tldr_json_data=tldr_data,
|
466 |
+
)
|
467 |
+
if upload_result["status"] == "error":
|
468 |
+
# Ensure logging uses f-string if adding step count here
|
469 |
+
logging.error(
|
470 |
+
f"Cache upload failed: {upload_result.get('message', 'Unknown error')}"
|
471 |
+
)
|
472 |
+
# Non-critical, don't stop the UI, just log
|
473 |
+
elif upload_result["status"] == "skipped":
|
474 |
+
logging.info(f"Cache upload skipped: {upload_result.get('reason', '')}")
|
475 |
+
else:
|
476 |
+
logging.info(
|
477 |
+
"Skipping cache upload as results were loaded from cache or cache check failed."
|
478 |
+
)
|
479 |
|
480 |
+
# Update UI including the generated (or failed) TLDR before upload
|
481 |
+
# Yield 7 updates
|
482 |
+
yield (
|
483 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
484 |
+
gr.update(value=data_details_content, visible=True),
|
485 |
+
gr.update(value=summary_report, visible=True),
|
486 |
+
gr.update(value=privacy_report, visible=True),
|
487 |
+
gr.update(visible=True, open=False),
|
488 |
+
gr.update(visible=True, open=False),
|
489 |
+
gr.update(visible=True, open=False),
|
490 |
+
)
|
491 |
+
|
492 |
+
# --- Step 9: Final Update --- (Old Step 8)
|
493 |
+
current_step += 1
|
494 |
+
progress_desc = f"Step {current_step}/{total_steps}: Analysis Complete!"
|
495 |
+
progress(current_step / total_steps, desc=progress_desc)
|
496 |
+
logging.info(progress_desc + f" Analysis complete for {space_id}.")
|
497 |
+
# Yield final state again to ensure UI is correct after potential upload messages
|
498 |
+
# Display final generated TLDR and Data Details
|
499 |
+
yield (
|
500 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
501 |
+
gr.update(value=data_details_content, visible=True),
|
502 |
+
gr.update(value=summary_report, visible=True),
|
503 |
+
gr.update(value=privacy_report, visible=True),
|
504 |
+
gr.update(visible=True, open=False),
|
505 |
+
gr.update(visible=True, open=False),
|
506 |
+
gr.update(visible=True, open=False),
|
507 |
)
|
508 |
+
|
509 |
+
|
510 |
+
# --- Original Input Handling Wrapper (updated yields for initial errors) ---
|
511 |
+
def get_space_report_wrapper(
|
512 |
+
selected_cached_space: str | None,
|
513 |
+
new_space_id: str | None,
|
514 |
+
progress=gr.Progress(track_tqdm=True),
|
515 |
+
):
|
516 |
+
"""
|
517 |
+
Wrapper function to decide whether to fetch cache or run live analysis.
|
518 |
+
Handles the logic based on Dropdown and Textbox inputs.
|
519 |
+
Yields tuples of Gradio updates.
|
520 |
+
"""
|
521 |
+
target_space_id = None
|
522 |
+
source = "new" # Assume new input unless dropdown is chosen
|
523 |
+
|
524 |
+
# Prioritize new_space_id if provided
|
525 |
+
if new_space_id and new_space_id.strip():
|
526 |
+
target_space_id = new_space_id.strip()
|
527 |
+
if target_space_id == selected_cached_space:
|
528 |
+
source = "dropdown_match" # User typed ID that exists in dropdown
|
529 |
+
else:
|
530 |
+
source = "new"
|
531 |
+
elif selected_cached_space:
|
532 |
+
target_space_id = selected_cached_space
|
533 |
+
source = "dropdown"
|
534 |
+
|
535 |
+
if not target_space_id:
|
536 |
+
# Yield 7 updates
|
537 |
yield (
|
538 |
+
gr.update(value="*Please provide a Space ID.*", visible=True),
|
539 |
+
gr.update(value="", visible=False),
|
540 |
gr.update(
|
541 |
+
value="Please select an existing report or enter a new Space ID.",
|
542 |
+
visible=True,
|
543 |
+
),
|
544 |
+
gr.update(value="", visible=False),
|
545 |
+
gr.update(visible=True, open=False),
|
546 |
+
gr.update(visible=True, open=False),
|
547 |
+
gr.update(visible=False),
|
548 |
)
|
549 |
+
return
|
|
|
|
|
550 |
|
551 |
+
if "/" not in target_space_id:
|
552 |
+
# Yield 7 updates
|
|
|
|
|
553 |
yield (
|
554 |
+
gr.update(value="*Invalid Space ID format.*", visible=True),
|
555 |
+
gr.update(value="", visible=False),
|
556 |
gr.update(
|
557 |
+
value=f"Invalid Space ID format: '{target_space_id}'. Use 'owner/name'.",
|
558 |
visible=True,
|
559 |
),
|
560 |
+
gr.update(value="", visible=False),
|
561 |
+
gr.update(visible=True, open=False),
|
562 |
+
gr.update(visible=True, open=False),
|
563 |
+
gr.update(visible=False),
|
564 |
)
|
565 |
+
return
|
|
|
|
|
566 |
|
567 |
+
logging.info(f"Request received for: '{target_space_id}' (Source: {source})")
|
|
|
|
|
|
|
|
|
|
|
|
|
568 |
|
569 |
+
if source == "dropdown":
|
570 |
+
progress(0.1, desc="Fetching selected cached report...")
|
571 |
+
# Yield 7 updates (initial placeholder)
|
572 |
+
yield (
|
573 |
+
gr.update(value="*Loading TLDR...*", visible=True),
|
574 |
+
gr.update(value="*Loading data details...*", visible=True),
|
575 |
+
gr.update(value="Fetching selected cached report...", visible=True),
|
576 |
+
gr.update(value="", visible=True),
|
577 |
+
gr.update(visible=True, open=False),
|
578 |
+
gr.update(visible=True, open=False),
|
579 |
+
gr.update(visible=True, open=False),
|
580 |
+
)
|
581 |
+
cache_result = check_cache_and_download(target_space_id, DATASET_ID, HF_TOKEN)
|
582 |
+
if cache_result["status"] == "cache_hit":
|
583 |
+
logging.info(
|
584 |
+
f"Successfully displayed cached reports for selected '{target_space_id}'."
|
|
|
|
|
|
|
|
|
|
|
|
|
585 |
)
|
586 |
+
progress(1.0, desc="Complete (from cache)")
|
587 |
+
# Use the cached report text directly here, adding the cache message is done within the helper now.
|
588 |
+
# Parse and render TLDR if available
|
589 |
+
tldr_json_str = cache_result.get("tldr_json_str")
|
590 |
+
rendered_tldr = "*TLDR not found in cache.*"
|
591 |
+
if tldr_json_str:
|
592 |
+
try:
|
593 |
+
cached_tldr_data = json.loads(tldr_json_str)
|
594 |
+
rendered_tldr = render_tldr_markdown(
|
595 |
+
cached_tldr_data, target_space_id
|
596 |
+
)
|
597 |
+
rendered_data_details = render_data_details_markdown(
|
598 |
+
cached_tldr_data
|
599 |
+
)
|
600 |
+
except Exception as parse_err:
|
601 |
+
logging.warning(
|
602 |
+
f"Failed to parse cached TLDR JSON for {target_space_id}: {parse_err}"
|
603 |
+
)
|
604 |
+
rendered_tldr = "*Error parsing cached TLDR.*"
|
605 |
+
rendered_data_details = (
|
606 |
+
"*Could not load data details due to parsing error.*"
|
607 |
+
)
|
608 |
|
609 |
+
yield (
|
610 |
+
gr.update(value=rendered_tldr, visible=True),
|
611 |
+
gr.update(value=rendered_data_details, visible=True),
|
612 |
+
gr.update(value=cache_result["summary"], visible=True),
|
613 |
+
gr.update(value=cache_result["privacy"], visible=True),
|
614 |
+
gr.update(visible=True, open=False),
|
615 |
+
gr.update(visible=True, open=False),
|
616 |
+
gr.update(visible=True, open=False),
|
617 |
+
)
|
618 |
+
else: # Cache miss or error for a dropdown selection is an error state
|
619 |
+
error_msg = cache_result.get(
|
620 |
+
"ui_message",
|
621 |
+
f"Failed to find or download cached report for selected '{target_space_id}'.",
|
622 |
+
)
|
623 |
+
logging.error(error_msg)
|
624 |
+
progress(1.0, desc="Error")
|
625 |
+
yield (
|
626 |
+
gr.update(value="*TLDR load failed.*", visible=True),
|
627 |
+
gr.update(value="*Data details load failed.*", visible=True),
|
628 |
+
gr.update(value=error_msg, visible=True),
|
629 |
+
gr.update(value="", visible=False),
|
630 |
+
gr.update(visible=True, open=False),
|
631 |
+
gr.update(visible=True, open=False),
|
632 |
+
gr.update(visible=False),
|
633 |
+
)
|
634 |
+
return # Stop after handling dropdown source
|
635 |
|
636 |
+
# --- Live Analysis or Check Cache for New Input ---
|
637 |
+
# If it came from the textbox OR was a dropdown match, run the full live analysis pipeline
|
638 |
+
# which includes its own cache check at the beginning.
|
639 |
+
else: # source == "new" or source == "dropdown_match"
|
640 |
+
# Yield intermediate updates from the generator by iterating through it
|
641 |
+
for update_tuple in _run_live_analysis(target_space_id, progress):
|
642 |
+
yield update_tuple
|
643 |
|
644 |
|
645 |
# --- Load Initial Data Function (for demo.load) ---
|
|
|
689 |
with gr.Row():
|
690 |
with gr.Column(scale=1): # Left column for inputs
|
691 |
description_accordion = gr.Accordion(
|
692 |
+
"What Privacy Questions do 🤗 Spaces Raise? Click here for Demo Description 👇",
|
693 |
+
open=False,
|
694 |
+
visible=True,
|
695 |
)
|
696 |
with description_accordion:
|
697 |
gr.Markdown(DESCRIPTION)
|
|
|
712 |
analyze_button = gr.Button("Get Space Report", variant="primary", scale=1)
|
713 |
|
714 |
with gr.Column(scale=1): # Right column for outputs
|
715 |
+
# Define TLDR Markdown component first, always visible
|
716 |
+
gr.Markdown("### Privacy TLDR 🕵️\n", visible=True)
|
717 |
+
tldr_markdown = gr.Markdown(
|
718 |
+
"*Select or enter a Space ID to get started.*", visible=True
|
719 |
+
)
|
720 |
+
|
721 |
+
# Define Accordions next, closed by default, visible
|
722 |
+
data_types_accordion = gr.Accordion(
|
723 |
+
"Data Types at Play", open=False, visible=True
|
724 |
+
)
|
725 |
+
with data_types_accordion:
|
726 |
+
data_details_markdown = gr.Markdown("*Data details will appear here.*")
|
727 |
+
|
728 |
summary_accordion = gr.Accordion(
|
729 |
+
"Summary & Privacy Highlights",
|
730 |
+
open=False,
|
731 |
+
visible=True, # Changed to open=False
|
732 |
)
|
733 |
privacy_accordion = gr.Accordion(
|
734 |
+
"Detailed Privacy Analysis Report",
|
735 |
+
open=False,
|
736 |
+
visible=True, # Changed to open=False
|
737 |
)
|
738 |
with summary_accordion:
|
739 |
summary_markdown = gr.Markdown(
|
|
|
755 |
fn=get_space_report_wrapper,
|
756 |
inputs=[cached_spaces_dropdown, space_id_input],
|
757 |
outputs=[
|
758 |
+
tldr_markdown,
|
759 |
+
data_details_markdown, # Added data details output
|
760 |
summary_markdown,
|
761 |
privacy_markdown,
|
762 |
+
data_types_accordion, # Added data details accordion output
|
763 |
summary_accordion,
|
764 |
privacy_accordion,
|
765 |
],
|
llm_interface.py
CHANGED
@@ -79,6 +79,7 @@ def query_qwen_endpoint(
|
|
79 |
return None # Return None for other HTTP errors
|
80 |
except Exception as e:
|
81 |
logging.error(f"An unexpected error occurred querying Inference Endpoint: {e}")
|
|
|
82 |
return None
|
83 |
|
84 |
|
|
|
79 |
return None # Return None for other HTTP errors
|
80 |
except Exception as e:
|
81 |
logging.error(f"An unexpected error occurred querying Inference Endpoint: {e}")
|
82 |
+
print(f"An unexpected error occurred querying Inference Endpoint: {e}")
|
83 |
return None
|
84 |
|
85 |
|
utils.py
CHANGED
@@ -80,6 +80,7 @@ MAX_MODEL_DESC_LENGTH = 1500
|
|
80 |
|
81 |
SUMMARY_FILENAME = "summary_highlights.md"
|
82 |
PRIVACY_FILENAME = "privacy_report.md"
|
|
|
83 |
|
84 |
|
85 |
def _is_relevant_file(filename):
|
@@ -367,7 +368,13 @@ def check_report_exists(space_id: str, dataset_id: str, hf_token: str | None) ->
|
|
367 |
def download_cached_reports(
|
368 |
space_id: str, dataset_id: str, hf_token: str | None
|
369 |
) -> dict[str, str]:
|
370 |
-
"""Downloads cached reports from the dataset repo.
|
|
|
|
|
|
|
|
|
|
|
|
|
371 |
if not hf_token:
|
372 |
raise ValueError("HF Token required to download cached reports.")
|
373 |
|
@@ -378,50 +385,95 @@ def download_cached_reports(
|
|
378 |
# Define paths relative to dataset root for hf_hub_download
|
379 |
summary_repo_path = f"{space_id}/{SUMMARY_FILENAME}"
|
380 |
privacy_repo_path = f"{space_id}/{PRIVACY_FILENAME}"
|
|
|
|
|
381 |
try:
|
382 |
# Download summary
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
|
393 |
# Download privacy report
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
403 |
|
404 |
return reports
|
405 |
|
406 |
-
except
|
407 |
-
# More specific error based on which file failed
|
408 |
-
missing_file = (
|
409 |
-
summary_repo_path if summary_repo_path in str(e) else privacy_repo_path
|
410 |
-
)
|
411 |
logging.error(
|
412 |
-
f"Cache download error:
|
413 |
)
|
414 |
-
raise FileNotFoundError(
|
415 |
-
|
416 |
-
) from e
|
417 |
-
except RepositoryNotFoundError as e:
|
418 |
-
logging.error(f"Cache download error: Dataset repo {dataset_id} not found. {e}")
|
419 |
-
raise FileNotFoundError(f"Dataset repo {dataset_id} not found") from e
|
420 |
-
except Exception as e:
|
421 |
logging.error(
|
422 |
-
f"Unexpected error downloading cached reports for {space_id} from {dataset_id}: {
|
423 |
)
|
424 |
-
raise IOError(
|
|
|
|
|
425 |
|
426 |
|
427 |
def upload_reports_to_dataset(
|
|
|
80 |
|
81 |
SUMMARY_FILENAME = "summary_highlights.md"
|
82 |
PRIVACY_FILENAME = "privacy_report.md"
|
83 |
+
TLDR_FILENAME = "tldr_summary.json"
|
84 |
|
85 |
|
86 |
def _is_relevant_file(filename):
|
|
|
368 |
def download_cached_reports(
|
369 |
space_id: str, dataset_id: str, hf_token: str | None
|
370 |
) -> dict[str, str]:
|
371 |
+
"""Downloads cached reports (summary, privacy, tldr json) from the dataset repo.
|
372 |
+
|
373 |
+
Returns:
|
374 |
+
Dict containing report contents keyed by 'summary', 'privacy', 'tldr_json_str'.
|
375 |
+
Keys will be missing if a specific file is not found.
|
376 |
+
Raises error on critical download failures (repo not found, etc.).
|
377 |
+
"""
|
378 |
if not hf_token:
|
379 |
raise ValueError("HF Token required to download cached reports.")
|
380 |
|
|
|
385 |
# Define paths relative to dataset root for hf_hub_download
|
386 |
summary_repo_path = f"{space_id}/{SUMMARY_FILENAME}"
|
387 |
privacy_repo_path = f"{space_id}/{PRIVACY_FILENAME}"
|
388 |
+
tldr_repo_path = f"{space_id}/{TLDR_FILENAME}" # Path for TLDR JSON
|
389 |
+
|
390 |
try:
|
391 |
# Download summary
|
392 |
+
try:
|
393 |
+
summary_path_local = hf_hub_download(
|
394 |
+
repo_id=dataset_id,
|
395 |
+
filename=summary_repo_path,
|
396 |
+
repo_type="dataset",
|
397 |
+
token=hf_token,
|
398 |
+
)
|
399 |
+
with open(summary_path_local, "r", encoding="utf-8") as f:
|
400 |
+
reports["summary"] = f.read()
|
401 |
+
logging.info(f"Successfully downloaded cached summary for {space_id}.")
|
402 |
+
except EntryNotFoundError:
|
403 |
+
logging.warning(
|
404 |
+
f"Cached summary file {summary_repo_path} not found for {space_id}."
|
405 |
+
)
|
406 |
+
except Exception as e_summary:
|
407 |
+
logging.error(
|
408 |
+
f"Error downloading cached summary for {space_id}: {e_summary}"
|
409 |
+
)
|
410 |
+
# Decide if this is critical - for now, we warn and continue
|
411 |
|
412 |
# Download privacy report
|
413 |
+
try:
|
414 |
+
privacy_path_local = hf_hub_download(
|
415 |
+
repo_id=dataset_id,
|
416 |
+
filename=privacy_repo_path,
|
417 |
+
repo_type="dataset",
|
418 |
+
token=hf_token,
|
419 |
+
)
|
420 |
+
with open(privacy_path_local, "r", encoding="utf-8") as f:
|
421 |
+
reports["privacy"] = f.read()
|
422 |
+
logging.info(
|
423 |
+
f"Successfully downloaded cached privacy report for {space_id}."
|
424 |
+
)
|
425 |
+
except EntryNotFoundError:
|
426 |
+
logging.warning(
|
427 |
+
f"Cached privacy file {privacy_repo_path} not found for {space_id}."
|
428 |
+
)
|
429 |
+
except Exception as e_privacy:
|
430 |
+
logging.error(
|
431 |
+
f"Error downloading cached privacy report for {space_id}: {e_privacy}"
|
432 |
+
)
|
433 |
+
# Decide if this is critical - for now, we warn and continue
|
434 |
+
|
435 |
+
# Download TLDR JSON
|
436 |
+
try:
|
437 |
+
tldr_path_local = hf_hub_download(
|
438 |
+
repo_id=dataset_id,
|
439 |
+
filename=tldr_repo_path,
|
440 |
+
repo_type="dataset",
|
441 |
+
token=hf_token,
|
442 |
+
)
|
443 |
+
with open(tldr_path_local, "r", encoding="utf-8") as f:
|
444 |
+
reports["tldr_json_str"] = f.read() # Store raw string content
|
445 |
+
logging.info(f"Successfully downloaded cached TLDR JSON for {space_id}.")
|
446 |
+
except EntryNotFoundError:
|
447 |
+
logging.warning(
|
448 |
+
f"Cached TLDR file {tldr_repo_path} not found for {space_id}."
|
449 |
+
)
|
450 |
+
# Don't treat TLDR absence as an error, just won't be in the dict
|
451 |
+
except Exception as e_tldr:
|
452 |
+
logging.error(
|
453 |
+
f"Error downloading cached TLDR JSON for {space_id}: {e_tldr}"
|
454 |
+
)
|
455 |
+
# Don't treat TLDR download error as critical, just won't be included
|
456 |
+
|
457 |
+
# Check if at least one report was downloaded successfully
|
458 |
+
if not reports.get("summary") and not reports.get("privacy"):
|
459 |
+
raise FileNotFoundError(
|
460 |
+
f"Failed to download *any* primary cache files (summary/privacy) for {space_id}"
|
461 |
+
)
|
462 |
|
463 |
return reports
|
464 |
|
465 |
+
except RepositoryNotFoundError as e_repo:
|
|
|
|
|
|
|
|
|
466 |
logging.error(
|
467 |
+
f"Cache download error: Dataset repo {dataset_id} not found. {e_repo}"
|
468 |
)
|
469 |
+
raise FileNotFoundError(f"Dataset repo {dataset_id} not found") from e_repo
|
470 |
+
except Exception as e_critical: # Catch other potential critical errors
|
|
|
|
|
|
|
|
|
|
|
471 |
logging.error(
|
472 |
+
f"Unexpected critical error downloading cached reports for {space_id} from {dataset_id}: {e_critical}"
|
473 |
)
|
474 |
+
raise IOError(
|
475 |
+
f"Failed critically during cached report download for {space_id}"
|
476 |
+
) from e_critical
|
477 |
|
478 |
|
479 |
def upload_reports_to_dataset(
|