Spaces:

yjernite
/

space-privacy

Running

App Files Files Community

Yacine Jernite commited on 12 days ago

Commit

36de078

1 Parent(s): d6d8868

added TLDR functionality

Browse files

Files changed (4) hide show

analysis_utils.py +684 -0
app.py +548 -349
llm_interface.py +1 -0
utils.py +86 -34

analysis_utils.py ADDED Viewed

	@@ -0,0 +1,684 @@

+import json  # Added for TLDR JSON parsing
+import logging
+import os
+import tempfile
+from huggingface_hub import HfApi
+from huggingface_hub.inference._generated.types import \
+    ChatCompletionOutput  # Added for type hinting
+# Imports from other project modules
+from llm_interface import (ERROR_503_DICT, parse_qwen_response,
+                           query_qwen_endpoint)
+from prompts import format_privacy_prompt, format_summary_highlights_prompt
+from utils import (PRIVACY_FILENAME,  # Import constants for filenames
+                   SUMMARY_FILENAME, TLDR_FILENAME, check_report_exists,
+                   download_cached_reports, get_space_code_files)
+# Configure logging (can inherit from app.py if called from there, but good practice)
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+# Load environment variables - redundant if always called by app.py which already loads them
+# load_dotenv()
+# Constants needed by helper functions (can be passed as args too)
+# Consider passing these from app.py if they might change or for clarity
+CACHE_INFO_MSG = "\n\n*(Report retrieved from cache)*"
+TRUNCATION_WARNING = """**⚠️ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n"""
+# --- Constants for TLDR Generation ---
+TLDR_SYSTEM_PROMPT = (
+    "You are an AI assistant specialized in summarizing privacy analysis reports for Hugging Face Spaces. "
+    "You will receive two reports: a detailed privacy analysis and a summary/highlights report. "
+    "Based **only** on the content of these two reports, generate a concise JSON object containing a structured TLDR (Too Long; Didn't Read). "
+    "Do not use any information not present in the provided reports. "
+    "The JSON object must have the following keys:\n"
+    '- "app_description": A 1-2 sentence summary of what the application does from a user\'s perspective.\n'
+    '- "privacy_tldr": A 2-3 sentence high-level overview of privacy. Mention if the analysis was conclusive based on available code, if data processing is local, or if/what data goes to external services.\n'
+    '- "data_types": A list of JSON objects, where each object has two keys: \'name\' (a short, unique identifier string for the data type, e.g., "User Text") and \'description\' (a brief string explaining the data type in context, max 6-8 words, e.g., "Text prompt entered by the user").\n'
+    "- \"user_input_data\": A list of strings, where each string is the 'name' of a data type defined in 'data_types' that is provided by the user to the app.\n"
+    "- \"local_processing\": A list of strings describing data processed locally. Each string should start with the 'name' of a data type defined in 'data_types', followed by details (like the processing model) in parentheses if mentioned in the reports. Example: \"User Text (Local Model XYZ)\".\n"
+    "- \"remote_processing\": A list of strings describing data sent to remote services. Each string should start with the 'name' of a data type defined in 'data_types', followed by the service/model name in parentheses if mentioned in the reports. Example: \"User Text (HF Inference API)\".\n"
+    "- \"external_logging\": A list of strings describing data logged or saved externally. Each string should start with the 'name' of a data type defined in 'data_types', followed by the location/service in parentheses if mentioned. Example: \"User Text (External DB)\".\n"
+    "Ensure the output is **only** a valid JSON object, starting with `{` and ending with `}`. Ensure all listed data types in the processing/logging lists exactly match a 'name' defined in the 'data_types' list."
+)
+# --- Analysis Pipeline Helper Functions ---
+def check_cache_and_download(space_id: str, dataset_id: str, hf_token: str | None):
+    """Checks cache and downloads if reports exist."""
+    logging.info(f"Checking cache for '{space_id}'...")
+    found_in_cache = False
+    if hf_token:
+        try:
+            found_in_cache = check_report_exists(space_id, dataset_id, hf_token)
+        except Exception as e:
+            logging.warning(f"Cache check failed for {space_id}: {e}. Proceeding.")
+            # Return cache_miss even if check failed, proceed to live analysis
+            return {"status": "cache_miss", "error_message": f"Cache check failed: {e}"}
+    if found_in_cache:
+        logging.info(f"Cache hit for {space_id}. Downloading.")
+        try:
+            cached_reports = download_cached_reports(space_id, dataset_id, hf_token)
+            summary_report = (
+                cached_reports.get("summary", "Error: Cached summary not found.")
+                + CACHE_INFO_MSG
+            )
+            privacy_report = (
+                cached_reports.get("privacy", "Error: Cached privacy report not found.")
+                + CACHE_INFO_MSG
+            )
+            logging.info(f"Successfully downloaded cached reports for {space_id}.")
+            return {
+                "status": "cache_hit",
+                "summary": summary_report,
+                "privacy": privacy_report,
+                "tldr_json_str": cached_reports.get("tldr_json_str"),
+            }
+        except Exception as e:
+            error_msg = f"Cache download failed for {space_id}: {e}"
+            logging.warning(f"{error_msg}. Proceeding with live analysis.")
+            # Return error, but let caller decide if live analysis proceeds
+            return {"status": "cache_error", "ui_message": error_msg}
+    else:
+        logging.info(f"Cache miss for {space_id}. Performing live analysis.")
+        return {"status": "cache_miss"}
+def check_endpoint_status(
+    endpoint_name: str, hf_token: str | None, error_503_user_message: str
+):
+    """Checks the status of the inference endpoint."""
+    logging.info(f"Checking endpoint status for '{endpoint_name}'...")
+    if not hf_token:
+        # Allow proceeding if token missing, maybe endpoint is public
+        logging.warning("HF_TOKEN not set, cannot check endpoint status definitively.")
+        return {"status": "ready", "warning": "HF_TOKEN not set"}
+    try:
+        api = HfApi(token=hf_token)
+        endpoint = api.get_inference_endpoint(name=endpoint_name)
+        status = endpoint.status
+        logging.info(f"Endpoint '{endpoint_name}' status: {status}")
+        if status == "running":
+            return {"status": "ready"}
+        else:
+            logging.warning(
+                f"Endpoint '{endpoint_name}' is not ready (Status: {status})."
+            )
+            if status == "scaledToZero":
+                logging.info(
+                    f"Endpoint '{endpoint_name}' is scaled to zero. Attempting to resume..."
+                )
+                try:
+                    endpoint.resume()
+                    # Still return an error message suggesting retry, as resume takes time
+                    # Keep this message concise as the action is specific (wait)
+                    msg = f"**Endpoint Resuming:** The analysis endpoint ('{endpoint_name}') was scaled to zero and is now restarting.\n\n{error_503_user_message}"
+                    return {"status": "error", "ui_message": msg}
+                except Exception as resume_error:
+                    # Resume failed, provide detailed message
+                    logging.error(
+                        f"Failed to resume endpoint {endpoint_name}: {resume_error}"
+                    )
+                    # Construct detailed message including full explanation
+                    msg = f"**Endpoint Issue:** The analysis endpoint ('{endpoint_name}') is currently {status} and an attempt to resume it failed ({resume_error}).\n\n{error_503_user_message}"
+                    return {"status": "error", "ui_message": msg}
+            else:  # Paused, failed, pending etc.
+                # Construct detailed message including full explanation
+                msg = f"**Endpoint Issue:** The analysis endpoint ('{endpoint_name}') status is currently <span style='color:red'>**{status}**</span>.\n\n{error_503_user_message}"
+                return {"status": "error", "ui_message": msg}
+    except Exception as e:
+        error_msg = f"Error checking analysis endpoint status for {endpoint_name}: {e}"
+        logging.error(error_msg)
+        # Let analysis stop if endpoint check fails critically
+        return {"status": "error", "ui_message": f"Error checking endpoint status: {e}"}
+def fetch_and_validate_code(space_id: str):
+    """Fetches and validates code files for the space."""
+    logging.info(f"Fetching code files for {space_id}...")
+    code_files = get_space_code_files(space_id)
+    if not code_files:
+        error_msg = f"Could not retrieve code files for '{space_id}'. Check ID and ensure it's a public Space."
+        logging.warning(error_msg)
+        return {
+            "status": "error",
+            "ui_message": f"**Error:**\n{error_msg}\nAnalysis Canceled.",
+        }
+    logging.info(f"Successfully fetched {len(code_files)} files for {space_id}.")
+    return {"status": "success", "code_files": code_files}
+def generate_detailed_report(
+    space_id: str, code_files: dict, error_503_user_message: str
+):
+    """Generates the detailed privacy report using the LLM."""
+    logging.info("Generating detailed privacy analysis report...")
+    privacy_prompt_messages, privacy_truncated = format_privacy_prompt(
+        space_id, code_files
+    )
+    privacy_api_response = query_qwen_endpoint(privacy_prompt_messages, max_tokens=3072)
+    if privacy_api_response == ERROR_503_DICT:
+        logging.warning("LLM Call 1 (Privacy) failed with 503.")
+        return {"status": "error", "ui_message": error_503_user_message}
+    detailed_privacy_report = parse_qwen_response(privacy_api_response)
+    if "Error:" in detailed_privacy_report:
+        error_msg = (
+            f"Failed to generate detailed privacy report: {detailed_privacy_report}"
+        )
+        logging.error(error_msg)
+        return {
+            "status": "error",
+            "ui_message": f"**Error Generating Detailed Privacy Report:**\n{detailed_privacy_report}\nAnalysis Halted.",
+        }
+    if privacy_truncated:
+        detailed_privacy_report = TRUNCATION_WARNING + detailed_privacy_report
+    logging.info("Successfully generated detailed privacy report.")
+    return {
+        "status": "success",
+        "report": detailed_privacy_report,
+        "truncated": privacy_truncated,
+    }
+def generate_summary_report(
+    space_id: str,
+    code_files: dict,
+    detailed_privacy_report: str,
+    error_503_user_message: str,
+):
+    """Generates the summary & highlights report using the LLM."""
+    logging.info("Generating summary and highlights report...")
+    # Remove potential truncation warning from detailed report before sending to next LLM
+    clean_detailed_report = detailed_privacy_report.replace(TRUNCATION_WARNING, "")
+    summary_highlights_prompt_messages, summary_truncated = (
+        format_summary_highlights_prompt(space_id, code_files, clean_detailed_report)
+    )
+    summary_highlights_api_response = query_qwen_endpoint(
+        summary_highlights_prompt_messages, max_tokens=2048
+    )
+    if summary_highlights_api_response == ERROR_503_DICT:
+        logging.warning("LLM Call 2 (Summary) failed with 503.")
+        # Return specific status to indicate partial success
+        return {"status": "error_503_summary", "ui_message": error_503_user_message}
+    summary_highlights_report = parse_qwen_response(summary_highlights_api_response)
+    if "Error:" in summary_highlights_report:
+        error_msg = (
+            f"Failed to generate summary/highlights report: {summary_highlights_report}"
+        )
+        logging.error(error_msg)
+        # Return specific status to indicate partial success
+        return {
+            "status": "error_summary",
+            "ui_message": f"**Error Generating Summary/Highlights:**\n{summary_highlights_report}",
+        }
+    if summary_truncated:
+        summary_highlights_report = TRUNCATION_WARNING + summary_highlights_report
+    logging.info("Successfully generated summary & highlights report.")
+    return {
+        "status": "success",
+        "report": summary_highlights_report,
+        "truncated": summary_truncated,
+    }
+def upload_results(
+    space_id: str,
+    summary_report: str,
+    detailed_report: str,
+    dataset_id: str,
+    hf_token: str | None,
+    tldr_json_data: dict | None = None,
+):
+    """Uploads the generated reports (Markdown and optional JSON TLDR) to the specified dataset repository."""
+    if not hf_token:
+        logging.warning("HF Token not provided, skipping dataset report upload.")
+        return {"status": "skipped", "reason": "HF_TOKEN not set"}
+    if "Error:" in detailed_report or "Error:" in summary_report:
+        msg = "Skipping cache upload due to errors in generated reports."
+        logging.warning(msg)
+        return {"status": "skipped", "reason": msg}
+    safe_space_id = space_id.replace("..", "")
+    try:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Define local paths
+            summary_path_local = os.path.join(tmpdir, SUMMARY_FILENAME)
+            privacy_path_local = os.path.join(tmpdir, PRIVACY_FILENAME)
+            tldr_json_path_local = os.path.join(tmpdir, TLDR_FILENAME)
+            # Write Markdown reports
+            with open(summary_path_local, "w", encoding="utf-8") as f:
+                f.write(summary_report)
+            with open(privacy_path_local, "w", encoding="utf-8") as f:
+                f.write(detailed_report)
+            # Prepare commit message
+            commit_message = f"Add analysis reports for Space: {safe_space_id}"
+            if tldr_json_data:
+                commit_message += " (including TLDR JSON)"
+                print(f"Successfully wrote TLDR JSON locally for {safe_space_id}.")
+                # Write JSON TLDR data if available
+                try:
+                    with open(tldr_json_path_local, "w", encoding="utf-8") as f:
+                        json.dump(tldr_json_data, f, indent=2, ensure_ascii=False)
+                    logging.info(
+                        f"Successfully wrote TLDR JSON locally for {safe_space_id}."
+                    )
+                except Exception as json_err:
+                    logging.error(
+                        f"Failed to write TLDR JSON locally for {safe_space_id}: {json_err}"
+                    )
+                    tldr_json_data = None  # Prevent upload attempt if writing failed
+            # Ensure repo exists
+            api = HfApi(token=hf_token)
+            repo_url = api.create_repo(
+                repo_id=dataset_id,
+                repo_type="dataset",
+                exist_ok=True,
+            )
+            logging.info(f"Ensured dataset repo {repo_url} exists.")
+            # Upload summary report
+            api.upload_file(
+                path_or_fileobj=summary_path_local,
+                path_in_repo=f"{safe_space_id}/{SUMMARY_FILENAME}",
+                repo_id=dataset_id,
+                repo_type="dataset",
+                commit_message=commit_message,
+            )
+            logging.info(f"Successfully uploaded summary report for {safe_space_id}.")
+            # Upload privacy report
+            api.upload_file(
+                path_or_fileobj=privacy_path_local,
+                path_in_repo=f"{safe_space_id}/{PRIVACY_FILENAME}",
+                repo_id=dataset_id,
+                repo_type="dataset",
+                commit_message=commit_message,
+            )
+            logging.info(
+                f"Successfully uploaded detailed privacy report for {safe_space_id}."
+            )
+            # print(f"Successfully uploaded detailed privacy report for {safe_space_id}.") # Keep if needed for debug
+            # Upload JSON TLDR if it was successfully written locally
+            if tldr_json_data and os.path.exists(tldr_json_path_local):
+                api.upload_file(
+                    path_or_fileobj=tldr_json_path_local,
+                    path_in_repo=f"{safe_space_id}/{TLDR_FILENAME}",
+                    repo_id=dataset_id,
+                    repo_type="dataset",
+                    commit_message=commit_message,  # Can reuse commit message or make specific
+                )
+                logging.info(f"Successfully uploaded TLDR JSON for {safe_space_id}.")
+                print(f"Successfully uploaded TLDR JSON for {safe_space_id}.")
+            # Return success if all uploads finished without error
+            return {"status": "success"}
+    except Exception as e:
+        error_msg = f"Non-critical error during report upload for {safe_space_id}: {e}"
+        logging.error(error_msg)
+        print(error_msg)
+        return {"status": "error", "message": error_msg}
+# --- New TLDR Generation Functions ---
+def format_tldr_prompt(
+    detailed_report: str, summary_report: str
+) -> list[dict[str, str]]:
+    """Formats the prompt for the TLDR generation task."""
+    # Clean potential cache/truncation markers from input reports for the LLM
+    cleaned_detailed = detailed_report.replace(CACHE_INFO_MSG, "").replace(
+        TRUNCATION_WARNING, ""
+    )
+    cleaned_summary = summary_report.replace(CACHE_INFO_MSG, "").replace(
+        TRUNCATION_WARNING, ""
+    )
+    user_content = (
+        "Please generate a structured JSON TLDR based on the following reports:\n\n"
+        "--- DETAILED PRIVACY ANALYSIS REPORT START ---\n"
+        f"{cleaned_detailed}\n"
+        "--- DETAILED PRIVACY ANALYSIS REPORT END ---\n\n"
+        "--- SUMMARY & HIGHLIGHTS REPORT START ---\n"
+        f"{cleaned_summary}\n"
+        "--- SUMMARY & HIGHLIGHTS REPORT END ---"
+    )
+    # Note: We are not handling truncation here, assuming the input reports
+    # are already reasonably sized from the previous steps.
+    # If reports could be extremely long, add truncation logic similar to other format_* functions.
+    messages = [
+        {"role": "system", "content": TLDR_SYSTEM_PROMPT},
+        {"role": "user", "content": user_content},
+    ]
+    return messages
+def parse_tldr_json_response(
+    response: ChatCompletionOutput | dict | None,
+) -> dict | None:
+    """Parses the LLM response, expecting JSON content for the TLDR."""
+    if response is None:
+        logging.error("TLDR Generation: Failed to get response from LLM.")
+        return None
+    # Check for 503 error dict first
+    if isinstance(response, dict) and response.get("error_type") == "503":
+        logging.error(f"TLDR Generation: Received 503 error: {response.get('message')}")
+        return None  # Treat 503 as failure for this specific task
+    # --- Direct Content Extraction (Replaces call to parse_qwen_response) ---
+    raw_content = ""
+    try:
+        # Check if it's likely the expected ChatCompletionOutput structure
+        if not hasattr(response, "choices"):
+            logging.error(
+                f"TLDR Generation: Unexpected response type received: {type(response)}. Content: {response}"
+            )
+            return None  # Return None if not the expected structure
+        # Access the generated content according to the ChatCompletionOutput structure
+        if response.choices and len(response.choices) > 0:
+            content = response.choices[0].message.content
+            if content:
+                raw_content = content.strip()
+                logging.info(
+                    "TLDR Generation: Successfully extracted raw content from response."
+                )
+            else:
+                logging.warning(
+                    "TLDR Generation: Response received, but content is empty."
+                )
+                return None
+        else:
+            logging.warning("TLDR Generation: Response received, but no choices found.")
+            return None
+    except AttributeError as e:
+        # This might catch cases where response looks like the object but lacks expected attributes
+        logging.error(
+            f"TLDR Generation: Attribute error parsing response object: {e}. Response structure might be unexpected. Response: {response}"
+        )
+        return None
+    except Exception as e:
+        logging.error(
+            f"TLDR Generation: Unexpected error extracting content from response object: {e}"
+        )
+        return None
+    # --- End Direct Content Extraction ---
+    # --- JSON Parsing Logic ---
+    if not raw_content:  # Should be caught by checks above, but belts and suspenders
+        logging.error("TLDR Generation: Raw content is empty after extraction attempt.")
+        return None
+    try:
+        # Clean potential markdown code block formatting
+        if raw_content.strip().startswith("```json"):
+            raw_content = raw_content.strip()[7:-3].strip()
+        elif raw_content.strip().startswith("```"):
+            raw_content = raw_content.strip()[3:-3].strip()
+        tldr_data = json.loads(raw_content)
+        # Validate structure: Check if it's a dict and has all required keys
+        required_keys = [
+            "app_description",
+            "privacy_tldr",
+            "data_types",
+            "user_input_data",
+            "local_processing",
+            "remote_processing",
+            "external_logging",
+        ]
+        if not isinstance(tldr_data, dict):
+            logging.error(
+                f"TLDR Generation: Parsed content is not a dictionary. Content: {raw_content[:500]}..."
+            )
+            return None
+        if not all(key in tldr_data for key in required_keys):
+            missing_keys = [key for key in required_keys if key not in tldr_data]
+            logging.error(
+                f"TLDR Generation: Parsed JSON is missing required keys: {missing_keys}. Content: {raw_content[:500]}..."
+            )
+            return None
+        # --- Add validation for the new data_types structure ---
+        data_types_list = tldr_data.get("data_types")
+        if not isinstance(data_types_list, list):
+            logging.error(
+                f"TLDR Generation: 'data_types' is not a list. Content: {data_types_list}"
+            )
+            return None
+        for item in data_types_list:
+            if (
+                not isinstance(item, dict)
+                or "name" not in item
+                or "description" not in item
+            ):
+                logging.error(
+                    f"TLDR Generation: Invalid item found in 'data_types' list: {item}. Must be dict with 'name' and 'description'."
+                )
+                return None
+            if not isinstance(item["name"], str) or not isinstance(
+                item["description"], str
+            ):
+                logging.error(
+                    f"TLDR Generation: Invalid types for name/description in 'data_types' item: {item}. Must be strings."
+                )
+                return None
+        # --- End validation for data_types ---
+        # Basic validation for other lists (should contain strings)
+        validation_passed = True
+        for key in [
+            "user_input_data",
+            "local_processing",
+            "remote_processing",
+            "external_logging",
+        ]:
+            data_list = tldr_data.get(key)
+            # Add more detailed check and logging
+            if not isinstance(data_list, list):
+                logging.error(
+                    f"TLDR Generation Validation Error: Key '{key}' is not a list. Found type: {type(data_list)}, Value: {data_list}"
+                )
+                validation_passed = False
+                # Allow continuing validation for other keys, but mark as failed
+            elif not all(isinstance(x, str) for x in data_list):
+                # This check might be too strict if LLM includes non-strings, but keep for now
+                logging.warning(
+                    f"TLDR Generation Validation Warning: Not all items in list '{key}' are strings. Content: {data_list}"
+                )
+                # Decide if this should cause failure - currently it doesn't, just warns
+        if not validation_passed:
+            logging.error(
+                "TLDR Generation: Validation failed due to incorrect list types."
+            )
+            return None  # Ensure failure if any key wasn't a list
+        logging.info("Successfully parsed and validated TLDR JSON response.")
+        return tldr_data
+    except json.JSONDecodeError as e:
+        logging.error(
+            f"TLDR Generation: Failed to decode JSON response: {e}. Content: {raw_content[:500]}..."
+        )
+        return None
+    except Exception as e:
+        logging.error(f"TLDR Generation: Unexpected error parsing JSON response: {e}")
+        return None
+def render_tldr_markdown(tldr_data: dict | None, space_id: str | None = None) -> str:
+    """Renders the top-level TLDR (description, privacy) data into a Markdown string.
+    (Does not include the data lists)
+    """
+    if not tldr_data:
+        # Return a more specific message for this part
+        return "*TLDR Summary could not be generated.*\n"
+    output = []
+    # Add Space link if space_id is provided
+    if space_id:
+        output.append(
+            f"**Source Space:** [`{space_id}`](https://huggingface.co/spaces/{space_id})\n"
+        )
+    output.append(f"**App Description:** {tldr_data.get('app_description', 'N/A')}\n")
+    privacy_summary = tldr_data.get("privacy_tldr", "N/A")
+    output.append(f"**Privacy TLDR:** {privacy_summary}")  # Removed extra newline
+    # Removed data list rendering from this function
+    return "\n".join(output)
+def render_data_details_markdown(tldr_data: dict | None) -> str:
+    """Renders the data lists (types, input, processing, logging) from TLDR data."""
+    if not tldr_data:
+        return "*Data details could not be generated.*\n"
+    output = []
+    # Get defined names for formatting
+    defined_names = sorted(
+        [
+            dt.get("name", "")
+            for dt in tldr_data.get("data_types", [])
+            if dt.get("name")
+        ],
+        key=len,
+        reverse=True,
+    )
+    output.append("**Data Types Defined:**")  # Renamed slightly for clarity
+    data_types = tldr_data.get("data_types")
+    if data_types and isinstance(data_types, list):
+        if not data_types:
+            output.append("- None identified.")
+        else:
+            for item in data_types:
+                name = item.get("name", "Unnamed")
+                desc = item.get("description", "No description")
+                output.append(f"- `{name}`: {desc}")
+    else:
+        output.append("- (Error loading data types)")
+    output.append("")  # Add newline for spacing
+    # Reusable helper for rendering lists
+    def render_list(title, key):
+        output.append(f"**{title}:**")
+        data_list = tldr_data.get(key)
+        if isinstance(data_list, list):
+            if not data_list:
+                output.append("- None identified.")
+            else:
+                for item_str in data_list:
+                    formatted_item = item_str  # Default
+                    found_match = False
+                    for name in defined_names:
+                        if item_str == name:
+                            formatted_item = f"`{name}`"
+                            found_match = True
+                            break
+                        elif item_str.startswith(name + " "):
+                            formatted_item = f"`{name}`{item_str[len(name):]}"
+                            found_match = True
+                            break
+                    if (
+                        not found_match
+                        and " " not in item_str
+                        and not item_str.startswith("`")
+                    ):
+                        formatted_item = f"`{item_str}`"
+                    output.append(f"- {formatted_item}")
+        else:
+            output.append("- (Error loading list)")
+        output.append("")
+    render_list("Data Sent by User to App", "user_input_data")
+    render_list("Data Processed Locally within App", "local_processing")
+    render_list("Data Processed Remotely", "remote_processing")
+    render_list("Data Logged/Saved Externally", "external_logging")
+    # Remove the last empty line
+    if output and output[-1] == "":
+        output.pop()
+    return "\n".join(output)
+# --- Combined TLDR Generation Function ---
+def generate_and_parse_tldr(detailed_report: str, summary_report: str) -> dict | None:
+    """Formats prompt, queries LLM, and parses JSON response for TLDR.
+    Args:
+        detailed_report: The detailed privacy report content.
+        summary_report: The summary & highlights report content.
+    Returns:
+        A dictionary with the parsed TLDR data, or None if any step fails.
+    """
+    logging.info("Starting TLDR generation and parsing...")
+    try:
+        # Format
+        tldr_prompt_messages = format_tldr_prompt(detailed_report, summary_report)
+        if not tldr_prompt_messages:
+            logging.error("TLDR Generation: Failed to format prompt.")
+            return None
+        # Query (using existing import within analysis_utils)
+        # Use slightly smaller max_tokens
+        llm_response = query_qwen_endpoint(tldr_prompt_messages, max_tokens=1024)
+        if llm_response is None:  # Check if query itself failed critically
+            logging.error("TLDR Generation: LLM query returned None.")
+            return None
+        # 503 handled within parse function below
+        # Parse
+        parsed_data = parse_tldr_json_response(llm_response)
+        if parsed_data:
+            logging.info("Successfully generated and parsed TLDR.")
+            return parsed_data
+        else:
+            logging.error("TLDR Generation: Failed to parse JSON response.")
+            return None
+    except Exception as e:
+        logging.error(
+            f"TLDR Generation: Unexpected error in generate_and_parse_tldr: {e}",
+            exc_info=True,
+        )
+        return None

app.py CHANGED Viewed

@@ -1,25 +1,37 @@
 import logging
 import os
 import gradio as gr
 from dotenv import load_dotenv
 from huggingface_hub import HfApi
-from llm_interface import ERROR_503_DICT  # Import error dict
-from llm_interface import parse_qwen_response, query_qwen_endpoint
-# Updated prompt imports for new order
-from prompts import format_privacy_prompt, format_summary_highlights_prompt
-# Import helper functions from other modules
-from utils import list_cached_spaces  # Added import
-from utils import (
-    check_report_exists,
-    download_cached_reports,
-    get_space_code_files,
-    upload_reports_to_dataset,
-)
 # Configure logging
 logging.basicConfig(
@@ -34,10 +46,13 @@ load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
 ENDPOINT_NAME = "qwen2-5-coder-32b-instruct-pmf"
 DATASET_ID = "yjernite/spaces-privacy-reports"
-CACHE_INFO_MSG = "\n\n*(Report retrieved from cache)*"
 DEFAULT_SELECTION = "HuggingFaceTB/SmolVLM2"
-TRUNCATION_WARNING = """**⚠️ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n"""
 ERROR_503_USER_MESSAGE = """It appears that the analysis model endpoint is currently down or starting up.
@@ -49,419 +64,582 @@ You have a few options:
 """
-def get_space_report_wrapper(
-    selected_cached_space: str | None,
-    new_space_id: str | None,
-    progress=gr.Progress(track_tqdm=True),
-):
     """
-    Wrapper function to decide whether to fetch cache or run live analysis.
-    Handles the logic based on Dropdown and Textbox inputs.
     Yields tuples of Gradio updates.
     """
-    target_space_id = None
-    source = "new"  # Assume new input unless dropdown is chosen
-    # Prioritize new_space_id if provided
-    if new_space_id and new_space_id.strip():
-        target_space_id = new_space_id.strip()
-        if target_space_id == selected_cached_space:
-            source = "dropdown_match"  # User typed ID that exists in dropdown
-        else:
-            source = "new"
-    elif selected_cached_space:
-        target_space_id = selected_cached_space
-        source = "dropdown"
-    if not target_space_id:
-        # No input provided
-        return (
-            gr.update(
-                value="Please select an existing report or enter a new Space ID.",
-                visible=True,
-            ),
-            gr.update(value="", visible=False),
-            gr.update(visible=True, open=True),
-            gr.update(visible=False),
-        )
-    # Validate format
-    if "/" not in target_space_id:
-        return (
             gr.update(
-                value=f"Invalid Space ID format: '{target_space_id}'. Use 'owner/name'.",
                 visible=True,
             ),
-            gr.update(value="", visible=False),
-            gr.update(visible=True, open=True),
-            gr.update(visible=False),
-        )
-    logging.info(f"Request received for: '{target_space_id}' (Source: {source})")
-    # --- Cache Handling ---
-    # If the user explicitly selected from the dropdown, try to fetch it directly.
-    if source == "dropdown":
-        progress(
-            0.1, desc="Fetching cached report..."
-        )  # Simple progress for cache fetch
-        yield (
-            gr.update(value="Fetching selected cached report...", visible=True),
             gr.update(value="", visible=True),
-            gr.update(visible=True, open=True),
             gr.update(visible=True, open=False),
         )
-        try:
-            cached_reports = download_cached_reports(
-                target_space_id, DATASET_ID, HF_TOKEN
-            )
-            summary_report = (
-                cached_reports.get("summary", "Error: Cached summary not found.")
-                + CACHE_INFO_MSG
-            )
-            privacy_report = (
-                cached_reports.get("privacy", "Error: Cached privacy report not found.")
-                + CACHE_INFO_MSG
-            )
-            logging.info(
-                f"Successfully displayed cached reports for selected '{target_space_id}'."
-            )
-            progress(1.0, desc="Complete (from cache)")
-            yield (
-                gr.update(value=summary_report, visible=True),
-                gr.update(value=privacy_report, visible=True),
-                gr.update(visible=True, open=True),
-                gr.update(visible=True, open=True),
-            )
-        except Exception as e:
-            error_msg = f"Failed to download cached report for selected '{target_space_id}': {e}"
-            logging.error(error_msg)
-            progress(1.0, desc="Error")
-            yield (
-                gr.update(value=error_msg, visible=True),
-                gr.update(value="", visible=False),
-                gr.update(visible=True, open=True),
-                gr.update(visible=False),
-            )
-    # --- Live Analysis or Check Cache for New Input ---
-    # If it came from the textbox OR was a dropdown match, we first check cache, then run live.
-    else:  # source == "new" or source == "dropdown_match"
-        # This generator now performs the full analysis if needed
-        # Yield intermediate updates from the generator
-        # Important: Need to use a loop to consume the generator
-        final_update = None
-        for update_tuple in _run_live_analysis(target_space_id, progress):
-            yield update_tuple
-            final_update = update_tuple  # Keep track of the last update
-        yield final_update  # Return the very last state
-def _run_live_analysis(space_id: str, progress=gr.Progress(track_tqdm=True)):
-    """
-    Performs the full analysis pipeline: cache check, code fetch, LLM calls, upload.
-    Yields tuples of Gradio updates.
-    (This contains the logic previously in analyze_space_privacy, minus initial input handling)
-    """
-    steps = 8  # Steps for the full pipeline
-    privacy_truncated = False
-    summary_truncated = False
-    # --- Step 1: Check Cache --- (Check again for new/matched input)
-    progress(1 / steps, desc="Step 1/8: Checking cache...")
-    logging.info(f"Step 1/8: Checking cache for '{space_id}'...")
     yield (
-        gr.update(value="Checking cache for existing reports...", visible=True),
         gr.update(value="", visible=True),
-        gr.update(visible=True, open=True),
         gr.update(visible=True, open=False),
     )
-    found_in_cache = False
-    if HF_TOKEN:
-        try:
-            found_in_cache = check_report_exists(space_id, DATASET_ID, HF_TOKEN)
-        except Exception as e:
-            logging.warning(f"Cache check failed: {e}. Proceeding.")
-            yield (
-                gr.update(
-                    value="Cache check failed, proceeding with live analysis...",
-                    visible=True,
-                ),
-                gr.update(value="", visible=True),
-                gr.update(visible=True, open=True),
-                gr.update(visible=True, open=False),
-            )
-    if found_in_cache:
-        logging.info(f"Cache hit for {space_id}. Downloading.")
-        progress(2 / steps, desc="Step 2/8: Cache hit! Downloading reports...")
         yield (
-            gr.update(value="Cache hit! Downloading reports...", visible=True),
-            gr.update(value="", visible=True),
-            gr.update(visible=True, open=True),
-            gr.update(visible=True, open=False),
-        )
-        try:
-            cached_reports = download_cached_reports(space_id, DATASET_ID, HF_TOKEN)
-            summary_report = (
-                cached_reports.get("summary", "Error: Cached summary not found.")
-                + CACHE_INFO_MSG
-            )
-            privacy_report = (
-                cached_reports.get("privacy", "Error: Cached privacy report not found.")
-                + CACHE_INFO_MSG
-            )
-            logging.info(f"Successfully displayed cached reports for {space_id}.")
-            progress(8 / steps, desc="Complete (from cache)")
-            yield (
-                gr.update(value=summary_report, visible=True),
-                gr.update(value=privacy_report, visible=True),
-                gr.update(visible=True, open=True),
-                gr.update(visible=True, open=True),
-            )
-            return  # End generation here if cache successful
-        except Exception as e:
-            logging.warning(f"Cache download failed for {space_id}: {e}. Proceeding.")
-            yield (
-                gr.update(
-                    value="Cache download failed, proceeding with live analysis...",
-                    visible=True,
-                ),
-                gr.update(value="", visible=True),
-                gr.update(visible=True, open=True),
-                gr.update(visible=True, open=False),
-            )
-    else:
-        logging.info(f"Cache miss for {space_id}. Performing live analysis.")
-        yield (
-            gr.update(value="Cache miss. Fetching code...", visible=True),
-            gr.update(value="", visible=True),
-            gr.update(visible=True, open=True),
-            gr.update(visible=True, open=False),
         )
-    # --- Step 2: Check Endpoint Status ---
-    progress(2 / steps, desc="Step 2/8: Checking endpoint status...")
-    logging.info("Step 2/8: Checking endpoint status...")
     yield (
-        gr.update(value="Checking whether model endpoint is active...", visible=True),
         gr.update(value="", visible=True),
-        gr.update(visible=True, open=True),
         gr.update(visible=True, open=False),
     )
-    endpoint_ready = False
-    if HF_TOKEN:
-        try:
-            api = HfApi(token=HF_TOKEN)
-            endpoint = api.get_inference_endpoint(name=ENDPOINT_NAME)
-            status = endpoint.status
-            logging.info(f"Endpoint '{ENDPOINT_NAME}' status: {status}")
-            if status == 'running':
-                endpoint_ready = True
-            else:
-                logging.warning(f"Endpoint '{ENDPOINT_NAME}' is not ready (Status: {status}).")
-                if status == 'scaledToZero':
-                    logging.info(f"Endpoint '{ENDPOINT_NAME}' is scaled to zero. Attempting to resume...")
-                    endpoint.resume()
-                msg_503 = f"**Full Service Temporarily Unavailable**: but you can **browse existing reports** or **check back later!**\n\n The status of the Qwen2.5-Coder-32B-Instruct endpoint powering the analysis is currently: <span style='color:red'>**{status}**</span>\n\n" + ERROR_503_USER_MESSAGE
-                yield (
-                    gr.update(value=msg_503, visible=True),
-                    gr.update(value="", visible=False),
-                    gr.update(visible=True, open=True),
-                    gr.update(visible=False)
-                )
-                return # Stop analysis, user needs to retry
-        except Exception as e:
-            logging.error(f"Error checking endpoint status for {ENDPOINT_NAME}: {e}")
-            yield (
-                gr.update(value=f"Error checking analysis endpoint status: {e}", visible=True),
-                gr.update(value="", visible=False),
-                gr.update(visible=True, open=True),
-                gr.update(visible=False)
-            )
-            return # Stop analysis
-    # --- Step 3: Fetch Code Files (if not cached) ---
-    progress(3 / steps, desc="Step 3/8: Fetching code files...")
-    logging.info("Step 3/8: Fetching code files...")
-    code_files = get_space_code_files(space_id)
-    if not code_files:
-        error_msg = f"Could not retrieve code files for '{space_id}'. Check ID and ensure it's a public Space."
-        logging.warning(error_msg)
         yield (
-            gr.update(value=f"**Error:**\n{error_msg}", visible=True),
             gr.update(value="Analysis Canceled", visible=True),
-            gr.update(visible=True, open=True),
             gr.update(visible=True, open=False),
         )
-        return  # End generation on error
     # --- Step 4: Generate DETAILED Privacy Report (LLM Call 1) ---
-    progress(
-        4 / steps, desc="Step 4/8: Generating detailed privacy report (AI Call 1)..."
     )
-    logging.info("Step 4/8: Generating detailed privacy analysis report...")
     yield (
-        gr.update(value="Generating detailed privacy report...", visible=True),
         gr.update(value="Generating detailed privacy report via AI...", visible=True),
-        gr.update(visible=True, open=True),
         gr.update(visible=True, open=True),
     )
-    privacy_prompt_messages, privacy_truncated = format_privacy_prompt(
-        space_id, code_files
     )
-    # --- Check for 503 after query ---
-    privacy_api_response = query_qwen_endpoint(privacy_prompt_messages, max_tokens=3072)
-    if privacy_api_response == ERROR_503_DICT:
-        logging.warning("LLM Call 1 failed with 503.")
         yield (
-            gr.update(
-                value=ERROR_503_USER_MESSAGE, visible=True
-            ),  # Show 503 message in summary area
-            gr.update(value="", visible=False),  # Clear privacy area
-            gr.update(visible=True, open=True),  # Keep summary open
-            gr.update(visible=False),  # Hide privacy accordion
         )
-        return  # Stop analysis
-    detailed_privacy_report = parse_qwen_response(privacy_api_response)
-    if "Error:" in detailed_privacy_report:
-        logging.error(
-            f"Failed to generate detailed privacy report: {detailed_privacy_report}"
         )
         yield (
-            gr.update(value="Analysis Halted due to Error", visible=True),
             gr.update(
-                value=f"**Error Generating Detailed Privacy Report:**\n{detailed_privacy_report}",
                 visible=True,
             ),
-            gr.update(visible=True, open=True),
             gr.update(visible=True, open=True),
         )
-        return  # End generation on error
-    if privacy_truncated:
-        detailed_privacy_report = TRUNCATION_WARNING + detailed_privacy_report
     yield (
-        gr.update(value="Extracting model info...", visible=True),
-        gr.update(value=detailed_privacy_report, visible=True),
-        gr.update(visible=True, open=True),
         gr.update(visible=True, open=True),
     )
-    # --- Step 5: Fetch Model Descriptions ---
-    progress(5 / steps, desc="Step 5/8: Fetching model descriptions...")
-    logging.info("Step 5/8: Fetching model descriptions...")
     yield (
-        gr.update(value="Fetching model descriptions...", visible=True),
         gr.update(),
         gr.update(),
         gr.update(),
     )
-    # --- Step 6: Generate Summary + Highlights Report (LLM Call 2) ---
-    progress(6 / steps, desc="Step 6/8: Generating summary & highlights (AI Call 2)...")
-    logging.info("Step 6/8: Generating summary and highlights report...")
     yield (
-        gr.update(value="Generating summary & highlights via AI...", visible=True),
         gr.update(),
         gr.update(),
         gr.update(),
     )
-    summary_highlights_prompt_messages, summary_truncated = (
-        format_summary_highlights_prompt(space_id, code_files, detailed_privacy_report)
     )
-    # --- Check for 503 after query ---
-    summary_highlights_api_response = query_qwen_endpoint(
-        summary_highlights_prompt_messages, max_tokens=2048
     )
-    if summary_highlights_api_response == ERROR_503_DICT:
-        logging.warning("LLM Call 2 failed with 503.")
         yield (
             gr.update(
-                value=ERROR_503_USER_MESSAGE, visible=True
-            ),  # Show 503 message in summary area
-            gr.update(
-                value=detailed_privacy_report, visible=True
-            ),  # Keep previous report visible
-            gr.update(visible=True, open=True),  # Keep summary open
-            gr.update(visible=True, open=True),  # Keep privacy open
         )
-        return  # Stop analysis
-    summary_highlights_report = parse_qwen_response(summary_highlights_api_response)
-    if "Error:" in summary_highlights_report:
-        logging.error(
-            f"Failed to generate summary/highlights report: {summary_highlights_report}"
-        )
         yield (
             gr.update(
-                value=f"**Error Generating Summary/Highlights:**\n{summary_highlights_report}",
                 visible=True,
             ),
-            gr.update(value=detailed_privacy_report, visible=True),
-            gr.update(visible=True, open=True),
-            gr.update(visible=True, open=True),
         )
-        return  # End generation on error
-    if summary_truncated:
-        summary_highlights_report = TRUNCATION_WARNING + summary_highlights_report
-    # Yield summary report before attempting upload
-    yield (
-        gr.update(value=summary_highlights_report, visible=True),
-        gr.update(value=detailed_privacy_report, visible=True),
-        gr.update(visible=True, open=True),
-        gr.update(visible=True, open=True),
-    )
-    # --- Step 7: Upload to Cache ---
-    progress(7 / steps, desc="Step 7/8: Uploading results to cache...")
-    logging.info("Step 7/8: Attempting to upload results to dataset cache...")
-    try:
-        if (
-            HF_TOKEN
-            and not found_in_cache
-            and "Error:" not in detailed_privacy_report
-            and "Error:" not in summary_highlights_report
-        ):
-            summary_to_save = summary_highlights_report.replace(
-                TRUNCATION_WARNING, ""
-            ).replace(CACHE_INFO_MSG, "")
-            privacy_to_save = detailed_privacy_report.replace(
-                TRUNCATION_WARNING, ""
-            ).replace(CACHE_INFO_MSG, "")
-            upload_reports_to_dataset(
-                space_id=space_id,
-                summary_report=summary_to_save,
-                detailed_report=privacy_to_save,
-                dataset_id=DATASET_ID,
-                hf_token=HF_TOKEN,
             )
-        elif not HF_TOKEN:
-            logging.warning("Skipping cache upload as HF_TOKEN is not set.")
-        elif found_in_cache:
-            logging.info("Skipping cache upload as results were loaded from cache.")
-    except Exception as e:
-        logging.error(f"Non-critical error during report upload: {e}")
-    logging.info("Step 8/8: Analysis complete.")
-    progress(8 / steps, desc="Step 8/8: Analysis Complete!")
-    # --- Step 8: Yield Final Results --- (Ensure final state is correct)
-    yield (
-        gr.update(value=summary_highlights_report, visible=True),
-        gr.update(value=detailed_privacy_report, visible=True),
-        gr.update(visible=True, open=True),
-        gr.update(visible=True, open=True),
-    )
 # --- Load Initial Data Function (for demo.load) ---
@@ -511,7 +689,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Row():
         with gr.Column(scale=1):  # Left column for inputs
             description_accordion = gr.Accordion(
-                "What Privacy Questions do 🤗 Spaces Raise? Click here for Demo Description 👇", open=False, visible=True
             )
             with description_accordion:
                 gr.Markdown(DESCRIPTION)
@@ -532,12 +712,28 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             analyze_button = gr.Button("Get Space Report", variant="primary", scale=1)
         with gr.Column(scale=1):  # Right column for outputs
-            # Define Accordions first, open by default, hidden initially
             summary_accordion = gr.Accordion(
-                "Summary & Privacy Highlights", open=True, visible=True
             )
             privacy_accordion = gr.Accordion(
-                "Detailed Privacy Analysis Report", open=False, visible=True
             )
             with summary_accordion:
                 summary_markdown = gr.Markdown(
@@ -559,8 +755,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         fn=get_space_report_wrapper,
         inputs=[cached_spaces_dropdown, space_id_input],
         outputs=[
             summary_markdown,
             privacy_markdown,
             summary_accordion,
             privacy_accordion,
         ],

+import json
 import logging
 import os
 import gradio as gr
 from dotenv import load_dotenv
 from huggingface_hub import HfApi
+# Import analysis pipeline helpers
+from analysis_utils import (check_cache_and_download, check_endpoint_status,
+                            fetch_and_validate_code, format_tldr_prompt,
+                            generate_and_parse_tldr, generate_detailed_report,
+                            generate_summary_report, parse_tldr_json_response,
+                            render_data_details_markdown, render_tldr_markdown,
+                            upload_results)
+# Import general utils
+from utils import list_cached_spaces  # Added import
+# Removed LLM interface imports, handled by analysis_utils
+# from llm_interface import ERROR_503_DICT
+# from llm_interface import parse_qwen_response, query_qwen_endpoint
+# Removed prompts import, handled by analysis_utils
+# from prompts import format_privacy_prompt, format_summary_highlights_prompt
+# Removed specific utils imports now handled via analysis_utils
+# from utils import (
+#     check_report_exists,
+#     download_cached_reports,
+#     get_space_code_files,
+#     upload_reports_to_dataset,
+# )
 # Configure logging
 logging.basicConfig(
 HF_TOKEN = os.getenv("HF_TOKEN")
 ENDPOINT_NAME = "qwen2-5-coder-32b-instruct-pmf"
 DATASET_ID = "yjernite/spaces-privacy-reports"
+CACHE_INFO_MSG = (
+    "\n\n*(Report retrieved from cache)*"  # Still needed for dropdown cache hit message
+)
 DEFAULT_SELECTION = "HuggingFaceTB/SmolVLM2"
+# TRUNCATION_WARNING now defined and used within analysis_utils
+# TRUNCATION_WARNING = """**⚠️ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n"""
 ERROR_503_USER_MESSAGE = """It appears that the analysis model endpoint is currently down or starting up.
 """
+def _run_live_analysis(space_id: str, progress=gr.Progress(track_tqdm=True)):
     """
+    Performs the full analysis pipeline using helper functions from analysis_utils.
     Yields tuples of Gradio updates.
     """
+    total_steps = 9  # Increased step count for TLDR generation
+    current_step = 0
+    summary_report = ""
+    privacy_report = ""
+    tldr_data = None
+    tldr_markdown_content = "*TLDR loading...*"
+    data_details_content = (
+        "*Data details loading...*"  # Default message for new component
+    )
+    # Initial message before first step
+    tldr_status_message = "*Starting analysis...*"
+    # --- Step 1: Check Cache ---
+    current_step += 1
+    progress_desc = f"Step {current_step}/{total_steps}: Checking cache..."
+    progress(current_step / total_steps, desc=progress_desc)
+    tldr_status_message = f"*{progress_desc}*"
+    yield (
+        gr.update(value=tldr_status_message, visible=True),  # TLDR shows progress
+        gr.update(value="*Checking cache...*", visible=True),
+        gr.update(value="Checking cache for existing reports...", visible=True),
+        gr.update(value="", visible=True),
+        gr.update(visible=True, open=False),
+        gr.update(visible=True, open=False),
+        gr.update(visible=True, open=False),
+    )
+    cache_result = check_cache_and_download(space_id, DATASET_ID, HF_TOKEN)
+    if cache_result["status"] == "cache_hit":
+        progress(total_steps / total_steps, desc="Complete (from cache)")
+        # Try to parse and render TLDR from cache
+        tldr_json_str = cache_result.get("tldr_json_str")
+        rendered_tldr = "*TLDR not found in cache.*"
+        if tldr_json_str:
+            try:
+                cached_tldr_data = json.loads(tldr_json_str)
+                # Render both parts
+                rendered_tldr = render_tldr_markdown(cached_tldr_data, space_id)
+                rendered_data_details = render_data_details_markdown(cached_tldr_data)
+            except Exception as parse_err:
+                logging.warning(
+                    f"Failed to parse cached TLDR JSON for {space_id}: {parse_err}"
+                )
+                rendered_tldr = "*Error parsing cached TLDR.*"
+                rendered_data_details = (
+                    "*Could not load data details due to parsing error.*"
+                )
+        yield (
+            gr.update(value=rendered_tldr, visible=True),
+            gr.update(value=rendered_data_details, visible=True),
+            gr.update(value=cache_result["summary"], visible=True),
+            gr.update(value=cache_result["privacy"], visible=True),
+            gr.update(visible=True, open=False),
+            gr.update(visible=True, open=False),
+            gr.update(visible=True, open=False),
+        )
+        return  # End generation successfully from cache
+    elif cache_result["status"] == "cache_error":
+        # Display final error in TLDR field
+        tldr_status_message = (
+            f"*Cache download failed. {cache_result.get('ui_message', '')}*"
+        )
+        data_details_content = "*Data details unavailable due to cache error.*"
+        yield (
+            gr.update(value=tldr_status_message, visible=True),
+            gr.update(value=data_details_content, visible=True),
+            gr.update(value=cache_result["ui_message"], visible=True),
+            gr.update(value="", visible=True),
+            gr.update(visible=True, open=False),
+            gr.update(visible=True, open=False),
+            gr.update(visible=True, open=False),
+        )
+        # Still continue to live analysis if cache download fails
+    elif cache_result["status"] == "cache_miss":
+        tldr_status_message = f"*{progress_desc} - Cache miss.*"  # Update status
+        data_details_content = "*Generating report...*"
+        yield (
+            gr.update(value=tldr_status_message, visible=True),
+            gr.update(value=data_details_content, visible=True),
+            gr.update(value="Cache miss. Starting live analysis...", visible=True),
+            gr.update(value="", visible=True),
+            gr.update(visible=True, open=False),
+            gr.update(visible=True, open=False),
+            gr.update(visible=True, open=False),
+        )
+    elif "error_message" in cache_result:
+        # Display final error in TLDR field
+        tldr_status_message = (
+            f"*Cache check failed. {cache_result.get('error_message', '')}*"
+        )
+        data_details_content = "*Data details unavailable due to cache error.*"
+        yield (
+            gr.update(value=tldr_status_message, visible=True),
+            gr.update(value=data_details_content, visible=True),
             gr.update(
+                value=f"Cache check failed: {cache_result.get('error_message', 'Unknown error')}. Proceeding with live analysis...",
                 visible=True,
             ),
             gr.update(value="", visible=True),
+            gr.update(visible=True, open=False),
+            gr.update(visible=True, open=False),
             gr.update(visible=True, open=False),
         )
+    # Still continue if cache check fails
+    # --- Step 2: Check Endpoint Status ---
+    current_step += 1
+    progress_desc = f"Step {current_step}/{total_steps}: Checking endpoint..."
+    progress(current_step / total_steps, desc=progress_desc)
+    tldr_status_message = f"*{progress_desc}*"
     yield (
+        gr.update(value=tldr_status_message, visible=True),  # TLDR shows progress
+        gr.update(),
+        gr.update(value="Checking analysis model endpoint status...", visible=True),
         gr.update(value="", visible=True),
+        gr.update(visible=True, open=False),
+        gr.update(visible=True, open=False),
         gr.update(visible=True, open=False),
     )
+    endpoint_result = check_endpoint_status(
+        ENDPOINT_NAME, HF_TOKEN, ERROR_503_USER_MESSAGE
+    )
+    if endpoint_result["status"] == "error":
+        progress(total_steps / total_steps, desc="Endpoint Error")
+        # Display final error in TLDR field
+        tldr_markdown_content = endpoint_result["ui_message"]
         yield (
+            gr.update(value=tldr_markdown_content, visible=True),
+            gr.update(value="", visible=False),
+            gr.update(value="", visible=False),
+            gr.update(value="", visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
         )
+        return
+    # --- Step 3: Fetch Code Files ---
+    current_step += 1
+    progress_desc = f"Step {current_step}/{total_steps}: Fetching code..."
+    progress(current_step / total_steps, desc=progress_desc)
+    tldr_status_message = f"*{progress_desc}*"
     yield (
+        gr.update(value=tldr_status_message, visible=True),  # TLDR shows progress
+        gr.update(),
+        gr.update(value="Fetching code files from the Space...", visible=True),
         gr.update(value="", visible=True),
+        gr.update(visible=True, open=False),
+        gr.update(visible=True, open=False),
         gr.update(visible=True, open=False),
     )
+    code_result = fetch_and_validate_code(space_id)
+    if code_result["status"] == "error":
+        progress(total_steps / total_steps, desc="Code Fetch Error")
+        # Display final error in TLDR field
+        tldr_markdown_content = (
+            f"**Error:** {code_result.get('ui_message', 'Failed to fetch code.')}"
+        )
         yield (
+            gr.update(value=tldr_markdown_content, visible=True),
+            gr.update(value="", visible=False),
+            gr.update(value="", visible=False),
             gr.update(value="Analysis Canceled", visible=True),
+            gr.update(visible=False),
+            gr.update(visible=False),
             gr.update(visible=True, open=False),
         )
+        return
+    code_files = code_result["code_files"]
     # --- Step 4: Generate DETAILED Privacy Report (LLM Call 1) ---
+    current_step += 1
+    progress_desc = (
+        f"Step {current_step}/{total_steps}: Generating privacy report (AI Call 1)..."
     )
+    progress(current_step / total_steps, desc=progress_desc)
+    tldr_status_message = f"*{progress_desc}*"
     yield (
+        gr.update(value=tldr_status_message, visible=True),  # TLDR shows progress
+        gr.update(),
+        gr.update(
+            value="Generating detailed privacy report (AI Call 1)...", visible=True
+        ),
         gr.update(value="Generating detailed privacy report via AI...", visible=True),
+        gr.update(visible=True, open=False),
+        gr.update(visible=True, open=False),
         gr.update(visible=True, open=True),
     )
+    privacy_result = generate_detailed_report(
+        space_id, code_files, ERROR_503_USER_MESSAGE
     )
+    if privacy_result["status"] == "error":
+        progress(total_steps / total_steps, desc="Privacy Report Error")
+        # Display final error in TLDR field
+        tldr_markdown_content = f"**Error:** {privacy_result.get('ui_message', 'Failed during detailed report generation.')}"
         yield (
+            gr.update(value=tldr_markdown_content, visible=True),
+            gr.update(value="", visible=False),
+            gr.update(value="", visible=False),
+            gr.update(value="", visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
         )
+        return
+    privacy_report = privacy_result["report"]
+    # Update UI with successful detailed report
+    yield (
+        gr.update(value=tldr_status_message, visible=True),  # Still show progress
+        gr.update(),
+        gr.update(
+            value="Detailed privacy report generated. Proceeding...", visible=True
+        ),
+        gr.update(value=privacy_report, visible=True),
+        gr.update(visible=True, open=False),
+        gr.update(visible=True, open=False),
+        gr.update(visible=True, open=True),
+    )
+    # --- Step 5: Fetch Model Descriptions (Placeholder/Optional) ---
+    current_step += 1
+    progress_desc = f"Step {current_step}/{total_steps}: Extracting model info..."
+    progress(current_step / total_steps, desc=progress_desc)
+    tldr_status_message = f"*{progress_desc}*"
+    logging.info(progress_desc + " (Placeholder)")
+    yield (
+        gr.update(value=tldr_status_message, visible=True),  # TLDR shows progress
+        gr.update(),
+        gr.update(value="Extracting model info...", visible=True),
+        gr.update(),
+        gr.update(),
+        gr.update(),
+        gr.update(),
+    )
+    # model_ids = extract_hf_model_ids(code_files) # utils function not imported
+    # model_descriptions = get_model_descriptions(model_ids) # utils function not imported
+    # Add model_descriptions to context if needed for summary prompt later
+    # --- Step 6: Generate Summary + Highlights Report (LLM Call 2) ---
+    current_step += 1
+    progress_desc = (
+        f"Step {current_step}/{total_steps}: Generating summary (AI Call 2)..."
+    )
+    progress(current_step / total_steps, desc=progress_desc)
+    tldr_status_message = f"*{progress_desc}*"
+    yield (
+        gr.update(value=tldr_status_message, visible=True),  # TLDR shows progress
+        gr.update(),
+        gr.update(value="Generating summary & highlights (AI Call 2)...", visible=True),
+        gr.update(),
+        gr.update(),
+        gr.update(),
+        gr.update(),
+    )
+    summary_result = generate_summary_report(
+        space_id, code_files, privacy_report, ERROR_503_USER_MESSAGE
+    )
+    if (
+        summary_result["status"] == "error_503_summary"
+        or summary_result["status"] == "error_summary"
+    ):
+        progress(total_steps / total_steps, desc="Summary Report Error")
+        # Display error in TLDR, show partial results below
+        tldr_markdown_content = f"**Error:** {summary_result.get('ui_message', 'Failed during summary generation.')}"
+        data_details_content = "*Data details may be incomplete.*"
+        yield (
+            gr.update(value=tldr_markdown_content, visible=True),
+            gr.update(value=data_details_content, visible=True),
+            gr.update(value=summary_result["ui_message"], visible=True),
+            gr.update(value=privacy_report, visible=True),
+            gr.update(visible=True, open=False),
+            gr.update(visible=True, open=False),
+            gr.update(visible=True, open=True),
         )
+        return
+    elif summary_result["status"] != "success":
+        progress(total_steps / total_steps, desc="Summary Report Error")
+        # Display error in TLDR, show partial results below
+        tldr_markdown_content = f"**Error:** Unexpected error generating summary: {summary_result.get('ui_message', 'Unknown')}"
+        data_details_content = "*Data details unavailable.*"
         yield (
+            gr.update(value=tldr_markdown_content, visible=True),
+            gr.update(value=data_details_content, visible=True),
             gr.update(
+                value=f"Unexpected error generating summary: {summary_result.get('ui_message', 'Unknown')}",
                 visible=True,
             ),
+            gr.update(value=privacy_report, visible=True),
+            gr.update(visible=True, open=False),
+            gr.update(visible=True, open=False),
             gr.update(visible=True, open=True),
         )
+        return
+    summary_report = summary_result["report"]
+    # Update UI with successful summary report before TLDR generation
+    tldr_status_message = (
+        f"*{progress_desc} - Success. Generating TLDR...*"  # Update status
+    )
+    data_details_content = "*Generating data details...*"
     yield (
+        gr.update(value=tldr_status_message, visible=True),
+        gr.update(value=data_details_content, visible=True),
+        gr.update(value=summary_report, visible=True),
+        gr.update(value=privacy_report, visible=True),
+        gr.update(visible=True, open=False),
+        gr.update(visible=True, open=False),
         gr.update(visible=True, open=True),
     )
+    # --- Step 7: Generate TLDR --- (New Step)
+    current_step += 1
+    progress_desc = f"Step {current_step}/{total_steps}: Generating TLDR summary..."
+    progress(current_step / total_steps, desc=progress_desc)
+    tldr_status_message = f"*{progress_desc}*"
+    yield (
+        gr.update(value=tldr_status_message, visible=True),
+        gr.update(),
+        gr.update(),
+        gr.update(),
+        gr.update(),
+        gr.update(),
+        gr.update(),
+    )
+    tldr_data = None  # Reset tldr_data before attempt
+    try:
+        # Call the combined helper function from analysis_utils
+        tldr_data = generate_and_parse_tldr(privacy_report, summary_report)
+        if tldr_data:
+            logging.info(f"Successfully generated and parsed TLDR for {space_id}.")
+            tldr_markdown_content = render_tldr_markdown(tldr_data, space_id)
+            data_details_content = render_data_details_markdown(tldr_data)
+        else:
+            logging.warning(
+                f"Failed to generate or parse TLDR for {space_id}. Proceeding without it."
+            )
+            tldr_markdown_content = "*TLDR generation failed.*"
+            data_details_content = "*Data details generation failed.*"
+    except Exception as tldr_err:
+        # This catch block might be redundant now if generate_and_parse_tldr handles its errors
+        logging.error(
+            f"Unexpected error during TLDR generation step call for {space_id}: {tldr_err}"
+        )
+        tldr_markdown_content = "*Error during TLDR generation step.*"
+        data_details_content = "*Error generating data details.*"
+        tldr_data = None  # Ensure it's None on error
+    # Update UI including the generated (or failed) TLDR before upload
     yield (
+        gr.update(value=tldr_markdown_content, visible=True),
+        gr.update(value=data_details_content, visible=True),
         gr.update(),
         gr.update(),
+        gr.update(visible=True, open=False),
+        gr.update(),
         gr.update(),
     )
+    # --- Step 8: Upload to Cache --- (Old Step 7)
+    current_step += 1
+    progress_desc = f"Step {current_step}/{total_steps}: Uploading to cache..."
+    progress(current_step / total_steps, desc=progress_desc)
+    tldr_status_message = f"*{progress_desc}*"  # Display final action in TLDR field
     yield (
+        gr.update(value=tldr_status_message, visible=True),
+        gr.update(),
+        gr.update(value="Uploading results to cache...", visible=True),
+        gr.update(),
         gr.update(),
         gr.update(),
         gr.update(),
     )
+    upload_needed = (
+        cache_result["status"] != "cache_hit"
+        and cache_result["status"] != "cache_error"
     )
+    if upload_needed:
+        # Call imported function, now passing tldr_data
+        upload_result = upload_results(
+            space_id,
+            summary_report,
+            privacy_report,
+            DATASET_ID,
+            HF_TOKEN,
+            tldr_json_data=tldr_data,
+        )
+        if upload_result["status"] == "error":
+            # Ensure logging uses f-string if adding step count here
+            logging.error(
+                f"Cache upload failed: {upload_result.get('message', 'Unknown error')}"
+            )
+            # Non-critical, don't stop the UI, just log
+        elif upload_result["status"] == "skipped":
+            logging.info(f"Cache upload skipped: {upload_result.get('reason', '')}")
+    else:
+        logging.info(
+            "Skipping cache upload as results were loaded from cache or cache check failed."
+        )
+    # Update UI including the generated (or failed) TLDR before upload
+    # Yield 7 updates
+    yield (
+        gr.update(value=tldr_markdown_content, visible=True),
+        gr.update(value=data_details_content, visible=True),
+        gr.update(value=summary_report, visible=True),
+        gr.update(value=privacy_report, visible=True),
+        gr.update(visible=True, open=False),
+        gr.update(visible=True, open=False),
+        gr.update(visible=True, open=False),
+    )
+    # --- Step 9: Final Update --- (Old Step 8)
+    current_step += 1
+    progress_desc = f"Step {current_step}/{total_steps}: Analysis Complete!"
+    progress(current_step / total_steps, desc=progress_desc)
+    logging.info(progress_desc + f" Analysis complete for {space_id}.")
+    # Yield final state again to ensure UI is correct after potential upload messages
+    # Display final generated TLDR and Data Details
+    yield (
+        gr.update(value=tldr_markdown_content, visible=True),
+        gr.update(value=data_details_content, visible=True),
+        gr.update(value=summary_report, visible=True),
+        gr.update(value=privacy_report, visible=True),
+        gr.update(visible=True, open=False),
+        gr.update(visible=True, open=False),
+        gr.update(visible=True, open=False),
     )
+# --- Original Input Handling Wrapper (updated yields for initial errors) ---
+def get_space_report_wrapper(
+    selected_cached_space: str | None,
+    new_space_id: str | None,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """
+    Wrapper function to decide whether to fetch cache or run live analysis.
+    Handles the logic based on Dropdown and Textbox inputs.
+    Yields tuples of Gradio updates.
+    """
+    target_space_id = None
+    source = "new"  # Assume new input unless dropdown is chosen
+    # Prioritize new_space_id if provided
+    if new_space_id and new_space_id.strip():
+        target_space_id = new_space_id.strip()
+        if target_space_id == selected_cached_space:
+            source = "dropdown_match"  # User typed ID that exists in dropdown
+        else:
+            source = "new"
+    elif selected_cached_space:
+        target_space_id = selected_cached_space
+        source = "dropdown"
+    if not target_space_id:
+        # Yield 7 updates
         yield (
+            gr.update(value="*Please provide a Space ID.*", visible=True),
+            gr.update(value="", visible=False),
             gr.update(
+                value="Please select an existing report or enter a new Space ID.",
+                visible=True,
+            ),
+            gr.update(value="", visible=False),
+            gr.update(visible=True, open=False),
+            gr.update(visible=True, open=False),
+            gr.update(visible=False),
         )
+        return
+    if "/" not in target_space_id:
+        # Yield 7 updates
         yield (
+            gr.update(value="*Invalid Space ID format.*", visible=True),
+            gr.update(value="", visible=False),
             gr.update(
+                value=f"Invalid Space ID format: '{target_space_id}'. Use 'owner/name'.",
                 visible=True,
             ),
+            gr.update(value="", visible=False),
+            gr.update(visible=True, open=False),
+            gr.update(visible=True, open=False),
+            gr.update(visible=False),
         )
+        return
+    logging.info(f"Request received for: '{target_space_id}' (Source: {source})")
+    if source == "dropdown":
+        progress(0.1, desc="Fetching selected cached report...")
+        # Yield 7 updates (initial placeholder)
+        yield (
+            gr.update(value="*Loading TLDR...*", visible=True),
+            gr.update(value="*Loading data details...*", visible=True),
+            gr.update(value="Fetching selected cached report...", visible=True),
+            gr.update(value="", visible=True),
+            gr.update(visible=True, open=False),
+            gr.update(visible=True, open=False),
+            gr.update(visible=True, open=False),
+        )
+        cache_result = check_cache_and_download(target_space_id, DATASET_ID, HF_TOKEN)
+        if cache_result["status"] == "cache_hit":
+            logging.info(
+                f"Successfully displayed cached reports for selected '{target_space_id}'."
             )
+            progress(1.0, desc="Complete (from cache)")
+            # Use the cached report text directly here, adding the cache message is done within the helper now.
+            # Parse and render TLDR if available
+            tldr_json_str = cache_result.get("tldr_json_str")
+            rendered_tldr = "*TLDR not found in cache.*"
+            if tldr_json_str:
+                try:
+                    cached_tldr_data = json.loads(tldr_json_str)
+                    rendered_tldr = render_tldr_markdown(
+                        cached_tldr_data, target_space_id
+                    )
+                    rendered_data_details = render_data_details_markdown(
+                        cached_tldr_data
+                    )
+                except Exception as parse_err:
+                    logging.warning(
+                        f"Failed to parse cached TLDR JSON for {target_space_id}: {parse_err}"
+                    )
+                    rendered_tldr = "*Error parsing cached TLDR.*"
+                    rendered_data_details = (
+                        "*Could not load data details due to parsing error.*"
+                    )
+            yield (
+                gr.update(value=rendered_tldr, visible=True),
+                gr.update(value=rendered_data_details, visible=True),
+                gr.update(value=cache_result["summary"], visible=True),
+                gr.update(value=cache_result["privacy"], visible=True),
+                gr.update(visible=True, open=False),
+                gr.update(visible=True, open=False),
+                gr.update(visible=True, open=False),
+            )
+        else:  # Cache miss or error for a dropdown selection is an error state
+            error_msg = cache_result.get(
+                "ui_message",
+                f"Failed to find or download cached report for selected '{target_space_id}'.",
+            )
+            logging.error(error_msg)
+            progress(1.0, desc="Error")
+            yield (
+                gr.update(value="*TLDR load failed.*", visible=True),
+                gr.update(value="*Data details load failed.*", visible=True),
+                gr.update(value=error_msg, visible=True),
+                gr.update(value="", visible=False),
+                gr.update(visible=True, open=False),
+                gr.update(visible=True, open=False),
+                gr.update(visible=False),
+            )
+        return  # Stop after handling dropdown source
+    # --- Live Analysis or Check Cache for New Input ---
+    # If it came from the textbox OR was a dropdown match, run the full live analysis pipeline
+    # which includes its own cache check at the beginning.
+    else:  # source == "new" or source == "dropdown_match"
+        # Yield intermediate updates from the generator by iterating through it
+        for update_tuple in _run_live_analysis(target_space_id, progress):
+            yield update_tuple
 # --- Load Initial Data Function (for demo.load) ---
     with gr.Row():
         with gr.Column(scale=1):  # Left column for inputs
             description_accordion = gr.Accordion(
+                "What Privacy Questions do 🤗 Spaces Raise? Click here for Demo Description 👇",
+                open=False,
+                visible=True,
             )
             with description_accordion:
                 gr.Markdown(DESCRIPTION)
             analyze_button = gr.Button("Get Space Report", variant="primary", scale=1)
         with gr.Column(scale=1):  # Right column for outputs
+            # Define TLDR Markdown component first, always visible
+            gr.Markdown("### Privacy TLDR  🕵️\n", visible=True)
+            tldr_markdown = gr.Markdown(
+                "*Select or enter a Space ID to get started.*", visible=True
+            )
+            # Define Accordions next, closed by default, visible
+            data_types_accordion = gr.Accordion(
+                "Data Types at Play", open=False, visible=True
+            )
+            with data_types_accordion:
+                data_details_markdown = gr.Markdown("*Data details will appear here.*")
             summary_accordion = gr.Accordion(
+                "Summary & Privacy Highlights",
+                open=False,
+                visible=True,  # Changed to open=False
             )
             privacy_accordion = gr.Accordion(
+                "Detailed Privacy Analysis Report",
+                open=False,
+                visible=True,  # Changed to open=False
             )
             with summary_accordion:
                 summary_markdown = gr.Markdown(
         fn=get_space_report_wrapper,
         inputs=[cached_spaces_dropdown, space_id_input],
         outputs=[
+            tldr_markdown,
+            data_details_markdown,  # Added data details output
             summary_markdown,
             privacy_markdown,
+            data_types_accordion,  # Added data details accordion output
             summary_accordion,
             privacy_accordion,
         ],

llm_interface.py CHANGED Viewed

@@ -79,6 +79,7 @@ def query_qwen_endpoint(
             return None  # Return None for other HTTP errors
     except Exception as e:
         logging.error(f"An unexpected error occurred querying Inference Endpoint: {e}")
         return None

             return None  # Return None for other HTTP errors
     except Exception as e:
         logging.error(f"An unexpected error occurred querying Inference Endpoint: {e}")
+        print(f"An unexpected error occurred querying Inference Endpoint: {e}")
         return None

utils.py CHANGED Viewed

@@ -80,6 +80,7 @@ MAX_MODEL_DESC_LENGTH = 1500
 SUMMARY_FILENAME = "summary_highlights.md"
 PRIVACY_FILENAME = "privacy_report.md"
 def _is_relevant_file(filename):
@@ -367,7 +368,13 @@ def check_report_exists(space_id: str, dataset_id: str, hf_token: str | None) ->
 def download_cached_reports(
     space_id: str, dataset_id: str, hf_token: str | None
 ) -> dict[str, str]:
-    """Downloads cached reports from the dataset repo. Raises error on failure."""
     if not hf_token:
         raise ValueError("HF Token required to download cached reports.")
@@ -378,50 +385,95 @@ def download_cached_reports(
     # Define paths relative to dataset root for hf_hub_download
     summary_repo_path = f"{space_id}/{SUMMARY_FILENAME}"
     privacy_repo_path = f"{space_id}/{PRIVACY_FILENAME}"
     try:
         # Download summary
-        summary_path_local = hf_hub_download(
-            repo_id=dataset_id,
-            filename=summary_repo_path,
-            repo_type="dataset",
-            token=hf_token,
-        )
-        with open(summary_path_local, "r", encoding="utf-8") as f:
-            reports["summary"] = f.read()
-        logging.info(f"Successfully downloaded cached summary for {space_id}.")
         # Download privacy report
-        privacy_path_local = hf_hub_download(
-            repo_id=dataset_id,
-            filename=privacy_repo_path,
-            repo_type="dataset",
-            token=hf_token,
-        )
-        with open(privacy_path_local, "r", encoding="utf-8") as f:
-            reports["privacy"] = f.read()
-        logging.info(f"Successfully downloaded cached privacy report for {space_id}.")
         return reports
-    except EntryNotFoundError as e:
-        # More specific error based on which file failed
-        missing_file = (
-            summary_repo_path if summary_repo_path in str(e) else privacy_repo_path
-        )
         logging.error(
-            f"Cache download error: Report file {missing_file} not found for {space_id} in {dataset_id}. {e}"
         )
-        raise FileNotFoundError(
-            f"Cached report file {missing_file} not found for {space_id}"
-        ) from e
-    except RepositoryNotFoundError as e:
-        logging.error(f"Cache download error: Dataset repo {dataset_id} not found. {e}")
-        raise FileNotFoundError(f"Dataset repo {dataset_id} not found") from e
-    except Exception as e:
         logging.error(
-            f"Unexpected error downloading cached reports for {space_id} from {dataset_id}: {e}"
         )
-        raise IOError(f"Failed to download cached reports for {space_id}") from e
 def upload_reports_to_dataset(

 SUMMARY_FILENAME = "summary_highlights.md"
 PRIVACY_FILENAME = "privacy_report.md"
+TLDR_FILENAME = "tldr_summary.json"
 def _is_relevant_file(filename):
 def download_cached_reports(
     space_id: str, dataset_id: str, hf_token: str | None
 ) -> dict[str, str]:
+    """Downloads cached reports (summary, privacy, tldr json) from the dataset repo.
+    Returns:
+        Dict containing report contents keyed by 'summary', 'privacy', 'tldr_json_str'.
+        Keys will be missing if a specific file is not found.
+        Raises error on critical download failures (repo not found, etc.).
+    """
     if not hf_token:
         raise ValueError("HF Token required to download cached reports.")
     # Define paths relative to dataset root for hf_hub_download
     summary_repo_path = f"{space_id}/{SUMMARY_FILENAME}"
     privacy_repo_path = f"{space_id}/{PRIVACY_FILENAME}"
+    tldr_repo_path = f"{space_id}/{TLDR_FILENAME}"  # Path for TLDR JSON
     try:
         # Download summary
+        try:
+            summary_path_local = hf_hub_download(
+                repo_id=dataset_id,
+                filename=summary_repo_path,
+                repo_type="dataset",
+                token=hf_token,
+            )
+            with open(summary_path_local, "r", encoding="utf-8") as f:
+                reports["summary"] = f.read()
+            logging.info(f"Successfully downloaded cached summary for {space_id}.")
+        except EntryNotFoundError:
+            logging.warning(
+                f"Cached summary file {summary_repo_path} not found for {space_id}."
+            )
+        except Exception as e_summary:
+            logging.error(
+                f"Error downloading cached summary for {space_id}: {e_summary}"
+            )
+            # Decide if this is critical - for now, we warn and continue
         # Download privacy report
+        try:
+            privacy_path_local = hf_hub_download(
+                repo_id=dataset_id,
+                filename=privacy_repo_path,
+                repo_type="dataset",
+                token=hf_token,
+            )
+            with open(privacy_path_local, "r", encoding="utf-8") as f:
+                reports["privacy"] = f.read()
+            logging.info(
+                f"Successfully downloaded cached privacy report for {space_id}."
+            )
+        except EntryNotFoundError:
+            logging.warning(
+                f"Cached privacy file {privacy_repo_path} not found for {space_id}."
+            )
+        except Exception as e_privacy:
+            logging.error(
+                f"Error downloading cached privacy report for {space_id}: {e_privacy}"
+            )
+            # Decide if this is critical - for now, we warn and continue
+        # Download TLDR JSON
+        try:
+            tldr_path_local = hf_hub_download(
+                repo_id=dataset_id,
+                filename=tldr_repo_path,
+                repo_type="dataset",
+                token=hf_token,
+            )
+            with open(tldr_path_local, "r", encoding="utf-8") as f:
+                reports["tldr_json_str"] = f.read()  # Store raw string content
+            logging.info(f"Successfully downloaded cached TLDR JSON for {space_id}.")
+        except EntryNotFoundError:
+            logging.warning(
+                f"Cached TLDR file {tldr_repo_path} not found for {space_id}."
+            )
+            # Don't treat TLDR absence as an error, just won't be in the dict
+        except Exception as e_tldr:
+            logging.error(
+                f"Error downloading cached TLDR JSON for {space_id}: {e_tldr}"
+            )
+            # Don't treat TLDR download error as critical, just won't be included
+        # Check if at least one report was downloaded successfully
+        if not reports.get("summary") and not reports.get("privacy"):
+            raise FileNotFoundError(
+                f"Failed to download *any* primary cache files (summary/privacy) for {space_id}"
+            )
         return reports
+    except RepositoryNotFoundError as e_repo:
         logging.error(
+            f"Cache download error: Dataset repo {dataset_id} not found. {e_repo}"
         )
+        raise FileNotFoundError(f"Dataset repo {dataset_id} not found") from e_repo
+    except Exception as e_critical:  # Catch other potential critical errors
         logging.error(
+            f"Unexpected critical error downloading cached reports for {space_id} from {dataset_id}: {e_critical}"
         )
+        raise IOError(
+            f"Failed critically during cached report download for {space_id}"
+        ) from e_critical
 def upload_reports_to_dataset(