import json import logging import os import gradio as gr from dotenv import load_dotenv from huggingface_hub import HfApi # Import analysis pipeline helpers from analysis_utils import (check_cache_and_download, check_endpoint_status, fetch_and_validate_code, format_tldr_prompt, generate_and_parse_tldr, generate_detailed_report, generate_summary_report, parse_tldr_json_response, render_data_details_markdown, render_tldr_markdown, upload_results) # Import general utils from utils import list_cached_spaces # Added import # Removed LLM interface imports, handled by analysis_utils # from llm_interface import ERROR_503_DICT # from llm_interface import parse_qwen_response, query_qwen_endpoint # Removed prompts import, handled by analysis_utils # from prompts import format_privacy_prompt, format_summary_highlights_prompt # Removed specific utils imports now handled via analysis_utils # from utils import ( # check_report_exists, # download_cached_reports, # get_space_code_files, # upload_reports_to_dataset, # ) # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) # Load environment variables from .env file # This is important to ensure API keys and endpoints are loaded before use load_dotenv() # --- Constants --- HF_TOKEN = os.getenv("HF_TOKEN") ENDPOINT_NAME = "qwen2-5-coder-32b-instruct-pmf" DATASET_ID = "yjernite/spaces-privacy-reports" CACHE_INFO_MSG = ( "\n\n*(Report retrieved from cache)*" # Still needed for dropdown cache hit message ) DEFAULT_SELECTION = "HuggingFaceTB/SmolVLM2" # TRUNCATION_WARNING now defined and used within analysis_utils # TRUNCATION_WARNING = """**⚠️ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n""" ERROR_503_USER_MESSAGE = """It appears that the analysis model endpoint is currently down or starting up. You have a few options: * **Wait & Retry:** Try clicking "Get Space Report" again in ~3-5 minutes. Endpoints often scale down to save resources and take a short time to wake up. * **Select Cached Report:** Use the dropdown above to view a report for a Space that has already been analyzed. * **Request Analysis:** If the error persists, please [open an issue or discussion](https://huggingface.co/spaces/yjernite/space-privacy/discussions) in the Space's Community tab requesting analysis for your target Space ID. We can run the job manually when the endpoint is available. """ def _run_live_analysis(space_id: str, progress=gr.Progress(track_tqdm=True)): """ Performs the full analysis pipeline using helper functions from analysis_utils. Yields tuples of Gradio updates. """ total_steps = 9 # Increased step count for TLDR generation current_step = 0 summary_report = "" privacy_report = "" tldr_data = None tldr_markdown_content = "*TLDR loading...*" data_details_content = ( "*Data details loading...*" # Default message for new component ) # Initial message before first step tldr_status_message = "*Starting analysis...*" # --- Step 1: Check Cache --- current_step += 1 progress_desc = f"Step {current_step}/{total_steps}: Checking cache..." progress(current_step / total_steps, desc=progress_desc) tldr_status_message = f"*{progress_desc}*" yield ( gr.update(value=tldr_status_message, visible=True), # TLDR shows progress gr.update(value="*Checking cache...*", visible=True), gr.update(value="Checking cache for existing reports...", visible=True), gr.update(value="", visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=False), ) cache_result = check_cache_and_download(space_id, DATASET_ID, HF_TOKEN) if cache_result["status"] == "cache_hit": progress(total_steps / total_steps, desc="Complete (from cache)") # Try to parse and render TLDR from cache tldr_json_str = cache_result.get("tldr_json_str") rendered_tldr = "*TLDR not found in cache.*" if tldr_json_str: try: cached_tldr_data = json.loads(tldr_json_str) # Render both parts rendered_tldr = render_tldr_markdown(cached_tldr_data, space_id) rendered_data_details = render_data_details_markdown(cached_tldr_data) except Exception as parse_err: logging.warning( f"Failed to parse cached TLDR JSON for {space_id}: {parse_err}" ) rendered_tldr = "*Error parsing cached TLDR.*" rendered_data_details = ( "*Could not load data details due to parsing error.*" ) yield ( gr.update(value=rendered_tldr, visible=True), gr.update(value=rendered_data_details, visible=True), gr.update(value=cache_result["summary"], visible=True), gr.update(value=cache_result["privacy"], visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=False), ) return # End generation successfully from cache elif cache_result["status"] == "cache_error": # Display final error in TLDR field tldr_status_message = ( f"*Cache download failed. {cache_result.get('ui_message', '')}*" ) data_details_content = "*Data details unavailable due to cache error.*" yield ( gr.update(value=tldr_status_message, visible=True), gr.update(value=data_details_content, visible=True), gr.update(value=cache_result["ui_message"], visible=True), gr.update(value="", visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=False), ) # Still continue to live analysis if cache download fails elif cache_result["status"] == "cache_miss": tldr_status_message = f"*{progress_desc} - Cache miss.*" # Update status data_details_content = "*Generating report...*" yield ( gr.update(value=tldr_status_message, visible=True), gr.update(value=data_details_content, visible=True), gr.update(value="Cache miss. Starting live analysis...", visible=True), gr.update(value="", visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=False), ) elif "error_message" in cache_result: # Display final error in TLDR field tldr_status_message = ( f"*Cache check failed. {cache_result.get('error_message', '')}*" ) data_details_content = "*Data details unavailable due to cache error.*" yield ( gr.update(value=tldr_status_message, visible=True), gr.update(value=data_details_content, visible=True), gr.update( value=f"Cache check failed: {cache_result.get('error_message', 'Unknown error')}. Proceeding with live analysis...", visible=True, ), gr.update(value="", visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=False), ) # Still continue if cache check fails # --- Step 2: Check Endpoint Status --- current_step += 1 progress_desc = f"Step {current_step}/{total_steps}: Checking endpoint..." progress(current_step / total_steps, desc=progress_desc) tldr_status_message = f"*{progress_desc}*" yield ( gr.update(value=tldr_status_message, visible=True), # TLDR shows progress gr.update(), gr.update(value="Checking analysis model endpoint status...", visible=True), gr.update(value="", visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=False), ) endpoint_result = check_endpoint_status( ENDPOINT_NAME, HF_TOKEN, ERROR_503_USER_MESSAGE ) if endpoint_result["status"] == "error": progress(total_steps / total_steps, desc="Endpoint Error") # Display final error in TLDR field tldr_markdown_content = endpoint_result["ui_message"] yield ( gr.update(value=tldr_markdown_content, visible=True), gr.update(value="", visible=False), gr.update(value="", visible=False), gr.update(value="", visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), ) return # --- Step 3: Fetch Code Files --- current_step += 1 progress_desc = f"Step {current_step}/{total_steps}: Fetching code..." progress(current_step / total_steps, desc=progress_desc) tldr_status_message = f"*{progress_desc}*" yield ( gr.update(value=tldr_status_message, visible=True), # TLDR shows progress gr.update(), gr.update(value="Fetching code files from the Space...", visible=True), gr.update(value="", visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=False), ) code_result = fetch_and_validate_code(space_id) if code_result["status"] == "error": progress(total_steps / total_steps, desc="Code Fetch Error") # Display final error in TLDR field tldr_markdown_content = ( f"**Error:** {code_result.get('ui_message', 'Failed to fetch code.')}" ) yield ( gr.update(value=tldr_markdown_content, visible=True), gr.update(value="", visible=False), gr.update(value="", visible=False), gr.update(value="Analysis Canceled", visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True, open=False), ) return code_files = code_result["code_files"] # --- Step 4: Generate DETAILED Privacy Report (LLM Call 1) --- current_step += 1 progress_desc = ( f"Step {current_step}/{total_steps}: Generating privacy report (AI Call 1)..." ) progress(current_step / total_steps, desc=progress_desc) tldr_status_message = f"*{progress_desc}*" yield ( gr.update(value=tldr_status_message, visible=True), # TLDR shows progress gr.update(), gr.update( value="Generating detailed privacy report (AI Call 1)...", visible=True ), gr.update(value="Generating detailed privacy report via AI...", visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=True), ) privacy_result = generate_detailed_report( space_id, code_files, ERROR_503_USER_MESSAGE ) if privacy_result["status"] == "error": progress(total_steps / total_steps, desc="Privacy Report Error") # Display final error in TLDR field tldr_markdown_content = f"**Error:** {privacy_result.get('ui_message', 'Failed during detailed report generation.')}" yield ( gr.update(value=tldr_markdown_content, visible=True), gr.update(value="", visible=False), gr.update(value="", visible=False), gr.update(value="", visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), ) return privacy_report = privacy_result["report"] # Update UI with successful detailed report yield ( gr.update(value=tldr_status_message, visible=True), # Still show progress gr.update(), gr.update( value="Detailed privacy report generated. Proceeding...", visible=True ), gr.update(value=privacy_report, visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=True), ) # --- Step 5: Fetch Model Descriptions (Placeholder/Optional) --- current_step += 1 progress_desc = f"Step {current_step}/{total_steps}: Extracting model info..." progress(current_step / total_steps, desc=progress_desc) tldr_status_message = f"*{progress_desc}*" logging.info(progress_desc + " (Placeholder)") yield ( gr.update(value=tldr_status_message, visible=True), # TLDR shows progress gr.update(), gr.update(value="Extracting model info...", visible=True), gr.update(), gr.update(), gr.update(), gr.update(), ) # model_ids = extract_hf_model_ids(code_files) # utils function not imported # model_descriptions = get_model_descriptions(model_ids) # utils function not imported # Add model_descriptions to context if needed for summary prompt later # --- Step 6: Generate Summary + Highlights Report (LLM Call 2) --- current_step += 1 progress_desc = ( f"Step {current_step}/{total_steps}: Generating summary (AI Call 2)..." ) progress(current_step / total_steps, desc=progress_desc) tldr_status_message = f"*{progress_desc}*" yield ( gr.update(value=tldr_status_message, visible=True), # TLDR shows progress gr.update(), gr.update(value="Generating summary & highlights (AI Call 2)...", visible=True), gr.update(), gr.update(), gr.update(), gr.update(), ) summary_result = generate_summary_report( space_id, code_files, privacy_report, ERROR_503_USER_MESSAGE ) if ( summary_result["status"] == "error_503_summary" or summary_result["status"] == "error_summary" ): progress(total_steps / total_steps, desc="Summary Report Error") # Display error in TLDR, show partial results below tldr_markdown_content = f"**Error:** {summary_result.get('ui_message', 'Failed during summary generation.')}" data_details_content = "*Data details may be incomplete.*" yield ( gr.update(value=tldr_markdown_content, visible=True), gr.update(value=data_details_content, visible=True), gr.update(value=summary_result["ui_message"], visible=True), gr.update(value=privacy_report, visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=True), ) return elif summary_result["status"] != "success": progress(total_steps / total_steps, desc="Summary Report Error") # Display error in TLDR, show partial results below tldr_markdown_content = f"**Error:** Unexpected error generating summary: {summary_result.get('ui_message', 'Unknown')}" data_details_content = "*Data details unavailable.*" yield ( gr.update(value=tldr_markdown_content, visible=True), gr.update(value=data_details_content, visible=True), gr.update( value=f"Unexpected error generating summary: {summary_result.get('ui_message', 'Unknown')}", visible=True, ), gr.update(value=privacy_report, visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=True), ) return summary_report = summary_result["report"] # Update UI with successful summary report before TLDR generation tldr_status_message = ( f"*{progress_desc} - Success. Generating TLDR...*" # Update status ) data_details_content = "*Generating data details...*" yield ( gr.update(value=tldr_status_message, visible=True), gr.update(value=data_details_content, visible=True), gr.update(value=summary_report, visible=True), gr.update(value=privacy_report, visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=True), ) # --- Step 7: Generate TLDR --- (New Step) current_step += 1 progress_desc = f"Step {current_step}/{total_steps}: Generating TLDR summary..." progress(current_step / total_steps, desc=progress_desc) tldr_status_message = f"*{progress_desc}*" yield ( gr.update(value=tldr_status_message, visible=True), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), ) tldr_data = None # Reset tldr_data before attempt try: # Call the combined helper function from analysis_utils tldr_data = generate_and_parse_tldr(privacy_report, summary_report) if tldr_data: logging.info(f"Successfully generated and parsed TLDR for {space_id}.") tldr_markdown_content = render_tldr_markdown(tldr_data, space_id) data_details_content = render_data_details_markdown(tldr_data) else: logging.warning( f"Failed to generate or parse TLDR for {space_id}. Proceeding without it." ) tldr_markdown_content = "*TLDR generation failed.*" data_details_content = "*Data details generation failed.*" except Exception as tldr_err: # This catch block might be redundant now if generate_and_parse_tldr handles its errors logging.error( f"Unexpected error during TLDR generation step call for {space_id}: {tldr_err}" ) tldr_markdown_content = "*Error during TLDR generation step.*" data_details_content = "*Error generating data details.*" tldr_data = None # Ensure it's None on error # Update UI including the generated (or failed) TLDR before upload yield ( gr.update(value=tldr_markdown_content, visible=True), gr.update(value=data_details_content, visible=True), gr.update(), gr.update(), gr.update(visible=True, open=False), gr.update(), gr.update(), ) # --- Step 8: Upload to Cache --- (Old Step 7) current_step += 1 progress_desc = f"Step {current_step}/{total_steps}: Uploading to cache..." progress(current_step / total_steps, desc=progress_desc) tldr_status_message = f"*{progress_desc}*" # Display final action in TLDR field yield ( gr.update(value=tldr_status_message, visible=True), gr.update(), gr.update(value="Uploading results to cache...", visible=True), gr.update(), gr.update(), gr.update(), gr.update(), ) upload_needed = ( cache_result["status"] != "cache_hit" and cache_result["status"] != "cache_error" ) if upload_needed: # Call imported function, now passing tldr_data upload_result = upload_results( space_id, summary_report, privacy_report, DATASET_ID, HF_TOKEN, tldr_json_data=tldr_data, ) if upload_result["status"] == "error": # Ensure logging uses f-string if adding step count here logging.error( f"Cache upload failed: {upload_result.get('message', 'Unknown error')}" ) # Non-critical, don't stop the UI, just log elif upload_result["status"] == "skipped": logging.info(f"Cache upload skipped: {upload_result.get('reason', '')}") else: logging.info( "Skipping cache upload as results were loaded from cache or cache check failed." ) # Update UI including the generated (or failed) TLDR before upload # Yield 7 updates yield ( gr.update(value=tldr_markdown_content, visible=True), gr.update(value=data_details_content, visible=True), gr.update(value=summary_report, visible=True), gr.update(value=privacy_report, visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=False), ) # --- Step 9: Final Update --- (Old Step 8) current_step += 1 progress_desc = f"Step {current_step}/{total_steps}: Analysis Complete!" progress(current_step / total_steps, desc=progress_desc) logging.info(progress_desc + f" Analysis complete for {space_id}.") # Yield final state again to ensure UI is correct after potential upload messages # Display final generated TLDR and Data Details yield ( gr.update(value=tldr_markdown_content, visible=True), gr.update(value=data_details_content, visible=True), gr.update(value=summary_report, visible=True), gr.update(value=privacy_report, visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=False), ) # --- Original Input Handling Wrapper (updated yields for initial errors) --- def get_space_report_wrapper( selected_cached_space: str | None, new_space_id: str | None, progress=gr.Progress(track_tqdm=True), ): """ Wrapper function to decide whether to fetch cache or run live analysis. Handles the logic based on Dropdown and Textbox inputs. Yields tuples of Gradio updates. """ target_space_id = None source = "new" # Assume new input unless dropdown is chosen # Prioritize new_space_id if provided if new_space_id and new_space_id.strip(): target_space_id = new_space_id.strip() if target_space_id == selected_cached_space: source = "dropdown_match" # User typed ID that exists in dropdown else: source = "new" elif selected_cached_space: target_space_id = selected_cached_space source = "dropdown" if not target_space_id: # Yield 7 updates yield ( gr.update(value="*Please provide a Space ID.*", visible=True), gr.update(value="", visible=False), gr.update( value="Please select an existing report or enter a new Space ID.", visible=True, ), gr.update(value="", visible=False), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=False), ) return if "/" not in target_space_id: # Yield 7 updates yield ( gr.update(value="*Invalid Space ID format.*", visible=True), gr.update(value="", visible=False), gr.update( value=f"Invalid Space ID format: '{target_space_id}'. Use 'owner/name'.", visible=True, ), gr.update(value="", visible=False), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=False), ) return logging.info(f"Request received for: '{target_space_id}' (Source: {source})") if source == "dropdown": progress(0.1, desc="Fetching selected cached report...") # Yield 7 updates (initial placeholder) yield ( gr.update(value="*Loading TLDR...*", visible=True), gr.update(value="*Loading data details...*", visible=True), gr.update(value="Fetching selected cached report...", visible=True), gr.update(value="", visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=False), ) cache_result = check_cache_and_download(target_space_id, DATASET_ID, HF_TOKEN) if cache_result["status"] == "cache_hit": logging.info( f"Successfully displayed cached reports for selected '{target_space_id}'." ) progress(1.0, desc="Complete (from cache)") # Use the cached report text directly here, adding the cache message is done within the helper now. # Parse and render TLDR if available tldr_json_str = cache_result.get("tldr_json_str") rendered_tldr = "*TLDR not found in cache.*" if tldr_json_str: try: cached_tldr_data = json.loads(tldr_json_str) rendered_tldr = render_tldr_markdown( cached_tldr_data, target_space_id ) rendered_data_details = render_data_details_markdown( cached_tldr_data ) except Exception as parse_err: logging.warning( f"Failed to parse cached TLDR JSON for {target_space_id}: {parse_err}" ) rendered_tldr = "*Error parsing cached TLDR.*" rendered_data_details = ( "*Could not load data details due to parsing error.*" ) yield ( gr.update(value=rendered_tldr, visible=True), gr.update(value=rendered_data_details, visible=True), gr.update(value=cache_result["summary"], visible=True), gr.update(value=cache_result["privacy"], visible=True), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=True, open=False), ) else: # Cache miss or error for a dropdown selection is an error state error_msg = cache_result.get( "ui_message", f"Failed to find or download cached report for selected '{target_space_id}'.", ) logging.error(error_msg) progress(1.0, desc="Error") yield ( gr.update(value="*TLDR load failed.*", visible=True), gr.update(value="*Data details load failed.*", visible=True), gr.update(value=error_msg, visible=True), gr.update(value="", visible=False), gr.update(visible=True, open=False), gr.update(visible=True, open=False), gr.update(visible=False), ) return # Stop after handling dropdown source # --- Live Analysis or Check Cache for New Input --- # If it came from the textbox OR was a dropdown match, run the full live analysis pipeline # which includes its own cache check at the beginning. else: # source == "new" or source == "dropdown_match" # Yield intermediate updates from the generator by iterating through it for update_tuple in _run_live_analysis(target_space_id, progress): yield update_tuple # --- Load Initial Data Function (for demo.load) --- def load_cached_list(): """Fetches the list of cached spaces and determines the default selection.""" print("Running demo.load: Fetching list of cached spaces...") # Use os.getenv here directly as HF_TOKEN might be loaded after initial import token = os.getenv("HF_TOKEN") cached_list = list_cached_spaces(DATASET_ID, token) default_value = DEFAULT_SELECTION if DEFAULT_SELECTION in cached_list else None if not cached_list: print( "WARNING: No cached spaces found or failed to fetch list during demo.load." ) # Return an update object for the dropdown using gr.update() return gr.update(choices=cached_list, value=default_value) # --- Gradio Interface Definition --- # Use HTML/CSS for centering the title TITLE = "