Spaces:

yjernite
/

space-privacy

Running

File size: 13,915 Bytes

import logging
from collections import defaultdict

MAX_FILE_CONTENT_LENGTH = 10000  # Limit the length of individual file contents
MAX_MODEL_DESC_LENGTH = 1500  # Limit the length of fetched model descriptions
MAX_PROMPT_CHARS = 110000  # Approx < 30k tokens (using ~4 chars/token heuristic)

# Prompt for the first LLM call: Detailed Privacy Analysis
PRIVACY_SYSTEM_PROMPT = (
    "You are a helpful AI assistant specialized in analyzing Hugging Face Spaces code for privacy concerns. "
    "Your goal is to identify data flows and potential privacy risks based *only* on the provided code files. "
    "Analyze the following aspects and provide relevant code snippets (formatted as Markdown code blocks) as evidence for each point. "
    "**Crucially, include the filename for each code snippet.** Example: `(filename.py)`\n\n"
    "**Note:** If the app uses externally defined or unaccessible code to upload or process data, say so.\n\n"
    "1.  **Data Inputs:**\n"
    "    - What types of user data does the application accept as input (e.g., text, images, audio, files)?\n"
    "    - Where in the code are these inputs defined (e.g., Gradio input widgets, file uploads)? Provide the filename and code snippet.\n\n"
    "2.  **Processing Services & Data Transmission:**\n"
    "    - What specific internal or external APIs, models, or services are used to process the input data?\n"
    "    - What specific AI models or services are used to process the input data? Are any of these Hugging Face-hosted models?\n"
    "    - Where in the code are these services called (e.g., `requests.post`, `InferenceClient`, specific API endpoint URLs) or defined (e.g., `transformers` library)? Provide the filename and code snippet.\n"
    "    - Is it likely that user data is transmitted to these external services, and what kind of data is transmitted by each service or API? Mention if the services are known (like Hugging Face Inference API/Endpoints) or potentially unknown third parties.\n\n"
    "3.  **Execution Environment & Potential Local Processing:**\n"
    "    - Does the code indicate that models or significant processing might run *locally* within the Space container? Provide the filename and code snippet.\n"
    "    - Does the code explicitly use external *inference services* to query AI models? If so, reiterate the relevant code snippet from point 2 with filename.\n\n"
    "    - Does the code mention interactions with remote databases (e.g., `sqlite`, `postgres`, `mysql`, `redis`, `mongodb`, etc.), storage (e.g., `s3`, `gcs`, `azure`, etc.), or Cloud-based data services? If so, provide the filename and code snippet.\n\n"
    "4.  **Explicit Data Storage/Logging:**\n"
    "    - Is there any code that explicitly stores user input or results to files, databases, or external logging services? Provide the filename and code snippet.\n\n"
    "5.  **Overall Privacy Risk Summary:**\n"
    "    - Based ONLY on the evidence from the code snippets above, provide a concise summary paragraph highlighting the main potential privacy considerations or risks.\n\n"
    "Format your entire response clearly using Markdown. Ensure all code snippets include filename and are properly formatted."
)

# Prompt for the second LLM call: Space Summary + Privacy Highlights
SUMMARY_HIGHLIGHTS_SYSTEM_PROMPT = (
    "You are an AI assistant reviewing a Hugging Face Space. You have been provided with: "
    "(1) the application code, and "
    "(2) a detailed preliminary privacy analysis report."
    "Your task is to generate a summary report containing two parts:\n\n"
    "**Part 1: Space Summary**\n"
    "- Based on the code and privacy analysis report, provide a concise summary (4-6 sentences max) of what the application does from a user's perspective.\n\n"
    "**Part 2: Privacy Highlights**\n"
    "- Using information from the preliminary privacy report (cross-referencing code/descriptions as needed), list the following key privacy aspects:\n"
    "  1.  **Data Inputs:** List the main types of data provided to the application with a brief description for each. List where the data is used or stored by the application.\n"
    "  2.  **AI Models/Services:** List the core AI models or services used. For each, specify: Is it run locally or remotely? What library or service is used, or is the code defined within the app?\n"
    "  3.  **Other Remote Data or Dataset Calls:** List any other identified remote data calls that might upload or transmit data outside of the app (e.g., to databases, external APIs not covered above, cloud storage).\n"
    "  4.  **Libraries Suggesting Data Transmission:** List libraries used (e.g., `requests`, `gradio[sharing]`) that might implicitly or explicitly transmit data, suggesting where users might look for more details (e.g., library documentation, specific code sections).\n\n"
    "Format the entire response clearly using Markdown. Do not include the preliminary privacy report itself in your output."
)


def _generate_file_structure(code_files: dict[str, str]) -> str:
    """Generates a tree-like textual representation of the file structure."""
    tree = defaultdict(dict)
    files = sorted(code_files.keys())

    for fpath in files:
        parts = fpath.split("/")
        node = tree
        for i, part in enumerate(parts):
            if i == len(parts) - 1:  # It's a file
                node[part] = None  # Mark as file
            else:  # It's a directory
                if part not in node:
                    node[part] = defaultdict(dict)  # Create dir node if not exists
                # Check if we previously marked this as a file (edge case where dir name = file name at higher level)
                elif node[part] is None:
                    node[part] = defaultdict(dict)  # Convert file marker to dir
                node = node[part]  # Move deeper

    output_lines = ["Project File Structure:"]

    def build_tree_lines(node, prefix=""):
        # Sort items: directories first (defaultdict instances), then files (keys with None value)
        items = sorted(
            node.items(),
            key=lambda item: isinstance(item[1], defaultdict),
            reverse=True,
        )

        pointers = ["├── " for _ in range(len(items) - 1)] + ["└── "]
        for pointer, (name, sub_node) in zip(pointers, items):
            output_lines.append(prefix + pointer + name)
            if isinstance(sub_node, defaultdict):  # It's a directory
                extension = "│   " if pointer == "├── " else "    "
                build_tree_lines(sub_node, prefix + extension)

    build_tree_lines(tree)
    print("\n".join(output_lines))
    return "\n".join(output_lines)


def _format_code_files_for_prompt(code_files: dict[str, str]) -> str:
    """Formats the code files into a single string for the prompt, sorted by depth and path."""

    def sort_key(filepath):
        parts = filepath.split("/")
        depth = len(parts) - 1
        dir_path = "/".join(parts[:-1]) if depth > 0 else ""
        filename = parts[-1]
        return (depth, dir_path, filename)

    sorted_filenames = sorted(code_files.keys(), key=sort_key)

    output_parts = []
    for filename in sorted_filenames:
        content = code_files[filename]
        print(f"--- File: {filename} ---")
        print(content[:128])
        output_parts.append(
            f"--- File: {filename} ---\n```\n{content[:MAX_FILE_CONTENT_LENGTH]}{'... [truncated]' if len(content) > MAX_FILE_CONTENT_LENGTH else ''}\n```"
        )

    return "\n".join(output_parts)


def format_privacy_prompt(
    space_id: str, code_files: dict[str, str]
) -> tuple[list[dict[str, str]], bool]:
    """
    Formats the prompt for the initial detailed privacy analysis task.
    Returns messages list and a boolean indicating if truncation occurred.
    """
    was_truncated = False
    file_structure = _generate_file_structure(code_files)
    formatted_code = _format_code_files_for_prompt(code_files)

    # Define components for length calculation
    prompt_header = f"Please perform a detailed privacy analysis for the Hugging Face Space '{space_id}'.\n\n{file_structure}\n\nCode Files Content:\n"
    base_length = len(prompt_header) + len(PRIVACY_SYSTEM_PROMPT)

    # Check if formatted code needs truncation for the overall prompt
    available_chars_for_code = MAX_PROMPT_CHARS - base_length
    if available_chars_for_code < 0:  # Header itself is too long (unlikely)
        available_chars_for_code = 0
        was_truncated = True

    if len(formatted_code) > available_chars_for_code:
        formatted_code = (
            formatted_code[:available_chars_for_code]
            + "\n... [Code Section Truncated Due to Overall Prompt Length] ..."
        )
        was_truncated = True
        logging.warning(
            f"Privacy prompt code section truncated for Space ID {space_id} due to overall length."
        )

    user_content = prompt_header + formatted_code

    messages = [
        {"role": "system", "content": PRIVACY_SYSTEM_PROMPT},
        {"role": "user", "content": user_content},
    ]
    return messages, was_truncated


def format_summary_highlights_prompt(
    space_id: str, code_files: dict[str, str], detailed_privacy_report: str
) -> tuple[list[dict[str, str]], bool]:
    """
    Formats the prompt for the final summary + highlights report.
    Returns messages list and a boolean indicating if truncation occurred.
    """
    was_truncated = False
    file_structure = _generate_file_structure(code_files)
    formatted_code = _format_code_files_for_prompt(code_files)

    # Define components for length calculation
    prompt_header = f"Please generate a final summary and privacy highlights report for the Hugging Face Space '{space_id}'.\n\n"
    report_header = "**Preliminary Detailed Privacy Report:**\n---\n"
    report_footer = "\n---\n\n"
    support_header = f"**Supporting Information:**\n{file_structure}\n\n"
    code_header = "**Original Code Files Content:**\n"

    base_length = (
        len(prompt_header)
        + len(report_header)
        + len(report_footer)
        + len(support_header)
        + len(code_header)
        + len(SUMMARY_HIGHLIGHTS_SYSTEM_PROMPT)
    )
    available_chars_total = MAX_PROMPT_CHARS - base_length

    if available_chars_total < 0:  # Base structure is too long
        logging.error(
            f"Base prompt structure for summary highlights exceeds limit for Space ID {space_id}. Cannot proceed effectively."
        )
        # Return minimal user content to avoid errors, but flag truncation heavily
        user_content = (
            prompt_header
            + report_header
            + "[TRUNCATED DUE TO LENGTH]"
            + report_footer
            + support_header
            + code_header
            + "[TRUNCATED DUE TO LENGTH]"
        )
        was_truncated = True
    else:
        # Prioritize truncating the detailed report first
        available_chars_for_report = available_chars_total - len(
            formatted_code
        )  # Reserve space for code
        if available_chars_for_report < 0:
            available_chars_for_report = 0  # Cannot fit report

        if len(detailed_privacy_report) > available_chars_for_report:
            detailed_privacy_report = (
                detailed_privacy_report[:available_chars_for_report]
                + "\n... [Detailed Privacy Report Truncated Due to Overall Prompt Length] ..."
            )
            was_truncated = True
            logging.warning(
                f"Summary prompt detailed report section truncated for Space ID {space_id}."
            )

        # Now check code length again with (potentially truncated) report length
        available_chars_for_code = available_chars_total - len(detailed_privacy_report)
        if available_chars_for_code < 0:
            available_chars_for_code = 0  # Cannot fit code

        if len(formatted_code) > available_chars_for_code:
            formatted_code = (
                formatted_code[:available_chars_for_code]
                + "\n... [Code Section Truncated Due to Overall Prompt Length] ..."
            )
            was_truncated = True
            logging.warning(
                f"Summary prompt code section truncated for Space ID {space_id}."
            )

        # Assemble the final user content
        user_content = (
            prompt_header
            + report_header
            + detailed_privacy_report
            + report_footer
            + support_header
            + code_header
            + formatted_code
        )

    messages = [
        {"role": "system", "content": SUMMARY_HIGHLIGHTS_SYSTEM_PROMPT},
        {"role": "user", "content": user_content},
    ]
    return messages, was_truncated


# Example usage (for testing)
# if __name__ == '__main__':
#     test_files = {
#         "app.py": "import gradio as gr\n\ndef greet(name):\n    # Potentially send data to external service?\n    # requests.post('http://example.com/log', json={'user': name})\n    return f'Hello {name}!'",
#         "requirements.txt": "gradio\nrequests",
#         "nested/utils.py": "def helper():\n    pass",
#         "README.md": "This should be ignored.", # Example of a file that *should* be filtered out before reaching here
#         "very_long_file.py": "print('hello' * 5000)" # Test truncation
#     }
#     # Typically, files like README.md would be filtered by get_space_code_files in utils.py
#     # We include it here just for demo purposes if you were to test prompts.py directly.
#     filtered_test_files = {k: v for k, v in test_files.items() if not k.endswith('.md')}
#     prompt_messages = format_code_for_analysis("test/space", filtered_test_files)
#     print("--- System Prompt ---")
#     print(prompt_messages[0]['content'])
#     print("\n--- User Prompt ---")
#     print(prompt_messages[1]['content'])