Final_Assignment_Submission

Running

App Files Files Community

mhattingpete commited on 3 days ago

Commit

912f746

1 Parent(s): 0866aba

add first version of agent

Browse files

Files changed (7) hide show

.gitignore +2 -0
agent.py +65 -8
app.py +10 -5
src/file_handler/get_file.py +25 -0
src/file_handler/handlers.py +227 -0
src/file_handler/parse.py +52 -0
src/tracing.py +33 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
 answer_cache.json
 uv.lock
 .venv

 answer_cache.json
 uv.lock
 .venv
+.env
+__pycache__

agent.py CHANGED Viewed

@@ -1,10 +1,67 @@
-# --- Basic Agent Definition ---
-# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
 class Agent:
     def __init__(self):
-        print("BasicAgent initialized.")
-    def __call__(self, question: str) -> str:
-        print(f"Agent received question (first 50 chars): {question[:50]}...")
-        fixed_answer = "This is a default answer."
-        print(f"Agent returning fixed answer: {fixed_answer}")
-        return fixed_answer

+import os
+import PIL.Image
+from dotenv import load_dotenv
+from loguru import logger
+from smolagents import AzureOpenAIServerModel, CodeAgent
+from src.file_handler.parse import parse_file
+load_dotenv()
 class Agent:
     def __init__(self):
+        model = AzureOpenAIServerModel(
+            model_id=os.getenv("AZURE_OPENAI_MODEL_ID"),
+            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
+            api_key=os.getenv("AZURE_OPENAI_API_KEY"),
+            api_version=os.getenv("OPENAI_API_VERSION"),
+        )
+        self.agent = CodeAgent(
+            tools=[],
+            model=model,
+            add_base_tools=True,  # Add any additional base tools
+            # planning_interval=3,
+        )
+        logger.info("BasicAgent initialized.")
+    def __call__(self, question: str, file_name: str) -> str:
+        logger.info(
+            f"Agent received question (first 50 chars): {question[:50]}..."
+        )
+        images = None
+        if file_name:
+            content = parse_file(task_id, file_name, api_url)
+            if content:
+                if isinstance(
+                    content, PIL.Image.Image
+                ):  # Parse content as image
+                    images = [content]
+                else:  # Append content to question
+                    question += f"\n\nAttached content:\n{content}"
+                    logger.info(f"Question with content: {question}")
+        answer = self.agent.run(question, images=images)
+        logger.info(f"Agent returning answer: {answer}")
+        return answer
+if __name__ == "__main__":
+    import requests
+    api_url = "https://agents-course-unit4-scoring.hf.space"
+    question_url = f"{api_url}/random-question"
+    data = requests.get(question_url).json()
+    agent = Agent()
+    task_id = data["task_id"]
+    question = data["question"]
+    file_name = data["file_name"]
+    logger.info(
+        f"Task ID: {task_id}\nQuestion: {question}\nFile Name: {file_name}\n\n"
+    )
+    answer = agent(question, file_name)

app.py CHANGED Viewed

@@ -8,6 +8,9 @@ import gradio as gr
 import pandas as pd
 from agent import Agent
 # --- Constants --------------------------------------------------------------
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
@@ -55,6 +58,7 @@ async def _run_agent_async(
     agent: Agent,
     question: str,
     task_id: str | int,
     cache: dict[str, str],
     semaphore: asyncio.Semaphore,
 ) -> tuple[str | int, str]:
@@ -68,7 +72,7 @@ async def _run_agent_async(
     loop = asyncio.get_running_loop()
     async with semaphore:
         answer = await loop.run_in_executor(
-            None, agent, question
         )  # execute in default thread‑pool
     cache[str(task_id)] = answer
     return task_id, answer
@@ -109,7 +113,9 @@ async def _async_main(profile: gr.OAuthProfile | None):
         cache = load_cache()
         sem = asyncio.Semaphore(MAX_CONCURRENCY)
         coros = [
-            _run_agent_async(agent, q["question"], q["task_id"], cache, sem)
             for q in questions
             if q.get("task_id") and q.get("question") is not None
         ]
@@ -185,9 +191,8 @@ with gr.Blocks() as demo:
         """
         **Quick‑start**
-        1. Fork this space, bring your own `Agent` in `agent.py`.
-        2. Log in with the HF button (needed for ranking).
-        3. Hit **Run Evaluation & Submit All Answers** – answers are cached
            locally so reruns are instant; agent calls & HTTP are parallel.
         """
     )

 import pandas as pd
 from agent import Agent
+from src.tracing import add_tracing
+add_tracing()
 # --- Constants --------------------------------------------------------------
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
     agent: Agent,
     question: str,
     task_id: str | int,
+    file_name: str,
     cache: dict[str, str],
     semaphore: asyncio.Semaphore,
 ) -> tuple[str | int, str]:
     loop = asyncio.get_running_loop()
     async with semaphore:
         answer = await loop.run_in_executor(
+            None, agent, question, file_name
         )  # execute in default thread‑pool
     cache[str(task_id)] = answer
     return task_id, answer
         cache = load_cache()
         sem = asyncio.Semaphore(MAX_CONCURRENCY)
         coros = [
+            _run_agent_async(
+                agent, q["question"], q["task_id"], q["file_name"], cache, sem
+            )
             for q in questions
             if q.get("task_id") and q.get("question") is not None
         ]
         """
         **Quick‑start**
+        1. Log in with the HF button (needed for ranking).
+        2. Hit **Run Evaluation & Submit All Answers** – answers are cached
            locally so reruns are instant; agent calls & HTTP are parallel.
         """
     )

src/file_handler/get_file.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import aiohttp
+async def download_file_for_task(task_id: str, api_base_url: str) -> bytes:
+    """
+    Asynchronously downloads a specific file associated with a given task ID.
+    This function performs a GET request to the endpoint /files/{task_id}.
+    Args:
+        task_id: The identifier of the task for which to download the file.
+        api_base_url: The base URL of the API.
+    Returns:
+        The content of the file as bytes.
+    Raises:
+        aiohttp.ClientResponseError: If the API returns an error status (e.g., 404 Not Found, 500 Internal Server Error).
+        aiohttp.ClientError: For other client-side errors, such as network connection issues.
+    """
+    url = f"{api_base_url}/files/{task_id}"
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url) as response:
+            response.raise_for_status()  # Raise an exception for HTTP error codes (4xx or 5xx)
+            file_bytes = await response.read()
+            return file_bytes

src/file_handler/handlers.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import io
+import json
+from typing import Optional
+import pandas as pd
+import pdfminer.high_level
+import PIL.Image
+from docx import Document
+from docx.opc.exceptions import PackageNotFoundError
+from pdfminer.pdfparser import PDFSyntaxError
+from src.file_handler.get_file import download_file_for_task
+async def convert_excel_bytes_to_llm_format(task_id: str, url: str) -> str:
+    """
+    Downloads an Excel file using download_file_for_task, removes empty rows
+    from each sheet, and converts its content to an LLM-friendly dictionary format.
+    Args:
+        task_id (str): The identifier for the task, used by download_file_for_task.
+        url (str): The URL of the Excel file to download and process.
+    Returns:
+        str: A dictionary where keys are sheet names and values are lists of
+              dictionaries (each dictionary representing a row, with column
+              headers as keys).
+              Returns None if a critical error occurs (e.g., download failure,
+              file unparseable).
+              Returns an empty dictionary if the Excel file is valid but contains
+              no sheets or no data after cleaning.
+    """
+    try:
+        file_bytes = await download_file_for_task(task_id, url)
+        if not file_bytes:
+            print(f"Info [{task_id}]: No content downloaded from URL '{url}'.")
+            # Depending on desired behavior, could return {} or raise an error.
+            # Returning None indicates a problem preventing processing.
+            return None
+        # Use io.BytesIO to treat the bytes as a file-like object for pandas
+        excel_buffer = io.BytesIO(file_bytes)
+        # Use pd.ExcelFile to efficiently parse Excel files, especially with multiple sheets
+        # This will raise an error (e.g., ValueError, various zipfile/xlrd/openpyxl errors)
+        # if the file is not a valid Excel format or is corrupted.
+        xls = pd.ExcelFile(excel_buffer)
+        if not xls.sheet_names:
+            print(
+                f"Info [{task_id}]: Excel file from URL '{url}' has no sheets."
+            )
+            return {}  # No sheets means no data to process
+        all_sheets_data = {}
+        for sheet_name in xls.sheet_names:
+            # Parse the current sheet into a DataFrame
+            df = xls.parse(sheet_name)
+            # Remove rows where all cells are NaN (these are considered empty rows)
+            df.dropna(how="all", inplace=True)
+            # Convert the cleaned DataFrame to a list of dictionaries (records format).
+            # If a sheet becomes empty after dropna, to_dict(orient='records')
+            # will correctly produce an empty list for that sheet's data.
+            all_sheets_data[sheet_name] = df.to_dict(orient="records")
+        return json.dumps(all_sheets_data, ensure_ascii=False)
+    except pd.errors.ParserError as e:
+        # Handles errors during the parsing of sheet data by pandas.
+        print(
+            f"Error [{task_id}]: Pandas parsing error for Excel file from '{url}': {e}"
+        )
+        return None
+    except ValueError as e:
+        # Catches errors like "Excel file format cannot be determined..." from pd.ExcelFile
+        # or other value-related issues during parsing.
+        print(
+            f"Error [{task_id}]: Value error processing Excel file from '{url}': {e}"
+        )
+        return None
+    except Exception as e:
+        # Catch-all for other unexpected errors (e.g., network issues if download_file_for_task
+        # is called here and raises something not caught, or other pandas/library issues).
+        # It's good practice to log the full traceback for unexpected errors in a real app.
+        # import traceback
+        # traceback.print_exc()
+        print(
+            f"Error [{task_id}]: Unexpected error processing Excel file from '{url}': {e}"
+        )
+        return None
+# ---------------------------------------------------------------------------
+# 1. Image → PIL Image
+# ---------------------------------------------------------------------------
+async def convert_image_to_pillow(
+    task_id: str, url: str
+) -> Optional[PIL.Image.Image]:
+    """
+    Downloads an image file and returns a PIL Image object.
+    Returns None on failure.
+    Args:
+        task_id (str): The ID of the task.
+        url (str): The URL of the image file.
+    Returns:
+        Optional[PIL.Image.Image]: The PIL Image object or None on failure.
+    """
+    try:
+        raw = await download_file_for_task(task_id, url)
+        if not raw:
+            print(f"Info [{task_id}]: No bytes downloaded from '{url}'.")
+            return None
+        return PIL.Image.open(io.BytesIO(raw))
+    except Exception as e:
+        print(f"Error [{task_id}]: converting image from '{url}' → base64: {e}")
+        return None
+# ---------------------------------------------------------------------------
+# 2. File → UTF‑8 string
+# ---------------------------------------------------------------------------
+async def convert_file_to_string(task_id: str, url: str) -> Optional[str]:
+    """
+    Downloads a file and returns its text (UTF‑8, errors replaced).
+    """
+    try:
+        raw = await download_file_for_task(task_id, url)
+        if not raw:
+            print(f"Info [{task_id}]: No bytes downloaded from '{url}'.")
+            return None
+        return raw.decode("utf-8", errors="replace")
+    except Exception as e:
+        print(f"Error [{task_id}]: decoding file from '{url}': {e}")
+        return None
+# ---------------------------------------------------------------------------
+# 3. DOCX → Markdown
+# ---------------------------------------------------------------------------
+def _runs_to_md(runs):
+    """Helper – convert a list of runs to markdown inline‑text."""
+    out = []
+    for run in runs:
+        text = run.text.replace("\n", " ")
+        if not text:
+            continue
+        if run.bold:
+            text = f"**{text}**"
+        if run.italic:
+            text = f"*{text}*"
+        out.append(text)
+    return "".join(out)
+async def convert_docx_to_markdown(task_id: str, url: str) -> Optional[str]:
+    """
+    Converts a Word document to *simple* Markdown.
+    """
+    try:
+        raw = await download_file_for_task(task_id, url)
+        if not raw:
+            print(f"Info [{task_id}]: No bytes downloaded from '{url}'.")
+            return None
+        doc = Document(io.BytesIO(raw))
+        md_lines = []
+        for p in doc.paragraphs:
+            style = (p.style.name or "").lower()
+            text = _runs_to_md(p.runs).strip()
+            if not text:
+                continue
+            if "heading" in style:
+                # e.g. 'Heading 1' → level 1, 'Heading 2' → level 2, etc.
+                level = int("".join(filter(str.isdigit, style)) or 1)
+                md_lines.append(f"{'#' * level} {text}")
+            else:
+                md_lines.append(text)
+        return "\n\n".join(md_lines)
+    except PackageNotFoundError:
+        print(f"Error [{task_id}]: file from '{url}' is not a valid DOCX.")
+        return None
+    except Exception as e:
+        print(f"Error [{task_id}]: DOCX→MD conversion failed for '{url}': {e}")
+        return None
+# ---------------------------------------------------------------------------
+# 4. PDF → Markdown (really, plain text with paragraph breaks)
+# ---------------------------------------------------------------------------
+async def convert_pdf_to_markdown(task_id: str, url: str) -> Optional[str]:
+    """
+    Extracts text from a PDF and returns it as Markdown (plain paragraphs).
+    """
+    try:
+        raw = await download_file_for_task(task_id, url)
+        if not raw:
+            print(f"Info [{task_id}]: No bytes downloaded from '{url}'.")
+            return None
+        text = pdfminer.high_level.extract_text(io.BytesIO(raw))
+        if not text.strip():
+            print(f"Info [{task_id}]: PDF at '{url}' produced no text.")
+            return ""
+        # Very light Markdown: treat empty lines as paragraph separators
+        paragraphs = [p.strip() for p in text.splitlines() if p.strip()]
+        return "\n\n".join(paragraphs)
+    except (PDFSyntaxError, ValueError) as e:
+        print(f"Error [{task_id}]: PDF syntax error for '{url}': {e}")
+        return None
+    except Exception as e:
+        print(f"Error [{task_id}]: PDF→MD conversion failed for '{url}': {e}")
+        return None

src/file_handler/parse.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import asyncio
+from src.file_handler.handlers import (
+    convert_docx_to_markdown,
+    convert_excel_bytes_to_llm_format,
+    convert_file_to_string,
+    convert_image_to_pillow,
+    convert_pdf_to_markdown,
+)
+async def aparse_file(task_id: str, file_name: str, api_base_url: str) -> str:
+    """
+    Parses a file and returns its content in a format suitable for LLMs.
+    Args:
+        task_id (str): The ID of the task.
+        file_name (str): The name of the file.
+        api_base_url (str): The base URL of the API.
+    Returns:
+        str: The content of the file in a format suitable for LLMs.
+    """
+    file_extension = file_name.split(".")[-1]
+    if file_extension == "xlsx":
+        return await convert_excel_bytes_to_llm_format(task_id, api_base_url)
+    elif file_extension == "docx":
+        return await convert_docx_to_markdown(task_id, api_base_url)
+    elif file_extension in ["jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp"]:
+        return await convert_image_to_pillow(task_id, api_base_url)
+    elif file_extension == "pdf":
+        return await convert_pdf_to_markdown(task_id, api_base_url)
+    elif file_extension == "mp3":
+        return None
+    else:
+        return await convert_file_to_string(task_id, api_base_url)
+def parse_file(task_id: str, file_name: str, api_base_url: str) -> str:
+    """
+    Parses a file and returns its content in a format suitable for LLMs.
+    Args:
+        task_id (str): The ID of the task.
+        file_name (str): The name of the file.
+        api_base_url (str): The base URL of the API.
+    Returns:
+        str: The content of the file in a format suitable for LLMs.
+    """
+    return asyncio.run(aparse_file(task_id, file_name, api_base_url))

src/tracing.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import base64
+import os
+from dotenv import load_dotenv
+from openinference.instrumentation.smolagents import SmolagentsInstrumentor
+from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
+    OTLPSpanExporter,
+)
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+load_dotenv()
+LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY")
+LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY")
+LANGFUSE_AUTH = base64.b64encode(
+    f"{LANGFUSE_PUBLIC_KEY}:{LANGFUSE_SECRET_KEY}".encode()
+).decode()
+os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = (
+    "https://cloud.langfuse.com/api/public/otel"  # EU data region
+)
+os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = (
+    f"Authorization=Basic {LANGFUSE_AUTH}"
+)
+def add_tracing():
+    trace_provider = TracerProvider()
+    trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
+    SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)