mhattingpete commited on
Commit
912f746
·
1 Parent(s): 0866aba

add first version of agent

Browse files
.gitignore CHANGED
@@ -1,3 +1,5 @@
1
  answer_cache.json
2
  uv.lock
3
  .venv
 
 
 
1
  answer_cache.json
2
  uv.lock
3
  .venv
4
+ .env
5
+ __pycache__
agent.py CHANGED
@@ -1,10 +1,67 @@
1
- # --- Basic Agent Definition ---
2
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
 
 
 
 
 
 
 
 
 
 
3
  class Agent:
4
  def __init__(self):
5
- print("BasicAgent initialized.")
6
- def __call__(self, question: str) -> str:
7
- print(f"Agent received question (first 50 chars): {question[:50]}...")
8
- fixed_answer = "This is a default answer."
9
- print(f"Agent returning fixed answer: {fixed_answer}")
10
- return fixed_answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import PIL.Image
4
+ from dotenv import load_dotenv
5
+ from loguru import logger
6
+ from smolagents import AzureOpenAIServerModel, CodeAgent
7
+
8
+ from src.file_handler.parse import parse_file
9
+
10
+ load_dotenv()
11
+
12
+
13
  class Agent:
14
  def __init__(self):
15
+ model = AzureOpenAIServerModel(
16
+ model_id=os.getenv("AZURE_OPENAI_MODEL_ID"),
17
+ azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
18
+ api_key=os.getenv("AZURE_OPENAI_API_KEY"),
19
+ api_version=os.getenv("OPENAI_API_VERSION"),
20
+ )
21
+ self.agent = CodeAgent(
22
+ tools=[],
23
+ model=model,
24
+ add_base_tools=True, # Add any additional base tools
25
+ # planning_interval=3,
26
+ )
27
+ logger.info("BasicAgent initialized.")
28
+
29
+ def __call__(self, question: str, file_name: str) -> str:
30
+ logger.info(
31
+ f"Agent received question (first 50 chars): {question[:50]}..."
32
+ )
33
+ images = None
34
+
35
+ if file_name:
36
+ content = parse_file(task_id, file_name, api_url)
37
+ if content:
38
+ if isinstance(
39
+ content, PIL.Image.Image
40
+ ): # Parse content as image
41
+ images = [content]
42
+ else: # Append content to question
43
+ question += f"\n\nAttached content:\n{content}"
44
+ logger.info(f"Question with content: {question}")
45
+
46
+ answer = self.agent.run(question, images=images)
47
+ logger.info(f"Agent returning answer: {answer}")
48
+ return answer
49
+
50
+
51
+ if __name__ == "__main__":
52
+ import requests
53
+
54
+ api_url = "https://agents-course-unit4-scoring.hf.space"
55
+ question_url = f"{api_url}/random-question"
56
+
57
+ data = requests.get(question_url).json()
58
+ agent = Agent()
59
+
60
+ task_id = data["task_id"]
61
+ question = data["question"]
62
+ file_name = data["file_name"]
63
+ logger.info(
64
+ f"Task ID: {task_id}\nQuestion: {question}\nFile Name: {file_name}\n\n"
65
+ )
66
+
67
+ answer = agent(question, file_name)
app.py CHANGED
@@ -8,6 +8,9 @@ import gradio as gr
8
  import pandas as pd
9
 
10
  from agent import Agent
 
 
 
11
 
12
  # --- Constants --------------------------------------------------------------
13
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
@@ -55,6 +58,7 @@ async def _run_agent_async(
55
  agent: Agent,
56
  question: str,
57
  task_id: str | int,
 
58
  cache: dict[str, str],
59
  semaphore: asyncio.Semaphore,
60
  ) -> tuple[str | int, str]:
@@ -68,7 +72,7 @@ async def _run_agent_async(
68
  loop = asyncio.get_running_loop()
69
  async with semaphore:
70
  answer = await loop.run_in_executor(
71
- None, agent, question
72
  ) # execute in default thread‑pool
73
  cache[str(task_id)] = answer
74
  return task_id, answer
@@ -109,7 +113,9 @@ async def _async_main(profile: gr.OAuthProfile | None):
109
  cache = load_cache()
110
  sem = asyncio.Semaphore(MAX_CONCURRENCY)
111
  coros = [
112
- _run_agent_async(agent, q["question"], q["task_id"], cache, sem)
 
 
113
  for q in questions
114
  if q.get("task_id") and q.get("question") is not None
115
  ]
@@ -185,9 +191,8 @@ with gr.Blocks() as demo:
185
  """
186
  **Quick‑start**
187
 
188
- 1. Fork this space, bring your own `Agent` in `agent.py`.
189
- 2. Log in with the HF button (needed for ranking).
190
- 3. Hit **Run Evaluation & Submit All Answers** – answers are cached
191
  locally so reruns are instant; agent calls & HTTP are parallel.
192
  """
193
  )
 
8
  import pandas as pd
9
 
10
  from agent import Agent
11
+ from src.tracing import add_tracing
12
+
13
+ add_tracing()
14
 
15
  # --- Constants --------------------------------------------------------------
16
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
58
  agent: Agent,
59
  question: str,
60
  task_id: str | int,
61
+ file_name: str,
62
  cache: dict[str, str],
63
  semaphore: asyncio.Semaphore,
64
  ) -> tuple[str | int, str]:
 
72
  loop = asyncio.get_running_loop()
73
  async with semaphore:
74
  answer = await loop.run_in_executor(
75
+ None, agent, question, file_name
76
  ) # execute in default thread‑pool
77
  cache[str(task_id)] = answer
78
  return task_id, answer
 
113
  cache = load_cache()
114
  sem = asyncio.Semaphore(MAX_CONCURRENCY)
115
  coros = [
116
+ _run_agent_async(
117
+ agent, q["question"], q["task_id"], q["file_name"], cache, sem
118
+ )
119
  for q in questions
120
  if q.get("task_id") and q.get("question") is not None
121
  ]
 
191
  """
192
  **Quick‑start**
193
 
194
+ 1. Log in with the HF button (needed for ranking).
195
+ 2. Hit **Run Evaluation & Submit All Answers** answers are cached
 
196
  locally so reruns are instant; agent calls & HTTP are parallel.
197
  """
198
  )
src/file_handler/get_file.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import aiohttp
2
+
3
+
4
+ async def download_file_for_task(task_id: str, api_base_url: str) -> bytes:
5
+ """
6
+ Asynchronously downloads a specific file associated with a given task ID.
7
+ This function performs a GET request to the endpoint /files/{task_id}.
8
+
9
+ Args:
10
+ task_id: The identifier of the task for which to download the file.
11
+ api_base_url: The base URL of the API.
12
+
13
+ Returns:
14
+ The content of the file as bytes.
15
+
16
+ Raises:
17
+ aiohttp.ClientResponseError: If the API returns an error status (e.g., 404 Not Found, 500 Internal Server Error).
18
+ aiohttp.ClientError: For other client-side errors, such as network connection issues.
19
+ """
20
+ url = f"{api_base_url}/files/{task_id}"
21
+ async with aiohttp.ClientSession() as session:
22
+ async with session.get(url) as response:
23
+ response.raise_for_status() # Raise an exception for HTTP error codes (4xx or 5xx)
24
+ file_bytes = await response.read()
25
+ return file_bytes
src/file_handler/handlers.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ from typing import Optional
4
+
5
+ import pandas as pd
6
+ import pdfminer.high_level
7
+ import PIL.Image
8
+ from docx import Document
9
+ from docx.opc.exceptions import PackageNotFoundError
10
+ from pdfminer.pdfparser import PDFSyntaxError
11
+
12
+ from src.file_handler.get_file import download_file_for_task
13
+
14
+
15
+ async def convert_excel_bytes_to_llm_format(task_id: str, url: str) -> str:
16
+ """
17
+ Downloads an Excel file using download_file_for_task, removes empty rows
18
+ from each sheet, and converts its content to an LLM-friendly dictionary format.
19
+
20
+ Args:
21
+ task_id (str): The identifier for the task, used by download_file_for_task.
22
+ url (str): The URL of the Excel file to download and process.
23
+
24
+ Returns:
25
+ str: A dictionary where keys are sheet names and values are lists of
26
+ dictionaries (each dictionary representing a row, with column
27
+ headers as keys).
28
+ Returns None if a critical error occurs (e.g., download failure,
29
+ file unparseable).
30
+ Returns an empty dictionary if the Excel file is valid but contains
31
+ no sheets or no data after cleaning.
32
+ """
33
+ try:
34
+ file_bytes = await download_file_for_task(task_id, url)
35
+
36
+ if not file_bytes:
37
+ print(f"Info [{task_id}]: No content downloaded from URL '{url}'.")
38
+ # Depending on desired behavior, could return {} or raise an error.
39
+ # Returning None indicates a problem preventing processing.
40
+ return None
41
+
42
+ # Use io.BytesIO to treat the bytes as a file-like object for pandas
43
+ excel_buffer = io.BytesIO(file_bytes)
44
+
45
+ # Use pd.ExcelFile to efficiently parse Excel files, especially with multiple sheets
46
+ # This will raise an error (e.g., ValueError, various zipfile/xlrd/openpyxl errors)
47
+ # if the file is not a valid Excel format or is corrupted.
48
+ xls = pd.ExcelFile(excel_buffer)
49
+
50
+ if not xls.sheet_names:
51
+ print(
52
+ f"Info [{task_id}]: Excel file from URL '{url}' has no sheets."
53
+ )
54
+ return {} # No sheets means no data to process
55
+
56
+ all_sheets_data = {}
57
+ for sheet_name in xls.sheet_names:
58
+ # Parse the current sheet into a DataFrame
59
+ df = xls.parse(sheet_name)
60
+
61
+ # Remove rows where all cells are NaN (these are considered empty rows)
62
+ df.dropna(how="all", inplace=True)
63
+
64
+ # Convert the cleaned DataFrame to a list of dictionaries (records format).
65
+ # If a sheet becomes empty after dropna, to_dict(orient='records')
66
+ # will correctly produce an empty list for that sheet's data.
67
+ all_sheets_data[sheet_name] = df.to_dict(orient="records")
68
+
69
+ return json.dumps(all_sheets_data, ensure_ascii=False)
70
+
71
+ except pd.errors.ParserError as e:
72
+ # Handles errors during the parsing of sheet data by pandas.
73
+ print(
74
+ f"Error [{task_id}]: Pandas parsing error for Excel file from '{url}': {e}"
75
+ )
76
+ return None
77
+ except ValueError as e:
78
+ # Catches errors like "Excel file format cannot be determined..." from pd.ExcelFile
79
+ # or other value-related issues during parsing.
80
+ print(
81
+ f"Error [{task_id}]: Value error processing Excel file from '{url}': {e}"
82
+ )
83
+ return None
84
+ except Exception as e:
85
+ # Catch-all for other unexpected errors (e.g., network issues if download_file_for_task
86
+ # is called here and raises something not caught, or other pandas/library issues).
87
+ # It's good practice to log the full traceback for unexpected errors in a real app.
88
+ # import traceback
89
+ # traceback.print_exc()
90
+ print(
91
+ f"Error [{task_id}]: Unexpected error processing Excel file from '{url}': {e}"
92
+ )
93
+ return None
94
+
95
+
96
+ # ---------------------------------------------------------------------------
97
+ # 1. Image → PIL Image
98
+ # ---------------------------------------------------------------------------
99
+ async def convert_image_to_pillow(
100
+ task_id: str, url: str
101
+ ) -> Optional[PIL.Image.Image]:
102
+ """
103
+ Downloads an image file and returns a PIL Image object.
104
+ Returns None on failure.
105
+
106
+ Args:
107
+ task_id (str): The ID of the task.
108
+ url (str): The URL of the image file.
109
+
110
+ Returns:
111
+ Optional[PIL.Image.Image]: The PIL Image object or None on failure.
112
+ """
113
+ try:
114
+ raw = await download_file_for_task(task_id, url)
115
+ if not raw:
116
+ print(f"Info [{task_id}]: No bytes downloaded from '{url}'.")
117
+ return None
118
+
119
+ return PIL.Image.open(io.BytesIO(raw))
120
+
121
+ except Exception as e:
122
+ print(f"Error [{task_id}]: converting image from '{url}' → base64: {e}")
123
+ return None
124
+
125
+
126
+ # ---------------------------------------------------------------------------
127
+ # 2. File → UTF‑8 string
128
+ # ---------------------------------------------------------------------------
129
+ async def convert_file_to_string(task_id: str, url: str) -> Optional[str]:
130
+ """
131
+ Downloads a file and returns its text (UTF‑8, errors replaced).
132
+ """
133
+ try:
134
+ raw = await download_file_for_task(task_id, url)
135
+ if not raw:
136
+ print(f"Info [{task_id}]: No bytes downloaded from '{url}'.")
137
+ return None
138
+
139
+ return raw.decode("utf-8", errors="replace")
140
+
141
+ except Exception as e:
142
+ print(f"Error [{task_id}]: decoding file from '{url}': {e}")
143
+ return None
144
+
145
+
146
+ # ---------------------------------------------------------------------------
147
+ # 3. DOCX → Markdown
148
+ # ---------------------------------------------------------------------------
149
+ def _runs_to_md(runs):
150
+ """Helper – convert a list of runs to markdown inline‑text."""
151
+ out = []
152
+ for run in runs:
153
+ text = run.text.replace("\n", " ")
154
+ if not text:
155
+ continue
156
+ if run.bold:
157
+ text = f"**{text}**"
158
+ if run.italic:
159
+ text = f"*{text}*"
160
+ out.append(text)
161
+ return "".join(out)
162
+
163
+
164
+ async def convert_docx_to_markdown(task_id: str, url: str) -> Optional[str]:
165
+ """
166
+ Converts a Word document to *simple* Markdown.
167
+ """
168
+ try:
169
+ raw = await download_file_for_task(task_id, url)
170
+ if not raw:
171
+ print(f"Info [{task_id}]: No bytes downloaded from '{url}'.")
172
+ return None
173
+
174
+ doc = Document(io.BytesIO(raw))
175
+
176
+ md_lines = []
177
+ for p in doc.paragraphs:
178
+ style = (p.style.name or "").lower()
179
+ text = _runs_to_md(p.runs).strip()
180
+ if not text:
181
+ continue
182
+
183
+ if "heading" in style:
184
+ # e.g. 'Heading 1' → level 1, 'Heading 2' → level 2, etc.
185
+ level = int("".join(filter(str.isdigit, style)) or 1)
186
+ md_lines.append(f"{'#' * level} {text}")
187
+ else:
188
+ md_lines.append(text)
189
+
190
+ return "\n\n".join(md_lines)
191
+
192
+ except PackageNotFoundError:
193
+ print(f"Error [{task_id}]: file from '{url}' is not a valid DOCX.")
194
+ return None
195
+ except Exception as e:
196
+ print(f"Error [{task_id}]: DOCX→MD conversion failed for '{url}': {e}")
197
+ return None
198
+
199
+
200
+ # ---------------------------------------------------------------------------
201
+ # 4. PDF → Markdown (really, plain text with paragraph breaks)
202
+ # ---------------------------------------------------------------------------
203
+ async def convert_pdf_to_markdown(task_id: str, url: str) -> Optional[str]:
204
+ """
205
+ Extracts text from a PDF and returns it as Markdown (plain paragraphs).
206
+ """
207
+ try:
208
+ raw = await download_file_for_task(task_id, url)
209
+ if not raw:
210
+ print(f"Info [{task_id}]: No bytes downloaded from '{url}'.")
211
+ return None
212
+
213
+ text = pdfminer.high_level.extract_text(io.BytesIO(raw))
214
+ if not text.strip():
215
+ print(f"Info [{task_id}]: PDF at '{url}' produced no text.")
216
+ return ""
217
+
218
+ # Very light Markdown: treat empty lines as paragraph separators
219
+ paragraphs = [p.strip() for p in text.splitlines() if p.strip()]
220
+ return "\n\n".join(paragraphs)
221
+
222
+ except (PDFSyntaxError, ValueError) as e:
223
+ print(f"Error [{task_id}]: PDF syntax error for '{url}': {e}")
224
+ return None
225
+ except Exception as e:
226
+ print(f"Error [{task_id}]: PDF→MD conversion failed for '{url}': {e}")
227
+ return None
src/file_handler/parse.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ from src.file_handler.handlers import (
4
+ convert_docx_to_markdown,
5
+ convert_excel_bytes_to_llm_format,
6
+ convert_file_to_string,
7
+ convert_image_to_pillow,
8
+ convert_pdf_to_markdown,
9
+ )
10
+
11
+
12
+ async def aparse_file(task_id: str, file_name: str, api_base_url: str) -> str:
13
+ """
14
+ Parses a file and returns its content in a format suitable for LLMs.
15
+
16
+ Args:
17
+ task_id (str): The ID of the task.
18
+ file_name (str): The name of the file.
19
+ api_base_url (str): The base URL of the API.
20
+
21
+ Returns:
22
+ str: The content of the file in a format suitable for LLMs.
23
+ """
24
+ file_extension = file_name.split(".")[-1]
25
+
26
+ if file_extension == "xlsx":
27
+ return await convert_excel_bytes_to_llm_format(task_id, api_base_url)
28
+ elif file_extension == "docx":
29
+ return await convert_docx_to_markdown(task_id, api_base_url)
30
+ elif file_extension in ["jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp"]:
31
+ return await convert_image_to_pillow(task_id, api_base_url)
32
+ elif file_extension == "pdf":
33
+ return await convert_pdf_to_markdown(task_id, api_base_url)
34
+ elif file_extension == "mp3":
35
+ return None
36
+ else:
37
+ return await convert_file_to_string(task_id, api_base_url)
38
+
39
+
40
+ def parse_file(task_id: str, file_name: str, api_base_url: str) -> str:
41
+ """
42
+ Parses a file and returns its content in a format suitable for LLMs.
43
+
44
+ Args:
45
+ task_id (str): The ID of the task.
46
+ file_name (str): The name of the file.
47
+ api_base_url (str): The base URL of the API.
48
+
49
+ Returns:
50
+ str: The content of the file in a format suitable for LLMs.
51
+ """
52
+ return asyncio.run(aparse_file(task_id, file_name, api_base_url))
src/tracing.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+
4
+ from dotenv import load_dotenv
5
+ from openinference.instrumentation.smolagents import SmolagentsInstrumentor
6
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
7
+ OTLPSpanExporter,
8
+ )
9
+ from opentelemetry.sdk.trace import TracerProvider
10
+ from opentelemetry.sdk.trace.export import SimpleSpanProcessor
11
+
12
+ load_dotenv()
13
+
14
+ LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY")
15
+ LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY")
16
+
17
+ LANGFUSE_AUTH = base64.b64encode(
18
+ f"{LANGFUSE_PUBLIC_KEY}:{LANGFUSE_SECRET_KEY}".encode()
19
+ ).decode()
20
+
21
+ os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = (
22
+ "https://cloud.langfuse.com/api/public/otel" # EU data region
23
+ )
24
+ os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = (
25
+ f"Authorization=Basic {LANGFUSE_AUTH}"
26
+ )
27
+
28
+
29
+ def add_tracing():
30
+ trace_provider = TracerProvider()
31
+ trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
32
+
33
+ SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)