Commit
·
912f746
1
Parent(s):
0866aba
add first version of agent
Browse files- .gitignore +2 -0
- agent.py +65 -8
- app.py +10 -5
- src/file_handler/get_file.py +25 -0
- src/file_handler/handlers.py +227 -0
- src/file_handler/parse.py +52 -0
- src/tracing.py +33 -0
.gitignore
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
answer_cache.json
|
2 |
uv.lock
|
3 |
.venv
|
|
|
|
|
|
1 |
answer_cache.json
|
2 |
uv.lock
|
3 |
.venv
|
4 |
+
.env
|
5 |
+
__pycache__
|
agent.py
CHANGED
@@ -1,10 +1,67 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
class Agent:
|
4 |
def __init__(self):
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import PIL.Image
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from loguru import logger
|
6 |
+
from smolagents import AzureOpenAIServerModel, CodeAgent
|
7 |
+
|
8 |
+
from src.file_handler.parse import parse_file
|
9 |
+
|
10 |
+
load_dotenv()
|
11 |
+
|
12 |
+
|
13 |
class Agent:
|
14 |
def __init__(self):
|
15 |
+
model = AzureOpenAIServerModel(
|
16 |
+
model_id=os.getenv("AZURE_OPENAI_MODEL_ID"),
|
17 |
+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
|
18 |
+
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
|
19 |
+
api_version=os.getenv("OPENAI_API_VERSION"),
|
20 |
+
)
|
21 |
+
self.agent = CodeAgent(
|
22 |
+
tools=[],
|
23 |
+
model=model,
|
24 |
+
add_base_tools=True, # Add any additional base tools
|
25 |
+
# planning_interval=3,
|
26 |
+
)
|
27 |
+
logger.info("BasicAgent initialized.")
|
28 |
+
|
29 |
+
def __call__(self, question: str, file_name: str) -> str:
|
30 |
+
logger.info(
|
31 |
+
f"Agent received question (first 50 chars): {question[:50]}..."
|
32 |
+
)
|
33 |
+
images = None
|
34 |
+
|
35 |
+
if file_name:
|
36 |
+
content = parse_file(task_id, file_name, api_url)
|
37 |
+
if content:
|
38 |
+
if isinstance(
|
39 |
+
content, PIL.Image.Image
|
40 |
+
): # Parse content as image
|
41 |
+
images = [content]
|
42 |
+
else: # Append content to question
|
43 |
+
question += f"\n\nAttached content:\n{content}"
|
44 |
+
logger.info(f"Question with content: {question}")
|
45 |
+
|
46 |
+
answer = self.agent.run(question, images=images)
|
47 |
+
logger.info(f"Agent returning answer: {answer}")
|
48 |
+
return answer
|
49 |
+
|
50 |
+
|
51 |
+
if __name__ == "__main__":
|
52 |
+
import requests
|
53 |
+
|
54 |
+
api_url = "https://agents-course-unit4-scoring.hf.space"
|
55 |
+
question_url = f"{api_url}/random-question"
|
56 |
+
|
57 |
+
data = requests.get(question_url).json()
|
58 |
+
agent = Agent()
|
59 |
+
|
60 |
+
task_id = data["task_id"]
|
61 |
+
question = data["question"]
|
62 |
+
file_name = data["file_name"]
|
63 |
+
logger.info(
|
64 |
+
f"Task ID: {task_id}\nQuestion: {question}\nFile Name: {file_name}\n\n"
|
65 |
+
)
|
66 |
+
|
67 |
+
answer = agent(question, file_name)
|
app.py
CHANGED
@@ -8,6 +8,9 @@ import gradio as gr
|
|
8 |
import pandas as pd
|
9 |
|
10 |
from agent import Agent
|
|
|
|
|
|
|
11 |
|
12 |
# --- Constants --------------------------------------------------------------
|
13 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
@@ -55,6 +58,7 @@ async def _run_agent_async(
|
|
55 |
agent: Agent,
|
56 |
question: str,
|
57 |
task_id: str | int,
|
|
|
58 |
cache: dict[str, str],
|
59 |
semaphore: asyncio.Semaphore,
|
60 |
) -> tuple[str | int, str]:
|
@@ -68,7 +72,7 @@ async def _run_agent_async(
|
|
68 |
loop = asyncio.get_running_loop()
|
69 |
async with semaphore:
|
70 |
answer = await loop.run_in_executor(
|
71 |
-
None, agent, question
|
72 |
) # execute in default thread‑pool
|
73 |
cache[str(task_id)] = answer
|
74 |
return task_id, answer
|
@@ -109,7 +113,9 @@ async def _async_main(profile: gr.OAuthProfile | None):
|
|
109 |
cache = load_cache()
|
110 |
sem = asyncio.Semaphore(MAX_CONCURRENCY)
|
111 |
coros = [
|
112 |
-
_run_agent_async(
|
|
|
|
|
113 |
for q in questions
|
114 |
if q.get("task_id") and q.get("question") is not None
|
115 |
]
|
@@ -185,9 +191,8 @@ with gr.Blocks() as demo:
|
|
185 |
"""
|
186 |
**Quick‑start**
|
187 |
|
188 |
-
1.
|
189 |
-
2.
|
190 |
-
3. Hit **Run Evaluation & Submit All Answers** – answers are cached
|
191 |
locally so reruns are instant; agent calls & HTTP are parallel.
|
192 |
"""
|
193 |
)
|
|
|
8 |
import pandas as pd
|
9 |
|
10 |
from agent import Agent
|
11 |
+
from src.tracing import add_tracing
|
12 |
+
|
13 |
+
add_tracing()
|
14 |
|
15 |
# --- Constants --------------------------------------------------------------
|
16 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
|
58 |
agent: Agent,
|
59 |
question: str,
|
60 |
task_id: str | int,
|
61 |
+
file_name: str,
|
62 |
cache: dict[str, str],
|
63 |
semaphore: asyncio.Semaphore,
|
64 |
) -> tuple[str | int, str]:
|
|
|
72 |
loop = asyncio.get_running_loop()
|
73 |
async with semaphore:
|
74 |
answer = await loop.run_in_executor(
|
75 |
+
None, agent, question, file_name
|
76 |
) # execute in default thread‑pool
|
77 |
cache[str(task_id)] = answer
|
78 |
return task_id, answer
|
|
|
113 |
cache = load_cache()
|
114 |
sem = asyncio.Semaphore(MAX_CONCURRENCY)
|
115 |
coros = [
|
116 |
+
_run_agent_async(
|
117 |
+
agent, q["question"], q["task_id"], q["file_name"], cache, sem
|
118 |
+
)
|
119 |
for q in questions
|
120 |
if q.get("task_id") and q.get("question") is not None
|
121 |
]
|
|
|
191 |
"""
|
192 |
**Quick‑start**
|
193 |
|
194 |
+
1. Log in with the HF button (needed for ranking).
|
195 |
+
2. Hit **Run Evaluation & Submit All Answers** – answers are cached
|
|
|
196 |
locally so reruns are instant; agent calls & HTTP are parallel.
|
197 |
"""
|
198 |
)
|
src/file_handler/get_file.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import aiohttp
|
2 |
+
|
3 |
+
|
4 |
+
async def download_file_for_task(task_id: str, api_base_url: str) -> bytes:
|
5 |
+
"""
|
6 |
+
Asynchronously downloads a specific file associated with a given task ID.
|
7 |
+
This function performs a GET request to the endpoint /files/{task_id}.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
task_id: The identifier of the task for which to download the file.
|
11 |
+
api_base_url: The base URL of the API.
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
The content of the file as bytes.
|
15 |
+
|
16 |
+
Raises:
|
17 |
+
aiohttp.ClientResponseError: If the API returns an error status (e.g., 404 Not Found, 500 Internal Server Error).
|
18 |
+
aiohttp.ClientError: For other client-side errors, such as network connection issues.
|
19 |
+
"""
|
20 |
+
url = f"{api_base_url}/files/{task_id}"
|
21 |
+
async with aiohttp.ClientSession() as session:
|
22 |
+
async with session.get(url) as response:
|
23 |
+
response.raise_for_status() # Raise an exception for HTTP error codes (4xx or 5xx)
|
24 |
+
file_bytes = await response.read()
|
25 |
+
return file_bytes
|
src/file_handler/handlers.py
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import json
|
3 |
+
from typing import Optional
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
import pdfminer.high_level
|
7 |
+
import PIL.Image
|
8 |
+
from docx import Document
|
9 |
+
from docx.opc.exceptions import PackageNotFoundError
|
10 |
+
from pdfminer.pdfparser import PDFSyntaxError
|
11 |
+
|
12 |
+
from src.file_handler.get_file import download_file_for_task
|
13 |
+
|
14 |
+
|
15 |
+
async def convert_excel_bytes_to_llm_format(task_id: str, url: str) -> str:
|
16 |
+
"""
|
17 |
+
Downloads an Excel file using download_file_for_task, removes empty rows
|
18 |
+
from each sheet, and converts its content to an LLM-friendly dictionary format.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
task_id (str): The identifier for the task, used by download_file_for_task.
|
22 |
+
url (str): The URL of the Excel file to download and process.
|
23 |
+
|
24 |
+
Returns:
|
25 |
+
str: A dictionary where keys are sheet names and values are lists of
|
26 |
+
dictionaries (each dictionary representing a row, with column
|
27 |
+
headers as keys).
|
28 |
+
Returns None if a critical error occurs (e.g., download failure,
|
29 |
+
file unparseable).
|
30 |
+
Returns an empty dictionary if the Excel file is valid but contains
|
31 |
+
no sheets or no data after cleaning.
|
32 |
+
"""
|
33 |
+
try:
|
34 |
+
file_bytes = await download_file_for_task(task_id, url)
|
35 |
+
|
36 |
+
if not file_bytes:
|
37 |
+
print(f"Info [{task_id}]: No content downloaded from URL '{url}'.")
|
38 |
+
# Depending on desired behavior, could return {} or raise an error.
|
39 |
+
# Returning None indicates a problem preventing processing.
|
40 |
+
return None
|
41 |
+
|
42 |
+
# Use io.BytesIO to treat the bytes as a file-like object for pandas
|
43 |
+
excel_buffer = io.BytesIO(file_bytes)
|
44 |
+
|
45 |
+
# Use pd.ExcelFile to efficiently parse Excel files, especially with multiple sheets
|
46 |
+
# This will raise an error (e.g., ValueError, various zipfile/xlrd/openpyxl errors)
|
47 |
+
# if the file is not a valid Excel format or is corrupted.
|
48 |
+
xls = pd.ExcelFile(excel_buffer)
|
49 |
+
|
50 |
+
if not xls.sheet_names:
|
51 |
+
print(
|
52 |
+
f"Info [{task_id}]: Excel file from URL '{url}' has no sheets."
|
53 |
+
)
|
54 |
+
return {} # No sheets means no data to process
|
55 |
+
|
56 |
+
all_sheets_data = {}
|
57 |
+
for sheet_name in xls.sheet_names:
|
58 |
+
# Parse the current sheet into a DataFrame
|
59 |
+
df = xls.parse(sheet_name)
|
60 |
+
|
61 |
+
# Remove rows where all cells are NaN (these are considered empty rows)
|
62 |
+
df.dropna(how="all", inplace=True)
|
63 |
+
|
64 |
+
# Convert the cleaned DataFrame to a list of dictionaries (records format).
|
65 |
+
# If a sheet becomes empty after dropna, to_dict(orient='records')
|
66 |
+
# will correctly produce an empty list for that sheet's data.
|
67 |
+
all_sheets_data[sheet_name] = df.to_dict(orient="records")
|
68 |
+
|
69 |
+
return json.dumps(all_sheets_data, ensure_ascii=False)
|
70 |
+
|
71 |
+
except pd.errors.ParserError as e:
|
72 |
+
# Handles errors during the parsing of sheet data by pandas.
|
73 |
+
print(
|
74 |
+
f"Error [{task_id}]: Pandas parsing error for Excel file from '{url}': {e}"
|
75 |
+
)
|
76 |
+
return None
|
77 |
+
except ValueError as e:
|
78 |
+
# Catches errors like "Excel file format cannot be determined..." from pd.ExcelFile
|
79 |
+
# or other value-related issues during parsing.
|
80 |
+
print(
|
81 |
+
f"Error [{task_id}]: Value error processing Excel file from '{url}': {e}"
|
82 |
+
)
|
83 |
+
return None
|
84 |
+
except Exception as e:
|
85 |
+
# Catch-all for other unexpected errors (e.g., network issues if download_file_for_task
|
86 |
+
# is called here and raises something not caught, or other pandas/library issues).
|
87 |
+
# It's good practice to log the full traceback for unexpected errors in a real app.
|
88 |
+
# import traceback
|
89 |
+
# traceback.print_exc()
|
90 |
+
print(
|
91 |
+
f"Error [{task_id}]: Unexpected error processing Excel file from '{url}': {e}"
|
92 |
+
)
|
93 |
+
return None
|
94 |
+
|
95 |
+
|
96 |
+
# ---------------------------------------------------------------------------
|
97 |
+
# 1. Image → PIL Image
|
98 |
+
# ---------------------------------------------------------------------------
|
99 |
+
async def convert_image_to_pillow(
|
100 |
+
task_id: str, url: str
|
101 |
+
) -> Optional[PIL.Image.Image]:
|
102 |
+
"""
|
103 |
+
Downloads an image file and returns a PIL Image object.
|
104 |
+
Returns None on failure.
|
105 |
+
|
106 |
+
Args:
|
107 |
+
task_id (str): The ID of the task.
|
108 |
+
url (str): The URL of the image file.
|
109 |
+
|
110 |
+
Returns:
|
111 |
+
Optional[PIL.Image.Image]: The PIL Image object or None on failure.
|
112 |
+
"""
|
113 |
+
try:
|
114 |
+
raw = await download_file_for_task(task_id, url)
|
115 |
+
if not raw:
|
116 |
+
print(f"Info [{task_id}]: No bytes downloaded from '{url}'.")
|
117 |
+
return None
|
118 |
+
|
119 |
+
return PIL.Image.open(io.BytesIO(raw))
|
120 |
+
|
121 |
+
except Exception as e:
|
122 |
+
print(f"Error [{task_id}]: converting image from '{url}' → base64: {e}")
|
123 |
+
return None
|
124 |
+
|
125 |
+
|
126 |
+
# ---------------------------------------------------------------------------
|
127 |
+
# 2. File → UTF‑8 string
|
128 |
+
# ---------------------------------------------------------------------------
|
129 |
+
async def convert_file_to_string(task_id: str, url: str) -> Optional[str]:
|
130 |
+
"""
|
131 |
+
Downloads a file and returns its text (UTF‑8, errors replaced).
|
132 |
+
"""
|
133 |
+
try:
|
134 |
+
raw = await download_file_for_task(task_id, url)
|
135 |
+
if not raw:
|
136 |
+
print(f"Info [{task_id}]: No bytes downloaded from '{url}'.")
|
137 |
+
return None
|
138 |
+
|
139 |
+
return raw.decode("utf-8", errors="replace")
|
140 |
+
|
141 |
+
except Exception as e:
|
142 |
+
print(f"Error [{task_id}]: decoding file from '{url}': {e}")
|
143 |
+
return None
|
144 |
+
|
145 |
+
|
146 |
+
# ---------------------------------------------------------------------------
|
147 |
+
# 3. DOCX → Markdown
|
148 |
+
# ---------------------------------------------------------------------------
|
149 |
+
def _runs_to_md(runs):
|
150 |
+
"""Helper – convert a list of runs to markdown inline‑text."""
|
151 |
+
out = []
|
152 |
+
for run in runs:
|
153 |
+
text = run.text.replace("\n", " ")
|
154 |
+
if not text:
|
155 |
+
continue
|
156 |
+
if run.bold:
|
157 |
+
text = f"**{text}**"
|
158 |
+
if run.italic:
|
159 |
+
text = f"*{text}*"
|
160 |
+
out.append(text)
|
161 |
+
return "".join(out)
|
162 |
+
|
163 |
+
|
164 |
+
async def convert_docx_to_markdown(task_id: str, url: str) -> Optional[str]:
|
165 |
+
"""
|
166 |
+
Converts a Word document to *simple* Markdown.
|
167 |
+
"""
|
168 |
+
try:
|
169 |
+
raw = await download_file_for_task(task_id, url)
|
170 |
+
if not raw:
|
171 |
+
print(f"Info [{task_id}]: No bytes downloaded from '{url}'.")
|
172 |
+
return None
|
173 |
+
|
174 |
+
doc = Document(io.BytesIO(raw))
|
175 |
+
|
176 |
+
md_lines = []
|
177 |
+
for p in doc.paragraphs:
|
178 |
+
style = (p.style.name or "").lower()
|
179 |
+
text = _runs_to_md(p.runs).strip()
|
180 |
+
if not text:
|
181 |
+
continue
|
182 |
+
|
183 |
+
if "heading" in style:
|
184 |
+
# e.g. 'Heading 1' → level 1, 'Heading 2' → level 2, etc.
|
185 |
+
level = int("".join(filter(str.isdigit, style)) or 1)
|
186 |
+
md_lines.append(f"{'#' * level} {text}")
|
187 |
+
else:
|
188 |
+
md_lines.append(text)
|
189 |
+
|
190 |
+
return "\n\n".join(md_lines)
|
191 |
+
|
192 |
+
except PackageNotFoundError:
|
193 |
+
print(f"Error [{task_id}]: file from '{url}' is not a valid DOCX.")
|
194 |
+
return None
|
195 |
+
except Exception as e:
|
196 |
+
print(f"Error [{task_id}]: DOCX→MD conversion failed for '{url}': {e}")
|
197 |
+
return None
|
198 |
+
|
199 |
+
|
200 |
+
# ---------------------------------------------------------------------------
|
201 |
+
# 4. PDF → Markdown (really, plain text with paragraph breaks)
|
202 |
+
# ---------------------------------------------------------------------------
|
203 |
+
async def convert_pdf_to_markdown(task_id: str, url: str) -> Optional[str]:
|
204 |
+
"""
|
205 |
+
Extracts text from a PDF and returns it as Markdown (plain paragraphs).
|
206 |
+
"""
|
207 |
+
try:
|
208 |
+
raw = await download_file_for_task(task_id, url)
|
209 |
+
if not raw:
|
210 |
+
print(f"Info [{task_id}]: No bytes downloaded from '{url}'.")
|
211 |
+
return None
|
212 |
+
|
213 |
+
text = pdfminer.high_level.extract_text(io.BytesIO(raw))
|
214 |
+
if not text.strip():
|
215 |
+
print(f"Info [{task_id}]: PDF at '{url}' produced no text.")
|
216 |
+
return ""
|
217 |
+
|
218 |
+
# Very light Markdown: treat empty lines as paragraph separators
|
219 |
+
paragraphs = [p.strip() for p in text.splitlines() if p.strip()]
|
220 |
+
return "\n\n".join(paragraphs)
|
221 |
+
|
222 |
+
except (PDFSyntaxError, ValueError) as e:
|
223 |
+
print(f"Error [{task_id}]: PDF syntax error for '{url}': {e}")
|
224 |
+
return None
|
225 |
+
except Exception as e:
|
226 |
+
print(f"Error [{task_id}]: PDF→MD conversion failed for '{url}': {e}")
|
227 |
+
return None
|
src/file_handler/parse.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
|
3 |
+
from src.file_handler.handlers import (
|
4 |
+
convert_docx_to_markdown,
|
5 |
+
convert_excel_bytes_to_llm_format,
|
6 |
+
convert_file_to_string,
|
7 |
+
convert_image_to_pillow,
|
8 |
+
convert_pdf_to_markdown,
|
9 |
+
)
|
10 |
+
|
11 |
+
|
12 |
+
async def aparse_file(task_id: str, file_name: str, api_base_url: str) -> str:
|
13 |
+
"""
|
14 |
+
Parses a file and returns its content in a format suitable for LLMs.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
task_id (str): The ID of the task.
|
18 |
+
file_name (str): The name of the file.
|
19 |
+
api_base_url (str): The base URL of the API.
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
str: The content of the file in a format suitable for LLMs.
|
23 |
+
"""
|
24 |
+
file_extension = file_name.split(".")[-1]
|
25 |
+
|
26 |
+
if file_extension == "xlsx":
|
27 |
+
return await convert_excel_bytes_to_llm_format(task_id, api_base_url)
|
28 |
+
elif file_extension == "docx":
|
29 |
+
return await convert_docx_to_markdown(task_id, api_base_url)
|
30 |
+
elif file_extension in ["jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp"]:
|
31 |
+
return await convert_image_to_pillow(task_id, api_base_url)
|
32 |
+
elif file_extension == "pdf":
|
33 |
+
return await convert_pdf_to_markdown(task_id, api_base_url)
|
34 |
+
elif file_extension == "mp3":
|
35 |
+
return None
|
36 |
+
else:
|
37 |
+
return await convert_file_to_string(task_id, api_base_url)
|
38 |
+
|
39 |
+
|
40 |
+
def parse_file(task_id: str, file_name: str, api_base_url: str) -> str:
|
41 |
+
"""
|
42 |
+
Parses a file and returns its content in a format suitable for LLMs.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
task_id (str): The ID of the task.
|
46 |
+
file_name (str): The name of the file.
|
47 |
+
api_base_url (str): The base URL of the API.
|
48 |
+
|
49 |
+
Returns:
|
50 |
+
str: The content of the file in a format suitable for LLMs.
|
51 |
+
"""
|
52 |
+
return asyncio.run(aparse_file(task_id, file_name, api_base_url))
|
src/tracing.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import os
|
3 |
+
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
|
6 |
+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
7 |
+
OTLPSpanExporter,
|
8 |
+
)
|
9 |
+
from opentelemetry.sdk.trace import TracerProvider
|
10 |
+
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY")
|
15 |
+
LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY")
|
16 |
+
|
17 |
+
LANGFUSE_AUTH = base64.b64encode(
|
18 |
+
f"{LANGFUSE_PUBLIC_KEY}:{LANGFUSE_SECRET_KEY}".encode()
|
19 |
+
).decode()
|
20 |
+
|
21 |
+
os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = (
|
22 |
+
"https://cloud.langfuse.com/api/public/otel" # EU data region
|
23 |
+
)
|
24 |
+
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = (
|
25 |
+
f"Authorization=Basic {LANGFUSE_AUTH}"
|
26 |
+
)
|
27 |
+
|
28 |
+
|
29 |
+
def add_tracing():
|
30 |
+
trace_provider = TracerProvider()
|
31 |
+
trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
|
32 |
+
|
33 |
+
SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
|