Spaces:
Build error
Build error
File size: 6,911 Bytes
55ef143 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 |
import io
import re
from typing import Any, Dict, List
import pandas as pd
import requests
from bs4 import BeautifulSoup
from PIL import Image
from smolagents import tool
from smolagents.default_tools import DuckDuckGoSearchTool, VisitWebpageTool
@tool
def web_search(query: str) -> str:
"""
Search the web for information.
Args:
query: Search query to find information
Returns:
Search results as text
"""
# Using the built-in DuckDuckGo search tool from smolagents
# search_tool = DuckDuckGoSearchTool()
search_tool = DuckDuckGoSearchTool(max_results=3)
results = search_tool.execute(query)
return results
@tool
def browse_webpage(url: str) -> Dict[str, Any]:
"""
Browse a webpage and extract its content.
Args:
url: URL of the webpage to browse
Returns:
Dictionary containing title, text content, and links from the webpage
"""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Extract title
title = soup.title.string if soup.title else "No title found"
# Extract main text content
paragraphs = soup.find_all("p")
text_content = "\n".join([p.get_text().strip() for p in paragraphs])
# Extract links
links = []
for link in soup.find_all("a", href=True):
href = link["href"]
text = link.get_text().strip()
if href.startswith("http"):
links.append({"text": text, "href": href})
return {"title": title, "content": text_content, "links": links}
except Exception as e:
return {"error": str(e)}
@tool
def analyze_image(image_url: str) -> Dict[str, Any]:
"""
Analyze an image and extract information from it.
Args:
image_url: URL of the image to analyze
Returns:
Dictionary containing information about the image
"""
try:
# Download the image
response = requests.get(image_url)
response.raise_for_status()
# Open the image
img = Image.open(io.BytesIO(response.content))
# Extract basic image information
width, height = img.size
format_type = img.format
mode = img.mode
return {
"width": width,
"height": height,
"format": format_type,
"mode": mode,
"aspect_ratio": width / height,
}
except Exception as e:
return {"error": str(e)}
@tool
def read_pdf(pdf_url: str) -> str:
"""
Extract text content from a PDF document.
Args:
pdf_url: URL of the PDF to read
Returns:
Text content extracted from the PDF
"""
try:
# Download the PDF
response = requests.get(pdf_url)
response.raise_for_status()
# This is a placeholder - in a real implementation, you would use a PDF parsing library
# such as PyPDF2, pdfplumber, or pdf2text
return "PDF content extraction would happen here in a real implementation"
except Exception as e:
return f"Error: {str(e)}"
@tool
def parse_csv(csv_url: str) -> Dict[str, Any]:
"""
Parse a CSV file and return its content as structured data.
Args:
csv_url: URL of the CSV file to parse
Returns:
Dictionary containing parsed CSV data
"""
try:
# Download the CSV
response = requests.get(csv_url)
response.raise_for_status()
# Parse the CSV
df = pd.read_csv(io.StringIO(response.text))
# Convert to dictionary format
columns = df.columns.tolist()
data = df.to_dict(orient="records")
# Return basic statistics and preview
return {
"columns": columns,
"row_count": len(data),
"preview": data[:5] if len(data) > 5 else data,
"column_dtypes": {col: str(df[col].dtype) for col in columns},
}
except Exception as e:
return {"error": str(e)}
@tool
def find_in_page(page_content: Dict[str, Any], query: str) -> List[str]:
"""
Find occurrences of a query string in page content.
Args:
page_content: Page content returned by browse_webpage
query: String to search for in the page
Returns:
List of sentences or sections containing the query
"""
results = []
if "content" in page_content:
content = page_content["content"]
# Split content into sentences
sentences = re.split(r"(?<=[.!?])\s+", content)
# Find sentences containing the query
for sentence in sentences:
if query.lower() in sentence.lower():
results.append(sentence)
return results
@tool
def extract_dates(text: str) -> List[str]:
"""
Extract dates from text content.
Args:
text: Text content to extract dates from
Returns:
List of date strings found in the text
"""
# Simple regex patterns for date extraction
# These patterns can be expanded for better coverage
date_patterns = [
r"\d{1,2}/\d{1,2}/\d{2,4}", # MM/DD/YYYY or DD/MM/YYYY
r"\d{1,2}-\d{1,2}-\d{2,4}", # MM-DD-YYYY or DD-MM-YYYY
r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b", # Month DD, YYYY
r"\b\d{1,2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}\b", # DD Month YYYY
]
results = []
for pattern in date_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
results.extend(matches)
return results
@tool
def perform_calculation(expression: str) -> Dict[str, Any]:
"""
Safely evaluate a mathematical expression.
Args:
expression: Mathematical expression to evaluate
Returns:
Dictionary containing the result or error message
"""
try:
# Using a safer approach than eval()
# This is very limited but safer
import math
# Define allowed names
allowed_names = {
"abs": abs,
"round": round,
"min": min,
"max": max,
"sum": sum,
"len": len,
"pow": pow,
"math": math,
}
# Clean the expression
cleaned_expr = expression.strip()
# Evaluate using safer methods (this is still a simplified example)
# In a real implementation, use a proper math expression parser
result = eval(cleaned_expr, {"__builtins__": {}}, allowed_names)
return {"result": result}
except Exception as e:
return {"error": str(e)}
|