Spaces:
Build error
Build error
Update SmartSearchTool to utilize Wikipedia API for content retrieval, enhancing search functionality. Refactor methods for cleaning and formatting Wikipedia content, and update requirements.txt to include new dependencies such as beautifulsoup4 and markdown.
Browse files- requirements.txt +5 -0
- tools/smart_search/tool.py +160 -24
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
duckduckgo-search>=8.0.1
|
2 |
gradio[oauth]>=5.26.0
|
3 |
isort>=6.0.1
|
@@ -9,13 +10,17 @@ litellm>=1.10.0
|
|
9 |
llama-index>=0.12.33
|
10 |
llama-index-embeddings-huggingface>=0.5.3
|
11 |
llama-index-readers-wikipedia>=0.3.0
|
|
|
12 |
mlcroissant>=1.0.17
|
|
|
13 |
pandas>=2.0.0
|
14 |
pytest>=8.3.5
|
15 |
pytest-cov>=6.1.1
|
16 |
python-dotenv>=1.0.0
|
17 |
requests>=2.32.3
|
|
|
18 |
smolagents[litellm,telemetry]>=1.14.0
|
19 |
typing-extensions>=4.5.0
|
|
|
20 |
wikipedia>=1.4.0
|
21 |
wikipedia-api>=0.8.1
|
|
|
1 |
+
beautifulsoup4>=4.13.4
|
2 |
duckduckgo-search>=8.0.1
|
3 |
gradio[oauth]>=5.26.0
|
4 |
isort>=6.0.1
|
|
|
10 |
llama-index>=0.12.33
|
11 |
llama-index-embeddings-huggingface>=0.5.3
|
12 |
llama-index-readers-wikipedia>=0.3.0
|
13 |
+
markdown>=3.8
|
14 |
mlcroissant>=1.0.17
|
15 |
+
numpy>=2.2.5
|
16 |
pandas>=2.0.0
|
17 |
pytest>=8.3.5
|
18 |
pytest-cov>=6.1.1
|
19 |
python-dotenv>=1.0.0
|
20 |
requests>=2.32.3
|
21 |
+
sentence-transformers>=4.1.0
|
22 |
smolagents[litellm,telemetry]>=1.14.0
|
23 |
typing-extensions>=4.5.0
|
24 |
+
hf-xet>=1.0.5
|
25 |
wikipedia>=1.4.0
|
26 |
wikipedia-api>=0.8.1
|
tools/smart_search/tool.py
CHANGED
@@ -1,49 +1,185 @@
|
|
1 |
import logging
|
2 |
import re
|
|
|
3 |
from smolagents import Tool
|
4 |
-
from smolagents.default_tools import DuckDuckGoSearchTool
|
|
|
|
|
5 |
|
6 |
logger = logging.getLogger(__name__)
|
7 |
|
8 |
|
9 |
class SmartSearchTool(Tool):
|
10 |
name = "smart_search"
|
11 |
-
description = """A smart search tool that
|
12 |
inputs = {"query": {"type": "string", "description": "The search query to find information"}}
|
13 |
output_type = "string"
|
14 |
|
15 |
def __init__(self):
|
16 |
super().__init__()
|
17 |
self.web_search_tool = DuckDuckGoSearchTool(max_results=1)
|
18 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
def forward(self, query: str) -> str:
|
21 |
logger.info(f"Starting smart search for query: {query}")
|
22 |
|
23 |
-
#
|
24 |
-
|
25 |
-
logger.info(f"Web search results: {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
for url in urls:
|
36 |
-
logger.info(f"Visiting URL: {url}")
|
37 |
-
try:
|
38 |
-
content = self.visit_webpage_tool.forward(url)
|
39 |
-
if content:
|
40 |
-
contents.append(f"\nContent from {url}:\n{content}")
|
41 |
-
except Exception as e:
|
42 |
-
logger.warning(f"Error visiting {url}: {e}")
|
43 |
-
contents.append(f"\nError visiting {url}: {e}")
|
44 |
|
45 |
-
#
|
46 |
-
|
|
|
47 |
|
48 |
|
49 |
def main(query: str) -> str:
|
|
|
1 |
import logging
|
2 |
import re
|
3 |
+
from typing import List, Dict, Optional
|
4 |
from smolagents import Tool
|
5 |
+
from smolagents.default_tools import DuckDuckGoSearchTool
|
6 |
+
import requests
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
|
9 |
logger = logging.getLogger(__name__)
|
10 |
|
11 |
|
12 |
class SmartSearchTool(Tool):
|
13 |
name = "smart_search"
|
14 |
+
description = """A smart search tool that searches Wikipedia for information."""
|
15 |
inputs = {"query": {"type": "string", "description": "The search query to find information"}}
|
16 |
output_type = "string"
|
17 |
|
18 |
def __init__(self):
|
19 |
super().__init__()
|
20 |
self.web_search_tool = DuckDuckGoSearchTool(max_results=1)
|
21 |
+
self.api_url = "https://en.wikipedia.org/w/api.php"
|
22 |
+
self.headers = {
|
23 |
+
'User-Agent': 'SmartSearchTool/1.0 (https://github.com/yourusername/yourproject; [email protected])'
|
24 |
+
}
|
25 |
+
|
26 |
+
def get_wikipedia_page(self, title: str) -> Optional[str]:
|
27 |
+
"""Get the raw wiki markup of a Wikipedia page."""
|
28 |
+
try:
|
29 |
+
params = {
|
30 |
+
'action': 'query',
|
31 |
+
'prop': 'revisions',
|
32 |
+
'rvprop': 'content',
|
33 |
+
'rvslots': 'main',
|
34 |
+
'format': 'json',
|
35 |
+
'titles': title,
|
36 |
+
'redirects': 1
|
37 |
+
}
|
38 |
+
response = requests.get(self.api_url, params=params, headers=self.headers)
|
39 |
+
response.raise_for_status()
|
40 |
+
data = response.json()
|
41 |
+
|
42 |
+
# Extract page content
|
43 |
+
pages = data.get('query', {}).get('pages', {})
|
44 |
+
for page_id, page_data in pages.items():
|
45 |
+
if 'revisions' in page_data:
|
46 |
+
return page_data['revisions'][0]['slots']['main']['*']
|
47 |
+
return None
|
48 |
+
except Exception as e:
|
49 |
+
logger.error(f"Error getting Wikipedia page: {e}")
|
50 |
+
return None
|
51 |
+
|
52 |
+
def clean_wiki_content(self, content: str) -> str:
|
53 |
+
"""Clean Wikipedia content by removing markup and formatting."""
|
54 |
+
# Remove citations
|
55 |
+
content = re.sub(r'\[\d+\]', '', content)
|
56 |
+
# Remove edit links
|
57 |
+
content = re.sub(r'\[edit\]', '', content)
|
58 |
+
# Remove file links
|
59 |
+
content = re.sub(r'\[\[File:.*?\]\]', '', content)
|
60 |
+
# Convert links to just text
|
61 |
+
content = re.sub(r'\[\[(?:[^|\]]*\|)?([^\]]+)\]\]', r'\1', content)
|
62 |
+
# Remove HTML comments
|
63 |
+
content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
|
64 |
+
# Remove templates
|
65 |
+
content = re.sub(r'\{\{.*?\}\}', '', content)
|
66 |
+
# Remove small tags
|
67 |
+
content = re.sub(r'<small>.*?</small>', '', content)
|
68 |
+
# Normalize whitespace
|
69 |
+
content = re.sub(r'\n\s*\n', '\n\n', content)
|
70 |
+
return content.strip()
|
71 |
+
|
72 |
+
def format_wiki_table(self, table_content: str) -> str:
|
73 |
+
"""Format a Wikipedia table into readable text."""
|
74 |
+
# Split into rows
|
75 |
+
rows = table_content.strip().split('\n')
|
76 |
+
formatted_rows = []
|
77 |
+
current_row = []
|
78 |
+
|
79 |
+
for row in rows:
|
80 |
+
# Skip empty rows and table structure markers
|
81 |
+
if not row.strip() or row.startswith('|-') or row.startswith('|+'):
|
82 |
+
if current_row:
|
83 |
+
formatted_rows.append('\t'.join(current_row))
|
84 |
+
current_row = []
|
85 |
+
continue
|
86 |
+
|
87 |
+
# Extract cells
|
88 |
+
cells = []
|
89 |
+
# Split the row into cells using | or ! as separators
|
90 |
+
cell_parts = re.split(r'[|!]', row)
|
91 |
+
for cell in cell_parts[1:]: # Skip the first empty part
|
92 |
+
# Clean up the cell content
|
93 |
+
cell = cell.strip()
|
94 |
+
# Remove any remaining markup
|
95 |
+
cell = re.sub(r'<.*?>', '', cell) # Remove HTML tags
|
96 |
+
cell = re.sub(r'\[\[.*?\|(.*?)\]\]', r'\1', cell) # Convert links
|
97 |
+
cell = re.sub(r'\[\[(.*?)\]\]', r'\1', cell) # Convert simple links
|
98 |
+
cell = re.sub(r'\{\{.*?\}\}', '', cell) # Remove templates
|
99 |
+
cell = re.sub(r'<small>.*?</small>', '', cell) # Remove small tags
|
100 |
+
cell = re.sub(r'rowspan="\d+"', '', cell) # Remove rowspan
|
101 |
+
cell = re.sub(r'colspan="\d+"', '', cell) # Remove colspan
|
102 |
+
cell = re.sub(r'class=".*?"', '', cell) # Remove class attributes
|
103 |
+
cell = re.sub(r'style=".*?"', '', cell) # Remove style attributes
|
104 |
+
cell = re.sub(r'align=".*?"', '', cell) # Remove align attributes
|
105 |
+
cell = re.sub(r'width=".*?"', '', cell) # Remove width attributes
|
106 |
+
cell = re.sub(r'bgcolor=".*?"', '', cell) # Remove bgcolor attributes
|
107 |
+
cell = re.sub(r'valign=".*?"', '', cell) # Remove valign attributes
|
108 |
+
cell = re.sub(r'border=".*?"', '', cell) # Remove border attributes
|
109 |
+
cell = re.sub(r'cellpadding=".*?"', '', cell) # Remove cellpadding attributes
|
110 |
+
cell = re.sub(r'cellspacing=".*?"', '', cell) # Remove cellspacing attributes
|
111 |
+
cell = re.sub(r'<ref.*?</ref>', '', cell) # Remove references
|
112 |
+
cell = re.sub(r'<ref.*?/>', '', cell) # Remove empty references
|
113 |
+
cell = re.sub(r'<br\s*/?>', ' ', cell) # Replace line breaks with spaces
|
114 |
+
cell = re.sub(r'\s+', ' ', cell) # Normalize whitespace
|
115 |
+
cells.append(cell)
|
116 |
+
|
117 |
+
if cells:
|
118 |
+
current_row.extend(cells)
|
119 |
+
|
120 |
+
if current_row:
|
121 |
+
formatted_rows.append('\t'.join(current_row))
|
122 |
+
|
123 |
+
if formatted_rows:
|
124 |
+
return '\n'.join(formatted_rows)
|
125 |
+
return ''
|
126 |
+
|
127 |
+
def extract_wikipedia_title(self, search_result: str) -> Optional[str]:
|
128 |
+
"""Extract Wikipedia page title from search result."""
|
129 |
+
# Look for Wikipedia links in the format [Title - Wikipedia](url)
|
130 |
+
wiki_match = re.search(r'\[([^\]]+)\s*-\s*Wikipedia\]\(https://en\.wikipedia\.org/wiki/[^)]+\)', search_result)
|
131 |
+
if wiki_match:
|
132 |
+
return wiki_match.group(1).strip()
|
133 |
+
return None
|
134 |
|
135 |
def forward(self, query: str) -> str:
|
136 |
logger.info(f"Starting smart search for query: {query}")
|
137 |
|
138 |
+
# First do a web search to find the Wikipedia page
|
139 |
+
search_result = self.web_search_tool.forward(query)
|
140 |
+
logger.info(f"Web search results: {search_result[:100]}...")
|
141 |
+
|
142 |
+
# Extract Wikipedia page title from search results
|
143 |
+
wiki_title = self.extract_wikipedia_title(search_result)
|
144 |
+
if not wiki_title:
|
145 |
+
return f"Could not find Wikipedia page in search results for '{query}'."
|
146 |
+
|
147 |
+
# Get Wikipedia page content
|
148 |
+
page_content = self.get_wikipedia_page(wiki_title)
|
149 |
+
if not page_content:
|
150 |
+
return f"Could not find Wikipedia page for '{wiki_title}'."
|
151 |
+
|
152 |
+
# Format tables and content
|
153 |
+
formatted_content = []
|
154 |
+
current_section = []
|
155 |
+
in_table = False
|
156 |
+
table_content = []
|
157 |
|
158 |
+
for line in page_content.split('\n'):
|
159 |
+
if line.startswith('{|'):
|
160 |
+
in_table = True
|
161 |
+
table_content = [line]
|
162 |
+
elif line.startswith('|}'):
|
163 |
+
in_table = False
|
164 |
+
table_content.append(line)
|
165 |
+
formatted_table = self.format_wiki_table('\n'.join(table_content))
|
166 |
+
if formatted_table:
|
167 |
+
current_section.append(formatted_table)
|
168 |
+
elif in_table:
|
169 |
+
table_content.append(line)
|
170 |
+
else:
|
171 |
+
if line.strip():
|
172 |
+
current_section.append(line)
|
173 |
+
elif current_section:
|
174 |
+
formatted_content.append('\n'.join(current_section))
|
175 |
+
current_section = []
|
176 |
|
177 |
+
if current_section:
|
178 |
+
formatted_content.append('\n'.join(current_section))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
|
180 |
+
# Clean and return the formatted content
|
181 |
+
cleaned_content = self.clean_wiki_content('\n\n'.join(formatted_content))
|
182 |
+
return f"Wikipedia content for '{wiki_title}':\n\n{cleaned_content}"
|
183 |
|
184 |
|
185 |
def main(query: str) -> str:
|