mjschock commited on
Commit
0305659
·
unverified ·
1 Parent(s): 4e02cb8

Update SmartSearchTool to utilize Wikipedia API for content retrieval, enhancing search functionality. Refactor methods for cleaning and formatting Wikipedia content, and update requirements.txt to include new dependencies such as beautifulsoup4 and markdown.

Browse files
Files changed (2) hide show
  1. requirements.txt +5 -0
  2. tools/smart_search/tool.py +160 -24
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  duckduckgo-search>=8.0.1
2
  gradio[oauth]>=5.26.0
3
  isort>=6.0.1
@@ -9,13 +10,17 @@ litellm>=1.10.0
9
  llama-index>=0.12.33
10
  llama-index-embeddings-huggingface>=0.5.3
11
  llama-index-readers-wikipedia>=0.3.0
 
12
  mlcroissant>=1.0.17
 
13
  pandas>=2.0.0
14
  pytest>=8.3.5
15
  pytest-cov>=6.1.1
16
  python-dotenv>=1.0.0
17
  requests>=2.32.3
 
18
  smolagents[litellm,telemetry]>=1.14.0
19
  typing-extensions>=4.5.0
 
20
  wikipedia>=1.4.0
21
  wikipedia-api>=0.8.1
 
1
+ beautifulsoup4>=4.13.4
2
  duckduckgo-search>=8.0.1
3
  gradio[oauth]>=5.26.0
4
  isort>=6.0.1
 
10
  llama-index>=0.12.33
11
  llama-index-embeddings-huggingface>=0.5.3
12
  llama-index-readers-wikipedia>=0.3.0
13
+ markdown>=3.8
14
  mlcroissant>=1.0.17
15
+ numpy>=2.2.5
16
  pandas>=2.0.0
17
  pytest>=8.3.5
18
  pytest-cov>=6.1.1
19
  python-dotenv>=1.0.0
20
  requests>=2.32.3
21
+ sentence-transformers>=4.1.0
22
  smolagents[litellm,telemetry]>=1.14.0
23
  typing-extensions>=4.5.0
24
+ hf-xet>=1.0.5
25
  wikipedia>=1.4.0
26
  wikipedia-api>=0.8.1
tools/smart_search/tool.py CHANGED
@@ -1,49 +1,185 @@
1
  import logging
2
  import re
 
3
  from smolagents import Tool
4
- from smolagents.default_tools import DuckDuckGoSearchTool, VisitWebpageTool
 
 
5
 
6
  logger = logging.getLogger(__name__)
7
 
8
 
9
  class SmartSearchTool(Tool):
10
  name = "smart_search"
11
- description = """A smart search tool that first performs a web search and then visits each URL to get its content."""
12
  inputs = {"query": {"type": "string", "description": "The search query to find information"}}
13
  output_type = "string"
14
 
15
  def __init__(self):
16
  super().__init__()
17
  self.web_search_tool = DuckDuckGoSearchTool(max_results=1)
18
- self.visit_webpage_tool = VisitWebpageTool(max_output_length=-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def forward(self, query: str) -> str:
21
  logger.info(f"Starting smart search for query: {query}")
22
 
23
- # Get web search results
24
- web_search_results = self.web_search_tool.forward(query)
25
- logger.info(f"Web search results: {web_search_results[:100]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- # Extract URLs from the web search result
28
- urls = re.findall(r'https?://[^\s)]+', web_search_results)
29
- if not urls:
30
- logger.info("No URLs found in web search result")
31
- return f"Web search results:\n{web_search_results}"
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- # Visit each URL and get its content
34
- contents = []
35
- for url in urls:
36
- logger.info(f"Visiting URL: {url}")
37
- try:
38
- content = self.visit_webpage_tool.forward(url)
39
- if content:
40
- contents.append(f"\nContent from {url}:\n{content}")
41
- except Exception as e:
42
- logger.warning(f"Error visiting {url}: {e}")
43
- contents.append(f"\nError visiting {url}: {e}")
44
 
45
- # Combine all results
46
- return f"Web search results:\n{web_search_results}\n" + "\n".join(contents)
 
47
 
48
 
49
  def main(query: str) -> str:
 
1
  import logging
2
  import re
3
+ from typing import List, Dict, Optional
4
  from smolagents import Tool
5
+ from smolagents.default_tools import DuckDuckGoSearchTool
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
 
9
  logger = logging.getLogger(__name__)
10
 
11
 
12
  class SmartSearchTool(Tool):
13
  name = "smart_search"
14
+ description = """A smart search tool that searches Wikipedia for information."""
15
  inputs = {"query": {"type": "string", "description": "The search query to find information"}}
16
  output_type = "string"
17
 
18
  def __init__(self):
19
  super().__init__()
20
  self.web_search_tool = DuckDuckGoSearchTool(max_results=1)
21
+ self.api_url = "https://en.wikipedia.org/w/api.php"
22
+ self.headers = {
23
+ 'User-Agent': 'SmartSearchTool/1.0 (https://github.com/yourusername/yourproject; [email protected])'
24
+ }
25
+
26
+ def get_wikipedia_page(self, title: str) -> Optional[str]:
27
+ """Get the raw wiki markup of a Wikipedia page."""
28
+ try:
29
+ params = {
30
+ 'action': 'query',
31
+ 'prop': 'revisions',
32
+ 'rvprop': 'content',
33
+ 'rvslots': 'main',
34
+ 'format': 'json',
35
+ 'titles': title,
36
+ 'redirects': 1
37
+ }
38
+ response = requests.get(self.api_url, params=params, headers=self.headers)
39
+ response.raise_for_status()
40
+ data = response.json()
41
+
42
+ # Extract page content
43
+ pages = data.get('query', {}).get('pages', {})
44
+ for page_id, page_data in pages.items():
45
+ if 'revisions' in page_data:
46
+ return page_data['revisions'][0]['slots']['main']['*']
47
+ return None
48
+ except Exception as e:
49
+ logger.error(f"Error getting Wikipedia page: {e}")
50
+ return None
51
+
52
+ def clean_wiki_content(self, content: str) -> str:
53
+ """Clean Wikipedia content by removing markup and formatting."""
54
+ # Remove citations
55
+ content = re.sub(r'\[\d+\]', '', content)
56
+ # Remove edit links
57
+ content = re.sub(r'\[edit\]', '', content)
58
+ # Remove file links
59
+ content = re.sub(r'\[\[File:.*?\]\]', '', content)
60
+ # Convert links to just text
61
+ content = re.sub(r'\[\[(?:[^|\]]*\|)?([^\]]+)\]\]', r'\1', content)
62
+ # Remove HTML comments
63
+ content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
64
+ # Remove templates
65
+ content = re.sub(r'\{\{.*?\}\}', '', content)
66
+ # Remove small tags
67
+ content = re.sub(r'<small>.*?</small>', '', content)
68
+ # Normalize whitespace
69
+ content = re.sub(r'\n\s*\n', '\n\n', content)
70
+ return content.strip()
71
+
72
+ def format_wiki_table(self, table_content: str) -> str:
73
+ """Format a Wikipedia table into readable text."""
74
+ # Split into rows
75
+ rows = table_content.strip().split('\n')
76
+ formatted_rows = []
77
+ current_row = []
78
+
79
+ for row in rows:
80
+ # Skip empty rows and table structure markers
81
+ if not row.strip() or row.startswith('|-') or row.startswith('|+'):
82
+ if current_row:
83
+ formatted_rows.append('\t'.join(current_row))
84
+ current_row = []
85
+ continue
86
+
87
+ # Extract cells
88
+ cells = []
89
+ # Split the row into cells using | or ! as separators
90
+ cell_parts = re.split(r'[|!]', row)
91
+ for cell in cell_parts[1:]: # Skip the first empty part
92
+ # Clean up the cell content
93
+ cell = cell.strip()
94
+ # Remove any remaining markup
95
+ cell = re.sub(r'<.*?>', '', cell) # Remove HTML tags
96
+ cell = re.sub(r'\[\[.*?\|(.*?)\]\]', r'\1', cell) # Convert links
97
+ cell = re.sub(r'\[\[(.*?)\]\]', r'\1', cell) # Convert simple links
98
+ cell = re.sub(r'\{\{.*?\}\}', '', cell) # Remove templates
99
+ cell = re.sub(r'<small>.*?</small>', '', cell) # Remove small tags
100
+ cell = re.sub(r'rowspan="\d+"', '', cell) # Remove rowspan
101
+ cell = re.sub(r'colspan="\d+"', '', cell) # Remove colspan
102
+ cell = re.sub(r'class=".*?"', '', cell) # Remove class attributes
103
+ cell = re.sub(r'style=".*?"', '', cell) # Remove style attributes
104
+ cell = re.sub(r'align=".*?"', '', cell) # Remove align attributes
105
+ cell = re.sub(r'width=".*?"', '', cell) # Remove width attributes
106
+ cell = re.sub(r'bgcolor=".*?"', '', cell) # Remove bgcolor attributes
107
+ cell = re.sub(r'valign=".*?"', '', cell) # Remove valign attributes
108
+ cell = re.sub(r'border=".*?"', '', cell) # Remove border attributes
109
+ cell = re.sub(r'cellpadding=".*?"', '', cell) # Remove cellpadding attributes
110
+ cell = re.sub(r'cellspacing=".*?"', '', cell) # Remove cellspacing attributes
111
+ cell = re.sub(r'<ref.*?</ref>', '', cell) # Remove references
112
+ cell = re.sub(r'<ref.*?/>', '', cell) # Remove empty references
113
+ cell = re.sub(r'<br\s*/?>', ' ', cell) # Replace line breaks with spaces
114
+ cell = re.sub(r'\s+', ' ', cell) # Normalize whitespace
115
+ cells.append(cell)
116
+
117
+ if cells:
118
+ current_row.extend(cells)
119
+
120
+ if current_row:
121
+ formatted_rows.append('\t'.join(current_row))
122
+
123
+ if formatted_rows:
124
+ return '\n'.join(formatted_rows)
125
+ return ''
126
+
127
+ def extract_wikipedia_title(self, search_result: str) -> Optional[str]:
128
+ """Extract Wikipedia page title from search result."""
129
+ # Look for Wikipedia links in the format [Title - Wikipedia](url)
130
+ wiki_match = re.search(r'\[([^\]]+)\s*-\s*Wikipedia\]\(https://en\.wikipedia\.org/wiki/[^)]+\)', search_result)
131
+ if wiki_match:
132
+ return wiki_match.group(1).strip()
133
+ return None
134
 
135
  def forward(self, query: str) -> str:
136
  logger.info(f"Starting smart search for query: {query}")
137
 
138
+ # First do a web search to find the Wikipedia page
139
+ search_result = self.web_search_tool.forward(query)
140
+ logger.info(f"Web search results: {search_result[:100]}...")
141
+
142
+ # Extract Wikipedia page title from search results
143
+ wiki_title = self.extract_wikipedia_title(search_result)
144
+ if not wiki_title:
145
+ return f"Could not find Wikipedia page in search results for '{query}'."
146
+
147
+ # Get Wikipedia page content
148
+ page_content = self.get_wikipedia_page(wiki_title)
149
+ if not page_content:
150
+ return f"Could not find Wikipedia page for '{wiki_title}'."
151
+
152
+ # Format tables and content
153
+ formatted_content = []
154
+ current_section = []
155
+ in_table = False
156
+ table_content = []
157
 
158
+ for line in page_content.split('\n'):
159
+ if line.startswith('{|'):
160
+ in_table = True
161
+ table_content = [line]
162
+ elif line.startswith('|}'):
163
+ in_table = False
164
+ table_content.append(line)
165
+ formatted_table = self.format_wiki_table('\n'.join(table_content))
166
+ if formatted_table:
167
+ current_section.append(formatted_table)
168
+ elif in_table:
169
+ table_content.append(line)
170
+ else:
171
+ if line.strip():
172
+ current_section.append(line)
173
+ elif current_section:
174
+ formatted_content.append('\n'.join(current_section))
175
+ current_section = []
176
 
177
+ if current_section:
178
+ formatted_content.append('\n'.join(current_section))
 
 
 
 
 
 
 
 
 
179
 
180
+ # Clean and return the formatted content
181
+ cleaned_content = self.clean_wiki_content('\n\n'.join(formatted_content))
182
+ return f"Wikipedia content for '{wiki_title}':\n\n{cleaned_content}"
183
 
184
 
185
  def main(query: str) -> str: