Final_Assignment_Template

Build error

App Files Files Community

mjschock commited on 17 days ago

Commit

0305659

unverified ·

1 Parent(s): 4e02cb8

Update SmartSearchTool to utilize Wikipedia API for content retrieval, enhancing search functionality. Refactor methods for cleaning and formatting Wikipedia content, and update requirements.txt to include new dependencies such as beautifulsoup4 and markdown.

Browse files

Files changed (2) hide show

requirements.txt +5 -0
tools/smart_search/tool.py +160 -24

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 duckduckgo-search>=8.0.1
 gradio[oauth]>=5.26.0
 isort>=6.0.1
@@ -9,13 +10,17 @@ litellm>=1.10.0
 llama-index>=0.12.33
 llama-index-embeddings-huggingface>=0.5.3
 llama-index-readers-wikipedia>=0.3.0
 mlcroissant>=1.0.17
 pandas>=2.0.0
 pytest>=8.3.5
 pytest-cov>=6.1.1
 python-dotenv>=1.0.0
 requests>=2.32.3
 smolagents[litellm,telemetry]>=1.14.0
 typing-extensions>=4.5.0
 wikipedia>=1.4.0
 wikipedia-api>=0.8.1

+beautifulsoup4>=4.13.4
 duckduckgo-search>=8.0.1
 gradio[oauth]>=5.26.0
 isort>=6.0.1
 llama-index>=0.12.33
 llama-index-embeddings-huggingface>=0.5.3
 llama-index-readers-wikipedia>=0.3.0
+markdown>=3.8
 mlcroissant>=1.0.17
+numpy>=2.2.5
 pandas>=2.0.0
 pytest>=8.3.5
 pytest-cov>=6.1.1
 python-dotenv>=1.0.0
 requests>=2.32.3
+sentence-transformers>=4.1.0
 smolagents[litellm,telemetry]>=1.14.0
 typing-extensions>=4.5.0
+hf-xet>=1.0.5
 wikipedia>=1.4.0
 wikipedia-api>=0.8.1

tools/smart_search/tool.py CHANGED Viewed

@@ -1,49 +1,185 @@
 import logging
 import re
 from smolagents import Tool
-from smolagents.default_tools import DuckDuckGoSearchTool, VisitWebpageTool
 logger = logging.getLogger(__name__)
 class SmartSearchTool(Tool):
     name = "smart_search"
-    description = """A smart search tool that first performs a web search and then visits each URL to get its content."""
     inputs = {"query": {"type": "string", "description": "The search query to find information"}}
     output_type = "string"
     def __init__(self):
         super().__init__()
         self.web_search_tool = DuckDuckGoSearchTool(max_results=1)
-        self.visit_webpage_tool = VisitWebpageTool(max_output_length=-1)
     def forward(self, query: str) -> str:
         logger.info(f"Starting smart search for query: {query}")
-        # Get web search results
-        web_search_results = self.web_search_tool.forward(query)
-        logger.info(f"Web search results: {web_search_results[:100]}...")
-        # Extract URLs from the web search result
-        urls = re.findall(r'https?://[^\s)]+', web_search_results)
-        if not urls:
-            logger.info("No URLs found in web search result")
-            return f"Web search results:\n{web_search_results}"
-        # Visit each URL and get its content
-        contents = []
-        for url in urls:
-            logger.info(f"Visiting URL: {url}")
-            try:
-                content = self.visit_webpage_tool.forward(url)
-                if content:
-                    contents.append(f"\nContent from {url}:\n{content}")
-            except Exception as e:
-                logger.warning(f"Error visiting {url}: {e}")
-                contents.append(f"\nError visiting {url}: {e}")
-        # Combine all results
-        return f"Web search results:\n{web_search_results}\n" + "\n".join(contents)
 def main(query: str) -> str:

 import logging
 import re
+from typing import List, Dict, Optional
 from smolagents import Tool
+from smolagents.default_tools import DuckDuckGoSearchTool
+import requests
+from bs4 import BeautifulSoup
 logger = logging.getLogger(__name__)
 class SmartSearchTool(Tool):
     name = "smart_search"
+    description = """A smart search tool that searches Wikipedia for information."""
     inputs = {"query": {"type": "string", "description": "The search query to find information"}}
     output_type = "string"
     def __init__(self):
         super().__init__()
         self.web_search_tool = DuckDuckGoSearchTool(max_results=1)
+        self.api_url = "https://en.wikipedia.org/w/api.php"
+        self.headers = {
+            'User-Agent': 'SmartSearchTool/1.0 (https://github.com/yourusername/yourproject; [email protected])'
+        }
+    def get_wikipedia_page(self, title: str) -> Optional[str]:
+        """Get the raw wiki markup of a Wikipedia page."""
+        try:
+            params = {
+                'action': 'query',
+                'prop': 'revisions',
+                'rvprop': 'content',
+                'rvslots': 'main',
+                'format': 'json',
+                'titles': title,
+                'redirects': 1
+            }
+            response = requests.get(self.api_url, params=params, headers=self.headers)
+            response.raise_for_status()
+            data = response.json()
+            # Extract page content
+            pages = data.get('query', {}).get('pages', {})
+            for page_id, page_data in pages.items():
+                if 'revisions' in page_data:
+                    return page_data['revisions'][0]['slots']['main']['*']
+            return None
+        except Exception as e:
+            logger.error(f"Error getting Wikipedia page: {e}")
+            return None
+    def clean_wiki_content(self, content: str) -> str:
+        """Clean Wikipedia content by removing markup and formatting."""
+        # Remove citations
+        content = re.sub(r'\[\d+\]', '', content)
+        # Remove edit links
+        content = re.sub(r'\[edit\]', '', content)
+        # Remove file links
+        content = re.sub(r'\[\[File:.*?\]\]', '', content)
+        # Convert links to just text
+        content = re.sub(r'\[\[(?:[^|\]]*\|)?([^\]]+)\]\]', r'\1', content)
+        # Remove HTML comments
+        content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
+        # Remove templates
+        content = re.sub(r'\{\{.*?\}\}', '', content)
+        # Remove small tags
+        content = re.sub(r'<small>.*?</small>', '', content)
+        # Normalize whitespace
+        content = re.sub(r'\n\s*\n', '\n\n', content)
+        return content.strip()
+    def format_wiki_table(self, table_content: str) -> str:
+        """Format a Wikipedia table into readable text."""
+        # Split into rows
+        rows = table_content.strip().split('\n')
+        formatted_rows = []
+        current_row = []
+        for row in rows:
+            # Skip empty rows and table structure markers
+            if not row.strip() or row.startswith('|-') or row.startswith('|+'):
+                if current_row:
+                    formatted_rows.append('\t'.join(current_row))
+                    current_row = []
+                continue
+            # Extract cells
+            cells = []
+            # Split the row into cells using | or ! as separators
+            cell_parts = re.split(r'[|!]', row)
+            for cell in cell_parts[1:]:  # Skip the first empty part
+                # Clean up the cell content
+                cell = cell.strip()
+                # Remove any remaining markup
+                cell = re.sub(r'<.*?>', '', cell)  # Remove HTML tags
+                cell = re.sub(r'\[\[.*?\|(.*?)\]\]', r'\1', cell)  # Convert links
+                cell = re.sub(r'\[\[(.*?)\]\]', r'\1', cell)  # Convert simple links
+                cell = re.sub(r'\{\{.*?\}\}', '', cell)  # Remove templates
+                cell = re.sub(r'<small>.*?</small>', '', cell)  # Remove small tags
+                cell = re.sub(r'rowspan="\d+"', '', cell)  # Remove rowspan
+                cell = re.sub(r'colspan="\d+"', '', cell)  # Remove colspan
+                cell = re.sub(r'class=".*?"', '', cell)  # Remove class attributes
+                cell = re.sub(r'style=".*?"', '', cell)  # Remove style attributes
+                cell = re.sub(r'align=".*?"', '', cell)  # Remove align attributes
+                cell = re.sub(r'width=".*?"', '', cell)  # Remove width attributes
+                cell = re.sub(r'bgcolor=".*?"', '', cell)  # Remove bgcolor attributes
+                cell = re.sub(r'valign=".*?"', '', cell)  # Remove valign attributes
+                cell = re.sub(r'border=".*?"', '', cell)  # Remove border attributes
+                cell = re.sub(r'cellpadding=".*?"', '', cell)  # Remove cellpadding attributes
+                cell = re.sub(r'cellspacing=".*?"', '', cell)  # Remove cellspacing attributes
+                cell = re.sub(r'<ref.*?</ref>', '', cell)  # Remove references
+                cell = re.sub(r'<ref.*?/>', '', cell)  # Remove empty references
+                cell = re.sub(r'<br\s*/?>', ' ', cell)  # Replace line breaks with spaces
+                cell = re.sub(r'\s+', ' ', cell)  # Normalize whitespace
+                cells.append(cell)
+            if cells:
+                current_row.extend(cells)
+        if current_row:
+            formatted_rows.append('\t'.join(current_row))
+        if formatted_rows:
+            return '\n'.join(formatted_rows)
+        return ''
+    def extract_wikipedia_title(self, search_result: str) -> Optional[str]:
+        """Extract Wikipedia page title from search result."""
+        # Look for Wikipedia links in the format [Title - Wikipedia](url)
+        wiki_match = re.search(r'\[([^\]]+)\s*-\s*Wikipedia\]\(https://en\.wikipedia\.org/wiki/[^)]+\)', search_result)
+        if wiki_match:
+            return wiki_match.group(1).strip()
+        return None
     def forward(self, query: str) -> str:
         logger.info(f"Starting smart search for query: {query}")
+        # First do a web search to find the Wikipedia page
+        search_result = self.web_search_tool.forward(query)
+        logger.info(f"Web search results: {search_result[:100]}...")
+        # Extract Wikipedia page title from search results
+        wiki_title = self.extract_wikipedia_title(search_result)
+        if not wiki_title:
+            return f"Could not find Wikipedia page in search results for '{query}'."
+        # Get Wikipedia page content
+        page_content = self.get_wikipedia_page(wiki_title)
+        if not page_content:
+            return f"Could not find Wikipedia page for '{wiki_title}'."
+        # Format tables and content
+        formatted_content = []
+        current_section = []
+        in_table = False
+        table_content = []
+        for line in page_content.split('\n'):
+            if line.startswith('{|'):
+                in_table = True
+                table_content = [line]
+            elif line.startswith('|}'):
+                in_table = False
+                table_content.append(line)
+                formatted_table = self.format_wiki_table('\n'.join(table_content))
+                if formatted_table:
+                    current_section.append(formatted_table)
+            elif in_table:
+                table_content.append(line)
+            else:
+                if line.strip():
+                    current_section.append(line)
+                elif current_section:
+                    formatted_content.append('\n'.join(current_section))
+                    current_section = []
+        if current_section:
+            formatted_content.append('\n'.join(current_section))
+        # Clean and return the formatted content
+        cleaned_content = self.clean_wiki_content('\n\n'.join(formatted_content))
+        return f"Wikipedia content for '{wiki_title}':\n\n{cleaned_content}"
 def main(query: str) -> str: