Spaces:

tensor-boy
/

aiws

Build error

App Files Files Community

fikird commited on Dec 2, 2024

Commit

68c6844

1 Parent(s): 3f90511

Improve content processing and result formatting

Browse files

Files changed (2) hide show

app.py +40 -40
search_engine.py +91 -84

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import gradio as gr
 from search_engine import search
-def format_results(results):
-    """Format search results in a user-friendly way"""
     if 'error' in results:
         return f"❌ Error: {results['error']}"
@@ -10,77 +10,77 @@ def format_results(results):
     # Add insights section
     if 'insights' in results and results['insights']:
-        output.append("# 💡 Key Insights\n")
         output.append(results['insights'])
         output.append("\n")
     # Add key points section
     if 'key_points' in results and results['key_points']:
-        output.append("# 🎯 Key Points\n")
-        for i, point in enumerate(results['key_points'], 1):
-            output.append(f"{i}. {point}\n")
         output.append("\n")
     # Add detailed results section
     if 'results' in results and results['results']:
-        output.append("# 📄 Detailed Results\n")
         for i, result in enumerate(results['results'], 1):
-            output.append(f"## {i}. [{result['title']}]({result['url']})\n")
-            if 'description' in result and result['description']:
-                output.append(f"*{result['description']}*\n")
-            if 'summary' in result and result['summary']:
-                output.append(f"{result['summary']}\n")
             if 'key_points' in result and result['key_points']:
-                output.append("\nHighlights:\n")
-                for point in result['key_points']:
-                    output.append(f"- {point}\n")
             output.append("\n")
     # Add follow-up questions section
     if 'follow_up_questions' in results and results['follow_up_questions']:
-        output.append("# ❓ Related Questions\n")
         for question in results['follow_up_questions']:
-            output.append(f"- {question}\n")
     return "\n".join(output)
 def search_and_format(query):
     """Search and format results"""
     try:
         results = search(query)
-        return format_results(results)
     except Exception as e:
-        return f"❌ Error: {str(e)}"
-# Create the Gradio interface
-interface = gr.Interface(
     fn=search_and_format,
     inputs=gr.Textbox(
         label="Enter your search query",
-        placeholder="What would you like to learn about?",
-        lines=2
-    ),
-    outputs=gr.Markdown(
-        label="Search Results",
-        show_label=True
     ),
-    title="🔍 AI-Powered Web Search",
     description="""
-    This search engine uses AI to:
-    - Find relevant web pages
-    - Extract key information
-    - Generate insights and summaries
-    - Suggest follow-up questions
     """,
     examples=[
-        ["What is quantum computing?"],
-        ["Latest developments in artificial intelligence"],
-        ["How does blockchain technology work?"],
-        ["Explain machine learning in simple terms"],
     ],
     theme=gr.themes.Soft()
 )
-# Launch the app
-if __name__ == "__main__":
-    interface.launch()

 import gradio as gr
 from search_engine import search
+def format_search_results(results):
+    """Format search results into a clean markdown output"""
     if 'error' in results:
         return f"❌ Error: {results['error']}"
     # Add insights section
     if 'insights' in results and results['insights']:
+        output.append("# 🔍 Latest Developments Summary\n")
         output.append(results['insights'])
         output.append("\n")
     # Add key points section
     if 'key_points' in results and results['key_points']:
+        output.append("# 💡 Key Points\n")
+        for point in results['key_points'][:5]:  # Limit to top 5 points
+            output.append(f"• {point}\n")
         output.append("\n")
     # Add detailed results section
     if 'results' in results and results['results']:
+        output.append("# 📄 Detailed Findings\n")
         for i, result in enumerate(results['results'], 1):
+            output.append(f"## {i}. {result.get('title', 'No Title')}\n")
+            if 'url' in result:
+                output.append(f"🔗 [Source]({result['url']})\n")
+            if 'summary' in result:
+                output.append(f"\n{result['summary']}\n")
             if 'key_points' in result and result['key_points']:
+                output.append("\nKey Takeaways:")
+                for point in result['key_points'][:3]:  # Limit to top 3 points per result
+                    output.append(f"• {point}")
             output.append("\n")
     # Add follow-up questions section
     if 'follow_up_questions' in results and results['follow_up_questions']:
+        output.append("# ❓ Suggested Follow-up Questions\n")
         for question in results['follow_up_questions']:
+            output.append(f"• {question}\n")
     return "\n".join(output)
 def search_and_format(query):
     """Search and format results"""
+    if not query.strip():
+        return "Please enter a search query"
     try:
         results = search(query)
+        return format_search_results(results)
     except Exception as e:
+        return f"❌ Error performing search: {str(e)}"
+# Create Gradio interface
+iface = gr.Interface(
     fn=search_and_format,
     inputs=gr.Textbox(
         label="Enter your search query",
+        placeholder="Example: Latest developments in quantum computing"
     ),
+    outputs=gr.Markdown(label="Search Results"),
+    title="AI-Powered Research Assistant",
     description="""
+    This tool helps you research topics by:
+    1. Finding relevant information from multiple sources
+    2. Summarizing key findings
+    3. Extracting important points
+    4. Suggesting follow-up questions
+    Try searching for topics in technology, science, or any other field!
     """,
     examples=[
+        ["Latest developments in quantum computing"],
+        ["Artificial intelligence breakthroughs"],
+        ["Climate change solutions"],
+        ["Space exploration advancements"],
     ],
     theme=gr.themes.Soft()
 )
+# Launch for Spaces
+iface.launch()

search_engine.py CHANGED Viewed

@@ -50,104 +50,95 @@ class ContentProcessor:
         # Remove extra whitespace
         text = ' '.join(text.split())
         # Remove common navigation elements
-        nav_patterns = [
             "skip to content",
-            "skip to navigation",
             "search",
             "menu",
             "subscribe",
             "sign in",
             "log in",
-            "browse",
             "submit",
         ]
-        for pattern in nav_patterns:
-            text = text.replace(pattern.lower(), "")
         return text.strip()
-    def extract_main_content(self, soup: BeautifulSoup) -> str:
-        """Extract main content from HTML"""
-        # Remove navigation, headers, footers
-        for elem in soup.find_all(['nav', 'header', 'footer', 'aside', 'script', 'style']):
-            elem.decompose()
-        # Try to find main content container
-        main_content = None
-        for tag in ['main', 'article', 'div[role="main"]', '.main-content', '#main-content']:
-            main_content = soup.select_one(tag)
-            if main_content:
-                break
-        if not main_content:
-            # Fallback to body content
-            main_content = soup.find('body')
-        if main_content:
-            text = main_content.get_text(separator=' ', strip=True)
-        else:
-            # Last resort: get all text
-            text = soup.get_text(separator=' ', strip=True)
-        return self.clean_text(text)
-    def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
-        """Extract key points from text using AI"""
-        try:
-            # Split text into smaller chunks
-            chunk_size = 1024
-            chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
-            all_points = []
-            for chunk in chunks[:3]:  # Process first 3 chunks to keep it manageable
-                summary = self.model_manager.models['summarizer'](
-                    chunk,
-                    max_length=100,
-                    min_length=30,
-                    do_sample=False
-                )[0]['summary_text']
-                # Split summary into sentences
-                points = [s.strip() for s in summary.split('.') if s.strip()]
-                all_points.extend(points)
-            # Return top points
-            return all_points[:max_points]
-        except Exception as e:
-            logger.error(f"Error extracting key points: {str(e)}")
-            return []
-    def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
-        """Process content and generate insights"""
         try:
-            # Extract main content if HTML is available
-            if soup:
-                content = self.extract_main_content(soup)
-            else:
-                content = self.clean_text(content)
-            # Extract key points
-            key_points = self.extract_key_points(content)
-            # Generate overall summary
             summary = self.model_manager.models['summarizer'](
-                content[:1024],
                 max_length=150,
                 min_length=50,
                 do_sample=False
             )[0]['summary_text']
             return {
                 'summary': summary,
                 'key_points': key_points,
-                'content': content
             }
         except Exception as e:
             return {
                 'summary': f"Error processing content: {str(e)}",
                 'key_points': [],
                 'content': content
             }
 class WebSearchEngine:
     """Main search engine class"""
@@ -222,12 +213,20 @@ class WebSearchEngine:
             response = self.safe_get(url)
             soup = BeautifulSoup(response.text, 'lxml')
-            # Process content with HTML context
-            processed = self.processor.process_content("", soup)
             # Get metadata
             metadata = self.get_metadata(soup)
             return {
                 'url': url,
                 'title': metadata['title'],
@@ -305,41 +304,49 @@ class WebSearchEngine:
                 return {'error': 'No results found'}
             results = []
-            all_key_points = []
             for result in search_results:
                 if 'link' in result:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
                         results.append(processed)
                         if 'key_points' in processed:
-                            all_key_points.extend(processed['key_points'])
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
-            # Combine all summaries and key points
-            all_summaries = [r['summary'] for r in results if 'summary' in r]
-            combined_summary = " ".join(all_summaries)
-            # Generate final insights
             final_summary = self.processor.model_manager.models['summarizer'](
-                combined_summary[:1024],
                 max_length=200,
                 min_length=100,
                 do_sample=False
             )[0]['summary_text']
             return {
                 'results': results,
                 'insights': final_summary,
-                'key_points': list(set(all_key_points)),  # Remove duplicates
-                'follow_up_questions': [
-                    f"What are the key differences between {query} and related topics?",
-                    f"Can you explain {query} in simple terms?",
-                    f"What are the latest developments in {query}?"
-                ]
             }
         except Exception as e:

         # Remove extra whitespace
         text = ' '.join(text.split())
         # Remove common navigation elements
+        nav_elements = [
             "skip to content",
             "search",
             "menu",
+            "navigation",
             "subscribe",
             "sign in",
             "log in",
             "submit",
+            "browse",
+            "explore",
         ]
+        for element in nav_elements:
+            text = text.replace(element.lower(), "")
         return text.strip()
+    def extract_main_content(self, content: str) -> str:
+        """Extract main content from webpage text"""
+        # Split into paragraphs
+        paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
+        # Filter out short lines and navigation elements
+        meaningful_paragraphs = []
+        for p in paragraphs:
+            # Skip if too short
+            if len(p.split()) < 5:
+                continue
+            # Skip if looks like navigation
+            if any(nav in p.lower() for nav in ["→", "↓", "menu", "search", "click"]):
+                continue
+            meaningful_paragraphs.append(p)
+        # Join remaining paragraphs
+        return ' '.join(meaningful_paragraphs)
+    def generate_insights(self, content: str) -> Dict[str, str]:
+        """Generate insights from content using AI"""
         try:
+            # Clean the content first
+            cleaned_content = self.clean_text(content)
+            main_content = self.extract_main_content(cleaned_content)
+            if not main_content:
+                return {
+                    'summary': "Could not extract meaningful content",
+                    'key_points': [],
+                    'content': content
+                }
+            # Generate summary
             summary = self.model_manager.models['summarizer'](
+                main_content[:1024],
                 max_length=150,
                 min_length=50,
                 do_sample=False
             )[0]['summary_text']
+            # Extract key points using the same model
+            key_points_text = self.model_manager.models['summarizer'](
+                main_content[:1024],
+                max_length=200,
+                min_length=100,
+                num_beams=4,
+                do_sample=True
+            )[0]['summary_text']
+            # Split into bullet points
+            key_points = [
+                point.strip()
+                for point in key_points_text.split('.')
+                if point.strip() and len(point.split()) > 3
+            ]
             return {
                 'summary': summary,
                 'key_points': key_points,
+                'content': main_content
             }
         except Exception as e:
             return {
                 'summary': f"Error processing content: {str(e)}",
                 'key_points': [],
                 'content': content
             }
+    def process_content(self, content: str) -> Dict:
+        """Process content and generate insights"""
+        return self.generate_insights(content)
 class WebSearchEngine:
     """Main search engine class"""
             response = self.safe_get(url)
             soup = BeautifulSoup(response.text, 'lxml')
+            # Extract text content
+            for script in soup(["script", "style"]):
+                script.decompose()
+            text = soup.get_text()
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            content = ' '.join(chunk for chunk in chunks if chunk)
             # Get metadata
             metadata = self.get_metadata(soup)
+            # Process content
+            processed = self.processor.process_content(content)
             return {
                 'url': url,
                 'title': metadata['title'],
                 return {'error': 'No results found'}
             results = []
+            all_insights = []
             for result in search_results:
                 if 'link' in result:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
+                        # Add the snippet to help with context
+                        processed['snippet'] = result.get('snippet', '')
                         results.append(processed)
+                        # Collect insights
+                        if 'summary' in processed:
+                            all_insights.append(processed['summary'])
                         if 'key_points' in processed:
+                            all_insights.extend(processed.get('key_points', []))
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
+            # Combine and summarize all insights
+            combined_insights = ' '.join(all_insights)
             final_summary = self.processor.model_manager.models['summarizer'](
+                combined_insights[:1024],
                 max_length=200,
                 min_length=100,
                 do_sample=False
             )[0]['summary_text']
+            # Generate specific follow-up questions
+            follow_ups = [
+                f"What are the recent breakthroughs in {query}?",
+                f"How does {query} impact industry and research?",
+                f"What are the challenges and limitations in {query}?",
+                f"What are the future prospects for {query}?"
+            ]
             return {
                 'results': results,
                 'insights': final_summary,
+                'key_points': list(set(all_insights)),  # Remove duplicates
+                'follow_up_questions': follow_ups
             }
         except Exception as e: