Spaces:

tensor-boy
/

aiws

Build error

App Files Files Community

fikird commited on Dec 2, 2024

Commit

3f90511

1 Parent(s): 2f58cc7

Enhance content processing and improve result formatting

Browse files

Files changed (2) hide show

app.py +74 -114
search_engine.py +89 -106

app.py CHANGED Viewed

@@ -1,126 +1,86 @@
 import gradio as gr
-from rag_engine import RAGEngine
-import torch
-import os
-import logging
-import traceback
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-def safe_search(query, max_results):
-    """Wrapper function to handle errors gracefully"""
-    try:
-        rag = RAGEngine()
-        results = rag.search_and_process(query, max_results)
-        if 'error' in results:
-            return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{results['error']}\n```"
-        return format_results(results)
-    except Exception as e:
-        error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
-        logger.error(error_msg)
-        return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
 def format_results(results):
-    """Format search results for display"""
-    if not results or not results.get('results'):
-        return "# ⚠️ No Results\nNo search results were found. Please try a different query."
-    formatted = f"# 🔍 Search Results\n\n"
     # Add insights section
-    if 'insights' in results:
-        formatted += f"## 💡 Key Insights\n{results['insights']}\n\n"
-    # Add follow-up questions
-    if 'follow_up_questions' in results:
-        formatted += "## ❓ Follow-up Questions\n"
-        for q in results['follow_up_questions']:
-            if q and q.strip():
-                formatted += f"- {q.strip()}\n"
-        formatted += "\n"
-    # Add main results
-    if 'results' in results:
-        formatted += "## 📄 Detailed Results\n\n"
         for i, result in enumerate(results['results'], 1):
-            if not isinstance(result, dict):
-                continue
-            formatted += f"### {i}. "
-            if 'url' in result:
-                title = result.get('title', 'Untitled')
-                formatted += f"[{title}]({result['url']})\n"
-            if 'summary' in result:
-                formatted += f"\n{result['summary']}\n\n"
-    # Add similar chunks if available
-    if 'similar_chunks' in results:
-        formatted += "## 🔍 Related Content\n\n"
-        for i, chunk in enumerate(results['similar_chunks'], 1):
-            if not isinstance(chunk, dict):
-                continue
-            formatted += f"### Related {i}\n"
-            if 'metadata' in chunk:
-                meta = chunk['metadata']
-                if 'title' in meta and 'url' in meta:
-                    formatted += f"From [{meta['title']}]({meta['url']})\n"
-            if 'content' in chunk:
-                formatted += f"\n{chunk['content'][:200]}...\n\n"
-    return formatted
-def create_demo():
-    """Create the Gradio interface"""
-    with gr.Blocks(title="Web Search + RAG") as demo:
-        gr.Markdown("# 🔍 Intelligent Web Search")
-        gr.Markdown("Search the web with AI-powered insights and analysis.")
-        with gr.Row():
-            with gr.Column():
-                query = gr.Textbox(
-                    label="Search Query",
-                    placeholder="Enter your search query...",
-                    lines=2
-                )
-                max_results = gr.Slider(
-                    minimum=1,
-                    maximum=10,
-                    value=5,
-                    step=1,
-                    label="Number of Results"
-                )
-                search_button = gr.Button("🔍 Search")
-        output = gr.Markdown()
-        search_button.click(
-            fn=safe_search,
-            inputs=[query, max_results],
-            outputs=output
-        )
-        gr.Examples(
-            examples=[
-                ["What is RAG in AI?", 5],
-                ["Latest developments in quantum computing", 3],
-                ["How does BERT work?", 5]
-            ],
-            inputs=[query, max_results]
-        )
-    return demo
-# Create the demo
-demo = create_demo()
-# Launch for Spaces
-demo.launch()

 import gradio as gr
+from search_engine import search
 def format_results(results):
+    """Format search results in a user-friendly way"""
+    if 'error' in results:
+        return f"❌ Error: {results['error']}"
+    output = []
     # Add insights section
+    if 'insights' in results and results['insights']:
+        output.append("# 💡 Key Insights\n")
+        output.append(results['insights'])
+        output.append("\n")
+    # Add key points section
+    if 'key_points' in results and results['key_points']:
+        output.append("# 🎯 Key Points\n")
+        for i, point in enumerate(results['key_points'], 1):
+            output.append(f"{i}. {point}\n")
+        output.append("\n")
+    # Add detailed results section
+    if 'results' in results and results['results']:
+        output.append("# 📄 Detailed Results\n")
         for i, result in enumerate(results['results'], 1):
+            output.append(f"## {i}. [{result['title']}]({result['url']})\n")
+            if 'description' in result and result['description']:
+                output.append(f"*{result['description']}*\n")
+            if 'summary' in result and result['summary']:
+                output.append(f"{result['summary']}\n")
+            if 'key_points' in result and result['key_points']:
+                output.append("\nHighlights:\n")
+                for point in result['key_points']:
+                    output.append(f"- {point}\n")
+            output.append("\n")
+    # Add follow-up questions section
+    if 'follow_up_questions' in results and results['follow_up_questions']:
+        output.append("# ❓ Related Questions\n")
+        for question in results['follow_up_questions']:
+            output.append(f"- {question}\n")
+    return "\n".join(output)
+def search_and_format(query):
+    """Search and format results"""
+    try:
+        results = search(query)
+        return format_results(results)
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+# Create the Gradio interface
+interface = gr.Interface(
+    fn=search_and_format,
+    inputs=gr.Textbox(
+        label="Enter your search query",
+        placeholder="What would you like to learn about?",
+        lines=2
+    ),
+    outputs=gr.Markdown(
+        label="Search Results",
+        show_label=True
+    ),
+    title="🔍 AI-Powered Web Search",
+    description="""
+    This search engine uses AI to:
+    - Find relevant web pages
+    - Extract key information
+    - Generate insights and summaries
+    - Suggest follow-up questions
+    """,
+    examples=[
+        ["What is quantum computing?"],
+        ["Latest developments in artificial intelligence"],
+        ["How does blockchain technology work?"],
+        ["Explain machine learning in simple terms"],
+    ],
+    theme=gr.themes.Soft()
+)
+# Launch the app
+if __name__ == "__main__":
+    interface.launch()

search_engine.py CHANGED Viewed

@@ -50,101 +50,102 @@ class ContentProcessor:
         # Remove extra whitespace
         text = ' '.join(text.split())
         # Remove common navigation elements
-        nav_elements = [
-            "Skip to content",
-            "Search",
-            "Menu",
-            "Navigation",
-            "Subscribe",
-            "Browse",
-            "Submit",
-            "More",
-            "About",
-            "Contact",
-            "Privacy Policy",
-            "Terms of Use"
         ]
-        for element in nav_elements:
-            text = text.replace(element, "")
         return text.strip()
     def extract_main_content(self, soup: BeautifulSoup) -> str:
         """Extract main content from HTML"""
         # Remove navigation, headers, footers
-        for elem in soup.find_all(['nav', 'header', 'footer', 'script', 'style', 'meta', 'link']):
             elem.decompose()
         # Try to find main content container
         main_content = None
-        content_tags = ['article', 'main', '[role="main"]', '.content', '#content', '.post', '.entry']
-        for tag in content_tags:
             main_content = soup.select_one(tag)
             if main_content:
                 break
         if not main_content:
-            main_content = soup
-        # Extract text from paragraphs
-        paragraphs = main_content.find_all('p')
-        if paragraphs:
-            return ' '.join(p.get_text(strip=True) for p in paragraphs)
-        # Fallback to all text if no paragraphs found
-        return main_content.get_text(strip=True)
-    def process_content(self, content: str, html_content: str = None) -> Dict:
-        """Process content and generate insights"""
         try:
-            # Clean content
-            cleaned_content = self.clean_text(content)
-            # If HTML content is provided, try to extract main content
-            if html_content:
-                soup = BeautifulSoup(html_content, 'lxml')
-                main_content = self.extract_main_content(soup)
-                if main_content:
-                    cleaned_content = self.clean_text(main_content)
-            # Generate summary in chunks if content is too long
-            chunks = [cleaned_content[i:i+1024] for i in range(0, len(cleaned_content), 1024)]
-            summaries = []
-            for chunk in chunks[:3]:  # Process up to 3 chunks to avoid too long processing
-                try:
-                    summary = self.model_manager.models['summarizer'](
-                        chunk,
-                        max_length=150,
-                        min_length=50,
-                        do_sample=False
-                    )[0]['summary_text']
-                    summaries.append(summary)
-                except Exception as e:
-                    logger.warning(f"Error summarizing chunk: {str(e)}")
-                    continue
-            # Combine summaries
-            final_summary = ' '.join(summaries)
-            # Extract key points using bullet points
-            key_points = self.model_manager.models['summarizer'](
-                cleaned_content[:1024],
-                max_length=100,
-                min_length=30,
-                num_beams=4,
-                do_sample=True
             )[0]['summary_text']
             return {
-                'summary': final_summary,
                 'key_points': key_points,
-                'content': cleaned_content
             }
         except Exception as e:
             return {
                 'summary': f"Error processing content: {str(e)}",
-                'key_points': "",
                 'content': content
             }
@@ -221,15 +222,12 @@ class WebSearchEngine:
             response = self.safe_get(url)
             soup = BeautifulSoup(response.text, 'lxml')
             # Get metadata
             metadata = self.get_metadata(soup)
-            # Process content with both text and HTML
-            processed = self.processor.process_content(
-                soup.get_text(),
-                html_content=response.text
-            )
             return {
                 'url': url,
                 'title': metadata['title'],
@@ -242,35 +240,6 @@ class WebSearchEngine:
         except Exception as e:
             return {'error': f"Error processing {url}: {str(e)}"}
-    def format_results(self, results: List[Dict]) -> Dict:
-        """Format search results in a user-friendly way"""
-        formatted_insights = []
-        formatted_results = []
-        for result in results:
-            if 'error' not in result:
-                # Format key points
-                if result.get('key_points'):
-                    points = result['key_points'].split('. ')
-                    formatted_points = [f"• {point.strip()}" for point in points if point.strip()]
-                    formatted_insights.extend(formatted_points)
-                # Format detailed result
-                formatted_result = {
-                    'title': result['title'],
-                    'url': result['url'],
-                    'summary': result['summary'],
-                }
-                formatted_results.append(formatted_result)
-        # Remove duplicates while preserving order
-        formatted_insights = list(dict.fromkeys(formatted_insights))
-        return {
-            'insights': '\n'.join(formatted_insights[:10]),  # Top 10 insights
-            'results': formatted_results
-        }
     def search_duckduckgo(self, query: str, max_results: int = 5) -> List[Dict]:
         """Search DuckDuckGo and parse HTML results"""
         search_results = []
@@ -336,26 +305,40 @@ class WebSearchEngine:
                 return {'error': 'No results found'}
             results = []
             for result in search_results:
                 if 'link' in result:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
                         results.append(processed)
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
-            # Format results in a user-friendly way
-            formatted = self.format_results(results)
             return {
-                'results': formatted['results'],
-                'insights': formatted['insights'],
                 'follow_up_questions': [
-                    f"What are the recent breakthroughs in {query}?",
-                    f"How does {query} impact various industries?",
-                    f"What are the future prospects of {query}?"
                 ]
             }

         # Remove extra whitespace
         text = ' '.join(text.split())
         # Remove common navigation elements
+        nav_patterns = [
+            "skip to content",
+            "skip to navigation",
+            "search",
+            "menu",
+            "subscribe",
+            "sign in",
+            "log in",
+            "browse",
+            "submit",
         ]
+        for pattern in nav_patterns:
+            text = text.replace(pattern.lower(), "")
         return text.strip()
     def extract_main_content(self, soup: BeautifulSoup) -> str:
         """Extract main content from HTML"""
         # Remove navigation, headers, footers
+        for elem in soup.find_all(['nav', 'header', 'footer', 'aside', 'script', 'style']):
             elem.decompose()
         # Try to find main content container
         main_content = None
+        for tag in ['main', 'article', 'div[role="main"]', '.main-content', '#main-content']:
             main_content = soup.select_one(tag)
             if main_content:
                 break
         if not main_content:
+            # Fallback to body content
+            main_content = soup.find('body')
+        if main_content:
+            text = main_content.get_text(separator=' ', strip=True)
+        else:
+            # Last resort: get all text
+            text = soup.get_text(separator=' ', strip=True)
+        return self.clean_text(text)
+    def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
+        """Extract key points from text using AI"""
         try:
+            # Split text into smaller chunks
+            chunk_size = 1024
+            chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+            all_points = []
+            for chunk in chunks[:3]:  # Process first 3 chunks to keep it manageable
+                summary = self.model_manager.models['summarizer'](
+                    chunk,
+                    max_length=100,
+                    min_length=30,
+                    do_sample=False
+                )[0]['summary_text']
+                # Split summary into sentences
+                points = [s.strip() for s in summary.split('.') if s.strip()]
+                all_points.extend(points)
+            # Return top points
+            return all_points[:max_points]
+        except Exception as e:
+            logger.error(f"Error extracting key points: {str(e)}")
+            return []
+    def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
+        """Process content and generate insights"""
+        try:
+            # Extract main content if HTML is available
+            if soup:
+                content = self.extract_main_content(soup)
+            else:
+                content = self.clean_text(content)
+            # Extract key points
+            key_points = self.extract_key_points(content)
+            # Generate overall summary
+            summary = self.model_manager.models['summarizer'](
+                content[:1024],
+                max_length=150,
+                min_length=50,
+                do_sample=False
             )[0]['summary_text']
             return {
+                'summary': summary,
                 'key_points': key_points,
+                'content': content
             }
         except Exception as e:
             return {
                 'summary': f"Error processing content: {str(e)}",
+                'key_points': [],
                 'content': content
             }
             response = self.safe_get(url)
             soup = BeautifulSoup(response.text, 'lxml')
+            # Process content with HTML context
+            processed = self.processor.process_content("", soup)
             # Get metadata
             metadata = self.get_metadata(soup)
             return {
                 'url': url,
                 'title': metadata['title'],
         except Exception as e:
             return {'error': f"Error processing {url}: {str(e)}"}
     def search_duckduckgo(self, query: str, max_results: int = 5) -> List[Dict]:
         """Search DuckDuckGo and parse HTML results"""
         search_results = []
                 return {'error': 'No results found'}
             results = []
+            all_key_points = []
             for result in search_results:
                 if 'link' in result:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
                         results.append(processed)
+                        if 'key_points' in processed:
+                            all_key_points.extend(processed['key_points'])
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
+            # Combine all summaries and key points
+            all_summaries = [r['summary'] for r in results if 'summary' in r]
+            combined_summary = " ".join(all_summaries)
+            # Generate final insights
+            final_summary = self.processor.model_manager.models['summarizer'](
+                combined_summary[:1024],
+                max_length=200,
+                min_length=100,
+                do_sample=False
+            )[0]['summary_text']
             return {
+                'results': results,
+                'insights': final_summary,
+                'key_points': list(set(all_key_points)),  # Remove duplicates
                 'follow_up_questions': [
+                    f"What are the key differences between {query} and related topics?",
+                    f"Can you explain {query} in simple terms?",
+                    f"What are the latest developments in {query}?"
                 ]
             }