Spaces:

tensor-boy
/

aiws

Build error

App Files Files Community

fikird commited on Dec 2, 2024

Commit

636f8ae

1 Parent(s): 68c6844

Enhance content processing with better extraction and summarization

Browse files

Files changed (2) hide show

app.py +110 -70
search_engine.py +68 -93

app.py CHANGED Viewed

@@ -1,86 +1,126 @@
 import gradio as gr
-from search_engine import search
-def format_search_results(results):
-    """Format search results into a clean markdown output"""
-    if 'error' in results:
-        return f"❌ Error: {results['error']}"
-    output = []
     # Add insights section
-    if 'insights' in results and results['insights']:
-        output.append("# 🔍 Latest Developments Summary\n")
-        output.append(results['insights'])
-        output.append("\n")
-    # Add key points section
-    if 'key_points' in results and results['key_points']:
-        output.append("# 💡 Key Points\n")
-        for point in results['key_points'][:5]:  # Limit to top 5 points
-            output.append(f"• {point}\n")
-        output.append("\n")
-    # Add detailed results section
-    if 'results' in results and results['results']:
-        output.append("# 📄 Detailed Findings\n")
         for i, result in enumerate(results['results'], 1):
-            output.append(f"## {i}. {result.get('title', 'No Title')}\n")
             if 'url' in result:
-                output.append(f"🔗 [Source]({result['url']})\n")
             if 'summary' in result:
-                output.append(f"\n{result['summary']}\n")
-            if 'key_points' in result and result['key_points']:
-                output.append("\nKey Takeaways:")
-                for point in result['key_points'][:3]:  # Limit to top 3 points per result
-                    output.append(f"• {point}")
-            output.append("\n")
-    # Add follow-up questions section
-    if 'follow_up_questions' in results and results['follow_up_questions']:
-        output.append("# ❓ Suggested Follow-up Questions\n")
-        for question in results['follow_up_questions']:
-            output.append(f"• {question}\n")
-    return "\n".join(output)
-def search_and_format(query):
-    """Search and format results"""
-    if not query.strip():
-        return "Please enter a search query"
-    try:
-        results = search(query)
-        return format_search_results(results)
-    except Exception as e:
-        return f"��� Error performing search: {str(e)}"
-# Create Gradio interface
-iface = gr.Interface(
-    fn=search_and_format,
-    inputs=gr.Textbox(
-        label="Enter your search query",
-        placeholder="Example: Latest developments in quantum computing"
-    ),
-    outputs=gr.Markdown(label="Search Results"),
-    title="AI-Powered Research Assistant",
-    description="""
-    This tool helps you research topics by:
-    1. Finding relevant information from multiple sources
-    2. Summarizing key findings
-    3. Extracting important points
-    4. Suggesting follow-up questions
-    Try searching for topics in technology, science, or any other field!
-    """,
-    examples=[
-        ["Latest developments in quantum computing"],
-        ["Artificial intelligence breakthroughs"],
-        ["Climate change solutions"],
-        ["Space exploration advancements"],
-    ],
-    theme=gr.themes.Soft()
-)
 # Launch for Spaces
-iface.launch()

 import gradio as gr
+from rag_engine import RAGEngine
+import torch
+import os
+import logging
+import traceback
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def safe_search(query, max_results):
+    """Wrapper function to handle errors gracefully"""
+    try:
+        rag = RAGEngine()
+        results = rag.search_and_process(query, max_results)
+        if 'error' in results:
+            return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{results['error']}\n```"
+        return format_results(results)
+    except Exception as e:
+        error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+        logger.error(error_msg)
+        return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
+def format_results(results):
+    """Format search results for display"""
+    if not results or not results.get('results'):
+        return "# ⚠️ No Results\nNo search results were found. Please try a different query."
+    formatted = f"# 🔍 Search Results\n\n"
     # Add insights section
+    if 'insights' in results:
+        formatted += f"## 💡 Key Insights\n{results['insights']}\n\n"
+    # Add follow-up questions
+    if 'follow_up_questions' in results:
+        formatted += "## ❓ Follow-up Questions\n"
+        for q in results['follow_up_questions']:
+            if q and q.strip():
+                formatted += f"- {q.strip()}\n"
+        formatted += "\n"
+    # Add main results
+    if 'results' in results:
+        formatted += "## 📄 Detailed Results\n\n"
         for i, result in enumerate(results['results'], 1):
+            if not isinstance(result, dict):
+                continue
+            formatted += f"### {i}. "
             if 'url' in result:
+                title = result.get('title', 'Untitled')
+                formatted += f"[{title}]({result['url']})\n"
             if 'summary' in result:
+                formatted += f"\n{result['summary']}\n\n"
+    # Add similar chunks if available
+    if 'similar_chunks' in results:
+        formatted += "## 🔍 Related Content\n\n"
+        for i, chunk in enumerate(results['similar_chunks'], 1):
+            if not isinstance(chunk, dict):
+                continue
+            formatted += f"### Related {i}\n"
+            if 'metadata' in chunk:
+                meta = chunk['metadata']
+                if 'title' in meta and 'url' in meta:
+                    formatted += f"From [{meta['title']}]({meta['url']})\n"
+            if 'content' in chunk:
+                formatted += f"\n{chunk['content'][:200]}...\n\n"
+    return formatted
+def create_demo():
+    """Create the Gradio interface"""
+    with gr.Blocks(title="Web Search + RAG") as demo:
+        gr.Markdown("# 🔍 Intelligent Web Search")
+        gr.Markdown("Search the web with AI-powered insights and analysis.")
+        with gr.Row():
+            with gr.Column():
+                query = gr.Textbox(
+                    label="Search Query",
+                    placeholder="Enter your search query...",
+                    lines=2
+                )
+                max_results = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=5,
+                    step=1,
+                    label="Number of Results"
+                )
+                search_button = gr.Button("🔍 Search")
+        output = gr.Markdown()
+        search_button.click(
+            fn=safe_search,
+            inputs=[query, max_results],
+            outputs=output
+        )
+        gr.Examples(
+            examples=[
+                ["What is RAG in AI?", 5],
+                ["Latest developments in quantum computing", 3],
+                ["How does BERT work?", 5]
+            ],
+            inputs=[query, max_results]
+        )
+    return demo
+# Create the demo
+demo = create_demo()
 # Launch for Spaces
+demo.launch()

search_engine.py CHANGED Viewed

@@ -49,96 +49,78 @@ class ContentProcessor:
         """Clean and normalize text content"""
         # Remove extra whitespace
         text = ' '.join(text.split())
-        # Remove common navigation elements
-        nav_elements = [
-            "skip to content",
-            "search",
-            "menu",
-            "navigation",
-            "subscribe",
-            "sign in",
-            "log in",
-            "submit",
-            "browse",
-            "explore",
-        ]
-        for element in nav_elements:
-            text = text.replace(element.lower(), "")
-        return text.strip()
-    def extract_main_content(self, content: str) -> str:
-        """Extract main content from webpage text"""
-        # Split into paragraphs
-        paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
-        # Filter out short lines and navigation elements
-        meaningful_paragraphs = []
-        for p in paragraphs:
-            # Skip if too short
-            if len(p.split()) < 5:
-                continue
-            # Skip if looks like navigation
-            if any(nav in p.lower() for nav in ["→", "↓", "menu", "search", "click"]):
-                continue
-            meaningful_paragraphs.append(p)
-        # Join remaining paragraphs
-        return ' '.join(meaningful_paragraphs)
-    def generate_insights(self, content: str) -> Dict[str, str]:
-        """Generate insights from content using AI"""
         try:
-            # Clean the content first
             cleaned_content = self.clean_text(content)
-            main_content = self.extract_main_content(cleaned_content)
-            if not main_content:
-                return {
-                    'summary': "Could not extract meaningful content",
-                    'key_points': [],
-                    'content': content
-                }
-            # Generate summary
-            summary = self.model_manager.models['summarizer'](
-                main_content[:1024],
-                max_length=150,
-                min_length=50,
-                do_sample=False
-            )[0]['summary_text']
-            # Extract key points using the same model
-            key_points_text = self.model_manager.models['summarizer'](
-                main_content[:1024],
                 max_length=200,
                 min_length=100,
                 num_beams=4,
-                do_sample=True
             )[0]['summary_text']
-            # Split into bullet points
-            key_points = [
-                point.strip()
-                for point in key_points_text.split('.')
-                if point.strip() and len(point.split()) > 3
-            ]
             return {
                 'summary': summary,
                 'key_points': key_points,
-                'content': main_content
             }
         except Exception as e:
             return {
                 'summary': f"Error processing content: {str(e)}",
                 'key_points': [],
                 'content': content
             }
-    def process_content(self, content: str) -> Dict:
-        """Process content and generate insights"""
-        return self.generate_insights(content)
 class WebSearchEngine:
     """Main search engine class"""
@@ -225,7 +207,7 @@ class WebSearchEngine:
             metadata = self.get_metadata(soup)
             # Process content
-            processed = self.processor.process_content(content)
             return {
                 'url': url,
@@ -304,49 +286,42 @@ class WebSearchEngine:
                 return {'error': 'No results found'}
             results = []
-            all_insights = []
             for result in search_results:
                 if 'link' in result:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
-                        # Add the snippet to help with context
                         processed['snippet'] = result.get('snippet', '')
                         results.append(processed)
-                        # Collect insights
-                        if 'summary' in processed:
-                            all_insights.append(processed['summary'])
                         if 'key_points' in processed:
-                            all_insights.extend(processed.get('key_points', []))
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
-            # Combine and summarize all insights
-            combined_insights = ' '.join(all_insights)
-            final_summary = self.processor.model_manager.models['summarizer'](
-                combined_insights[:1024],
-                max_length=200,
-                min_length=100,
-                do_sample=False
-            )[0]['summary_text']
-            # Generate specific follow-up questions
-            follow_ups = [
-                f"What are the recent breakthroughs in {query}?",
                 f"How does {query} impact industry and research?",
-                f"What are the challenges and limitations in {query}?",
-                f"What are the future prospects for {query}?"
             ]
             return {
                 'results': results,
-                'insights': final_summary,
-                'key_points': list(set(all_insights)),  # Remove duplicates
-                'follow_up_questions': follow_ups
             }
         except Exception as e:

         """Clean and normalize text content"""
         # Remove extra whitespace
         text = ' '.join(text.split())
+        # Remove redundant headers and navigation text
+        common_headers = ['skip to content', 'search', 'menu', 'navigation', 'subscribe']
+        lines = []
+        for line in text.split('\n'):
+            line = line.strip().lower()
+            if not any(header in line for header in common_headers) and len(line) > 20:
+                lines.append(line)
+        return ' '.join(lines)
+    def extract_key_points(self, content: str) -> List[str]:
+        """Extract key points from content using AI"""
+        try:
+            # Split content into chunks for processing
+            chunks = [content[i:i+1024] for i in range(0, len(content), 1024)]
+            key_points = []
+            for chunk in chunks:
+                # Generate focused summary for each chunk
+                summary = self.model_manager.models['summarizer'](
+                    chunk,
+                    max_length=150,
+                    min_length=50,
+                    do_sample=False,
+                    num_beams=4,
+                    length_penalty=2.0,
+                    early_stopping=True
+                )[0]['summary_text']
+                key_points.append(summary)
+            return key_points
+        except Exception as e:
+            logger.error(f"Error extracting key points: {str(e)}")
+            return []
+    def process_content(self, content: str, title: str = "", description: str = "") -> Dict:
+        """Process content and generate insights"""
         try:
+            # Clean the content
             cleaned_content = self.clean_text(content)
+            # Combine title and description with content for context
+            if title:
+                cleaned_content = f"{title}. {cleaned_content}"
+            if description:
+                cleaned_content = f"{description}. {cleaned_content}"
+            # Extract key points
+            key_points = self.extract_key_points(cleaned_content)
+            # Generate overall summary
+            summary = self.model_manager.models['summarizer'](
+                ' '.join(key_points)[:1024],
                 max_length=200,
                 min_length=100,
+                do_sample=False,
                 num_beams=4,
+                length_penalty=2.0,
+                early_stopping=True
             )[0]['summary_text']
             return {
                 'summary': summary,
                 'key_points': key_points,
+                'content': cleaned_content
             }
         except Exception as e:
             return {
                 'summary': f"Error processing content: {str(e)}",
                 'key_points': [],
                 'content': content
             }
 class WebSearchEngine:
     """Main search engine class"""
             metadata = self.get_metadata(soup)
             # Process content
+            processed = self.processor.process_content(content, metadata['title'], metadata['description'])
             return {
                 'url': url,
                 return {'error': 'No results found'}
             results = []
+            all_key_points = []
             for result in search_results:
                 if 'link' in result:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
+                        # Add original search snippet
                         processed['snippet'] = result.get('snippet', '')
                         results.append(processed)
+                        # Collect key points
                         if 'key_points' in processed:
+                            all_key_points.extend(processed['key_points'])
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
+            # Generate comprehensive insights
+            insights = []
+            if all_key_points:
+                # Group similar points and remove duplicates
+                unique_points = list(set(all_key_points))
+                insights = self.processor.extract_key_points(' '.join(unique_points))
+            # Generate relevant follow-up questions
+            follow_up_questions = [
+                f"What are the practical applications of {query}?",
                 f"How does {query} impact industry and research?",
+                f"What challenges and limitations exist in {query}?",
+                f"What future developments are expected in {query}?"
             ]
             return {
                 'results': results,
+                'insights': insights if insights else ["No comprehensive insights available."],
+                'follow_up_questions': follow_up_questions
             }
         except Exception as e: