Spaces:

tensor-boy
/

aiws

Build error

App Files Files Community

fikird commited on Dec 2, 2024

Commit

ae8bccc

1 Parent(s): 25a3d88

Improve content processing and result formatting

Browse files

Files changed (2) hide show

app.py +46 -21
search_engine.py +117 -22

app.py CHANGED Viewed

@@ -32,50 +32,72 @@ def format_results(results):
     if not results or not results.get('results'):
         return "# ⚠️ No Results\nNo search results were found. Please try a different query."
-    formatted = f"# 🔍 Search Results\n\n"
-    # Add insights section
-    if 'insights' in results:
-        formatted += f"## 💡 Key Insights\n{results['insights']}\n\n"
-    # Add follow-up questions
-    if 'follow_up_questions' in results:
-        formatted += "## ❓ Follow-up Questions\n"
-        for q in results['follow_up_questions']:
-            if q and q.strip():
-                formatted += f"- {q.strip()}\n"
-        formatted += "\n"
     # Add main results
     if 'results' in results:
-        formatted += "## 📄 Detailed Results\n\n"
         for i, result in enumerate(results['results'], 1):
             if not isinstance(result, dict):
                 continue
-            formatted += f"### {i}. "
             if 'url' in result:
                 title = result.get('title', 'Untitled')
-                formatted += f"[{title}]({result['url']})\n"
             if 'summary' in result:
-                formatted += f"\n{result['summary']}\n\n"
     # Add similar chunks if available
     if 'similar_chunks' in results:
-        formatted += "## 🔍 Related Content\n\n"
         for i, chunk in enumerate(results['similar_chunks'], 1):
             if not isinstance(chunk, dict):
                 continue
-            formatted += f"### Related {i}\n"
             if 'metadata' in chunk:
                 meta = chunk['metadata']
                 if 'title' in meta and 'url' in meta:
-                    formatted += f"From [{meta['title']}]({meta['url']})\n"
             if 'content' in chunk:
-                formatted += f"\n{chunk['content'][:200]}...\n\n"
-    return formatted
 def create_demo():
     """Create the Gradio interface"""
@@ -100,7 +122,10 @@ def create_demo():
                 )
                 search_button = gr.Button("🔍 Search")
-        output = gr.Markdown()
         search_button.click(
             fn=safe_search,

     if not results or not results.get('results'):
         return "# ⚠️ No Results\nNo search results were found. Please try a different query."
+    insights = results.get('insights', {})
+    output = []
+    # Main Summary
+    output.append("📝 Executive Summary")
+    output.append("-" * 50)
+    output.append(insights.get('main_summary', ''))
+    output.append("\n")
+    # Key Findings
+    output.append("🔑 Key Findings")
+    output.append("-" * 50)
+    for i, point in enumerate(insights.get('key_findings', []), 1):
+        output.append(f"{i}. {point}")
+    output.append("\n")
+    # Sources
+    output.append("📚 Sources")
+    output.append("-" * 50)
+    for source in insights.get('sources', []):
+        output.append(f"• {source.get('title', '')}")
+        output.append(f"  {source.get('url', '')}")
+    output.append("\n")
+    # Follow-up Questions
+    output.append("❓ Suggested Questions")
+    output.append("-" * 50)
+    for question in results.get('follow_up_questions', []):
+        output.append(f"• {question}")
     # Add main results
     if 'results' in results:
+        output.append("\n")
+        output.append("📄 Detailed Results")
+        output.append("-" * 50)
+        output.append("\n")
         for i, result in enumerate(results['results'], 1):
             if not isinstance(result, dict):
                 continue
+            output.append(f"### {i}. ")
             if 'url' in result:
                 title = result.get('title', 'Untitled')
+                output.append(f"[{title}]({result['url']})\n")
             if 'summary' in result:
+                output.append(f"\n{result['summary']}\n\n")
     # Add similar chunks if available
     if 'similar_chunks' in results:
+        output.append("\n")
+        output.append("🔍 Related Content")
+        output.append("-" * 50)
+        output.append("\n")
         for i, chunk in enumerate(results['similar_chunks'], 1):
             if not isinstance(chunk, dict):
                 continue
+            output.append(f"### Related {i}\n")
             if 'metadata' in chunk:
                 meta = chunk['metadata']
                 if 'title' in meta and 'url' in meta:
+                    output.append(f"From [{meta['title']}]({meta['url']})\n")
             if 'content' in chunk:
+                output.append(f"\n{chunk['content'][:200]}...\n\n")
+    return "\n".join(output)
 def create_demo():
     """Create the Gradio interface"""
                 )
                 search_button = gr.Button("🔍 Search")
+        output = gr.Textbox(
+            label="Search Results",
+            lines=20
+        )
         search_button.click(
             fn=safe_search,

search_engine.py CHANGED Viewed

@@ -44,25 +44,115 @@ class ContentProcessor:
     def __init__(self):
         self.model_manager = ModelManager()
-    def process_content(self, content: str) -> Dict:
         """Process content and generate insights"""
         try:
-            # Generate summary
             summary = self.model_manager.models['summarizer'](
                 content[:1024],
-                max_length=100,
-                min_length=30,
                 do_sample=False
             )[0]['summary_text']
             return {
                 'summary': summary,
                 'content': content
             }
         except Exception as e:
             return {
                 'summary': f"Error processing content: {str(e)}",
                 'content': content
             }
@@ -139,25 +229,18 @@ class WebSearchEngine:
             response = self.safe_get(url)
             soup = BeautifulSoup(response.text, 'lxml')
-            # Extract text content
-            for script in soup(["script", "style"]):
-                script.decompose()
-            text = soup.get_text()
-            lines = (line.strip() for line in text.splitlines())
-            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-            content = ' '.join(chunk for chunk in chunks if chunk)
             # Get metadata
             metadata = self.get_metadata(soup)
-            # Process content
-            processed = self.processor.process_content(content)
             return {
                 'url': url,
                 'title': metadata['title'],
                 'description': metadata['description'],
                 'summary': processed['summary'],
                 'content': processed['content']
             }
@@ -229,26 +312,38 @@ class WebSearchEngine:
                 return {'error': 'No results found'}
             results = []
             for result in search_results:
                 if 'link' in result:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
                         results.append(processed)
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
-            # Generate insights from results
-            all_content = " ".join([r['summary'] for r in results if 'summary' in r])
             return {
                 'results': results,
-                'insights': all_content[:1000] if all_content else "No insights available.",
                 'follow_up_questions': [
-                    f"What are the key differences between {query} and related topics?",
-                    f"Can you explain {query} in simple terms?",
-                    f"What are the latest developments in {query}?"
                 ]
             }

     def __init__(self):
         self.model_manager = ModelManager()
+    def clean_text(self, text: str) -> str:
+        """Clean and normalize text content"""
+        # Remove extra whitespace
+        text = ' '.join(text.split())
+        # Remove common navigation elements
+        nav_elements = [
+            "skip to content",
+            "skip to navigation",
+            "search",
+            "menu",
+            "subscribe",
+            "sign in",
+            "log in",
+            "submit",
+            "browse",
+            "explore",
+        ]
+        for element in nav_elements:
+            text = text.replace(element.lower(), "")
+        return text.strip()
+    def extract_main_content(self, soup: BeautifulSoup) -> str:
+        """Extract main content from HTML soup"""
+        # Remove navigation, headers, footers, and sidebars
+        for elem in soup.find_all(['nav', 'header', 'footer', 'aside']):
+            elem.decompose()
+        # Remove script and style elements
+        for elem in soup.find_all(['script', 'style']):
+            elem.decompose()
+        # Try to find main content area
+        main_content = None
+        content_tags = ['article', 'main', '[role="main"]', '#content', '.content', '.post-content']
+        for tag in content_tags:
+            main_content = soup.select_one(tag)
+            if main_content:
+                break
+        # If no main content found, use body
+        if not main_content:
+            main_content = soup.find('body')
+        if main_content:
+            text = main_content.get_text(separator=' ', strip=True)
+        else:
+            text = soup.get_text(separator=' ', strip=True)
+        return self.clean_text(text)
+    def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
+        """Extract key points from text using AI"""
+        try:
+            # Split text into chunks for processing
+            chunks = [text[i:i + 1024] for i in range(0, len(text), 1024)]
+            all_points = []
+            for chunk in chunks[:3]:  # Process first 3 chunks to keep it manageable
+                summary = self.model_manager.models['summarizer'](
+                    chunk,
+                    max_length=100,
+                    min_length=30,
+                    do_sample=False
+                )[0]['summary_text']
+                # Split summary into sentences
+                points = [s.strip() for s in summary.split('.') if s.strip()]
+                all_points.extend(points)
+            # Return top points
+            return all_points[:max_points]
+        except Exception as e:
+            logger.error(f"Error extracting key points: {str(e)}")
+            return []
+    def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
         """Process content and generate insights"""
         try:
+            # Extract main content if soup is provided
+            if soup:
+                content = self.extract_main_content(soup)
+            else:
+                content = self.clean_text(content)
+            # Extract key points
+            key_points = self.extract_key_points(content)
+            # Generate overall summary
             summary = self.model_manager.models['summarizer'](
                 content[:1024],
+                max_length=150,
+                min_length=50,
                 do_sample=False
             )[0]['summary_text']
             return {
                 'summary': summary,
+                'key_points': key_points,
                 'content': content
             }
         except Exception as e:
+            logger.error(f"Error processing content: {str(e)}")
             return {
                 'summary': f"Error processing content: {str(e)}",
+                'key_points': [],
                 'content': content
             }
             response = self.safe_get(url)
             soup = BeautifulSoup(response.text, 'lxml')
+            # Process content with BeautifulSoup object
+            processed = self.processor.process_content("", soup)
             # Get metadata
             metadata = self.get_metadata(soup)
             return {
                 'url': url,
                 'title': metadata['title'],
                 'description': metadata['description'],
                 'summary': processed['summary'],
+                'key_points': processed['key_points'],
                 'content': processed['content']
             }
                 return {'error': 'No results found'}
             results = []
+            all_key_points = []
             for result in search_results:
                 if 'link' in result:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
                         results.append(processed)
+                        if 'key_points' in processed:
+                            all_key_points.extend(processed['key_points'])
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
+            # Combine all summaries and key points
+            all_summaries = [r['summary'] for r in results if 'summary' in r]
+            combined_summary = " ".join(all_summaries)
+            # Format insights
+            insights = {
+                'main_summary': combined_summary[:500],
+                'key_findings': list(set(all_key_points))[:7],  # Remove duplicates and limit to top 7
+                'sources': [{'title': r['title'], 'url': r['url']} for r in results]
+            }
             return {
                 'results': results,
+                'insights': insights,
                 'follow_up_questions': [
+                    f"What are the practical applications of {query}?",
+                    f"How has {query} evolved over the past year?",
+                    f"What challenges remain in {query}?"
                 ]
             }