Spaces:

tensor-boy
/

aiws

Build error

App Files Files Community

fikird commited on Dec 2, 2024

Commit

f2c01c1

1 Parent(s): ae8bccc

Enhance content processing with better extraction and summarization

Browse files

Files changed (2) hide show

app.py +21 -46
search_engine.py +75 -72

app.py CHANGED Viewed

@@ -32,72 +32,50 @@ def format_results(results):
     if not results or not results.get('results'):
         return "# ⚠️ No Results\nNo search results were found. Please try a different query."
-    insights = results.get('insights', {})
-    output = []
-    # Main Summary
-    output.append("📝 Executive Summary")
-    output.append("-" * 50)
-    output.append(insights.get('main_summary', ''))
-    output.append("\n")
-    # Key Findings
-    output.append("🔑 Key Findings")
-    output.append("-" * 50)
-    for i, point in enumerate(insights.get('key_findings', []), 1):
-        output.append(f"{i}. {point}")
-    output.append("\n")
-    # Sources
-    output.append("📚 Sources")
-    output.append("-" * 50)
-    for source in insights.get('sources', []):
-        output.append(f"• {source.get('title', '')}")
-        output.append(f"  {source.get('url', '')}")
-    output.append("\n")
-    # Follow-up Questions
-    output.append("❓ Suggested Questions")
-    output.append("-" * 50)
-    for question in results.get('follow_up_questions', []):
-        output.append(f"• {question}")
     # Add main results
     if 'results' in results:
-        output.append("\n")
-        output.append("📄 Detailed Results")
-        output.append("-" * 50)
-        output.append("\n")
         for i, result in enumerate(results['results'], 1):
             if not isinstance(result, dict):
                 continue
-            output.append(f"### {i}. ")
             if 'url' in result:
                 title = result.get('title', 'Untitled')
-                output.append(f"[{title}]({result['url']})\n")
             if 'summary' in result:
-                output.append(f"\n{result['summary']}\n\n")
     # Add similar chunks if available
     if 'similar_chunks' in results:
-        output.append("\n")
-        output.append("🔍 Related Content")
-        output.append("-" * 50)
-        output.append("\n")
         for i, chunk in enumerate(results['similar_chunks'], 1):
             if not isinstance(chunk, dict):
                 continue
-            output.append(f"### Related {i}\n")
             if 'metadata' in chunk:
                 meta = chunk['metadata']
                 if 'title' in meta and 'url' in meta:
-                    output.append(f"From [{meta['title']}]({meta['url']})\n")
             if 'content' in chunk:
-                output.append(f"\n{chunk['content'][:200]}...\n\n")
-    return "\n".join(output)
 def create_demo():
     """Create the Gradio interface"""
@@ -122,10 +100,7 @@ def create_demo():
                 )
                 search_button = gr.Button("🔍 Search")
-        output = gr.Textbox(
-            label="Search Results",
-            lines=20
-        )
         search_button.click(
             fn=safe_search,

     if not results or not results.get('results'):
         return "# ⚠️ No Results\nNo search results were found. Please try a different query."
+    formatted = f"# 🔍 Search Results\n\n"
+    # Add insights section
+    if 'insights' in results:
+        formatted += f"## 💡 Key Insights\n{results['insights']}\n\n"
+    # Add follow-up questions
+    if 'follow_up_questions' in results:
+        formatted += "## ❓ Follow-up Questions\n"
+        for q in results['follow_up_questions']:
+            if q and q.strip():
+                formatted += f"- {q.strip()}\n"
+        formatted += "\n"
     # Add main results
     if 'results' in results:
+        formatted += "## 📄 Detailed Results\n\n"
         for i, result in enumerate(results['results'], 1):
             if not isinstance(result, dict):
                 continue
+            formatted += f"### {i}. "
             if 'url' in result:
                 title = result.get('title', 'Untitled')
+                formatted += f"[{title}]({result['url']})\n"
             if 'summary' in result:
+                formatted += f"\n{result['summary']}\n\n"
     # Add similar chunks if available
     if 'similar_chunks' in results:
+        formatted += "## 🔍 Related Content\n\n"
         for i, chunk in enumerate(results['similar_chunks'], 1):
             if not isinstance(chunk, dict):
                 continue
+            formatted += f"### Related {i}\n"
             if 'metadata' in chunk:
                 meta = chunk['metadata']
                 if 'title' in meta and 'url' in meta:
+                    formatted += f"From [{meta['title']}]({meta['url']})\n"
             if 'content' in chunk:
+                formatted += f"\n{chunk['content'][:200]}...\n\n"
+    return formatted
 def create_demo():
     """Create the Gradio interface"""
                 )
                 search_button = gr.Button("🔍 Search")
+        output = gr.Markdown()
         search_button.click(
             fn=safe_search,

search_engine.py CHANGED Viewed

@@ -52,72 +52,76 @@ class ContentProcessor:
         # Remove common navigation elements
         nav_elements = [
             "skip to content",
-            "skip to navigation",
             "search",
             "menu",
             "subscribe",
             "sign in",
             "log in",
             "submit",
             "browse",
-            "explore",
         ]
         for element in nav_elements:
             text = text.replace(element.lower(), "")
         return text.strip()
     def extract_main_content(self, soup: BeautifulSoup) -> str:
-        """Extract main content from HTML soup"""
-        # Remove navigation, headers, footers, and sidebars
-        for elem in soup.find_all(['nav', 'header', 'footer', 'aside']):
-            elem.decompose()
-        # Remove script and style elements
-        for elem in soup.find_all(['script', 'style']):
-            elem.decompose()
-        # Try to find main content area
-        main_content = None
-        content_tags = ['article', 'main', '[role="main"]', '#content', '.content', '.post-content']
-        for tag in content_tags:
-            main_content = soup.select_one(tag)
-            if main_content:
-                break
-        # If no main content found, use body
-        if not main_content:
-            main_content = soup.find('body')
-        if main_content:
-            text = main_content.get_text(separator=' ', strip=True)
-        else:
-            text = soup.get_text(separator=' ', strip=True)
-        return self.clean_text(text)
     def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
-        """Extract key points from text using AI"""
         try:
-            # Split text into chunks for processing
-            chunks = [text[i:i + 1024] for i in range(0, len(text), 1024)]
-            all_points = []
-            for chunk in chunks[:3]:  # Process first 3 chunks to keep it manageable
-                summary = self.model_manager.models['summarizer'](
-                    chunk,
-                    max_length=100,
-                    min_length=30,
-                    do_sample=False
-                )[0]['summary_text']
-                # Split summary into sentences
-                points = [s.strip() for s in summary.split('.') if s.strip()]
-                all_points.extend(points)
-            # Return top points
-            return all_points[:max_points]
         except Exception as e:
             logger.error(f"Error extracting key points: {str(e)}")
             return []
@@ -125,16 +129,13 @@ class ContentProcessor:
     def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
         """Process content and generate insights"""
         try:
-            # Extract main content if soup is provided
             if soup:
                 content = self.extract_main_content(soup)
             else:
                 content = self.clean_text(content)
-            # Extract key points
-            key_points = self.extract_key_points(content)
-            # Generate overall summary
             summary = self.model_manager.models['summarizer'](
                 content[:1024],
                 max_length=150,
@@ -142,18 +143,19 @@ class ContentProcessor:
                 do_sample=False
             )[0]['summary_text']
             return {
                 'summary': summary,
-                'key_points': key_points,
-                'content': content
             }
         except Exception as e:
-            logger.error(f"Error processing content: {str(e)}")
             return {
                 'summary': f"Error processing content: {str(e)}",
-                'key_points': [],
-                'content': content
             }
 class WebSearchEngine:
@@ -229,12 +231,12 @@ class WebSearchEngine:
             response = self.safe_get(url)
             soup = BeautifulSoup(response.text, 'lxml')
-            # Process content with BeautifulSoup object
-            processed = self.processor.process_content("", soup)
             # Get metadata
             metadata = self.get_metadata(soup)
             return {
                 'url': url,
                 'title': metadata['title'],
@@ -326,24 +328,25 @@ class WebSearchEngine:
             if not results:
                 return {'error': 'Failed to process any search results'}
-            # Combine all summaries and key points
-            all_summaries = [r['summary'] for r in results if 'summary' in r]
-            combined_summary = " ".join(all_summaries)
-            # Format insights
-            insights = {
-                'main_summary': combined_summary[:500],
-                'key_findings': list(set(all_key_points))[:7],  # Remove duplicates and limit to top 7
-                'sources': [{'title': r['title'], 'url': r['url']} for r in results]
-            }
             return {
                 'results': results,
                 'insights': insights,
                 'follow_up_questions': [
-                    f"What are the practical applications of {query}?",
-                    f"How has {query} evolved over the past year?",
-                    f"What challenges remain in {query}?"
                 ]
             }

         # Remove common navigation elements
         nav_elements = [
             "skip to content",
             "search",
             "menu",
+            "navigation",
             "subscribe",
             "sign in",
             "log in",
             "submit",
             "browse",
         ]
         for element in nav_elements:
             text = text.replace(element.lower(), "")
         return text.strip()
     def extract_main_content(self, soup: BeautifulSoup) -> str:
+        """Extract main content from HTML, prioritizing article content"""
+        content = ""
+        # Try to find main content containers
+        priority_tags = [
+            ('article', {}),
+            ('div', {'class': ['article', 'post', 'content', 'main']}),
+            ('div', {'id': ['article', 'post', 'content', 'main']}),
+            ('main', {}),
+        ]
+        for tag, attrs in priority_tags:
+            elements = soup.find_all(tag, attrs)
+            if elements:
+                content = " ".join(elem.get_text(strip=True) for elem in elements)
+                if content:
+                    break
+        # If no main content found, try extracting paragraphs
+        if not content:
+            paragraphs = soup.find_all('p')
+            content = " ".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 100)
+        return self.clean_text(content)
     def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
+        """Extract key points from text using sentence transformers"""
         try:
+            # Split into sentences
+            sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 20]
+            if not sentences:
+                return []
+            # Get embeddings for sentences
+            embeddings = self.model_manager.models['embeddings'].embed_documents(sentences)
+            # Use simple clustering to find diverse sentences
+            selected_indices = [0]  # Start with first sentence
+            for _ in range(min(max_points - 1, len(sentences) - 1)):
+                # Find sentence most different from selected ones
+                max_diff = -1
+                max_idx = -1
+                for i in range(len(sentences)):
+                    if i not in selected_indices:
+                        # Calculate average difference from selected sentences
+                        diffs = [sum((embeddings[i][j] - embeddings[k][j])**2
+                                for j in range(len(embeddings[i])))
+                                for k in selected_indices]
+                        avg_diff = sum(diffs) / len(diffs)
+                        if avg_diff > max_diff:
+                            max_diff = avg_diff
+                            max_idx = i
+                if max_idx != -1:
+                    selected_indices.append(max_idx)
+            return [sentences[i] for i in selected_indices]
         except Exception as e:
             logger.error(f"Error extracting key points: {str(e)}")
             return []
     def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
         """Process content and generate insights"""
         try:
+            # Extract main content if HTML is available
             if soup:
                 content = self.extract_main_content(soup)
             else:
                 content = self.clean_text(content)
+            # Generate summary
             summary = self.model_manager.models['summarizer'](
                 content[:1024],
                 max_length=150,
                 do_sample=False
             )[0]['summary_text']
+            # Extract key points
+            key_points = self.extract_key_points(content)
             return {
                 'summary': summary,
+                'content': content,
+                'key_points': key_points
             }
         except Exception as e:
             return {
                 'summary': f"Error processing content: {str(e)}",
+                'content': content,
+                'key_points': []
             }
 class WebSearchEngine:
             response = self.safe_get(url)
             soup = BeautifulSoup(response.text, 'lxml')
             # Get metadata
             metadata = self.get_metadata(soup)
+            # Process content
+            processed = self.processor.process_content("", soup=soup)
             return {
                 'url': url,
                 'title': metadata['title'],
             if not results:
                 return {'error': 'Failed to process any search results'}
+            # Combine insights from all results
+            combined_summary = " ".join([r['summary'] for r in results if 'summary' in r])
+            # Generate overall insights
+            insights = self.processor.model_manager.models['summarizer'](
+                combined_summary,
+                max_length=200,
+                min_length=100,
+                do_sample=False
+            )[0]['summary_text']
             return {
                 'results': results,
                 'insights': insights,
+                'key_points': all_key_points[:10],  # Top 10 key points
                 'follow_up_questions': [
+                    f"What are the recent breakthroughs in {query}?",
+                    f"How does {query} impact various industries?",
+                    f"What are the future prospects of {query}?"
                 ]
             }